xpath

网站	备注
lxml	etree 解析网页，html = etree.HTML(text)，result = html.xpath('//div')
xpath语法
python 应用语法

选择

寻找节点

语法	含义
nodename(节点名字)	直接根据写的节点名字查找节点,如：div
//	在当前节点下的子孙节点中寻找,如：//div
/	在当前节点下的子节点中寻找,如：/div
.	代表当前节点（可省略不写，就像我们有时候写的相对路径），如：./div
…	当前节点的父节点，如：…/div

python

from lxml import etree

text = '''
<body>
    <div>这时测试的div</div>
    <div>
        <div>
            这是嵌套的div标签
            <p>
                这时嵌套的p标签
            </p>
        </div>
    </div>
    <p>这时测试的p</p>
</body>
'''

html = etree.HTML(text)
result = html.xpath('//div')	#使用xpath语法,一是在子孙节点中寻找，二是寻找div的标签
print(result)
#结果：
#[<Element div at 0x1e4cadbf608>, <Element div at 0x1e4cae512c8>, <Element div at 0x1e4cae51348>]

筛选节点

方法名\符号	作用
@	获取属性或者筛选属性,如：@class，hello_tag = html.xpath('//div[@class="hello"]')，html.xpath('//input[@name="btn"]')
contains	判断属性中是否含有某个值（用于多值判断），如：contains(@class,‘hello’)

python

from lxml import etree

text = '''
<div class="hello">
    <p>Hello,this is used to tested</p>
</div>
<div class="hello test hi">
    <div>
        <div>你好，这是用于测试的html代码</div>
    </div>
</div>

<div class="button">
    <div class="menu">
        <input name="btn" type="button" value="按钮" />
    <div>
</div>
'''

#初始化
html = etree.HTML(text)
#根据单一属性筛选

#筛选出class="hello"的div标签
hello_tag = html.xpath('//div[@class="hello"]')		#注意筛选的方法都是在中括号里面的
print(hello_tag)	#结果为： [<Element div at 0x2ba41e6d088>]，即找到了一个标签，符合条件

#找出具有name="btn"的input标签
input_tag = html.xpath('//input[@name="btn"]')
print(input_tag)	#结果为：[<Element input at 0x1751d29df08>]，找到一个input标签，符合条件

#筛选出具有class="hello"的div标签
hello_tags = html.xpath('//div[contains(@class,"hello")]')
print(hello_tags)	#结果为：[<Element div at 0x1348272d248>, <Element div at 0x1348272d6c8>]，即找到了两个div标签，符合条件

按序选择

方法	作用	示例
last()	获取最后一个标签	`html.xpath('//li[1]')`
1	获取第一个标签	`html.xpath('//li[last()]')`
position() < = > num	筛选多个标签（具体见实例）	`html.xpath('//li[position() < 6]')`

注意：这里需要注意这里的序是从1开始而不是从0开始。

python

from lxml import etree

text = '''
<ul>
    <li>1</li>
    <li>2</li>
    <li>3</li>
    <li>4</li>
    <li>5</li>
    <li>6</li>
    <li>7</li>
    <li>8</li>
</ul>     
'''

#初始化
html = etree.HTML(text)

#获取第一个li标签
first_tag = html.xpath('//li[1]')	#令人吃惊，lxml并没有first()方法
print(first_tag)

#获取最后一个li标签
last_tag = html.xpath('//li[last()]')
print(last_tag)

#获取前五个标签
li_tags = html.xpath('//li[position() < 6]')
print(li_tags)

逻辑和计算

and、or、>、>=等。当然也是可以写入一些计算方法的，如：+、-等。

python

from lxml import etree

text = '''
<ul>
    <li>1</li>
    <li>2</li>
    <li>3</li>
    <li>4</li>
    <li>5</li>
    <li>6</li>
    <li>7</li>
    <li>8</li>
</ul>     
'''

#初始化
html = etree.HTML(text)

#获取第二个li标签，使用=判断
second_tag = html.xpath('//li[position() = 2]')
print(second_tag)

#获取第一个和第二个标签，使用or
tags = html.xpath('//li[position() = 1 or position() = 2]')
print(tags)

#获取前三个标签，使用<
three_tags = html.xpath('//li[position()<4]')
print(three_tags)

获取属性值或文本内容

我们寻找标签、筛选标签的最终目的就是获取它的属性或者文本内容。下面讲解获取文本和属性的方法。

方法	作用
@	获取属性或者筛选属性
text()	获取文本

获取属性值或文本内容

方法	作用
@	获取属性或者筛选属性
text()	获取文本

python

from lxml import etree

text = '''
<div class="hello">
    <p>Hello,this is used to tested</p>
</div>
<div class="hello test hi">
    <div>
        <div>你好，这是用于测试的html代码</div>
    </div>
</div>

<div class="button">
    <div class="menu">
        <input name="btn" type="button" value="按钮" />
    <div>
</div>
'''

#初始化
html = etree.HTML(text)

#获取第一个div中的p标签中的文本
content = html.xpath('//div/p/text()')    #注意使用text()的时机和位置
print(content)  #结果为：['Hello,this is used to tested']，仍然是以列表形式返回结果

#获取拥有第二个div中的文本,注意观察下面的不同之处
content_two = html.xpath('//div[position() = 2]/text()')
print(content_two)  #结果为： ['\n    ', '\n']

content_three = html.xpath('//div[position() = 2]//text()')
print(content_three)  #结果为： ['\n    ', '\n        ', '你好，这是用于测试的html代码', '\n    ', '\n']
#两者不同之处在于：一个为//，一个为/。我们知道//获取其子孙节点中的内容，而/只获取其子节点的内容。

获取属性示例：

python

from lxml import etree

text = '''
<div class="hello" name="test">
    <p>Hello,this is used to tested</p>
</div>
<div class="hello test hi">
    <div>
        <div>你好，这是用于测试的html代码</div>
    </div>
</div>

<div class="button">
    <div class="menu">
        <input name="btn" type="button" value="按钮" />
    <div>
</div>
'''

#初始化
html = etree.HTML(text) 

#获取第一个div的name属性
first_div_class = html.xpath('//div[@class="hello"]/@name')
print(first_div_class)  #结果为：['test']

#获取input标签的name值
input_tag_class = html.xpath('//input/@name')
print(input_tag_class) #结果为：['btn']

xpath ​

选择 ​

寻找节点 ​

筛选节点 ​

按序选择 ​

逻辑和计算 ​

获取属性值或文本内容 ​

获取属性值或文本内容 ​

xpath

选择

寻找节点

筛选节点

按序选择

逻辑和计算

获取属性值或文本内容

获取属性值或文本内容