BeautifulSoup

from bs4 import BeautifulSoup
import requests

res = requests.get('https://www.something.com/')
soup = BeautifulSoup(res.text, 'lxml')
print(soup.title)
print(soup.title.name)
print(soup.title.parent)  # 找到父级

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="Dormouse"><b>The Dormouse's story</b></p>


<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'lxml')
print(list(soup.p.children))  # 子节点可以调用contents 和 children属性

父级

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>


<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<p>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
</p>
</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'lxml')
print(soup.a.parent)  # 找到父级

多个父级

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>


<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<p>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
</p>
</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'lxml')
print(list(soup.a.parents))  # 找到父级

兄弟

next_sibling、previous_sibling、next_siblings、previous_siblings这四个属性来获取。

而next_siblings和previous_siblings是获取前面和后面的兄弟节点，返回的类型依然是生成器类型。

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>


<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>hello
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'lxml')
print(soup.a.next_sibling)  # next_sibling上个兄弟元素
print(list(soup.a.next_siblings))
print(soup.a.previous_sibling)  # previous_sibling下个兄弟元素
print(list(soup.a.previous_siblings))

方法选择器

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="Dormouse"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><span>Elsie</span></a>,
<a href="http://example.com/lacie" class="sister" id="link2"><span>Lacie</span></a> and
<a href="http://example.com/tillie" class="sister" id="link3"><span>Tillie</span></a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'lxml')
print(soup.find_all(name='p'))
print(soup.find_all(attrs={'id': 'link1'}))
print(soup.find_all(attrs={'name': 'Dormouse'}))
print(soup.find_all(class_='sister'))
print(soup.find_all(id='link1'))

BeautifulSoup 后继元素

使用descendants属性，我们可以获得标签的所有后代（所有级别的子级）。

#!/usr/bin/python3

from bs4 import BeautifulSoup

with open("index.html", "r") as f:

    contents = f.read()

    soup = BeautifulSoup(contents, 'lxml')

    root = soup.body

    root_childs = [e.name for e in root.descendants if e.name is not None]
    print(root_childs) # ['h2', 'ul', 'li', 'li', 'li', 'li', 'li', 'p', 'p']