python
BeautifulSoup Document 정리 1
iosroid
2019. 10. 2. 18:32
웹 클롤링에 많이 쓰이는 파이썬 라이브러리 BeautifulSoup 공식 문서의 예제 코드들을 모았다.
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
https://www.crummy.com/software/BeautifulSoup/bs4/doc/ | |
attribbutes: | |
.contents, .children, .descendants, .string, .strings, .stripped_strings, | |
.parent, .parents, | |
.next_sibling, .previous_sibling, .next_siblings, .previous_siblings, | |
.next_element, .previous_element, .next_elements, .previous_elements, | |
""" | |
from bs4 import BeautifulSoup | |
html_doc = """ | |
<html><head><title>The Dormouse's story</title></head> | |
<body> | |
<b><!--Hey, buddy. Want to buy a used parser?--></b> | |
<p class="title"><b>The Dormouse's story</b></p> | |
<p class="body strikeout"></p> | |
<p class="story">Once upon a time there were three little sisters; and their names were | |
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, | |
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and | |
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; | |
and they lived at the bottom of a well.</p> | |
<p class="story">...</p> | |
<a><c>text1</c><d>text2</d></b></a> | |
""" | |
soup = BeautifulSoup(html_doc, 'html.parser') | |
## A Tag object corresponds to an XML or HTML tag in the original document | |
# <head> tag | |
print("soup.head:", soup.head) | |
# <head><title>The Dormouse's story</title></head> | |
# <title> tag | |
print("soup.title:", soup.title) | |
# <title>The Dormouse's story</title> | |
## Every tag has a name, accessible as '.name' | |
print("soup.title.name:", soup.title.name) | |
# u'title' | |
## NavigableString | |
## If a tag has only one child, and that child is a NavigableString, | |
## the child is made available as .string. | |
print("soup.title.string:", soup.title.string) | |
# u'The Dormouse's story' | |
## If a tag’s only child is another tag, and that tag has a .string, | |
## then the parent tag is considered to have the same .string as its child | |
print("soup.head.string:", soup.head.string) | |
# u'The Dormouse's story' | |
## You can’t edit a string in place, but you can replace one string with another | |
soup.title.string.replace_with('My story') | |
print("soup.title.string:", soup.title.string) | |
# My story | |
## element's parent | |
print("soup.title.parent:", soup.title.parent) | |
# <head><title>My story</title></head> | |
## The title string itself has a parent: the <title> tag that contains it | |
print("soup.title.string.parent:", soup.title.string.parent) | |
# <title>My story</title> | |
print("soup.title.parent.name:", soup.title.parent.name) | |
# u'head' | |
print("soup.title.parents:") | |
for parent in soup.title.parents: | |
if parent is None: | |
print(parent) | |
else: | |
print(parent.name) | |
# head | |
# html | |
# [document] | |
## comment | |
print("soup.b.string:", soup.b.string) | |
# Hey, buddy. Want to buy a used parser? | |
print("soup.p:", soup.p) | |
# <p class="title"><b>The Dormouse's story</b></p> | |
## Attribute of tag | |
print("soup.p['class']:", soup.p['class']) | |
# ['title'] | |
print("soup.find_all('p') tag.attrs:") | |
for tag in soup.find_all('p'): | |
print(tag.attrs) | |
# {'class': ['title']} | |
# {'class': ['body', 'strikeout']} | |
# {'class': ['story']} | |
# {'class': ['story']} | |
# print(soup.p.attrs) | |
# {'class': ['title']} | |
## Can add, remove, modify a tag's attribute | |
soup.p['class'] = 'noclass' | |
soup.p['another-attribute'] = 1 | |
print("soup.p:", soup.p) | |
# <p another-attribute="1" class="noclass"><b>The Dormouse's story</b></p> | |
del soup.p['class'] | |
del soup.p['another-attribute'] | |
print("soup.p:", soup.p) | |
# <p><b>The Dormouse's story</b></p> | |
## A tag’s children are available in a list called .contents | |
print("soup.p.contents:", soup.p.contents, len(soup.p.contents)) | |
# [<b>The Dormouse's story</b>] 1 | |
print("soup.p.contents[0]:", soup.p.contents[0]) | |
# <b>The Dormouse's story</b> | |
## iterate over a tag’s children using the .children generator | |
print("soup.p.children:") | |
for child in soup.p.children: | |
print(child) | |
# <b>The Dormouse's story</b> | |
print("len(list(soup.children)):", len(list(soup.children))) | |
# 3 | |
## find | |
print("soup.a:", soup.a) | |
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> | |
print("soup.find_all('a'):", soup.find_all('a')) | |
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, | |
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, | |
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, | |
# <a><c>text1</c><d>text2</d></a>] | |
print("soup.find(id='link3'):", soup.find(id='link3')) | |
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> | |
print("soup.find_all('a') link.get('href'):") | |
for link in soup.find_all('a'): | |
print(link.get('href')) | |
# http://example.com/elsie | |
# http://example.com/lacie | |
# http://example.com/tillie | |
# None | |
print("soup.get_text():") | |
print(soup.get_text()) | |
# | |
# My story | |
# | |
# | |
# The Dormouse's story | |
# | |
# Once upon a time there were three little sisters; and their names were | |
# Elsie, | |
# Lacie and | |
# Tillie; | |
# and they lived at the bottom of a well. | |
# ... | |
# text1text2 | |
# | |
print("soup.strings:") | |
for string in soup.strings: | |
print(repr(string)) | |
# '\n' | |
# 'My story' | |
# '\n' | |
# '\n' | |
# '\n' | |
# "The Dormouse's story" | |
# '\n' | |
# '\n' | |
# 'Once upon a time there were three little sisters; and their names were\n' | |
# 'Elsie' | |
# ',\n' | |
# 'Lacie' | |
# ' and\n' | |
# 'Tillie' | |
# ';\nand they lived at the bottom of a well.' | |
# '\n' | |
# '...' | |
# '\n' | |
# 'text1' | |
# 'text2' | |
# '\n' | |
print("soup.stripped_strings:") | |
for string in soup.stripped_strings: | |
print(repr(string)) | |
# 'My story' | |
# "The Dormouse's story" | |
# 'Once upon a time there were three little sisters; and their names were' | |
# 'Elsie' | |
# ',' | |
# 'Lacie' | |
# 'and' | |
# 'Tillie' | |
# ';\nand they lived at the bottom of a well.' | |
# '...' | |
# 'text1' | |
# 'text2' | |
## The .descendants attribute lets you iterate over all of a tag’s children, recursively | |
print("soup.p.descendants:") | |
for child in soup.p.descendants: | |
print("descendants:", child) | |
# <b>The Dormouse's story</b> | |
# The Dormouse's story | |
print("len(list(soup.descendants)):", len(list(soup.descendants))) | |
# 38 | |
## to navigate between page elements that are on the same level of the parse tree | |
print("soup.c.next_sibling:", soup.c.next_sibling) | |
# <d>text2</d> | |
print("soup.d.previous_sibling:", soup.d.previous_sibling) | |
# <c>text1</c> | |
## <a> tag's sibling is not second <a> tag but it's string: | |
## the comma and newline that separate the first <a> tag from the second | |
print("soup.a.next_sibling:", soup.a.next_sibling) | |
# u',\n' | |
## The second <a> tag is actually the .next_sibling of the comma | |
print("soup.a.next_sibling.next_sibling:", soup.a.next_sibling.next_sibling) | |
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> | |
## iterate over a tag’s siblings | |
print("soup.a.next_siblings:") | |
for sibling in soup.a.next_siblings: | |
print(repr(sibling)) | |
# ',\n' | |
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> | |
# ' and\n' | |
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> | |
# ';\nand they lived at the bottom of a well.' | |
print("soup.find(id='link3').previous_siblings:") | |
for sibling in soup.find(id='link3').previous_siblings: | |
print(repr(sibling)) | |
# ' and\n' | |
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> | |
# ',\n' | |
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> | |
# 'Once upon a time there were three little sisters; and their names were\n | |
## .next_element attribute of a string or tag points | |
## to whatever was parsed immediately afterwards | |
## .previous_element attribute points to whatever element | |
## was parsed immediately before this one | |
print("soup.a.next_sibling:", soup.a.next_sibling) | |
# u',\n' | |
print("soup.a.next_element:", soup.a.next_element) | |
# Elsie | |
print("soup.a.previous_element:", soup.a.previous_element) | |
# Once upon a time there were three little sisters; and their names were | |
print("soup.c.next_elements:") | |
for element in soup.c.next_elements: | |
print(repr(element)) | |
# 'text1' | |
# <d>text2</d> | |
# 'text2' | |
# '\n' |