조용한 담장

BeautifulSoup Document 정리 1 본문

python

BeautifulSoup Document 정리 1

iosroid 2019. 10. 2. 18:32

웹 클롤링에 많이 쓰이는 파이썬 라이브러리 BeautifulSoup 공식 문서의 예제 코드들을 모았다.

 

https://www.crummy.com/software/BeautifulSoup/bs4/doc/

 

#!/usr/bin/env python3
"""
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
attribbutes:
.contents, .children, .descendants, .string, .strings, .stripped_strings,
.parent, .parents,
.next_sibling, .previous_sibling, .next_siblings, .previous_siblings,
.next_element, .previous_element, .next_elements, .previous_elements,
"""
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<b><!--Hey, buddy. Want to buy a used parser?--></b>
<p class="title"><b>The Dormouse's story</b></p>
<p class="body strikeout"></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
<a><c>text1</c><d>text2</d></b></a>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
## A Tag object corresponds to an XML or HTML tag in the original document
# <head> tag
print("soup.head:", soup.head)
# <head><title>The Dormouse's story</title></head>
# <title> tag
print("soup.title:", soup.title)
# <title>The Dormouse's story</title>
## Every tag has a name, accessible as '.name'
print("soup.title.name:", soup.title.name)
# u'title'
## NavigableString
## If a tag has only one child, and that child is a NavigableString,
## the child is made available as .string.
print("soup.title.string:", soup.title.string)
# u'The Dormouse's story'
## If a tag’s only child is another tag, and that tag has a .string,
## then the parent tag is considered to have the same .string as its child
print("soup.head.string:", soup.head.string)
# u'The Dormouse's story'
## You can’t edit a string in place, but you can replace one string with another
soup.title.string.replace_with('My story')
print("soup.title.string:", soup.title.string)
# My story
## element's parent
print("soup.title.parent:", soup.title.parent)
# <head><title>My story</title></head>
## The title string itself has a parent: the <title> tag that contains it
print("soup.title.string.parent:", soup.title.string.parent)
# <title>My story</title>
print("soup.title.parent.name:", soup.title.parent.name)
# u'head'
print("soup.title.parents:")
for parent in soup.title.parents:
if parent is None:
print(parent)
else:
print(parent.name)
# head
# html
# [document]
## comment
print("soup.b.string:", soup.b.string)
# Hey, buddy. Want to buy a used parser?
print("soup.p:", soup.p)
# <p class="title"><b>The Dormouse's story</b></p>
## Attribute of tag
print("soup.p['class']:", soup.p['class'])
# ['title']
print("soup.find_all('p') tag.attrs:")
for tag in soup.find_all('p'):
print(tag.attrs)
# {'class': ['title']}
# {'class': ['body', 'strikeout']}
# {'class': ['story']}
# {'class': ['story']}
# print(soup.p.attrs)
# {'class': ['title']}
## Can add, remove, modify a tag's attribute
soup.p['class'] = 'noclass'
soup.p['another-attribute'] = 1
print("soup.p:", soup.p)
# <p another-attribute="1" class="noclass"><b>The Dormouse's story</b></p>
del soup.p['class']
del soup.p['another-attribute']
print("soup.p:", soup.p)
# <p><b>The Dormouse's story</b></p>
## A tag’s children are available in a list called .contents
print("soup.p.contents:", soup.p.contents, len(soup.p.contents))
# [<b>The Dormouse's story</b>] 1
print("soup.p.contents[0]:", soup.p.contents[0])
# <b>The Dormouse's story</b>
## iterate over a tag’s children using the .children generator
print("soup.p.children:")
for child in soup.p.children:
print(child)
# <b>The Dormouse's story</b>
print("len(list(soup.children)):", len(list(soup.children)))
# 3
## find
print("soup.a:", soup.a)
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print("soup.find_all('a'):", soup.find_all('a'))
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>,
# <a><c>text1</c><d>text2</d></a>]
print("soup.find(id='link3'):", soup.find(id='link3'))
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
print("soup.find_all('a') link.get('href'):")
for link in soup.find_all('a'):
print(link.get('href'))
# http://example.com/elsie
# http://example.com/lacie
# http://example.com/tillie
# None
print("soup.get_text():")
print(soup.get_text())
#
# My story
#
#
# The Dormouse's story
#
# Once upon a time there were three little sisters; and their names were
# Elsie,
# Lacie and
# Tillie;
# and they lived at the bottom of a well.
# ...
# text1text2
#
print("soup.strings:")
for string in soup.strings:
print(repr(string))
# '\n'
# 'My story'
# '\n'
# '\n'
# '\n'
# "The Dormouse's story"
# '\n'
# '\n'
# 'Once upon a time there were three little sisters; and their names were\n'
# 'Elsie'
# ',\n'
# 'Lacie'
# ' and\n'
# 'Tillie'
# ';\nand they lived at the bottom of a well.'
# '\n'
# '...'
# '\n'
# 'text1'
# 'text2'
# '\n'
print("soup.stripped_strings:")
for string in soup.stripped_strings:
print(repr(string))
# 'My story'
# "The Dormouse's story"
# 'Once upon a time there were three little sisters; and their names were'
# 'Elsie'
# ','
# 'Lacie'
# 'and'
# 'Tillie'
# ';\nand they lived at the bottom of a well.'
# '...'
# 'text1'
# 'text2'
## The .descendants attribute lets you iterate over all of a tag’s children, recursively
print("soup.p.descendants:")
for child in soup.p.descendants:
print("descendants:", child)
# <b>The Dormouse's story</b>
# The Dormouse's story
print("len(list(soup.descendants)):", len(list(soup.descendants)))
# 38
## to navigate between page elements that are on the same level of the parse tree
print("soup.c.next_sibling:", soup.c.next_sibling)
# <d>text2</d>
print("soup.d.previous_sibling:", soup.d.previous_sibling)
# <c>text1</c>
## <a> tag's sibling is not second <a> tag but it's string:
## the comma and newline that separate the first <a> tag from the second
print("soup.a.next_sibling:", soup.a.next_sibling)
# u',\n'
## The second <a> tag is actually the .next_sibling of the comma
print("soup.a.next_sibling.next_sibling:", soup.a.next_sibling.next_sibling)
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
## iterate over a tag’s siblings
print("soup.a.next_siblings:")
for sibling in soup.a.next_siblings:
print(repr(sibling))
# ',\n'
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
# ' and\n'
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
# ';\nand they lived at the bottom of a well.'
print("soup.find(id='link3').previous_siblings:")
for sibling in soup.find(id='link3').previous_siblings:
print(repr(sibling))
# ' and\n'
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
# ',\n'
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
# 'Once upon a time there were three little sisters; and their names were\n
## .next_element attribute of a string or tag points
## to whatever was parsed immediately afterwards
## .previous_element attribute points to whatever element
## was parsed immediately before this one
print("soup.a.next_sibling:", soup.a.next_sibling)
# u',\n'
print("soup.a.next_element:", soup.a.next_element)
# Elsie
print("soup.a.previous_element:", soup.a.previous_element)
# Once upon a time there were three little sisters; and their names were
print("soup.c.next_elements:")
for element in soup.c.next_elements:
print(repr(element))
# 'text1'
# <d>text2</d>
# 'text2'
# '\n'

 

Comments