BeautifulSoup Document 정리 1

python

BeautifulSoup Document 정리 1

iosroid 2019. 10. 2. 18:32

웹 클롤링에 많이 쓰이는 파이썬 라이브러리 BeautifulSoup 공식 문서의 예제 코드들을 모았다.

https://www.crummy.com/software/BeautifulSoup/bs4/doc/

	#!/usr/bin/env python3

	"""
	https://www.crummy.com/software/BeautifulSoup/bs4/doc/

	attribbutes:
	.contents, .children, .descendants, .string, .strings, .stripped_strings,
	.parent, .parents,
	.next_sibling, .previous_sibling, .next_siblings, .previous_siblings,
	.next_element, .previous_element, .next_elements, .previous_elements,

	"""

	from bs4 import BeautifulSoup

	html_doc = """
	<html><head><title>The Dormouse's story</title></head>
	<body>
	<b><!--Hey, buddy. Want to buy a used parser?--></b>
	<p class="title"><b>The Dormouse's story</b></p>
	<p class="body strikeout"></p>

	<p class="story">Once upon a time there were three little sisters; and their names were
	<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
	<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
	<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
	and they lived at the bottom of a well.</p>

	<p class="story">...</p>
	<a><c>text1</c><d>text2</d></b></a>
	"""

	soup = BeautifulSoup(html_doc, 'html.parser')


	## A Tag object corresponds to an XML or HTML tag in the original document
	# <head> tag
	print("soup.head:", soup.head)
	# <head><title>The Dormouse's story</title></head>
	# <title> tag
	print("soup.title:", soup.title)
	# <title>The Dormouse's story</title>


	## Every tag has a name, accessible as '.name'
	print("soup.title.name:", soup.title.name)
	# u'title'


	## NavigableString
	## If a tag has only one child, and that child is a NavigableString,
	## the child is made available as .string.
	print("soup.title.string:", soup.title.string)
	# u'The Dormouse's story'


	## If a tag’s only child is another tag, and that tag has a .string,
	## then the parent tag is considered to have the same .string as its child
	print("soup.head.string:", soup.head.string)
	# u'The Dormouse's story'


	## You can’t edit a string in place, but you can replace one string with another
	soup.title.string.replace_with('My story')
	print("soup.title.string:", soup.title.string)
	# My story


	## element's parent
	print("soup.title.parent:", soup.title.parent)
	# <head><title>My story</title></head>
	## The title string itself has a parent: the <title> tag that contains it
	print("soup.title.string.parent:", soup.title.string.parent)
	# <title>My story</title>
	print("soup.title.parent.name:", soup.title.parent.name)
	# u'head'
	print("soup.title.parents:")
	for parent in soup.title.parents:
	if parent is None:
	print(parent)
	else:
	print(parent.name)
	# head
	# html
	# [document]


	## comment
	print("soup.b.string:", soup.b.string)
	# Hey, buddy. Want to buy a used parser?


	print("soup.p:", soup.p)
	# <p class="title"><b>The Dormouse's story</b></p>


	## Attribute of tag
	print("soup.p['class']:", soup.p['class'])
	# ['title']
	print("soup.find_all('p') tag.attrs:")
	for tag in soup.find_all('p'):
	print(tag.attrs)
	# {'class': ['title']}
	# {'class': ['body', 'strikeout']}
	# {'class': ['story']}
	# {'class': ['story']}
	# print(soup.p.attrs)
	# {'class': ['title']}
	## Can add, remove, modify a tag's attribute
	soup.p['class'] = 'noclass'
	soup.p['another-attribute'] = 1
	print("soup.p:", soup.p)
	# <p another-attribute="1" class="noclass"><b>The Dormouse's story</b></p>
	del soup.p['class']
	del soup.p['another-attribute']
	print("soup.p:", soup.p)
	# <p><b>The Dormouse's story</b></p>


	## A tag’s children are available in a list called .contents
	print("soup.p.contents:", soup.p.contents, len(soup.p.contents))
	# [<b>The Dormouse's story</b>] 1
	print("soup.p.contents[0]:", soup.p.contents[0])
	# <b>The Dormouse's story</b>


	## iterate over a tag’s children using the .children generator
	print("soup.p.children:")
	for child in soup.p.children:
	print(child)
	# <b>The Dormouse's story</b>
	print("len(list(soup.children)):", len(list(soup.children)))
	# 3


	## find
	print("soup.a:", soup.a)
	# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
	print("soup.find_all('a'):", soup.find_all('a'))
	# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
	# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
	# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>,
	# <a><c>text1</c><d>text2</d></a>]
	print("soup.find(id='link3'):", soup.find(id='link3'))
	# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
	print("soup.find_all('a') link.get('href'):")
	for link in soup.find_all('a'):
	print(link.get('href'))
	# http://example.com/elsie
	# http://example.com/lacie
	# http://example.com/tillie
	# None


	print("soup.get_text():")
	print(soup.get_text())
	#
	# My story
	#
	#
	# The Dormouse's story
	#
	# Once upon a time there were three little sisters; and their names were
	# Elsie,
	# Lacie and
	# Tillie;
	# and they lived at the bottom of a well.
	# ...
	# text1text2
	#
	print("soup.strings:")
	for string in soup.strings:
	print(repr(string))
	# '\n'
	# 'My story'
	# '\n'
	# '\n'
	# '\n'
	# "The Dormouse's story"
	# '\n'
	# '\n'
	# 'Once upon a time there were three little sisters; and their names were\n'
	# 'Elsie'
	# ',\n'
	# 'Lacie'
	# ' and\n'
	# 'Tillie'
	# ';\nand they lived at the bottom of a well.'
	# '\n'
	# '...'
	# '\n'
	# 'text1'
	# 'text2'
	# '\n'
	print("soup.stripped_strings:")
	for string in soup.stripped_strings:
	print(repr(string))
	# 'My story'
	# "The Dormouse's story"
	# 'Once upon a time there were three little sisters; and their names were'
	# 'Elsie'
	# ','
	# 'Lacie'
	# 'and'
	# 'Tillie'
	# ';\nand they lived at the bottom of a well.'
	# '...'
	# 'text1'
	# 'text2'


	## The .descendants attribute lets you iterate over all of a tag’s children, recursively
	print("soup.p.descendants:")
	for child in soup.p.descendants:
	print("descendants:", child)
	# <b>The Dormouse's story</b>
	# The Dormouse's story
	print("len(list(soup.descendants)):", len(list(soup.descendants)))
	# 38


	## to navigate between page elements that are on the same level of the parse tree
	print("soup.c.next_sibling:", soup.c.next_sibling)
	# <d>text2</d>
	print("soup.d.previous_sibling:", soup.d.previous_sibling)
	# <c>text1</c>
	## <a> tag's sibling is not second <a> tag but it's string:
	## the comma and newline that separate the first <a> tag from the second
	print("soup.a.next_sibling:", soup.a.next_sibling)
	# u',\n'
	## The second <a> tag is actually the .next_sibling of the comma
	print("soup.a.next_sibling.next_sibling:", soup.a.next_sibling.next_sibling)
	# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>


	## iterate over a tag’s siblings
	print("soup.a.next_siblings:")
	for sibling in soup.a.next_siblings:
	print(repr(sibling))
	# ',\n'
	# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
	# ' and\n'
	# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
	# ';\nand they lived at the bottom of a well.'
	print("soup.find(id='link3').previous_siblings:")
	for sibling in soup.find(id='link3').previous_siblings:
	print(repr(sibling))
	# ' and\n'
	# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
	# ',\n'
	# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
	# 'Once upon a time there were three little sisters; and their names were\n


	## .next_element attribute of a string or tag points
	## to whatever was parsed immediately afterwards
	## .previous_element attribute points to whatever element
	## was parsed immediately before this one
	print("soup.a.next_sibling:", soup.a.next_sibling)
	# u',\n'
	print("soup.a.next_element:", soup.a.next_element)
	# Elsie
	print("soup.a.previous_element:", soup.a.previous_element)
	# Once upon a time there were three little sisters; and their names were
	print("soup.c.next_elements:")
	for element in soup.c.next_elements:
	print(repr(element))
	# 'text1'
	# <d>text2</d>
	# 'text2'
	# '\n'

view raw beautifulSoupDocs1.py hosted with ❤ by GitHub

저작자표시 비영리 변경금지 (새창열림)