Notice
Recent Posts
Recent Comments
Link
일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | 2 | 3 | 4 | 5 | ||
6 | 7 | 8 | 9 | 10 | 11 | 12 |
13 | 14 | 15 | 16 | 17 | 18 | 19 |
20 | 21 | 22 | 23 | 24 | 25 | 26 |
27 | 28 | 29 | 30 |
Tags
- texttheme
- set
- 플러터
- kotlin
- 웹크롤러
- List
- text
- pushnamed
- variable
- Collection
- 다트
- import
- crawler
- function
- 클래스
- 코틀린
- python
- textstyle
- 콜렉션
- 파이썬
- Class
- Android
- 함수
- package
- Flutter
- ML
- 크롤러
- map
- animation
- DART
Archives
- Today
- Total
조용한 담장
BeautifulSoup Document 정리 1 본문
웹 클롤링에 많이 쓰이는 파이썬 라이브러리 BeautifulSoup 공식 문서의 예제 코드들을 모았다.
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
https://www.crummy.com/software/BeautifulSoup/bs4/doc/ | |
attribbutes: | |
.contents, .children, .descendants, .string, .strings, .stripped_strings, | |
.parent, .parents, | |
.next_sibling, .previous_sibling, .next_siblings, .previous_siblings, | |
.next_element, .previous_element, .next_elements, .previous_elements, | |
""" | |
from bs4 import BeautifulSoup | |
html_doc = """ | |
<html><head><title>The Dormouse's story</title></head> | |
<body> | |
<b><!--Hey, buddy. Want to buy a used parser?--></b> | |
<p class="title"><b>The Dormouse's story</b></p> | |
<p class="body strikeout"></p> | |
<p class="story">Once upon a time there were three little sisters; and their names were | |
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, | |
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and | |
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; | |
and they lived at the bottom of a well.</p> | |
<p class="story">...</p> | |
<a><c>text1</c><d>text2</d></b></a> | |
""" | |
soup = BeautifulSoup(html_doc, 'html.parser') | |
## A Tag object corresponds to an XML or HTML tag in the original document | |
# <head> tag | |
print("soup.head:", soup.head) | |
# <head><title>The Dormouse's story</title></head> | |
# <title> tag | |
print("soup.title:", soup.title) | |
# <title>The Dormouse's story</title> | |
## Every tag has a name, accessible as '.name' | |
print("soup.title.name:", soup.title.name) | |
# u'title' | |
## NavigableString | |
## If a tag has only one child, and that child is a NavigableString, | |
## the child is made available as .string. | |
print("soup.title.string:", soup.title.string) | |
# u'The Dormouse's story' | |
## If a tag’s only child is another tag, and that tag has a .string, | |
## then the parent tag is considered to have the same .string as its child | |
print("soup.head.string:", soup.head.string) | |
# u'The Dormouse's story' | |
## You can’t edit a string in place, but you can replace one string with another | |
soup.title.string.replace_with('My story') | |
print("soup.title.string:", soup.title.string) | |
# My story | |
## element's parent | |
print("soup.title.parent:", soup.title.parent) | |
# <head><title>My story</title></head> | |
## The title string itself has a parent: the <title> tag that contains it | |
print("soup.title.string.parent:", soup.title.string.parent) | |
# <title>My story</title> | |
print("soup.title.parent.name:", soup.title.parent.name) | |
# u'head' | |
print("soup.title.parents:") | |
for parent in soup.title.parents: | |
if parent is None: | |
print(parent) | |
else: | |
print(parent.name) | |
# head | |
# html | |
# [document] | |
## comment | |
print("soup.b.string:", soup.b.string) | |
# Hey, buddy. Want to buy a used parser? | |
print("soup.p:", soup.p) | |
# <p class="title"><b>The Dormouse's story</b></p> | |
## Attribute of tag | |
print("soup.p['class']:", soup.p['class']) | |
# ['title'] | |
print("soup.find_all('p') tag.attrs:") | |
for tag in soup.find_all('p'): | |
print(tag.attrs) | |
# {'class': ['title']} | |
# {'class': ['body', 'strikeout']} | |
# {'class': ['story']} | |
# {'class': ['story']} | |
# print(soup.p.attrs) | |
# {'class': ['title']} | |
## Can add, remove, modify a tag's attribute | |
soup.p['class'] = 'noclass' | |
soup.p['another-attribute'] = 1 | |
print("soup.p:", soup.p) | |
# <p another-attribute="1" class="noclass"><b>The Dormouse's story</b></p> | |
del soup.p['class'] | |
del soup.p['another-attribute'] | |
print("soup.p:", soup.p) | |
# <p><b>The Dormouse's story</b></p> | |
## A tag’s children are available in a list called .contents | |
print("soup.p.contents:", soup.p.contents, len(soup.p.contents)) | |
# [<b>The Dormouse's story</b>] 1 | |
print("soup.p.contents[0]:", soup.p.contents[0]) | |
# <b>The Dormouse's story</b> | |
## iterate over a tag’s children using the .children generator | |
print("soup.p.children:") | |
for child in soup.p.children: | |
print(child) | |
# <b>The Dormouse's story</b> | |
print("len(list(soup.children)):", len(list(soup.children))) | |
# 3 | |
## find | |
print("soup.a:", soup.a) | |
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> | |
print("soup.find_all('a'):", soup.find_all('a')) | |
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, | |
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, | |
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, | |
# <a><c>text1</c><d>text2</d></a>] | |
print("soup.find(id='link3'):", soup.find(id='link3')) | |
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> | |
print("soup.find_all('a') link.get('href'):") | |
for link in soup.find_all('a'): | |
print(link.get('href')) | |
# http://example.com/elsie | |
# http://example.com/lacie | |
# http://example.com/tillie | |
# None | |
print("soup.get_text():") | |
print(soup.get_text()) | |
# | |
# My story | |
# | |
# | |
# The Dormouse's story | |
# | |
# Once upon a time there were three little sisters; and their names were | |
# Elsie, | |
# Lacie and | |
# Tillie; | |
# and they lived at the bottom of a well. | |
# ... | |
# text1text2 | |
# | |
print("soup.strings:") | |
for string in soup.strings: | |
print(repr(string)) | |
# '\n' | |
# 'My story' | |
# '\n' | |
# '\n' | |
# '\n' | |
# "The Dormouse's story" | |
# '\n' | |
# '\n' | |
# 'Once upon a time there were three little sisters; and their names were\n' | |
# 'Elsie' | |
# ',\n' | |
# 'Lacie' | |
# ' and\n' | |
# 'Tillie' | |
# ';\nand they lived at the bottom of a well.' | |
# '\n' | |
# '...' | |
# '\n' | |
# 'text1' | |
# 'text2' | |
# '\n' | |
print("soup.stripped_strings:") | |
for string in soup.stripped_strings: | |
print(repr(string)) | |
# 'My story' | |
# "The Dormouse's story" | |
# 'Once upon a time there were three little sisters; and their names were' | |
# 'Elsie' | |
# ',' | |
# 'Lacie' | |
# 'and' | |
# 'Tillie' | |
# ';\nand they lived at the bottom of a well.' | |
# '...' | |
# 'text1' | |
# 'text2' | |
## The .descendants attribute lets you iterate over all of a tag’s children, recursively | |
print("soup.p.descendants:") | |
for child in soup.p.descendants: | |
print("descendants:", child) | |
# <b>The Dormouse's story</b> | |
# The Dormouse's story | |
print("len(list(soup.descendants)):", len(list(soup.descendants))) | |
# 38 | |
## to navigate between page elements that are on the same level of the parse tree | |
print("soup.c.next_sibling:", soup.c.next_sibling) | |
# <d>text2</d> | |
print("soup.d.previous_sibling:", soup.d.previous_sibling) | |
# <c>text1</c> | |
## <a> tag's sibling is not second <a> tag but it's string: | |
## the comma and newline that separate the first <a> tag from the second | |
print("soup.a.next_sibling:", soup.a.next_sibling) | |
# u',\n' | |
## The second <a> tag is actually the .next_sibling of the comma | |
print("soup.a.next_sibling.next_sibling:", soup.a.next_sibling.next_sibling) | |
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> | |
## iterate over a tag’s siblings | |
print("soup.a.next_siblings:") | |
for sibling in soup.a.next_siblings: | |
print(repr(sibling)) | |
# ',\n' | |
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> | |
# ' and\n' | |
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> | |
# ';\nand they lived at the bottom of a well.' | |
print("soup.find(id='link3').previous_siblings:") | |
for sibling in soup.find(id='link3').previous_siblings: | |
print(repr(sibling)) | |
# ' and\n' | |
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> | |
# ',\n' | |
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> | |
# 'Once upon a time there were three little sisters; and their names were\n | |
## .next_element attribute of a string or tag points | |
## to whatever was parsed immediately afterwards | |
## .previous_element attribute points to whatever element | |
## was parsed immediately before this one | |
print("soup.a.next_sibling:", soup.a.next_sibling) | |
# u',\n' | |
print("soup.a.next_element:", soup.a.next_element) | |
# Elsie | |
print("soup.a.previous_element:", soup.a.previous_element) | |
# Once upon a time there were three little sisters; and their names were | |
print("soup.c.next_elements:") | |
for element in soup.c.next_elements: | |
print(repr(element)) | |
# 'text1' | |
# <d>text2</d> | |
# 'text2' | |
# '\n' |
'python' 카테고리의 다른 글
tkinter와 cffi 간단한 툴 제작 (0) | 2024.04.11 |
---|---|
python2 기본 환경에서 python3 사용하려면 feat. AI (1) | 2024.03.12 |
Python: f-String (Literal String Interpolation) (0) | 2020.04.28 |
BeautifulSoup Document 정리 2 (0) | 2019.10.02 |
Scrapy : python web crawler (0) | 2019.10.01 |