Notice
Recent Posts
Recent Comments
Link
일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | 2 | 3 | 4 | 5 | ||
6 | 7 | 8 | 9 | 10 | 11 | 12 |
13 | 14 | 15 | 16 | 17 | 18 | 19 |
20 | 21 | 22 | 23 | 24 | 25 | 26 |
27 | 28 | 29 | 30 |
Tags
- pushnamed
- 함수
- 코틀린
- 플러터
- variable
- crawler
- textstyle
- Flutter
- 웹크롤러
- 다트
- 파이썬
- ML
- Collection
- import
- texttheme
- set
- animation
- kotlin
- 크롤러
- python
- 콜렉션
- 클래스
- List
- DART
- function
- text
- package
- map
- Android
- Class
Archives
- Today
- Total
조용한 담장
BeautifulSoup Document 정리 2 본문
웹 크롤러 개발에 많이 사용하는 파이썬 라이브러리(python library) BeautifulSoup 의 공식 문서의 예제 코드들을 모았다.
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
https://www.crummy.com/software/BeautifulSoup/bs4/doc/ | |
find(), find_all() | |
find_parents(), find_parent() | |
find_next_siblings(), find_next_sibling() | |
find_previous_siblings(), find_previous_sibling() | |
find_all_next(), find_next() | |
find_all_previous(), find_previous() | |
""" | |
from bs4 import BeautifulSoup | |
html_doc = """ | |
<html><head><title>The Dormouse's story</title></head> | |
<body> | |
<p class="title"><b>The Dormouse's story</b></p> | |
<p class="body strikeout"></p> | |
<p class="story">Once upon a time there were three little sisters; and their names were | |
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, | |
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and | |
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; | |
and they lived at the bottom of a well.</p> | |
<p class="story">...</p> | |
""" | |
soup = BeautifulSoup(html_doc, 'html.parser') | |
### Searching tree | |
## string | |
print("soup.find_all('b'):", soup.find_all('b')) | |
# soup.find_all('b'): [<b>The Dormouse's story</b>] | |
## regular expression | |
import re | |
for tag in soup.find_all(re.compile('^b')): | |
print("tag.name:", tag.name) | |
# tag.name: body | |
# tag.name: b | |
for tag in soup.find_all(re.compile('t')): | |
print("tag.name:", tag.name) | |
# tag.name: html | |
# tag.name: title | |
## list | |
print("soup.find_all(['a', 'b']):", soup.find_all(['a', 'b'])) | |
# soup.find_all(['a', 'b']): | |
# [<b>The Dormouse's story</b>, | |
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, | |
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, | |
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] | |
## True | |
for tag in soup.find_all(True): | |
print("tag.name:", tag.name) | |
# tag.name: html | |
# tag.name: head | |
# tag.name: title | |
# tag.name: body | |
# tag.name: p | |
# tag.name: b | |
# tag.name: p | |
# tag.name: a | |
# tag.name: a | |
# tag.name: a | |
# tag.name: p | |
## function | |
def has_class_but_no_id(tag): | |
return tag.has_attr('class') and not tag.has_attr('id') | |
print("soup.find_all(has_class_but_no_id):", soup.find_all(has_class_but_no_id)) | |
# soup.find_all(has_class_but_no_id): | |
# [<p class="title"><b>The Dormouse's story</b></p>, | |
# <p class="story">Once upon a time there were three little sisters; and their names were | |
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, | |
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and | |
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well.</p>, | |
# <p class="story">...</p>] | |
def not_lacie(href): | |
return href and not re.compile('lacie').search(href) | |
print("soup.find_all(href=not_lacie):", soup.find_all(href=not_lacie)) | |
# soup.find_all(href=not_lacie): | |
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, | |
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] | |
from bs4 import NavigableString | |
def surrounded_by_strings(tag): | |
return (isinstance(tag.next_element, NavigableString) | |
and isinstance(tag.previous_element, NavigableString)) | |
for tag in soup.find_all(surrounded_by_strings): | |
print("tag.name:", tag.name) | |
# tag.name: body | |
# tag.name: p | |
# tag.name: a | |
# tag.name: a | |
# tag.name: a | |
# tag.name: p | |
### find_all() | |
## argument: name, string, limit, recursive and keyword | |
print("soup.find_all('title'):", soup.find_all('title')) | |
# soup.find_all('title'): [<title>The Dormouse's story</title>] | |
print("soup.find_all('p', 'title'):", soup.find_all('p', 'title')) | |
# soup.find_all('p', 'title'): [<p class="title"><b>The Dormouse's story</b></p>] | |
print("soup.find_all('a'):", soup.find_all('a')) | |
# soup.find_all('a'): | |
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, | |
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, | |
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] | |
print("soup.find_all(id='link2'):", soup.find_all(id='link2')) | |
# soup.find_all(id='link2'): | |
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] | |
import re | |
print("soup.find(string=re.compile('sisters')):", soup.find(string=re.compile('sisters'))) | |
# soup.find(string=re.compile('sisters')): | |
# Once upon a time there were three little sisters; and their names were | |
### name argument | |
print("soup.find_all('title'):", soup.find_all('title')) | |
# soup.find_all('title'): [<title>The Dormouse's story</title>] | |
### keyword arguments | |
print("soup.find_all(id='link2'):", soup.find_all(id='link2')) | |
# soup.find_all(id='link2'): [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] | |
print("soup.find_all(href=re.compile('elsie')):", soup.find_all(href=re.compile('elsie'))) | |
# soup.find_all(href=re.compile('elsie')): | |
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] | |
print("soup.find_all(id=True):", soup.find_all(id=True)) | |
# soup.find_all(id=True): | |
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, | |
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, | |
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] | |
print("soup.find_all(href=re.compile('elsie'), id='link1'):", soup.find_all(href=re.compile('elsie'), id='link1')) | |
# soup.find_all(href=re.compile('elsie'), id='link1'): | |
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] | |
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>', 'html.parser') | |
## data_soup.find_all(data-foo="value") | |
# SyntaxError: keyword can't be an expression | |
print("data_soup.find_all(attrs={'data-foo': 'value'}):", data_soup.find_all(attrs={'data-foo': 'value'})) | |
# data_soup.find_all(attrs={'data-foo': 'value'}): [<div data-foo="value">foo!</div>] | |
name_soup = BeautifulSoup('<input name="email"/>', 'html.parser') | |
print("name_soup.find_all(name='email'):", name_soup.find_all(name='email')) | |
# name_soup.find_all(name='email'): [] | |
print("name_soup.find_all(attrs={'name': 'email'}):", name_soup.find_all(attrs={'name': 'email'})) | |
# name_soup.find_all(attrs={'name': 'email'}): [<input name="email"/>] | |
### Searching by CSS class | |
## you can search by CSS class using the keyword argument class_ | |
print("soup.find_all('a', class_='sister'):", soup.find_all('a', class_='sister')) | |
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, | |
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, | |
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] | |
soup.find_all(class_=re.compile("itl")) | |
print("soup.find_all(class_=re.compile('itl')):", soup.find_all(class_=re.compile('itl'))) | |
# soup.find_all(class_=re.compile('itl')): [<p class="title"><b>The Dormouse's story</b></p>] | |
def has_six_characters(css_class): | |
return css_class is not None and len(css_class) == 6 | |
print("soup.find_all(class_=has_six_characters):", soup.find_all(class_=has_six_characters)) | |
# soup.find_all(class_=has_six_characters): | |
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, | |
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, | |
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] | |
## a single tag can have multiple values for its “class” attribute | |
css_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html.parser') | |
print("css_soup.find_all('p', class_='strikeout'):", css_soup.find_all('p', class_='strikeout')) | |
# css_soup.find_all('p', class_='strikeout'): [<p class="body strikeout"></p>] | |
print("css_soup.find_all('p', class_='body'):", css_soup.find_all('p', class_='body')) | |
# css_soup.find_all('p', class_='body'): [<p class="body strikeout"></p>] | |
## You can also search for the exact string value of the class attribute | |
print("css_soup.find_all('p', class_='body strikeout'):", css_soup.find_all('p', class_='body strikeout')) | |
# css_soup.find_all('p', class_='body strikeout'): [<p class="body strikeout"></p>] | |
## CSS selector to search for tags that match two or more CSS classes | |
print("css_soup.select('p.strikeout.body'):", css_soup.select('p.strikeout.body')) | |
# css_soup.select('p.strikeout.body'): [<p class="body strikeout"></p>] | |
### string argument | |
## search for strings instead of tags. | |
print("soup.find_all(string='Elsie'):", soup.find_all(string='Elsie')) | |
# soup.find_all(string='Elsie'): ['Elsie'] | |
print("soup.find_all(string=['Tillie', 'Elsie', 'Lacie']):", soup.find_all(string=['Tillie', 'Elsie', 'Lacie'])) | |
# soup.find_all(string=['Tillie', 'Elsie', 'Lacie']): ['Elsie', 'Lacie', 'Tillie'] | |
print("soup.find_all(string=re.compile('Dormouse')):", soup.find_all(string=re.compile('Dormouse'))) | |
# soup.find_all(string=re.compile('Dormouse')): ["The Dormouse's story", "The Dormouse's story"] | |
def is_the_only_string_within_a_tag(s): | |
"""Return True if this string is the only child of its parent tag.""" | |
return (s == s.parent.string) | |
print("soup.find_all(string=is_the_only_string_within_a_tag):", soup.find_all(string=is_the_only_string_within_a_tag)) | |
# soup.find_all(string=is_the_only_string_within_a_tag): ["The Dormouse's story", "The Dormouse's story", 'Elsie', 'Lacie', 'Tillie', '...'] | |
## find all tags whose .string matches your value for string. | |
print("soup.find_all('a', string='Elsie'):", soup.find_all('a', string='Elsie')) | |
# soup.find_all('a', string='Elsie'): [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] | |
## before 4.4.0, text instead of string | |
## soup.find_all("a", text="Elsie") | |
# [<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>] | |
## limit argument | |
print("soup.find_all('a', limit=2):", soup.find_all('a', limit=2)) | |
# soup.find_all('a', limit=2): | |
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, | |
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] | |
## recursive argument | |
soup.html.find_all("title") | |
print("soup.html.find_all('title'):", soup.html.find_all('title')) | |
# soup.html.find_all('title'): [<title>The Dormouse's story</title>] | |
soup.html.find_all("title", recursive=False) | |
print("soup.html.find_all('title', recursive=False):", soup.html.find_all('title', recursive=False)) | |
# soup.html.find_all('title', recursive=False): [] | |
## Calling a tag is like calling find_all() | |
soup.find_all('a') # == soup('a') | |
soup.title.find_all(string=True) # == soup.title(string=True) | |
## find(name, attrs, recursive, string, **kwargs) | |
soup.find('title') # == soup.find_all('title', limit=1) | |
# <title>The Dormouse's story</title> | |
print(soup.find('nosuchtag')) | |
# None | |
soup.head.title # == soup.find('head').find('title') | |
# <title>The Dormouse's story</title> | |
## find_parents(name, attrs, string, limit, **kwargs) | |
## find_parent(name, attrs, string, **kwargs) | |
a_string = soup.find(string='Lacie') | |
print("a_string:", a_string) | |
# a_string: Lacie | |
print("a_string.find_parents('a'):", a_string.find_parents('a')) | |
# a_string.find_parents('a'): [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] | |
print("a_string.find_parent('p'):", a_string.find_parent('p')) | |
# a_string.find_parent('p'): | |
# <p class="story">Once upon a time there were three little sisters; and their names were | |
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, | |
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and | |
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; | |
# and they lived at the bottom of a well.</p> | |
print("a_string.find_parents('p', class='title'):", a_string.find_parents('p', class_='title')) | |
# a_string.find_parents('p', class='title'): [] | |
## find_next_siblings(name, attrs, string, limit, **kwargs) | |
## find_next_sibling(name, attrs, string, **kwargs) | |
first_link = soup.a | |
print("first_link:", first_link) | |
# first_link: <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> | |
print("first_link.find_next_siblings('a'):", first_link.find_next_siblings('a')) | |
# first_link.find_next_siblings('a'): | |
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, | |
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] | |
first_story_paragraph = soup.find('p', 'story') | |
print("first_story_paragraph.find_next_sibling('p'):", first_story_paragraph.find_next_sibling('p')) | |
# first_story_paragraph.find_next_sibling('p'): <p class="story">...</p> | |
## find_previous_siblings(name, attrs, string, limit, **kwargs) | |
## find_previous_sibling(name, attrs, string, **kwargs) | |
last_link = soup.find('a', id='link3') | |
print("last_link:", last_link) | |
# last_link: <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> | |
print("last_link.find_previous_siblings('a'):", last_link.find_previous_siblings('a')) | |
# last_link.find_previous_siblings('a'): | |
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, | |
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] | |
first_story_paragraph = soup.find('p', 'story') | |
print("first_story_paragraph.find_previous_sibling('p'):", first_story_paragraph.find_previous_sibling('p')) | |
# first_story_paragraph.find_previous_sibling('p'): <p class="title"><b>The Dormouse's story</b></p> | |
## find_all_next(name, attrs, string, limit, **kwargs) | |
## find_next(name, attrs, string, **kwargs) | |
first_link = soup.a | |
print("first_link:", first_link) | |
# first_link: <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> | |
print("first_link.find_all_next(string=True):", first_link.find_all_next(string=True)) | |
# first_link.find_all_next(string=True): | |
# ['Elsie', ',\n', 'Lacie', ' and\n', 'Tillie', | |
# ';\nand they lived at the bottom of a well.', '\n', '...', '\n'] | |
print("first_link.find_next('p'):", first_link.find_next('p')) | |
# first_link.find_next('p'): <p class="story">...</p> | |
### find_all_previous(name, attrs, string, limit, **kwargs) | |
### find_previous(name, attrs, string, **kwargs) | |
first_link = soup.a | |
print("first_link:", first_link) | |
# first_link: <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> | |
print("first_link.find_all_previous('p'):", first_link.find_all_previous('p')) | |
# first_link.find_all_previous('p'): | |
# [<p class="story">Once upon a time there were three little sisters; and their names were | |
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, | |
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and | |
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; | |
# and they lived at the bottom of a well.</p>, | |
# <p class="title"><b>The Dormouse's story</b></p>] | |
print("first_link.find_previous('title'):", first_link.find_previous('title')) | |
# first_link.find_previous('title'): <title>The Dormouse's story</title> |
'python' 카테고리의 다른 글
tkinter와 cffi 간단한 툴 제작 (0) | 2024.04.11 |
---|---|
python2 기본 환경에서 python3 사용하려면 feat. AI (1) | 2024.03.12 |
Python: f-String (Literal String Interpolation) (0) | 2020.04.28 |
BeautifulSoup Document 정리 1 (0) | 2019.10.02 |
Scrapy : python web crawler (0) | 2019.10.01 |