조용한 담장

BeautifulSoup Document 정리 2 본문

python

BeautifulSoup Document 정리 2

iosroid 2019. 10. 2. 18:34

웹 크롤러 개발에 많이 사용하는 파이썬 라이브러리(python library) BeautifulSoup 의 공식 문서의 예제 코드들을 모았다.

 

https://www.crummy.com/software/BeautifulSoup/bs4/doc/

 

#!/usr/bin/env python3
"""
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
find(), find_all()
find_parents(), find_parent()
find_next_siblings(), find_next_sibling()
find_previous_siblings(), find_previous_sibling()
find_all_next(), find_next()
find_all_previous(), find_previous()
"""
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="body strikeout"></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
### Searching tree
## string
print("soup.find_all('b'):", soup.find_all('b'))
# soup.find_all('b'): [<b>The Dormouse's story</b>]
## regular expression
import re
for tag in soup.find_all(re.compile('^b')):
print("tag.name:", tag.name)
# tag.name: body
# tag.name: b
for tag in soup.find_all(re.compile('t')):
print("tag.name:", tag.name)
# tag.name: html
# tag.name: title
## list
print("soup.find_all(['a', 'b']):", soup.find_all(['a', 'b']))
# soup.find_all(['a', 'b']):
# [<b>The Dormouse's story</b>,
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
## True
for tag in soup.find_all(True):
print("tag.name:", tag.name)
# tag.name: html
# tag.name: head
# tag.name: title
# tag.name: body
# tag.name: p
# tag.name: b
# tag.name: p
# tag.name: a
# tag.name: a
# tag.name: a
# tag.name: p
## function
def has_class_but_no_id(tag):
return tag.has_attr('class') and not tag.has_attr('id')
print("soup.find_all(has_class_but_no_id):", soup.find_all(has_class_but_no_id))
# soup.find_all(has_class_but_no_id):
# [<p class="title"><b>The Dormouse's story</b></p>,
# <p class="story">Once upon a time there were three little sisters; and their names were
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well.</p>,
# <p class="story">...</p>]
def not_lacie(href):
return href and not re.compile('lacie').search(href)
print("soup.find_all(href=not_lacie):", soup.find_all(href=not_lacie))
# soup.find_all(href=not_lacie):
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
from bs4 import NavigableString
def surrounded_by_strings(tag):
return (isinstance(tag.next_element, NavigableString)
and isinstance(tag.previous_element, NavigableString))
for tag in soup.find_all(surrounded_by_strings):
print("tag.name:", tag.name)
# tag.name: body
# tag.name: p
# tag.name: a
# tag.name: a
# tag.name: a
# tag.name: p
### find_all()
## argument: name, string, limit, recursive and keyword
print("soup.find_all('title'):", soup.find_all('title'))
# soup.find_all('title'): [<title>The Dormouse's story</title>]
print("soup.find_all('p', 'title'):", soup.find_all('p', 'title'))
# soup.find_all('p', 'title'): [<p class="title"><b>The Dormouse's story</b></p>]
print("soup.find_all('a'):", soup.find_all('a'))
# soup.find_all('a'):
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
print("soup.find_all(id='link2'):", soup.find_all(id='link2'))
# soup.find_all(id='link2'):
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
import re
print("soup.find(string=re.compile('sisters')):", soup.find(string=re.compile('sisters')))
# soup.find(string=re.compile('sisters')):
# Once upon a time there were three little sisters; and their names were
### name argument
print("soup.find_all('title'):", soup.find_all('title'))
# soup.find_all('title'): [<title>The Dormouse's story</title>]
### keyword arguments
print("soup.find_all(id='link2'):", soup.find_all(id='link2'))
# soup.find_all(id='link2'): [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
print("soup.find_all(href=re.compile('elsie')):", soup.find_all(href=re.compile('elsie')))
# soup.find_all(href=re.compile('elsie')):
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
print("soup.find_all(id=True):", soup.find_all(id=True))
# soup.find_all(id=True):
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
print("soup.find_all(href=re.compile('elsie'), id='link1'):", soup.find_all(href=re.compile('elsie'), id='link1'))
# soup.find_all(href=re.compile('elsie'), id='link1'):
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>', 'html.parser')
## data_soup.find_all(data-foo="value")
# SyntaxError: keyword can't be an expression
print("data_soup.find_all(attrs={'data-foo': 'value'}):", data_soup.find_all(attrs={'data-foo': 'value'}))
# data_soup.find_all(attrs={'data-foo': 'value'}): [<div data-foo="value">foo!</div>]
name_soup = BeautifulSoup('<input name="email"/>', 'html.parser')
print("name_soup.find_all(name='email'):", name_soup.find_all(name='email'))
# name_soup.find_all(name='email'): []
print("name_soup.find_all(attrs={'name': 'email'}):", name_soup.find_all(attrs={'name': 'email'}))
# name_soup.find_all(attrs={'name': 'email'}): [<input name="email"/>]
### Searching by CSS class
## you can search by CSS class using the keyword argument class_
print("soup.find_all('a', class_='sister'):", soup.find_all('a', class_='sister'))
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.find_all(class_=re.compile("itl"))
print("soup.find_all(class_=re.compile('itl')):", soup.find_all(class_=re.compile('itl')))
# soup.find_all(class_=re.compile('itl')): [<p class="title"><b>The Dormouse's story</b></p>]
def has_six_characters(css_class):
return css_class is not None and len(css_class) == 6
print("soup.find_all(class_=has_six_characters):", soup.find_all(class_=has_six_characters))
# soup.find_all(class_=has_six_characters):
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
## a single tag can have multiple values for its “class” attribute
css_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html.parser')
print("css_soup.find_all('p', class_='strikeout'):", css_soup.find_all('p', class_='strikeout'))
# css_soup.find_all('p', class_='strikeout'): [<p class="body strikeout"></p>]
print("css_soup.find_all('p', class_='body'):", css_soup.find_all('p', class_='body'))
# css_soup.find_all('p', class_='body'): [<p class="body strikeout"></p>]
## You can also search for the exact string value of the class attribute
print("css_soup.find_all('p', class_='body strikeout'):", css_soup.find_all('p', class_='body strikeout'))
# css_soup.find_all('p', class_='body strikeout'): [<p class="body strikeout"></p>]
## CSS selector to search for tags that match two or more CSS classes
print("css_soup.select('p.strikeout.body'):", css_soup.select('p.strikeout.body'))
# css_soup.select('p.strikeout.body'): [<p class="body strikeout"></p>]
### string argument
## search for strings instead of tags.
print("soup.find_all(string='Elsie'):", soup.find_all(string='Elsie'))
# soup.find_all(string='Elsie'): ['Elsie']
print("soup.find_all(string=['Tillie', 'Elsie', 'Lacie']):", soup.find_all(string=['Tillie', 'Elsie', 'Lacie']))
# soup.find_all(string=['Tillie', 'Elsie', 'Lacie']): ['Elsie', 'Lacie', 'Tillie']
print("soup.find_all(string=re.compile('Dormouse')):", soup.find_all(string=re.compile('Dormouse')))
# soup.find_all(string=re.compile('Dormouse')): ["The Dormouse's story", "The Dormouse's story"]
def is_the_only_string_within_a_tag(s):
"""Return True if this string is the only child of its parent tag."""
return (s == s.parent.string)
print("soup.find_all(string=is_the_only_string_within_a_tag):", soup.find_all(string=is_the_only_string_within_a_tag))
# soup.find_all(string=is_the_only_string_within_a_tag): ["The Dormouse's story", "The Dormouse's story", 'Elsie', 'Lacie', 'Tillie', '...']
## find all tags whose .string matches your value for string.
print("soup.find_all('a', string='Elsie'):", soup.find_all('a', string='Elsie'))
# soup.find_all('a', string='Elsie'): [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
## before 4.4.0, text instead of string
## soup.find_all("a", text="Elsie")
# [<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>]
## limit argument
print("soup.find_all('a', limit=2):", soup.find_all('a', limit=2))
# soup.find_all('a', limit=2):
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
## recursive argument
soup.html.find_all("title")
print("soup.html.find_all('title'):", soup.html.find_all('title'))
# soup.html.find_all('title'): [<title>The Dormouse's story</title>]
soup.html.find_all("title", recursive=False)
print("soup.html.find_all('title', recursive=False):", soup.html.find_all('title', recursive=False))
# soup.html.find_all('title', recursive=False): []
## Calling a tag is like calling find_all()
soup.find_all('a') # == soup('a')
soup.title.find_all(string=True) # == soup.title(string=True)
## find(name, attrs, recursive, string, **kwargs)
soup.find('title') # == soup.find_all('title', limit=1)
# <title>The Dormouse's story</title>
print(soup.find('nosuchtag'))
# None
soup.head.title # == soup.find('head').find('title')
# <title>The Dormouse's story</title>
## find_parents(name, attrs, string, limit, **kwargs)
## find_parent(name, attrs, string, **kwargs)
a_string = soup.find(string='Lacie')
print("a_string:", a_string)
# a_string: Lacie
print("a_string.find_parents('a'):", a_string.find_parents('a'))
# a_string.find_parents('a'): [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
print("a_string.find_parent('p'):", a_string.find_parent('p'))
# a_string.find_parent('p'):
# <p class="story">Once upon a time there were three little sisters; and their names were
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
# and they lived at the bottom of a well.</p>
print("a_string.find_parents('p', class='title'):", a_string.find_parents('p', class_='title'))
# a_string.find_parents('p', class='title'): []
## find_next_siblings(name, attrs, string, limit, **kwargs)
## find_next_sibling(name, attrs, string, **kwargs)
first_link = soup.a
print("first_link:", first_link)
# first_link: <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print("first_link.find_next_siblings('a'):", first_link.find_next_siblings('a'))
# first_link.find_next_siblings('a'):
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
first_story_paragraph = soup.find('p', 'story')
print("first_story_paragraph.find_next_sibling('p'):", first_story_paragraph.find_next_sibling('p'))
# first_story_paragraph.find_next_sibling('p'): <p class="story">...</p>
## find_previous_siblings(name, attrs, string, limit, **kwargs)
## find_previous_sibling(name, attrs, string, **kwargs)
last_link = soup.find('a', id='link3')
print("last_link:", last_link)
# last_link: <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
print("last_link.find_previous_siblings('a'):", last_link.find_previous_siblings('a'))
# last_link.find_previous_siblings('a'):
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
first_story_paragraph = soup.find('p', 'story')
print("first_story_paragraph.find_previous_sibling('p'):", first_story_paragraph.find_previous_sibling('p'))
# first_story_paragraph.find_previous_sibling('p'): <p class="title"><b>The Dormouse's story</b></p>
## find_all_next(name, attrs, string, limit, **kwargs)
## find_next(name, attrs, string, **kwargs)
first_link = soup.a
print("first_link:", first_link)
# first_link: <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print("first_link.find_all_next(string=True):", first_link.find_all_next(string=True))
# first_link.find_all_next(string=True):
# ['Elsie', ',\n', 'Lacie', ' and\n', 'Tillie',
# ';\nand they lived at the bottom of a well.', '\n', '...', '\n']
print("first_link.find_next('p'):", first_link.find_next('p'))
# first_link.find_next('p'): <p class="story">...</p>
### find_all_previous(name, attrs, string, limit, **kwargs)
### find_previous(name, attrs, string, **kwargs)
first_link = soup.a
print("first_link:", first_link)
# first_link: <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print("first_link.find_all_previous('p'):", first_link.find_all_previous('p'))
# first_link.find_all_previous('p'):
# [<p class="story">Once upon a time there were three little sisters; and their names were
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
# and they lived at the bottom of a well.</p>,
# <p class="title"><b>The Dormouse's story</b></p>]
print("first_link.find_previous('title'):", first_link.find_previous('title'))
# first_link.find_previous('title'): <title>The Dormouse's story</title>
Comments