Content¶
You can handle multiple Elements at once.
from scrapbook import Element, Content
import requests
class Twitter(Content):
username = Element(
xpath='//*[@id="page-container"]/div[2]/div/div'
'/div[1]/div/div/div/div[1]/h2/a/span/b/text()',
)
screen_name = Element(
xpath='//*[@id="page-container"]/div[2]/div/div/'
'div[1]/div/div/div/div[1]/h1/a',
)
response = requests.get('https://twitter.com/odoku')
data = Twitter().parse(response.text)
print(data)
Include filter/parser functions¶
You can define the filter / parser specified in the Element in the Content.
class Page(Content):
username = Element(
xpath='//*[@id="username"]',
parse='parse_username',
filter='filter_username',
)
def parse_username(self, selector):
return selector.xpath('./text()').extract_first()
def filter_username(self, value):
return value.replace('username: ', '').strip()
Nest¶
Content can be nested.
class Profile(Content):
username = Element(xpath='./path/to/username/text()')
screen_name = Element(xpath='./path/to/screen_name/text()')
class Page(Content):
profile = Profile(xpath='//*[@id="profile"]')
Inheritance¶
Content supports inheritance.
class Common(Content):
title = Element(xpath='/path/to/title/text()')
class ProjectPage(Common):
name = Element(xpath='/path/to/name/text()')
class TeamPage(Common):
name = Element(xpath='/path/to/name/text()')
Arguments¶
Content(
xpath: Optional[str] = None,
filter: Union[Callable, str, list[Union[Callable, str]] = scrapbook.filters.through,
many: bool = False,
)
xpath¶
Specify the xpath of the element you want to parse. For the included Element, the element of the specified xpath is passed.
class Page(Content):
username = Element(xpath='./span[1]/text()')
page = Page(xpath='//*[@id="profile"]')
data = page.parse(html)
filter¶
You can do arbitrary processing on the acquired value. As with Element, multiple filters can be specified.
class Page(Content):
username = Element(xpath='./span[1]/text()')
def rename(value):
alias = {'username': 'account'}
return {alias.get(k, k): v for k, v in value.items()}
page = Page(xpath='//*[@id="profile"]', filter=rename)
data = page.parse(html)
many¶
If there are multiple elements specified by xpath, you can get it as a list by specifying many = True.
class Comemnt(Content):
text = Element(xpath='./text()')
class Article(Content):
title = Element(xpath='//*[@id="title"]')
content = Element(xpath='//*[@id="content"]')
comments = Comment(xpath='//*[@id="content-list"]/li', many=True)
article = Article()
data = article.parse(html)
Methods¶
parse¶
parse(
html: Union[str, parsel.Selector, parsel.SelectorList],
object: Optional[Any],
)
Parse html.
class Page(Content):
content = Element(xpath='/html/body/p/text()')
html = '<html><body><p>Hello!</p></body></html>'
page = Page()
data = page.parse(html) # {'content': 'Hello!'}
Map the value to the object specified in the object argument.
instance = PageModel()
page = Page()
instance = page.parse(html, object=instance)
Class Methods¶
inline¶
inline(
xpath: str = None,
filter: Union[Callable, str, list[Union[Callable, str]] = scrapbook.filters.through,
**attrs: Dict[str, Any]
)
Returns an instance of dynamically generated Content class.
class Page(Content):
content = Content.inline(
text=Element(xpath='/html/body/p/text()', filter='twice'),
)
def twice(self, value):
return value * 2
html = '<html><body><p>Hello!</p></body></html>'
page = Page()
data = page.parse(html) # {'content': {'text': 'Hello!Hello!'}}