Content¶

You can handle multiple Elements at once.

from scrapbook import Element, Content
import requests


class Twitter(Content):
    username = Element(
        xpath='//*[@id="page-container"]/div[2]/div/div'
              '/div[1]/div/div/div/div[1]/h2/a/span/b/text()',
    )
    screen_name = Element(
        xpath='//*[@id="page-container"]/div[2]/div/div/'
              'div[1]/div/div/div/div[1]/h1/a',
    )


response = requests.get('https://twitter.com/odoku')
data = Twitter().parse(response.text)

print(data)

Include filter/parser functions¶

You can define the filter / parser specified in the Element in the Content.

class Page(Content):
    username = Element(
        xpath='//*[@id="username"]',
        parse='parse_username',
        filter='filter_username',
    )

    def parse_username(self, selector):
        return selector.xpath('./text()').extract_first()

    def filter_username(self, value):
        return value.replace('username: ', '').strip()

Nest¶

Content can be nested.

class Profile(Content):
    username = Element(xpath='./path/to/username/text()')
    screen_name = Element(xpath='./path/to/screen_name/text()')

class Page(Content):
    profile = Profile(xpath='//*[@id="profile"]')

Inheritance¶

Content supports inheritance.

class Common(Content):
    title = Element(xpath='/path/to/title/text()')

class ProjectPage(Common):
    name = Element(xpath='/path/to/name/text()')

class TeamPage(Common):
    name = Element(xpath='/path/to/name/text()')

Arguments¶

Content(
    xpath: Optional[str] = None,
    filter: Union[Callable, str, list[Union[Callable, str]] = scrapbook.filters.through,
    many: bool = False,
)

xpath¶

Specify the xpath of the element you want to parse. For the included Element, the element of the specified xpath is passed.

class Page(Content):
    username = Element(xpath='./span[1]/text()')

page = Page(xpath='//*[@id="profile"]')
data = page.parse(html)

filter¶

You can do arbitrary processing on the acquired value. As with Element, multiple filters can be specified.

class Page(Content):
    username = Element(xpath='./span[1]/text()')

def rename(value):
    alias = {'username': 'account'}
    return {alias.get(k, k): v for k, v in value.items()}

page = Page(xpath='//*[@id="profile"]', filter=rename)
data = page.parse(html)

many¶

If there are multiple elements specified by xpath, you can get it as a list by specifying many = True.

class Comemnt(Content):
    text = Element(xpath='./text()')

class Article(Content):
    title = Element(xpath='//*[@id="title"]')
    content = Element(xpath='//*[@id="content"]')
    comments = Comment(xpath='//*[@id="content-list"]/li', many=True)

article = Article()
data = article.parse(html)

Methods¶

parse¶

parse(
    html: Union[str, parsel.Selector, parsel.SelectorList],
    object: Optional[Any],
)

Parse html.

class Page(Content):
    content = Element(xpath='/html/body/p/text()')

html = '<html><body><p>Hello!</p></body></html>'
page = Page()
data = page.parse(html)  # {'content': 'Hello!'}

Map the value to the object specified in the object argument.

instance = PageModel()
page = Page()
instance = page.parse(html, object=instance)

Class Methods¶

inline¶

inline(
    xpath: str = None,
    filter: Union[Callable, str, list[Union[Callable, str]] = scrapbook.filters.through,
    **attrs: Dict[str, Any]
)

Returns an instance of dynamically generated Content class.

class Page(Content):
    content = Content.inline(
        text=Element(xpath='/html/body/p/text()', filter='twice'),
    )

    def twice(self, value):
        return value * 2

html = '<html><body><p>Hello!</p></body></html>'
page = Page()
data = page.parse(html)  # {'content': {'text': 'Hello!Hello!'}}