Karim shoair commited on
Commit ·
4e52121
1
Parent(s): 1ba0746
Adding `css_first` and `xpath_first` for easier usage
Browse files- README.md +3 -3
- scrapling/parser.py +52 -0
README.md
CHANGED
|
@@ -59,7 +59,7 @@ quotes = page.css('.quote').css('.text::text') # Chained selectors
|
|
| 59 |
quotes = [element.text for element in page.css('.quote').css('.text')] # Slower than bulk query above
|
| 60 |
|
| 61 |
# Get the first quote element
|
| 62 |
-
quote = page.css('.quote').first
|
| 63 |
|
| 64 |
# Working with elements
|
| 65 |
quote.html_content # Inner HTML
|
|
@@ -244,8 +244,8 @@ To increase the complexity a little bit, let's say we want to get all books' dat
|
|
| 244 |
```python
|
| 245 |
>>> for product in page.find_by_text('Tipping the Velvet').parent.parent.find_similar():
|
| 246 |
print({
|
| 247 |
-
"name": product.
|
| 248 |
-
"price": product.
|
| 249 |
"stock": product.css('.availability::text')[-1].clean()
|
| 250 |
})
|
| 251 |
{'name': 'A Light in the ...', 'price': '51.77', 'stock': 'In stock'}
|
|
|
|
| 59 |
quotes = [element.text for element in page.css('.quote').css('.text')] # Slower than bulk query above
|
| 60 |
|
| 61 |
# Get the first quote element
|
| 62 |
+
quote = page.css_first('.quote') # or page.css('.quote').first or [0] or .get()
|
| 63 |
|
| 64 |
# Working with elements
|
| 65 |
quote.html_content # Inner HTML
|
|
|
|
| 244 |
```python
|
| 245 |
>>> for product in page.find_by_text('Tipping the Velvet').parent.parent.find_similar():
|
| 246 |
print({
|
| 247 |
+
"name": product.css_first('h3 a::text'),
|
| 248 |
+
"price": product.css_first('.price_color').re_first(r'[\d\.]+'),
|
| 249 |
"stock": product.css('.availability::text')[-1].clean()
|
| 250 |
})
|
| 251 |
{'name': 'A Light in the ...', 'price': '51.77', 'stock': 'In stock'}
|
scrapling/parser.py
CHANGED
|
@@ -394,6 +394,58 @@ class Adaptor(SelectorsGeneration):
|
|
| 394 |
return self.__convert_results(score_table[highest_probability])
|
| 395 |
return []
|
| 396 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
def css(self, selector: str, identifier: str = '',
|
| 398 |
auto_match: bool = False, auto_save: bool = False, percentage: int = 0
|
| 399 |
) -> Union['Adaptors[Adaptor]', List]:
|
|
|
|
| 394 |
return self.__convert_results(score_table[highest_probability])
|
| 395 |
return []
|
| 396 |
|
| 397 |
+
def css_first(self, selector: str, identifier: str = '',
|
| 398 |
+
auto_match: bool = False, auto_save: bool = False, percentage: int = 0
|
| 399 |
+
) -> Union['Adaptors[Adaptor]', List, None]:
|
| 400 |
+
"""Search current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
|
| 401 |
+
|
| 402 |
+
**Important:
|
| 403 |
+
It's recommended to use the identifier argument if you plan to use different selector later
|
| 404 |
+
and want to relocate the same element(s)**
|
| 405 |
+
|
| 406 |
+
:param selector: The CSS3 selector to be used.
|
| 407 |
+
:param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
|
| 408 |
+
:param identifier: A string that will be used to save/retrieve element's data in auto-matching
|
| 409 |
+
otherwise the selector will be used.
|
| 410 |
+
:param auto_save: Automatically save new elements for `auto_match` later
|
| 411 |
+
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 412 |
+
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
| 413 |
+
number unless you must know what you are doing!
|
| 414 |
+
|
| 415 |
+
:return: List as :class:`Adaptors`
|
| 416 |
+
"""
|
| 417 |
+
try:
|
| 418 |
+
return self.css(selector, identifier, auto_match, auto_save, percentage)[0]
|
| 419 |
+
except (IndexError, TypeError,):
|
| 420 |
+
return None
|
| 421 |
+
|
| 422 |
+
def xpath_first(self, selector: str, identifier: str = '',
|
| 423 |
+
auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
|
| 424 |
+
) -> Union['Adaptors[Adaptor]', List, None]:
|
| 425 |
+
"""Search current tree with XPath selectors and return the first result if possible, otherwise return `None`
|
| 426 |
+
|
| 427 |
+
**Important:
|
| 428 |
+
It's recommended to use the identifier argument if you plan to use different selector later
|
| 429 |
+
and want to relocate the same element(s)**
|
| 430 |
+
|
| 431 |
+
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
| 432 |
+
|
| 433 |
+
:param selector: The XPath selector to be used.
|
| 434 |
+
:param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
|
| 435 |
+
:param identifier: A string that will be used to save/retrieve element's data in auto-matching
|
| 436 |
+
otherwise the selector will be used.
|
| 437 |
+
:param auto_save: Automatically save new elements for `auto_match` later
|
| 438 |
+
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 439 |
+
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
| 440 |
+
number unless you must know what you are doing!
|
| 441 |
+
|
| 442 |
+
:return: List as :class:`Adaptors`
|
| 443 |
+
"""
|
| 444 |
+
try:
|
| 445 |
+
return self.xpath(selector, identifier, auto_match, auto_save, percentage, **kwargs)[0]
|
| 446 |
+
except (IndexError, TypeError,):
|
| 447 |
+
return None
|
| 448 |
+
|
| 449 |
def css(self, selector: str, identifier: str = '',
|
| 450 |
auto_match: bool = False, auto_save: bool = False, percentage: int = 0
|
| 451 |
) -> Union['Adaptors[Adaptor]', List]:
|