Karim shoair commited on
Commit ·
fb54505
1
Parent(s): 2be5705
Updating the keywords whitelisting logic in `find`/`find_all` functions
Browse files- scrapling/parser.py +7 -3
scrapling/parser.py
CHANGED
|
@@ -551,9 +551,10 @@ class Adaptor(SelectorsGeneration):
|
|
| 551 |
"""
|
| 552 |
# Attributes that are Python reserved words and can't be used directly
|
| 553 |
# Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
|
|
|
|
| 554 |
whitelisted = {
|
| 555 |
-
'id_': 'id',
|
| 556 |
'class_': 'class',
|
|
|
|
| 557 |
}
|
| 558 |
|
| 559 |
if not args and not kwargs:
|
|
@@ -582,14 +583,17 @@ class Adaptor(SelectorsGeneration):
|
|
| 582 |
|
| 583 |
if not all([(type(k) is str and type(v) is str) for k, v in kwargs.items()]):
|
| 584 |
raise TypeError('Only string values are accepted for arguments')
|
| 585 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 586 |
|
| 587 |
# It's easier and faster to build a selector than traversing the tree
|
| 588 |
tags = tags or ['']
|
| 589 |
for tag in tags:
|
| 590 |
selector = tag
|
| 591 |
for key, value in attributes.items():
|
| 592 |
-
key = whitelisted.get(key, key)
|
| 593 |
value = value.replace('"', r'\"') # Escape double quotes in user input
|
| 594 |
# Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
|
| 595 |
selector += '[{}="{}"]'.format(key, value)
|
|
|
|
| 551 |
"""
|
| 552 |
# Attributes that are Python reserved words and can't be used directly
|
| 553 |
# Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
|
| 554 |
+
# https://www.w3schools.com/python/python_ref_keywords.asp
|
| 555 |
whitelisted = {
|
|
|
|
| 556 |
'class_': 'class',
|
| 557 |
+
'for_': 'for',
|
| 558 |
}
|
| 559 |
|
| 560 |
if not args and not kwargs:
|
|
|
|
| 583 |
|
| 584 |
if not all([(type(k) is str and type(v) is str) for k, v in kwargs.items()]):
|
| 585 |
raise TypeError('Only string values are accepted for arguments')
|
| 586 |
+
|
| 587 |
+
for attribute_name, value in kwargs.items():
|
| 588 |
+
# Only replace names for kwargs, replacing them in dictionaries doesn't make sense
|
| 589 |
+
attribute_name = whitelisted.get(attribute_name, attribute_name)
|
| 590 |
+
attributes[attribute_name] = value
|
| 591 |
|
| 592 |
# It's easier and faster to build a selector than traversing the tree
|
| 593 |
tags = tags or ['']
|
| 594 |
for tag in tags:
|
| 595 |
selector = tag
|
| 596 |
for key, value in attributes.items():
|
|
|
|
| 597 |
value = value.replace('"', r'\"') # Escape double quotes in user input
|
| 598 |
# Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
|
| 599 |
selector += '[{}="{}"]'.format(key, value)
|