Karim shoair commited on
Commit ·
1d98b51
1
Parent(s): 9ac26c8
style: General type hints fixes and imports optimizing
Browse files- ruff.toml +1 -1
- scrapling/core/_types.py +1 -0
- scrapling/core/custom_types.py +8 -5
- scrapling/core/storage.py +3 -3
- scrapling/core/utils.py +6 -6
- scrapling/engines/_browsers/_validators.py +1 -1
- scrapling/parser.py +13 -13
ruff.toml
CHANGED
|
@@ -15,7 +15,7 @@ target-version = "py39"
|
|
| 15 |
|
| 16 |
[lint]
|
| 17 |
select = ["E", "F", "W"]
|
| 18 |
-
ignore = ["E501", "F401"]
|
| 19 |
|
| 20 |
[format]
|
| 21 |
# Like Black, use double quotes for strings.
|
|
|
|
| 15 |
|
| 16 |
[lint]
|
| 17 |
select = ["E", "F", "W"]
|
| 18 |
+
ignore = ["E501", "F401", "F811"]
|
| 19 |
|
| 20 |
[format]
|
| 21 |
# Like Black, use double quotes for strings.
|
scrapling/core/_types.py
CHANGED
|
@@ -4,6 +4,7 @@ Type definitions for type checking purposes.
|
|
| 4 |
|
| 5 |
from typing import (
|
| 6 |
TYPE_CHECKING,
|
|
|
|
| 7 |
Any,
|
| 8 |
Callable,
|
| 9 |
Dict,
|
|
|
|
| 4 |
|
| 5 |
from typing import (
|
| 6 |
TYPE_CHECKING,
|
| 7 |
+
overload,
|
| 8 |
Any,
|
| 9 |
Callable,
|
| 10 |
Dict,
|
scrapling/core/custom_types.py
CHANGED
|
@@ -7,14 +7,15 @@ from orjson import dumps, loads
|
|
| 7 |
|
| 8 |
from scrapling.core._types import (
|
| 9 |
Dict,
|
| 10 |
-
Iterable,
|
| 11 |
List,
|
|
|
|
|
|
|
| 12 |
Literal,
|
| 13 |
-
Optional,
|
| 14 |
Pattern,
|
|
|
|
|
|
|
|
|
|
| 15 |
SupportsIndex,
|
| 16 |
-
TypeVar,
|
| 17 |
-
Union,
|
| 18 |
)
|
| 19 |
from scrapling.core.utils import _is_iterable, flatten
|
| 20 |
from scrapling.core._html_utils import _replace_entities
|
|
@@ -341,7 +342,9 @@ class AttributesHandler(Mapping[str, _TextHandlerType]):
|
|
| 341 |
"""Acts like the standard dictionary `.get()` method"""
|
| 342 |
return self._data.get(key, default)
|
| 343 |
|
| 344 |
-
def search_values(
|
|
|
|
|
|
|
| 345 |
"""Search current attributes by values and return a dictionary of each matching item
|
| 346 |
:param keyword: The keyword to search for in the attribute values
|
| 347 |
:param partial: If True, the function will search if keyword in each value instead of perfect match
|
|
|
|
| 7 |
|
| 8 |
from scrapling.core._types import (
|
| 9 |
Dict,
|
|
|
|
| 10 |
List,
|
| 11 |
+
Union,
|
| 12 |
+
TypeVar,
|
| 13 |
Literal,
|
|
|
|
| 14 |
Pattern,
|
| 15 |
+
Iterable,
|
| 16 |
+
Optional,
|
| 17 |
+
Generator,
|
| 18 |
SupportsIndex,
|
|
|
|
|
|
|
| 19 |
)
|
| 20 |
from scrapling.core.utils import _is_iterable, flatten
|
| 21 |
from scrapling.core._html_utils import _replace_entities
|
|
|
|
| 342 |
"""Acts like the standard dictionary `.get()` method"""
|
| 343 |
return self._data.get(key, default)
|
| 344 |
|
| 345 |
+
def search_values(
|
| 346 |
+
self, keyword: str, partial: bool = False
|
| 347 |
+
) -> Generator["AttributesHandler", None, None]:
|
| 348 |
"""Search current attributes by values and return a dictionary of each matching item
|
| 349 |
:param keyword: The keyword to search for in the attribute values
|
| 350 |
:param partial: If True, the function will search if keyword in each value instead of perfect match
|
scrapling/core/storage.py
CHANGED
|
@@ -9,7 +9,7 @@ from orjson import dumps, loads
|
|
| 9 |
from tldextract import extract as tld
|
| 10 |
|
| 11 |
from scrapling.core.utils import _StorageTools, log
|
| 12 |
-
from scrapling.core._types import Dict, Optional, Union
|
| 13 |
|
| 14 |
|
| 15 |
class StorageSystemMixin(ABC):
|
|
@@ -106,7 +106,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
|
|
| 106 |
""")
|
| 107 |
self.connection.commit()
|
| 108 |
|
| 109 |
-
def save(self, element: HtmlElement, identifier: str):
|
| 110 |
"""Saves the elements unique properties to the storage for retrieval and relocation later
|
| 111 |
|
| 112 |
:param element: The element itself which we want to save to storage.
|
|
@@ -126,7 +126,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
|
|
| 126 |
self.cursor.fetchall()
|
| 127 |
self.connection.commit()
|
| 128 |
|
| 129 |
-
def retrieve(self, identifier: str) -> Optional[Dict]:
|
| 130 |
"""Using the identifier, we search the storage and return the unique properties of the element
|
| 131 |
|
| 132 |
:param identifier: This is the identifier that will be used to retrieve the element from the storage. See
|
|
|
|
| 9 |
from tldextract import extract as tld
|
| 10 |
|
| 11 |
from scrapling.core.utils import _StorageTools, log
|
| 12 |
+
from scrapling.core._types import Dict, Optional, Union, Any
|
| 13 |
|
| 14 |
|
| 15 |
class StorageSystemMixin(ABC):
|
|
|
|
| 106 |
""")
|
| 107 |
self.connection.commit()
|
| 108 |
|
| 109 |
+
def save(self, element: HtmlElement, identifier: str) -> None:
|
| 110 |
"""Saves the elements unique properties to the storage for retrieval and relocation later
|
| 111 |
|
| 112 |
:param element: The element itself which we want to save to storage.
|
|
|
|
| 126 |
self.cursor.fetchall()
|
| 127 |
self.connection.commit()
|
| 128 |
|
| 129 |
+
def retrieve(self, identifier: str) -> Optional[Dict[str, Any]]:
|
| 130 |
"""Using the identifier, we search the storage and return the unique properties of the element
|
| 131 |
|
| 132 |
:param identifier: This is the identifier that will be used to retrieve the element from the storage. See
|
scrapling/core/utils.py
CHANGED
|
@@ -5,7 +5,7 @@ from itertools import chain
|
|
| 5 |
import orjson
|
| 6 |
from lxml import html
|
| 7 |
|
| 8 |
-
from scrapling.core._types import Any, Dict, Iterable, Union
|
| 9 |
|
| 10 |
# Using cache on top of a class is a brilliant way to achieve a Singleton design pattern without much code
|
| 11 |
from functools import lru_cache # isort:skip
|
|
@@ -41,8 +41,8 @@ def setup_logger():
|
|
| 41 |
log = setup_logger()
|
| 42 |
|
| 43 |
|
| 44 |
-
def is_jsonable(content:
|
| 45 |
-
if
|
| 46 |
content = content.decode()
|
| 47 |
|
| 48 |
try:
|
|
@@ -52,14 +52,14 @@ def is_jsonable(content: Union[bytes, str]) -> bool:
|
|
| 52 |
return False
|
| 53 |
|
| 54 |
|
| 55 |
-
def flatten(lst: Iterable):
|
| 56 |
return list(chain.from_iterable(lst))
|
| 57 |
|
| 58 |
|
| 59 |
-
def _is_iterable(
|
| 60 |
# This will be used only in regex functions to make sure it's iterable but not string/bytes
|
| 61 |
return isinstance(
|
| 62 |
-
|
| 63 |
(
|
| 64 |
list,
|
| 65 |
tuple,
|
|
|
|
| 5 |
import orjson
|
| 6 |
from lxml import html
|
| 7 |
|
| 8 |
+
from scrapling.core._types import Any, Dict, Iterable, Union, List
|
| 9 |
|
| 10 |
# Using cache on top of a class is a brilliant way to achieve a Singleton design pattern without much code
|
| 11 |
from functools import lru_cache # isort:skip
|
|
|
|
| 41 |
log = setup_logger()
|
| 42 |
|
| 43 |
|
| 44 |
+
def is_jsonable(content: bytes | str) -> bool:
|
| 45 |
+
if isinstance(content, bytes):
|
| 46 |
content = content.decode()
|
| 47 |
|
| 48 |
try:
|
|
|
|
| 52 |
return False
|
| 53 |
|
| 54 |
|
| 55 |
+
def flatten(lst: Iterable[Any]) -> List[Any]:
|
| 56 |
return list(chain.from_iterable(lst))
|
| 57 |
|
| 58 |
|
| 59 |
+
def _is_iterable(obj: Any) -> bool:
|
| 60 |
# This will be used only in regex functions to make sure it's iterable but not string/bytes
|
| 61 |
return isinstance(
|
| 62 |
+
obj,
|
| 63 |
(
|
| 64 |
list,
|
| 65 |
tuple,
|
scrapling/engines/_browsers/_validators.py
CHANGED
|
@@ -82,7 +82,7 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
|
| 82 |
"""Configuration struct for validation"""
|
| 83 |
|
| 84 |
max_pages: int = 1
|
| 85 |
-
headless:
|
| 86 |
block_images: bool = False
|
| 87 |
disable_resources: bool = False
|
| 88 |
block_webrtc: bool = False
|
|
|
|
| 82 |
"""Configuration struct for validation"""
|
| 83 |
|
| 84 |
max_pages: int = 1
|
| 85 |
+
headless: bool = True # noqa: F821
|
| 86 |
block_images: bool = False
|
| 87 |
disable_resources: bool = False
|
| 88 |
block_webrtc: bool = False
|
scrapling/parser.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
-
import inspect
|
| 2 |
import os
|
| 3 |
import re
|
| 4 |
-
import
|
| 5 |
from difflib import SequenceMatcher
|
| 6 |
from urllib.parse import urljoin
|
| 7 |
|
|
@@ -18,16 +17,17 @@ from lxml.etree import (
|
|
| 18 |
|
| 19 |
from scrapling.core._types import (
|
| 20 |
Any,
|
| 21 |
-
Callable,
|
| 22 |
Dict,
|
| 23 |
-
Generator,
|
| 24 |
-
Iterable,
|
| 25 |
List,
|
| 26 |
-
Optional,
|
| 27 |
-
Pattern,
|
| 28 |
-
SupportsIndex,
|
| 29 |
Tuple,
|
| 30 |
Union,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
)
|
| 32 |
from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
|
| 33 |
from scrapling.core.mixins import SelectorsGeneration
|
|
@@ -248,7 +248,7 @@ class Selector(SelectorsGeneration):
|
|
| 248 |
|
| 249 |
def __handle_elements(
|
| 250 |
self, result: List[Union[HtmlElement, _ElementUnicodeResult]]
|
| 251 |
-
) -> Union["Selectors", "TextHandlers"
|
| 252 |
"""Used internally in all functions to convert results to type (Selectors|TextHandlers) in bulk when possible"""
|
| 253 |
if not len(
|
| 254 |
result
|
|
@@ -761,7 +761,7 @@ class Selector(SelectorsGeneration):
|
|
| 761 |
patterns.add(arg)
|
| 762 |
|
| 763 |
elif callable(arg):
|
| 764 |
-
if len(
|
| 765 |
functions.append(arg)
|
| 766 |
else:
|
| 767 |
raise TypeError(
|
|
@@ -914,7 +914,7 @@ class Selector(SelectorsGeneration):
|
|
| 914 |
return round((score / checks) * 100, 2)
|
| 915 |
|
| 916 |
@staticmethod
|
| 917 |
-
def __calculate_dict_diff(dict1:
|
| 918 |
"""Used internally to calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries"""
|
| 919 |
score = (
|
| 920 |
SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio()
|
|
@@ -1210,11 +1210,11 @@ class Selectors(List[Selector]):
|
|
| 1210 |
|
| 1211 |
__slots__ = ()
|
| 1212 |
|
| 1213 |
-
@
|
| 1214 |
def __getitem__(self, pos: SupportsIndex) -> Selector:
|
| 1215 |
pass
|
| 1216 |
|
| 1217 |
-
@
|
| 1218 |
def __getitem__(self, pos: slice) -> "Selectors":
|
| 1219 |
pass
|
| 1220 |
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import re
|
| 3 |
+
from inspect import signature
|
| 4 |
from difflib import SequenceMatcher
|
| 5 |
from urllib.parse import urljoin
|
| 6 |
|
|
|
|
| 17 |
|
| 18 |
from scrapling.core._types import (
|
| 19 |
Any,
|
|
|
|
| 20 |
Dict,
|
|
|
|
|
|
|
| 21 |
List,
|
|
|
|
|
|
|
|
|
|
| 22 |
Tuple,
|
| 23 |
Union,
|
| 24 |
+
Pattern,
|
| 25 |
+
Callable,
|
| 26 |
+
Optional,
|
| 27 |
+
Iterable,
|
| 28 |
+
overload,
|
| 29 |
+
Generator,
|
| 30 |
+
SupportsIndex,
|
| 31 |
)
|
| 32 |
from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
|
| 33 |
from scrapling.core.mixins import SelectorsGeneration
|
|
|
|
| 248 |
|
| 249 |
def __handle_elements(
|
| 250 |
self, result: List[Union[HtmlElement, _ElementUnicodeResult]]
|
| 251 |
+
) -> Union["Selectors", "TextHandlers"]:
|
| 252 |
"""Used internally in all functions to convert results to type (Selectors|TextHandlers) in bulk when possible"""
|
| 253 |
if not len(
|
| 254 |
result
|
|
|
|
| 761 |
patterns.add(arg)
|
| 762 |
|
| 763 |
elif callable(arg):
|
| 764 |
+
if len(signature(arg).parameters) > 0:
|
| 765 |
functions.append(arg)
|
| 766 |
else:
|
| 767 |
raise TypeError(
|
|
|
|
| 914 |
return round((score / checks) * 100, 2)
|
| 915 |
|
| 916 |
@staticmethod
|
| 917 |
+
def __calculate_dict_diff(dict1: Dict, dict2: Dict) -> float:
|
| 918 |
"""Used internally to calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries"""
|
| 919 |
score = (
|
| 920 |
SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio()
|
|
|
|
| 1210 |
|
| 1211 |
__slots__ = ()
|
| 1212 |
|
| 1213 |
+
@overload
|
| 1214 |
def __getitem__(self, pos: SupportsIndex) -> Selector:
|
| 1215 |
pass
|
| 1216 |
|
| 1217 |
+
@overload
|
| 1218 |
def __getitem__(self, pos: slice) -> "Selectors":
|
| 1219 |
pass
|
| 1220 |
|