Karim shoair commited on
Commit ·
e22b4a7
1
Parent(s): c32f33c
perf: Speed up `clean` functions
Browse files
scrapling/core/custom_types.py
CHANGED
|
@@ -18,11 +18,12 @@ from scrapling.core._types import (
|
|
| 18 |
Generator,
|
| 19 |
SupportsIndex,
|
| 20 |
)
|
| 21 |
-
from scrapling.core.utils import _is_iterable, flatten
|
| 22 |
from scrapling.core._html_utils import _replace_entities
|
| 23 |
|
| 24 |
# Define type variable for AttributeHandler value type
|
| 25 |
_TextHandlerType = TypeVar("_TextHandlerType", bound="TextHandler")
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
class TextHandler(str):
|
|
@@ -118,9 +119,8 @@ class TextHandler(str):
|
|
| 118 |
|
| 119 |
def clean(self) -> Union[str, "TextHandler"]:
|
| 120 |
"""Return a new version of the string after removing all white spaces and consecutive spaces"""
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
return self.__class__(sub(" +", " ", data).strip())
|
| 124 |
|
| 125 |
# For easy copy-paste from Scrapy/parsel code when needed :)
|
| 126 |
def get(self, default=None):
|
|
|
|
| 18 |
Generator,
|
| 19 |
SupportsIndex,
|
| 20 |
)
|
| 21 |
+
from scrapling.core.utils import _is_iterable, flatten, __CONSECUTIVE_SPACES_REGEX__
|
| 22 |
from scrapling.core._html_utils import _replace_entities
|
| 23 |
|
| 24 |
# Define type variable for AttributeHandler value type
|
| 25 |
_TextHandlerType = TypeVar("_TextHandlerType", bound="TextHandler")
|
| 26 |
+
__CLEANING_TABLE__ = str.maketrans("\t\r\n", " ")
|
| 27 |
|
| 28 |
|
| 29 |
class TextHandler(str):
|
|
|
|
| 119 |
|
| 120 |
def clean(self) -> Union[str, "TextHandler"]:
|
| 121 |
"""Return a new version of the string after removing all white spaces and consecutive spaces"""
|
| 122 |
+
data = self.translate(__CLEANING_TABLE__)
|
| 123 |
+
return self.__class__(__CONSECUTIVE_SPACES_REGEX__.sub(" ", data).strip())
|
|
|
|
| 124 |
|
| 125 |
# For easy copy-paste from Scrapy/parsel code when needed :)
|
| 126 |
def get(self, default=None):
|
scrapling/core/utils.py
CHANGED
|
@@ -14,6 +14,9 @@ html_forbidden = {
|
|
| 14 |
html.HtmlComment,
|
| 15 |
}
|
| 16 |
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
@lru_cache(1, typed=True)
|
| 19 |
def setup_logger():
|
|
@@ -135,6 +138,5 @@ class _StorageTools:
|
|
| 135 |
|
| 136 |
@lru_cache(128, typed=True)
|
| 137 |
def clean_spaces(string):
|
| 138 |
-
string = string.
|
| 139 |
-
|
| 140 |
-
return re.sub(" +", " ", string)
|
|
|
|
| 14 |
html.HtmlComment,
|
| 15 |
}
|
| 16 |
|
| 17 |
+
__CLEANING_TABLE__ = str.maketrans({"\t": " ", "\n": None, "\r": None})
|
| 18 |
+
__CONSECUTIVE_SPACES_REGEX__ = re.compile(r" +")
|
| 19 |
+
|
| 20 |
|
| 21 |
@lru_cache(1, typed=True)
|
| 22 |
def setup_logger():
|
|
|
|
| 138 |
|
| 139 |
@lru_cache(128, typed=True)
|
| 140 |
def clean_spaces(string):
|
| 141 |
+
string = string.translate(__CLEANING_TABLE__)
|
| 142 |
+
return __CONSECUTIVE_SPACES_REGEX__.sub(" ", string)
|
|
|