Karim shoair commited on
Commit
e22b4a7
·
1 Parent(s): c32f33c

perf: Speed up `clean` functions

Browse files
scrapling/core/custom_types.py CHANGED
@@ -18,11 +18,12 @@ from scrapling.core._types import (
18
  Generator,
19
  SupportsIndex,
20
  )
21
- from scrapling.core.utils import _is_iterable, flatten
22
  from scrapling.core._html_utils import _replace_entities
23
 
24
  # Define type variable for AttributeHandler value type
25
  _TextHandlerType = TypeVar("_TextHandlerType", bound="TextHandler")
 
26
 
27
 
28
  class TextHandler(str):
@@ -118,9 +119,8 @@ class TextHandler(str):
118
 
119
  def clean(self) -> Union[str, "TextHandler"]:
120
  """Return a new version of the string after removing all white spaces and consecutive spaces"""
121
- trans_table = str.maketrans("\t\r\n", " ")
122
- data = self.translate(trans_table)
123
- return self.__class__(sub(" +", " ", data).strip())
124
 
125
  # For easy copy-paste from Scrapy/parsel code when needed :)
126
  def get(self, default=None):
 
18
  Generator,
19
  SupportsIndex,
20
  )
21
+ from scrapling.core.utils import _is_iterable, flatten, __CONSECUTIVE_SPACES_REGEX__
22
  from scrapling.core._html_utils import _replace_entities
23
 
24
  # Define type variable for AttributeHandler value type
25
  _TextHandlerType = TypeVar("_TextHandlerType", bound="TextHandler")
26
+ __CLEANING_TABLE__ = str.maketrans("\t\r\n", " ")
27
 
28
 
29
  class TextHandler(str):
 
119
 
120
  def clean(self) -> Union[str, "TextHandler"]:
121
  """Return a new version of the string after removing all white spaces and consecutive spaces"""
122
+ data = self.translate(__CLEANING_TABLE__)
123
+ return self.__class__(__CONSECUTIVE_SPACES_REGEX__.sub(" ", data).strip())
 
124
 
125
  # For easy copy-paste from Scrapy/parsel code when needed :)
126
  def get(self, default=None):
scrapling/core/utils.py CHANGED
@@ -14,6 +14,9 @@ html_forbidden = {
14
  html.HtmlComment,
15
  }
16
 
 
 
 
17
 
18
  @lru_cache(1, typed=True)
19
  def setup_logger():
@@ -135,6 +138,5 @@ class _StorageTools:
135
 
136
  @lru_cache(128, typed=True)
137
  def clean_spaces(string):
138
- string = string.replace("\t", " ")
139
- string = re.sub("[\n|\r]", "", string)
140
- return re.sub(" +", " ", string)
 
14
  html.HtmlComment,
15
  }
16
 
17
+ __CLEANING_TABLE__ = str.maketrans({"\t": " ", "\n": None, "\r": None})
18
+ __CONSECUTIVE_SPACES_REGEX__ = re.compile(r" +")
19
+
20
 
21
  @lru_cache(1, typed=True)
22
  def setup_logger():
 
138
 
139
  @lru_cache(128, typed=True)
140
  def clean_spaces(string):
141
+ string = string.translate(__CLEANING_TABLE__)
142
+ return __CONSECUTIVE_SPACES_REGEX__.sub(" ", string)