Karim shoair commited on
Commit ·
f88c43a
1
Parent(s): ae6e6c8
feat(TextHandler): Add argument to `clean` method to remove html entities
Browse files
scrapling/core/custom_types.py
CHANGED
|
@@ -103,9 +103,11 @@ class TextHandler(str):
|
|
| 103 |
"""Return a sorted version of the string"""
|
| 104 |
return self.__class__("".join(sorted(self, reverse=reverse)))
|
| 105 |
|
| 106 |
-
def clean(self) -> Union[str, "TextHandler"]:
|
| 107 |
"""Return a new version of the string after removing all white spaces and consecutive spaces"""
|
| 108 |
data = self.translate(__CLEANING_TABLE__)
|
|
|
|
|
|
|
| 109 |
return self.__class__(__CONSECUTIVE_SPACES_REGEX__.sub(" ", data).strip())
|
| 110 |
|
| 111 |
# For easy copy-paste from Scrapy/parsel code when needed :)
|
|
|
|
| 103 |
"""Return a sorted version of the string"""
|
| 104 |
return self.__class__("".join(sorted(self, reverse=reverse)))
|
| 105 |
|
| 106 |
+
def clean(self, remove_entities=False) -> Union[str, "TextHandler"]:
|
| 107 |
"""Return a new version of the string after removing all white spaces and consecutive spaces"""
|
| 108 |
data = self.translate(__CLEANING_TABLE__)
|
| 109 |
+
if remove_entities:
|
| 110 |
+
data = _replace_entities(data)
|
| 111 |
return self.__class__(__CONSECUTIVE_SPACES_REGEX__.sub(" ", data).strip())
|
| 112 |
|
| 113 |
# For easy copy-paste from Scrapy/parsel code when needed :)
|