Karim shoair commited on
Commit ·
c36c9c3
1
Parent(s): deb22eb
Adding `keep_cdata` argument for `Adaptor` and `Response` classes
Browse files- README.md +1 -1
- scrapling/engines/toolbelt/custom.py +3 -1
- scrapling/parser.py +7 -3
README.md
CHANGED
|
@@ -219,7 +219,7 @@ You might be slightly confused by now so let me clear things up. All fetcher-typ
|
|
| 219 |
```python
|
| 220 |
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
| 221 |
```
|
| 222 |
-
All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
|
| 223 |
|
| 224 |
If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
|
| 225 |
```python
|
|
|
|
| 219 |
```python
|
| 220 |
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
| 221 |
```
|
| 222 |
+
All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
|
| 223 |
|
| 224 |
If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
|
| 225 |
```python
|
scrapling/engines/toolbelt/custom.py
CHANGED
|
@@ -105,7 +105,7 @@ class BaseFetcher:
|
|
| 105 |
def __init__(
|
| 106 |
self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
|
| 107 |
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = False,
|
| 108 |
-
automatch_domain: Optional[str] = None,
|
| 109 |
):
|
| 110 |
"""Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
|
| 111 |
are detected and passed automatically from the Fetcher based on the response for accessibility.
|
|
@@ -113,6 +113,7 @@ class BaseFetcher:
|
|
| 113 |
:param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
|
| 114 |
libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
|
| 115 |
:param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
|
|
|
|
| 116 |
:param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
|
| 117 |
priority over all auto-match related arguments/functions in the class.
|
| 118 |
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
|
|
@@ -127,6 +128,7 @@ class BaseFetcher:
|
|
| 127 |
self.adaptor_arguments = dict(
|
| 128 |
huge_tree=huge_tree,
|
| 129 |
keep_comments=keep_comments,
|
|
|
|
| 130 |
auto_match=auto_match,
|
| 131 |
storage=storage,
|
| 132 |
storage_args=storage_args,
|
|
|
|
| 105 |
def __init__(
|
| 106 |
self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
|
| 107 |
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = False,
|
| 108 |
+
automatch_domain: Optional[str] = None, keep_cdata: Optional[bool] = False,
|
| 109 |
):
|
| 110 |
"""Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
|
| 111 |
are detected and passed automatically from the Fetcher based on the response for accessibility.
|
|
|
|
| 113 |
:param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
|
| 114 |
libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
|
| 115 |
:param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
|
| 116 |
+
:param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
|
| 117 |
:param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
|
| 118 |
priority over all auto-match related arguments/functions in the class.
|
| 119 |
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
|
|
|
|
| 128 |
self.adaptor_arguments = dict(
|
| 129 |
huge_tree=huge_tree,
|
| 130 |
keep_comments=keep_comments,
|
| 131 |
+
keep_cdata=keep_cdata,
|
| 132 |
auto_match=auto_match,
|
| 133 |
storage=storage,
|
| 134 |
storage_args=storage_args,
|
scrapling/parser.py
CHANGED
|
@@ -25,6 +25,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 25 |
__slots__ = (
|
| 26 |
'url', 'encoding', '__auto_match_enabled', '_root', '_storage', '__debug',
|
| 27 |
'__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
|
|
|
|
| 28 |
)
|
| 29 |
|
| 30 |
def __init__(
|
|
@@ -36,6 +37,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 36 |
huge_tree: bool = True,
|
| 37 |
root: Optional[html.HtmlElement] = None,
|
| 38 |
keep_comments: Optional[bool] = False,
|
|
|
|
| 39 |
auto_match: Optional[bool] = True,
|
| 40 |
storage: Any = SQLiteStorageSystem,
|
| 41 |
storage_args: Optional[Dict] = None,
|
|
@@ -59,6 +61,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 59 |
:param root: Used internally to pass etree objects instead of text/body arguments, it takes highest priority.
|
| 60 |
Don't use it unless you know what you are doing!
|
| 61 |
:param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
|
|
|
|
| 62 |
:param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
|
| 63 |
priority over all auto-match related arguments/functions in the class.
|
| 64 |
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
|
|
@@ -84,8 +87,8 @@ class Adaptor(SelectorsGeneration):
|
|
| 84 |
|
| 85 |
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
| 86 |
parser = html.HTMLParser(
|
| 87 |
-
recover=True, remove_blank_text=True, remove_comments=(
|
| 88 |
-
compact=True, huge_tree=huge_tree, default_doctype=True
|
| 89 |
)
|
| 90 |
self._root = etree.fromstring(body, parser=parser, base_url=url)
|
| 91 |
if is_jsonable(text or body.decode()):
|
|
@@ -119,6 +122,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 119 |
self._storage = storage(**storage_args)
|
| 120 |
|
| 121 |
self.__keep_comments = keep_comments
|
|
|
|
| 122 |
self.__huge_tree_enabled = huge_tree
|
| 123 |
self.encoding = encoding
|
| 124 |
self.url = url
|
|
@@ -156,7 +160,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 156 |
root=element,
|
| 157 |
text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
|
| 158 |
url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
| 159 |
-
keep_comments=
|
| 160 |
huge_tree=self.__huge_tree_enabled, debug=self.__debug,
|
| 161 |
**self.__response_data
|
| 162 |
)
|
|
|
|
| 25 |
__slots__ = (
|
| 26 |
'url', 'encoding', '__auto_match_enabled', '_root', '_storage', '__debug',
|
| 27 |
'__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
|
| 28 |
+
'__keep_cdata',
|
| 29 |
)
|
| 30 |
|
| 31 |
def __init__(
|
|
|
|
| 37 |
huge_tree: bool = True,
|
| 38 |
root: Optional[html.HtmlElement] = None,
|
| 39 |
keep_comments: Optional[bool] = False,
|
| 40 |
+
keep_cdata: Optional[bool] = False,
|
| 41 |
auto_match: Optional[bool] = True,
|
| 42 |
storage: Any = SQLiteStorageSystem,
|
| 43 |
storage_args: Optional[Dict] = None,
|
|
|
|
| 61 |
:param root: Used internally to pass etree objects instead of text/body arguments, it takes highest priority.
|
| 62 |
Don't use it unless you know what you are doing!
|
| 63 |
:param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
|
| 64 |
+
:param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
|
| 65 |
:param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
|
| 66 |
priority over all auto-match related arguments/functions in the class.
|
| 67 |
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
|
|
|
|
| 87 |
|
| 88 |
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
| 89 |
parser = html.HTMLParser(
|
| 90 |
+
recover=True, remove_blank_text=True, remove_comments=(not keep_comments), encoding=encoding,
|
| 91 |
+
compact=True, huge_tree=huge_tree, default_doctype=True, strip_cdata=(not keep_cdata),
|
| 92 |
)
|
| 93 |
self._root = etree.fromstring(body, parser=parser, base_url=url)
|
| 94 |
if is_jsonable(text or body.decode()):
|
|
|
|
| 122 |
self._storage = storage(**storage_args)
|
| 123 |
|
| 124 |
self.__keep_comments = keep_comments
|
| 125 |
+
self.__keep_cdata = keep_cdata
|
| 126 |
self.__huge_tree_enabled = huge_tree
|
| 127 |
self.encoding = encoding
|
| 128 |
self.url = url
|
|
|
|
| 160 |
root=element,
|
| 161 |
text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
|
| 162 |
url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
| 163 |
+
keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
|
| 164 |
huge_tree=self.__huge_tree_enabled, debug=self.__debug,
|
| 165 |
**self.__response_data
|
| 166 |
)
|