Karim shoair commited on
Commit
c36c9c3
·
1 Parent(s): deb22eb

Adding `keep_cdata` argument for `Adaptor` and `Response` classes

Browse files
README.md CHANGED
@@ -219,7 +219,7 @@ You might be slightly confused by now so let me clear things up. All fetcher-typ
219
  ```python
220
  from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
221
  ```
222
- All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
223
 
224
  If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
225
  ```python
 
219
  ```python
220
  from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
221
  ```
222
+ All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
223
 
224
  If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
225
  ```python
scrapling/engines/toolbelt/custom.py CHANGED
@@ -105,7 +105,7 @@ class BaseFetcher:
105
  def __init__(
106
  self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
107
  storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = False,
108
- automatch_domain: Optional[str] = None,
109
  ):
110
  """Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
111
  are detected and passed automatically from the Fetcher based on the response for accessibility.
@@ -113,6 +113,7 @@ class BaseFetcher:
113
  :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
114
  libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
115
  :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
 
116
  :param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
117
  priority over all auto-match related arguments/functions in the class.
118
  :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
@@ -127,6 +128,7 @@ class BaseFetcher:
127
  self.adaptor_arguments = dict(
128
  huge_tree=huge_tree,
129
  keep_comments=keep_comments,
 
130
  auto_match=auto_match,
131
  storage=storage,
132
  storage_args=storage_args,
 
105
  def __init__(
106
  self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
107
  storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = False,
108
+ automatch_domain: Optional[str] = None, keep_cdata: Optional[bool] = False,
109
  ):
110
  """Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
111
  are detected and passed automatically from the Fetcher based on the response for accessibility.
 
113
  :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
114
  libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
115
  :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
116
+ :param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
117
  :param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
118
  priority over all auto-match related arguments/functions in the class.
119
  :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
 
128
  self.adaptor_arguments = dict(
129
  huge_tree=huge_tree,
130
  keep_comments=keep_comments,
131
+ keep_cdata=keep_cdata,
132
  auto_match=auto_match,
133
  storage=storage,
134
  storage_args=storage_args,
scrapling/parser.py CHANGED
@@ -25,6 +25,7 @@ class Adaptor(SelectorsGeneration):
25
  __slots__ = (
26
  'url', 'encoding', '__auto_match_enabled', '_root', '_storage', '__debug',
27
  '__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
 
28
  )
29
 
30
  def __init__(
@@ -36,6 +37,7 @@ class Adaptor(SelectorsGeneration):
36
  huge_tree: bool = True,
37
  root: Optional[html.HtmlElement] = None,
38
  keep_comments: Optional[bool] = False,
 
39
  auto_match: Optional[bool] = True,
40
  storage: Any = SQLiteStorageSystem,
41
  storage_args: Optional[Dict] = None,
@@ -59,6 +61,7 @@ class Adaptor(SelectorsGeneration):
59
  :param root: Used internally to pass etree objects instead of text/body arguments, it takes highest priority.
60
  Don't use it unless you know what you are doing!
61
  :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
 
62
  :param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
63
  priority over all auto-match related arguments/functions in the class.
64
  :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
@@ -84,8 +87,8 @@ class Adaptor(SelectorsGeneration):
84
 
85
  # https://lxml.de/api/lxml.etree.HTMLParser-class.html
86
  parser = html.HTMLParser(
87
- recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
88
- compact=True, huge_tree=huge_tree, default_doctype=True
89
  )
90
  self._root = etree.fromstring(body, parser=parser, base_url=url)
91
  if is_jsonable(text or body.decode()):
@@ -119,6 +122,7 @@ class Adaptor(SelectorsGeneration):
119
  self._storage = storage(**storage_args)
120
 
121
  self.__keep_comments = keep_comments
 
122
  self.__huge_tree_enabled = huge_tree
123
  self.encoding = encoding
124
  self.url = url
@@ -156,7 +160,7 @@ class Adaptor(SelectorsGeneration):
156
  root=element,
157
  text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
158
  url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
159
- keep_comments=True, # if the comments are already removed in initialization, no need to try to delete them in sub-elements
160
  huge_tree=self.__huge_tree_enabled, debug=self.__debug,
161
  **self.__response_data
162
  )
 
25
  __slots__ = (
26
  'url', 'encoding', '__auto_match_enabled', '_root', '_storage', '__debug',
27
  '__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
28
+ '__keep_cdata',
29
  )
30
 
31
  def __init__(
 
37
  huge_tree: bool = True,
38
  root: Optional[html.HtmlElement] = None,
39
  keep_comments: Optional[bool] = False,
40
+ keep_cdata: Optional[bool] = False,
41
  auto_match: Optional[bool] = True,
42
  storage: Any = SQLiteStorageSystem,
43
  storage_args: Optional[Dict] = None,
 
61
  :param root: Used internally to pass etree objects instead of text/body arguments, it takes highest priority.
62
  Don't use it unless you know what you are doing!
63
  :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
64
+ :param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
65
  :param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
66
  priority over all auto-match related arguments/functions in the class.
67
  :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
 
87
 
88
  # https://lxml.de/api/lxml.etree.HTMLParser-class.html
89
  parser = html.HTMLParser(
90
+ recover=True, remove_blank_text=True, remove_comments=(not keep_comments), encoding=encoding,
91
+ compact=True, huge_tree=huge_tree, default_doctype=True, strip_cdata=(not keep_cdata),
92
  )
93
  self._root = etree.fromstring(body, parser=parser, base_url=url)
94
  if is_jsonable(text or body.decode()):
 
122
  self._storage = storage(**storage_args)
123
 
124
  self.__keep_comments = keep_comments
125
+ self.__keep_cdata = keep_cdata
126
  self.__huge_tree_enabled = huge_tree
127
  self.encoding = encoding
128
  self.url = url
 
160
  root=element,
161
  text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
162
  url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
163
+ keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
164
  huge_tree=self.__huge_tree_enabled, debug=self.__debug,
165
  **self.__response_data
166
  )