Spaces:

lenson78
/

Scrapling

Paused

App Files Files Community

Karim shoair commited on Dec 10, 2024

Commit

c36c9c3

1 Parent(s): deb22eb

Adding `keep_cdata` argument for `Adaptor` and `Response` classes

Browse files

Files changed (3) hide show

README.md +1 -1
scrapling/engines/toolbelt/custom.py +3 -1
scrapling/parser.py +7 -3

README.md CHANGED Viewed

@@ -219,7 +219,7 @@ You might be slightly confused by now so let me clear things up. All fetcher-typ
 ```python
 from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
 ```
-All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
 If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
 ```python

 ```python
 from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
 ```
+All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
 If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
 ```python

scrapling/engines/toolbelt/custom.py CHANGED Viewed

@@ -105,7 +105,7 @@ class BaseFetcher:
     def __init__(
             self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
             storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = False,
-            automatch_domain: Optional[str] = None,
     ):
         """Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
         are detected and passed automatically from the Fetcher based on the response for accessibility.
@@ -113,6 +113,7 @@ class BaseFetcher:
         :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
             libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
         :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
         :param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
             priority over all auto-match related arguments/functions in the class.
         :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
@@ -127,6 +128,7 @@ class BaseFetcher:
         self.adaptor_arguments = dict(
             huge_tree=huge_tree,
             keep_comments=keep_comments,
             auto_match=auto_match,
             storage=storage,
             storage_args=storage_args,

     def __init__(
             self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
             storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = False,
+            automatch_domain: Optional[str] = None, keep_cdata: Optional[bool] = False,
     ):
         """Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
         are detected and passed automatically from the Fetcher based on the response for accessibility.
         :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
             libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
         :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
+        :param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
         :param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
             priority over all auto-match related arguments/functions in the class.
         :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
         self.adaptor_arguments = dict(
             huge_tree=huge_tree,
             keep_comments=keep_comments,
+            keep_cdata=keep_cdata,
             auto_match=auto_match,
             storage=storage,
             storage_args=storage_args,

scrapling/parser.py CHANGED Viewed

@@ -25,6 +25,7 @@ class Adaptor(SelectorsGeneration):
     __slots__ = (
         'url', 'encoding', '__auto_match_enabled', '_root', '_storage', '__debug',
         '__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
     )
     def __init__(
@@ -36,6 +37,7 @@ class Adaptor(SelectorsGeneration):
             huge_tree: bool = True,
             root: Optional[html.HtmlElement] = None,
             keep_comments: Optional[bool] = False,
             auto_match: Optional[bool] = True,
             storage: Any = SQLiteStorageSystem,
             storage_args: Optional[Dict] = None,
@@ -59,6 +61,7 @@ class Adaptor(SelectorsGeneration):
         :param root: Used internally to pass etree objects instead of text/body arguments, it takes highest priority.
             Don't use it unless you know what you are doing!
         :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
         :param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
             priority over all auto-match related arguments/functions in the class.
         :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
@@ -84,8 +87,8 @@ class Adaptor(SelectorsGeneration):
             # https://lxml.de/api/lxml.etree.HTMLParser-class.html
             parser = html.HTMLParser(
-                recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
-                compact=True, huge_tree=huge_tree, default_doctype=True
             )
             self._root = etree.fromstring(body, parser=parser, base_url=url)
             if is_jsonable(text or body.decode()):
@@ -119,6 +122,7 @@ class Adaptor(SelectorsGeneration):
             self._storage = storage(**storage_args)
         self.__keep_comments = keep_comments
         self.__huge_tree_enabled = huge_tree
         self.encoding = encoding
         self.url = url
@@ -156,7 +160,7 @@ class Adaptor(SelectorsGeneration):
                     root=element,
                     text='', body=b'',  # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
                     url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
-                    keep_comments=True,  # if the comments are already removed in initialization, no need to try to delete them in sub-elements
                     huge_tree=self.__huge_tree_enabled, debug=self.__debug,
                     **self.__response_data
                 )

     __slots__ = (
         'url', 'encoding', '__auto_match_enabled', '_root', '_storage', '__debug',
         '__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
+        '__keep_cdata',
     )
     def __init__(
             huge_tree: bool = True,
             root: Optional[html.HtmlElement] = None,
             keep_comments: Optional[bool] = False,
+            keep_cdata: Optional[bool] = False,
             auto_match: Optional[bool] = True,
             storage: Any = SQLiteStorageSystem,
             storage_args: Optional[Dict] = None,
         :param root: Used internally to pass etree objects instead of text/body arguments, it takes highest priority.
             Don't use it unless you know what you are doing!
         :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
+        :param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
         :param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
             priority over all auto-match related arguments/functions in the class.
         :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
             # https://lxml.de/api/lxml.etree.HTMLParser-class.html
             parser = html.HTMLParser(
+                recover=True, remove_blank_text=True, remove_comments=(not keep_comments), encoding=encoding,
+                compact=True, huge_tree=huge_tree, default_doctype=True, strip_cdata=(not keep_cdata),
             )
             self._root = etree.fromstring(body, parser=parser, base_url=url)
             if is_jsonable(text or body.decode()):
             self._storage = storage(**storage_args)
         self.__keep_comments = keep_comments
+        self.__keep_cdata = keep_cdata
         self.__huge_tree_enabled = huge_tree
         self.encoding = encoding
         self.url = url
                     root=element,
                     text='', body=b'',  # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
                     url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
+                    keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
                     huge_tree=self.__huge_tree_enabled, debug=self.__debug,
                     **self.__response_data
                 )