Spaces:

lenson78
/

Scrapling

Paused

App Files Files Community

Karim shoair commited on Jul 29, 2025

Commit

3db9c55

1 Parent(s): bb70f02

refactor: replace's Selector inpt (text/body) with 1 argument called `content`

Browse files

Files changed (5) hide show

benchmarks.py +1 -1
scrapling/engines/toolbelt/convertor.py +5 -10
scrapling/engines/toolbelt/custom.py +5 -5
scrapling/parser.py +18 -22
tests/parser/test_general.py +1 -4

benchmarks.py CHANGED Viewed

@@ -80,7 +80,7 @@ def test_scrapling():
 @benchmark
 def test_parsel():
-    return Selector(text=large_html).css(".item::text").extract()
 @benchmark

 @benchmark
 def test_parsel():
+    return Selector(content=large_html).css(".item::text").extract()
 @benchmark

scrapling/engines/toolbelt/convertor.py CHANGED Viewed

@@ -34,8 +34,7 @@ class ResponseFactory:
                         Response(
                             url=current_request.url,
                             # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
-                            text="",
-                            body=b"",
                             status=current_response.status if current_response else 301,
                             reason=(
                                 current_response.status_text
@@ -112,8 +111,7 @@ class ResponseFactory:
         return Response(
             url=page.url,
-            text=page_content,
-            body=page_content.encode("utf-8"),
             status=final_response.status,
             reason=status_text,
             encoding=encoding,
@@ -141,8 +139,7 @@ class ResponseFactory:
                         Response(
                             url=current_request.url,
                             # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
-                            text="",
-                            body=b"",
                             status=current_response.status if current_response else 301,
                             reason=(
                                 current_response.status_text
@@ -221,8 +218,7 @@ class ResponseFactory:
         return Response(
             url=page.url,
-            text=page_content,
-            body=page_content.encode("utf-8"),
             status=final_response.status,
             reason=status_text,
             encoding=encoding,
@@ -243,8 +239,7 @@ class ResponseFactory:
         """
         return Response(
             url=response.url,
-            text=response.text,
-            body=response.content
             if type(response.content) is bytes
             else response.content.encode(),
             status=response.status_code,

                         Response(
                             url=current_request.url,
                             # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
+                            content="",
                             status=current_response.status if current_response else 301,
                             reason=(
                                 current_response.status_text
         return Response(
             url=page.url,
+            content=page_content,
             status=final_response.status,
             reason=status_text,
             encoding=encoding,
                         Response(
                             url=current_request.url,
                             # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
+                            content="",
                             status=current_response.status if current_response else 301,
                             reason=(
                                 current_response.status_text
         return Response(
             url=page.url,
+            content=page_content,
             status=final_response.status,
             reason=status_text,
             encoding=encoding,
         """
         return Response(
             url=response.url,
+            content=response.content
             if type(response.content) is bytes
             else response.content.encode(),
             status=response.status_code,

scrapling/engines/toolbelt/custom.py CHANGED Viewed

@@ -103,8 +103,7 @@ class Response(Selector):
     def __init__(
         self,
         url: str,
-        text: str,
-        body: bytes,
         status: int,
         reason: str,
         cookies: Union[Tuple[Dict[str, str], ...], Dict[str, str]],
@@ -122,10 +121,11 @@ class Response(Selector):
         self.headers = headers
         self.request_headers = request_headers
         self.history = history or []
-        encoding = ResponseEncoding.get_value(encoding, text)
         super().__init__(
-            text=text,
-            body=body,
             url=adaptive_domain or url,
             encoding=encoding,
             **selector_config,

     def __init__(
         self,
         url: str,
+        content: str | bytes,
         status: int,
         reason: str,
         cookies: Union[Tuple[Dict[str, str], ...], Dict[str, str]],
         self.headers = headers
         self.request_headers = request_headers
         self.history = history or []
+        encoding = ResponseEncoding.get_value(
+            encoding, content.decode("utf-8") if isinstance(content, bytes) else content
+        )
         super().__init__(
+            content=content,
             url=adaptive_domain or url,
             encoding=encoding,
             **selector_config,

scrapling/parser.py CHANGED Viewed

@@ -50,9 +50,8 @@ class Selector(SelectorsGeneration):
     def __init__(
         self,
-        text: Optional[str] = None,
         url: Optional[str] = None,
-        body: bytes = b"",
         encoding: str = "utf8",
         huge_tree: bool = True,
         root: Optional[html.HtmlElement] = None,
@@ -72,9 +71,8 @@ class Selector(SelectorsGeneration):
         not possible. You can test it here and see code explodes with `AssertionError: invalid Element proxy at...`.
         It's an old issue with lxml, see `this entry <https://bugs.launchpad.net/lxml/+bug/736708>`
-        :param text: HTML body passed as text.
         :param url: It allows storing a URL with the HTML data for retrieving later.
-        :param body: HTML body as an ``bytes`` object. It can be used instead of the ``text`` argument.
         :param encoding: The encoding type that will be used in HTML parsing, default is `UTF-8`
         :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
              the libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
@@ -88,27 +86,23 @@ class Selector(SelectorsGeneration):
         :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
             If empty, default values will be used.
         """
-        if root is None and not body and text is None:
             raise ValueError(
-                "Selector class needs text, body, or root arguments to work"
             )
         self.__text = ""
         if root is None:
-            if text is None:
-                if not body or not isinstance(body, bytes):
-                    raise TypeError(
-                        f"body argument must be valid and of type bytes, got {body.__class__}"
-                    )
-                body = body.replace(b"\x00", b"").strip()
             else:
-                if not isinstance(text, str):
-                    raise TypeError(
-                        f"text argument must be of type str, got {text.__class__}"
-                    )
-                body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
             # https://lxml.de/api/lxml.etree.HTMLParser-class.html
             parser = html.HTMLParser(
@@ -122,8 +116,10 @@ class Selector(SelectorsGeneration):
                 strip_cdata=(not keep_cdata),
             )
             self._root = etree.fromstring(body, parser=parser, base_url=url)
-            if is_jsonable(text or body.decode()):
-                self.__text = TextHandler(text or body.decode())
         else:
             # All HTML types inherit from HtmlMixin so this to check for all at once
@@ -930,7 +926,7 @@ class Selector(SelectorsGeneration):
     ) -> None:
         """Saves the element's unique properties to the storage for retrieval and relocation later
-        :param element: The element itself that we want to save to storage, it can be an ` Selector ` or pure ` HtmlElement `
         :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
             the docs for more info.
         """

     def __init__(
         self,
+        content: Optional[Union[str, bytes]] = None,
         url: Optional[str] = None,
         encoding: str = "utf8",
         huge_tree: bool = True,
         root: Optional[html.HtmlElement] = None,
         not possible. You can test it here and see code explodes with `AssertionError: invalid Element proxy at...`.
         It's an old issue with lxml, see `this entry <https://bugs.launchpad.net/lxml/+bug/736708>`
+        :param content: HTML content as either string or bytes.
         :param url: It allows storing a URL with the HTML data for retrieving later.
         :param encoding: The encoding type that will be used in HTML parsing, default is `UTF-8`
         :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
              the libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
         :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
             If empty, default values will be used.
         """
+        if root is None and content is None:
             raise ValueError(
+                "Selector class needs HTML content, or root arguments to work"
             )
         self.__text = ""
         if root is None:
+            if isinstance(content, bytes):
+                body = content.replace(b"\x00", b"").strip()
+            elif isinstance(content, str):
+                body = (
+                    content.strip().replace("\x00", "").encode(encoding) or b"<html/>"
+                )
             else:
+                raise TypeError(
+                    f"content argument must be str or bytes, got {type(content)}"
+                )
             # https://lxml.de/api/lxml.etree.HTMLParser-class.html
             parser = html.HTMLParser(
                 strip_cdata=(not keep_cdata),
             )
             self._root = etree.fromstring(body, parser=parser, base_url=url)
+            jsonable_text = content if isinstance(content, str) else body.decode()
+            if is_jsonable(jsonable_text):
+                self.__text = TextHandler(jsonable_text)
         else:
             # All HTML types inherit from HtmlMixin so this to check for all at once
     ) -> None:
         """Saves the element's unique properties to the storage for retrieval and relocation later
+        :param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
         :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
             the docs for more info.
         """

tests/parser/test_general.py CHANGED Viewed

@@ -173,10 +173,7 @@ class TestErrorHandling:
             _ = Selector(root="ayo", adaptive=False)
         with pytest.raises(TypeError):
-            _ = Selector(text=1, adaptive=False)
-        with pytest.raises(TypeError):
-            _ = Selector(body=1, adaptive=False)
     def test_invalid_storage(self, page, html_content):
         """Test invalid storage parameter"""

             _ = Selector(root="ayo", adaptive=False)
         with pytest.raises(TypeError):
+            _ = Selector(content=1, adaptive=False)
     def test_invalid_storage(self, page, html_content):
         """Test invalid storage parameter"""