Spaces:

lenson78
/

Scrapling

Paused

App Files Files Community

Karim shoair commited on Sep 14, 2025

Commit

450d5ca

1 Parent(s): 299ae74

fix: Fixes for multiple encoding issues (#80 & #81 )

Browse files

Files changed (4) hide show

scrapling/core/shell.py +2 -2
scrapling/engines/toolbelt/convertor.py +95 -60
scrapling/engines/toolbelt/custom.py +0 -78
scrapling/parser.py +4 -4

scrapling/core/shell.py CHANGED Viewed

@@ -317,7 +317,7 @@ def show_page_in_browser(page: Selector):  # pragma: no cover
     try:
         fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
-        with open(fd, "w", encoding="utf-8") as f:
             f.write(page.body)
         open_in_browser(f"file://{fname}")
@@ -556,7 +556,7 @@ class Convertor:
         elif not filename.endswith((".md", ".html", ".txt")):
             raise ValueError("Unknown file type: filename must end with '.md', '.html', or '.txt'")
         else:
-            with open(filename, "w", encoding="utf-8") as f:
                 extension = filename.split(".")[-1]
                 f.write(
                     "".join(

     try:
         fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
+        with open(fd, "w", encoding=page.encoding) as f:
             f.write(page.body)
         open_in_browser(f"file://{fname}")
         elif not filename.endswith((".md", ".html", ".txt")):
             raise ValueError("Unknown file type: filename must end with '.md', '.html', or '.txt'")
         else:
+            with open(filename, "w", encoding=page.encoding) as f:
                 extension = filename.split(".")[-1]
                 f.write(
                     "".join(

scrapling/engines/toolbelt/convertor.py CHANGED Viewed

@@ -1,10 +1,15 @@
 from curl_cffi.requests import Response as CurlResponse
 from playwright.sync_api import Page as SyncPage, Response as SyncResponse
 from playwright.async_api import Page as AsyncPage, Response as AsyncResponse
 from scrapling.core.utils import log
-from scrapling.core._types import Dict, Optional
 from .custom import Response, StatusText
 class ResponseFactory:
@@ -17,6 +22,18 @@ class ResponseFactory:
     response objects, and managing encoding, headers, cookies, and other attributes.
     """
     @classmethod
     def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
         """Process response history to build a list of `Response` objects"""
@@ -30,18 +47,23 @@ class ResponseFactory:
                     history.insert(
                         0,
                         Response(
-                            url=current_request.url,
-                            # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
-                            content="",
-                            status=current_response.status if current_response else 301,
-                            reason=(current_response.status_text or StatusText.get(current_response.status))
-                            if current_response
-                            else StatusText.get(301),
-                            encoding=current_response.headers.get("content-type", "") or "utf-8",
-                            cookies=tuple(),
-                            headers=current_response.all_headers() if current_response else {},
-                            request_headers=current_request.all_headers(),
-                            **parser_arguments,
                         ),
                     )
                 except Exception as e:  # pragma: no cover
@@ -85,8 +107,9 @@ class ResponseFactory:
         if not final_response:
             raise ValueError("Failed to get a response from the page")
-        # This will be parsed inside `Response`
-        encoding = final_response.headers.get("content-type", "") or "utf-8"  # default encoding
         # PlayWright API sometimes give empty status text for some reason!
         status_text = final_response.status_text or StatusText.get(final_response.status)
@@ -98,16 +121,18 @@ class ResponseFactory:
             page_content = ""
         return Response(
-            url=page.url,
-            content=page_content,
-            status=final_response.status,
-            reason=status_text,
-            encoding=encoding,
-            cookies=tuple(dict(cookie) for cookie in page.context.cookies()),
-            headers=first_response.all_headers(),
-            request_headers=first_response.request.all_headers(),
-            history=history,
-            **parser_arguments,
         )
     @classmethod
@@ -125,18 +150,23 @@ class ResponseFactory:
                     history.insert(
                         0,
                         Response(
-                            url=current_request.url,
-                            # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
-                            content="",
-                            status=current_response.status if current_response else 301,
-                            reason=(current_response.status_text or StatusText.get(current_response.status))
-                            if current_response
-                            else StatusText.get(301),
-                            encoding=current_response.headers.get("content-type", "") or "utf-8",
-                            cookies=tuple(),
-                            headers=await current_response.all_headers() if current_response else {},
-                            request_headers=await current_request.all_headers(),
-                            **parser_arguments,
                         ),
                     )
                 except Exception as e:  # pragma: no cover
@@ -180,8 +210,9 @@ class ResponseFactory:
         if not final_response:
             raise ValueError("Failed to get a response from the page")
-        # This will be parsed inside `Response`
-        encoding = final_response.headers.get("content-type", "") or "utf-8"  # default encoding
         # PlayWright API sometimes give empty status text for some reason!
         status_text = final_response.status_text or StatusText.get(final_response.status)
@@ -193,16 +224,18 @@ class ResponseFactory:
             page_content = ""
         return Response(
-            url=page.url,
-            content=page_content,
-            status=final_response.status,
-            reason=status_text,
-            encoding=encoding,
-            cookies=tuple(dict(cookie) for cookie in await page.context.cookies()),
-            headers=await first_response.all_headers(),
-            request_headers=await first_response.request.all_headers(),
-            history=history,
-            **parser_arguments,
         )
     @staticmethod
@@ -214,15 +247,17 @@ class ResponseFactory:
         :return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         return Response(
-            url=response.url,
-            content=response.content if isinstance(response.content, bytes) else response.content.encode(),
-            status=response.status_code,
-            reason=response.reason,
-            encoding=response.encoding or "utf-8",
-            cookies=dict(response.cookies),
-            headers=dict(response.headers),
-            request_headers=dict(response.request.headers),
-            method=response.request.method,
-            history=response.history,  # https://github.com/lexiforest/curl_cffi/issues/82
-            **parser_arguments,
         )

+from functools import lru_cache
+from re import compile as re_compile
 from curl_cffi.requests import Response as CurlResponse
 from playwright.sync_api import Page as SyncPage, Response as SyncResponse
 from playwright.async_api import Page as AsyncPage, Response as AsyncResponse
 from scrapling.core.utils import log
 from .custom import Response, StatusText
+from scrapling.core._types import Dict, Optional
+__CHARSET_RE__ = re_compile(r"charset=([\w-]+)")
 class ResponseFactory:
     response objects, and managing encoding, headers, cookies, and other attributes.
     """
+    @classmethod
+    @lru_cache(maxsize=16)
+    def __extract_browser_encoding(cls, content_type: str | None) -> Optional[str]:
+        """Extract browser encoding from headers.
+        Ex: from header "content-type: text/html; charset=utf-8" -> "utf-8
+        """
+        if content_type:
+            # Because Playwright can't do that by themselves like all libraries for some reason :3
+            match = __CHARSET_RE__.search(content_type)
+            return match.group(1) if match else None
+        return None
     @classmethod
     def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
         """Process response history to build a list of `Response` objects"""
                     history.insert(
                         0,
                         Response(
+                            **{
+                                "url": current_request.url,
+                                # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
+                                "content": "",
+                                "status": current_response.status if current_response else 301,
+                                "reason": (current_response.status_text or StatusText.get(current_response.status))
+                                if current_response
+                                else StatusText.get(301),
+                                "encoding": cls.__extract_browser_encoding(
+                                    current_response.headers.get("content-type", "")
+                                )
+                                or "utf-8",
+                                "cookies": tuple(),
+                                "headers": current_response.all_headers() if current_response else {},
+                                "request_headers": current_request.all_headers(),
+                                **parser_arguments,
+                            }
                         ),
                     )
                 except Exception as e:  # pragma: no cover
         if not final_response:
             raise ValueError("Failed to get a response from the page")
+        encoding = (
+            cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
+        )  # default encoding
         # PlayWright API sometimes give empty status text for some reason!
         status_text = final_response.status_text or StatusText.get(final_response.status)
             page_content = ""
         return Response(
+            **{
+                "url": page.url,
+                "content": page_content,
+                "status": final_response.status,
+                "reason": status_text,
+                "encoding": encoding,
+                "cookies": tuple(dict(cookie) for cookie in page.context.cookies()),
+                "headers": first_response.all_headers(),
+                "request_headers": first_response.request.all_headers(),
+                "history": history,
+                **parser_arguments,
+            }
         )
     @classmethod
                     history.insert(
                         0,
                         Response(
+                            **{
+                                "url": current_request.url,
+                                # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
+                                "content": "",
+                                "status": current_response.status if current_response else 301,
+                                "reason": (current_response.status_text or StatusText.get(current_response.status))
+                                if current_response
+                                else StatusText.get(301),
+                                "encoding": cls.__extract_browser_encoding(
+                                    current_response.headers.get("content-type", "")
+                                )
+                                or "utf-8",
+                                "cookies": tuple(),
+                                "headers": await current_response.all_headers() if current_response else {},
+                                "request_headers": await current_request.all_headers(),
+                                **parser_arguments,
+                            }
                         ),
                     )
                 except Exception as e:  # pragma: no cover
         if not final_response:
             raise ValueError("Failed to get a response from the page")
+        encoding = (
+            cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
+        )  # default encoding
         # PlayWright API sometimes give empty status text for some reason!
         status_text = final_response.status_text or StatusText.get(final_response.status)
             page_content = ""
         return Response(
+            **{
+                "url": page.url,
+                "content": page_content,
+                "status": final_response.status,
+                "reason": status_text,
+                "encoding": encoding,
+                "cookies": tuple(dict(cookie) for cookie in await page.context.cookies()),
+                "headers": await first_response.all_headers(),
+                "request_headers": await first_response.request.all_headers(),
+                "history": history,
+                **parser_arguments,
+            }
         )
     @staticmethod
         :return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         return Response(
+            **{
+                "url": response.url,
+                "content": response.content,
+                "status": response.status_code,
+                "reason": response.reason,
+                "encoding": response.encoding or "utf-8",
+                "cookies": dict(response.cookies),
+                "headers": dict(response.headers),
+                "request_headers": dict(response.request.headers),
+                "method": response.request.method,
+                "history": response.history,  # https://github.com/lexiforest/curl_cffi/issues/82
+                **parser_arguments,
+            }
         )

scrapling/engines/toolbelt/custom.py CHANGED Viewed

@@ -17,81 +17,6 @@ from scrapling.core.custom_types import MappingProxyType
 from scrapling.parser import Selector, SQLiteStorageSystem
-class ResponseEncoding:
-    __DEFAULT_ENCODING = "utf-8"
-    __ISO_8859_1_CONTENT_TYPES = {
-        "text/plain",
-        "text/html",
-        "text/css",
-        "text/javascript",
-    }
-    @classmethod
-    @lru_cache(maxsize=128)
-    def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
-        """Parse content type and parameters from a content-type header value.
-            Uses `email.message.Message` for robust header parsing according to RFC 2045.
-        :param header_value: Raw content-type header string
-        :return: Tuple of (content_type, parameters_dict)
-        """
-        # Create a Message object and set the Content-Type header then get the content type and parameters
-        msg = Message()
-        msg["content-type"] = header_value
-        content_type = msg.get_content_type()
-        params = dict(msg.get_params(failobj=[]))
-        # Remove the content-type from params if present somehow
-        params.pop("content-type", None)
-        return content_type, params
-    @classmethod
-    @lru_cache(maxsize=128)
-    def get_value(cls, content_type: Optional[str], text: Optional[str] = "test") -> str:
-        """Determine the appropriate character encoding from a content-type header.
-        The encoding is determined by these rules in order:
-            1. If no content-type is provided, use UTF-8
-            2. If charset parameter is present, use that encoding
-            3. If content-type is `text/*`, use ISO-8859-1 per HTTP/1.1 spec
-            4. If content-type is application/json, use UTF-8 per RFC 4627
-            5. Default to UTF-8 if nothing else matches
-        :param content_type: Content-Type header value or None
-        :param text: A text to test the encoding on it
-        :return: String naming the character encoding
-        """
-        if not content_type:
-            return cls.__DEFAULT_ENCODING
-        try:
-            encoding = None
-            content_type, params = cls.__parse_content_type(content_type)
-            # First check for explicit charset parameter
-            if "charset" in params:
-                encoding = params["charset"].strip("'\"")
-            # Apply content-type specific rules
-            elif content_type in cls.__ISO_8859_1_CONTENT_TYPES:
-                encoding = "ISO-8859-1"
-            elif content_type == "application/json":
-                encoding = cls.__DEFAULT_ENCODING
-            if encoding:
-                _ = text.encode(encoding)  # Validate encoding and validate it can encode the given text
-                return encoding
-            return cls.__DEFAULT_ENCODING
-        except (ValueError, LookupError, UnicodeEncodeError):
-            return cls.__DEFAULT_ENCODING
 class Response(Selector):
     """This class is returned by all engines as a way to unify response type between different libraries."""
@@ -116,9 +41,6 @@ class Response(Selector):
         self.headers = headers
         self.request_headers = request_headers
         self.history = history or []
-        encoding = ResponseEncoding.get_value(
-            encoding, content.decode("utf-8") if isinstance(content, bytes) else content
-        )
         super().__init__(
             content=content,
             url=adaptive_domain or url,

 from scrapling.parser import Selector, SQLiteStorageSystem
 class Response(Selector):
     """This class is returned by all engines as a way to unify response type between different libraries."""
         self.headers = headers
         self.request_headers = request_headers
         self.history = history or []
         super().__init__(
             content=content,
             url=adaptive_domain or url,

scrapling/parser.py CHANGED Viewed

@@ -74,7 +74,7 @@ class Selector(SelectorsGeneration):
         self,
         content: Optional[str | bytes] = None,
         url: Optional[str] = None,
-        encoding: str = "utf8",
         huge_tree: bool = True,
         root: Optional[HtmlElement] = None,
         keep_comments: Optional[bool] = False,
@@ -116,7 +116,7 @@ class Selector(SelectorsGeneration):
             if isinstance(content, str):
                 body = content.strip().replace("\x00", "").encode(encoding) or b"<html/>"
             elif isinstance(content, bytes):
-                body = content.replace(b"\x00", b"").strip()
             else:
                 raise TypeError(f"content argument must be str or bytes, got {type(content)}")
@@ -340,7 +340,7 @@ class Selector(SelectorsGeneration):
     @property
     def html_content(self) -> TextHandler:
         """Return the inner HTML code of the element"""
-        return TextHandler(tostring(self._root, encoding="unicode", method="html", with_tail=False))
     body = html_content
@@ -349,7 +349,7 @@ class Selector(SelectorsGeneration):
         return TextHandler(
             tostring(
                 self._root,
-                encoding="unicode",
                 pretty_print=True,
                 method="html",
                 with_tail=False,

         self,
         content: Optional[str | bytes] = None,
         url: Optional[str] = None,
+        encoding: str = "utf-8",
         huge_tree: bool = True,
         root: Optional[HtmlElement] = None,
         keep_comments: Optional[bool] = False,
             if isinstance(content, str):
                 body = content.strip().replace("\x00", "").encode(encoding) or b"<html/>"
             elif isinstance(content, bytes):
+                body = content.replace(b"\x00", b"")
             else:
                 raise TypeError(f"content argument must be str or bytes, got {type(content)}")
     @property
     def html_content(self) -> TextHandler:
         """Return the inner HTML code of the element"""
+        return TextHandler(tostring(self._root, encoding=self.encoding, method="html", with_tail=False))
     body = html_content
         return TextHandler(
             tostring(
                 self._root,
+                encoding=self.encoding,
                 pretty_print=True,
                 method="html",
                 with_tail=False,