Karim shoair commited on
Commit ·
450d5ca
1
Parent(s): 299ae74
fix: Fixes for multiple encoding issues (#80 & #81 )
Browse files- scrapling/core/shell.py +2 -2
- scrapling/engines/toolbelt/convertor.py +95 -60
- scrapling/engines/toolbelt/custom.py +0 -78
- scrapling/parser.py +4 -4
scrapling/core/shell.py
CHANGED
|
@@ -317,7 +317,7 @@ def show_page_in_browser(page: Selector): # pragma: no cover
|
|
| 317 |
|
| 318 |
try:
|
| 319 |
fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
|
| 320 |
-
with open(fd, "w", encoding=
|
| 321 |
f.write(page.body)
|
| 322 |
|
| 323 |
open_in_browser(f"file://{fname}")
|
|
@@ -556,7 +556,7 @@ class Convertor:
|
|
| 556 |
elif not filename.endswith((".md", ".html", ".txt")):
|
| 557 |
raise ValueError("Unknown file type: filename must end with '.md', '.html', or '.txt'")
|
| 558 |
else:
|
| 559 |
-
with open(filename, "w", encoding=
|
| 560 |
extension = filename.split(".")[-1]
|
| 561 |
f.write(
|
| 562 |
"".join(
|
|
|
|
| 317 |
|
| 318 |
try:
|
| 319 |
fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
|
| 320 |
+
with open(fd, "w", encoding=page.encoding) as f:
|
| 321 |
f.write(page.body)
|
| 322 |
|
| 323 |
open_in_browser(f"file://{fname}")
|
|
|
|
| 556 |
elif not filename.endswith((".md", ".html", ".txt")):
|
| 557 |
raise ValueError("Unknown file type: filename must end with '.md', '.html', or '.txt'")
|
| 558 |
else:
|
| 559 |
+
with open(filename, "w", encoding=page.encoding) as f:
|
| 560 |
extension = filename.split(".")[-1]
|
| 561 |
f.write(
|
| 562 |
"".join(
|
scrapling/engines/toolbelt/convertor.py
CHANGED
|
@@ -1,10 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from curl_cffi.requests import Response as CurlResponse
|
| 2 |
from playwright.sync_api import Page as SyncPage, Response as SyncResponse
|
| 3 |
from playwright.async_api import Page as AsyncPage, Response as AsyncResponse
|
| 4 |
|
| 5 |
from scrapling.core.utils import log
|
| 6 |
-
from scrapling.core._types import Dict, Optional
|
| 7 |
from .custom import Response, StatusText
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
class ResponseFactory:
|
|
@@ -17,6 +22,18 @@ class ResponseFactory:
|
|
| 17 |
response objects, and managing encoding, headers, cookies, and other attributes.
|
| 18 |
"""
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
@classmethod
|
| 21 |
def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
|
| 22 |
"""Process response history to build a list of `Response` objects"""
|
|
@@ -30,18 +47,23 @@ class ResponseFactory:
|
|
| 30 |
history.insert(
|
| 31 |
0,
|
| 32 |
Response(
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
),
|
| 46 |
)
|
| 47 |
except Exception as e: # pragma: no cover
|
|
@@ -85,8 +107,9 @@ class ResponseFactory:
|
|
| 85 |
if not final_response:
|
| 86 |
raise ValueError("Failed to get a response from the page")
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
|
|
|
| 90 |
# PlayWright API sometimes give empty status text for some reason!
|
| 91 |
status_text = final_response.status_text or StatusText.get(final_response.status)
|
| 92 |
|
|
@@ -98,16 +121,18 @@ class ResponseFactory:
|
|
| 98 |
page_content = ""
|
| 99 |
|
| 100 |
return Response(
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
|
|
|
|
|
|
| 111 |
)
|
| 112 |
|
| 113 |
@classmethod
|
|
@@ -125,18 +150,23 @@ class ResponseFactory:
|
|
| 125 |
history.insert(
|
| 126 |
0,
|
| 127 |
Response(
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
),
|
| 141 |
)
|
| 142 |
except Exception as e: # pragma: no cover
|
|
@@ -180,8 +210,9 @@ class ResponseFactory:
|
|
| 180 |
if not final_response:
|
| 181 |
raise ValueError("Failed to get a response from the page")
|
| 182 |
|
| 183 |
-
|
| 184 |
-
|
|
|
|
| 185 |
# PlayWright API sometimes give empty status text for some reason!
|
| 186 |
status_text = final_response.status_text or StatusText.get(final_response.status)
|
| 187 |
|
|
@@ -193,16 +224,18 @@ class ResponseFactory:
|
|
| 193 |
page_content = ""
|
| 194 |
|
| 195 |
return Response(
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
|
|
|
|
|
|
| 206 |
)
|
| 207 |
|
| 208 |
@staticmethod
|
|
@@ -214,15 +247,17 @@ class ResponseFactory:
|
|
| 214 |
:return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 215 |
"""
|
| 216 |
return Response(
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
|
|
|
|
|
|
| 228 |
)
|
|
|
|
| 1 |
+
from functools import lru_cache
|
| 2 |
+
from re import compile as re_compile
|
| 3 |
+
|
| 4 |
from curl_cffi.requests import Response as CurlResponse
|
| 5 |
from playwright.sync_api import Page as SyncPage, Response as SyncResponse
|
| 6 |
from playwright.async_api import Page as AsyncPage, Response as AsyncResponse
|
| 7 |
|
| 8 |
from scrapling.core.utils import log
|
|
|
|
| 9 |
from .custom import Response, StatusText
|
| 10 |
+
from scrapling.core._types import Dict, Optional
|
| 11 |
+
|
| 12 |
+
__CHARSET_RE__ = re_compile(r"charset=([\w-]+)")
|
| 13 |
|
| 14 |
|
| 15 |
class ResponseFactory:
|
|
|
|
| 22 |
response objects, and managing encoding, headers, cookies, and other attributes.
|
| 23 |
"""
|
| 24 |
|
| 25 |
+
@classmethod
|
| 26 |
+
@lru_cache(maxsize=16)
|
| 27 |
+
def __extract_browser_encoding(cls, content_type: str | None) -> Optional[str]:
|
| 28 |
+
"""Extract browser encoding from headers.
|
| 29 |
+
Ex: from header "content-type: text/html; charset=utf-8" -> "utf-8
|
| 30 |
+
"""
|
| 31 |
+
if content_type:
|
| 32 |
+
# Because Playwright can't do that by themselves like all libraries for some reason :3
|
| 33 |
+
match = __CHARSET_RE__.search(content_type)
|
| 34 |
+
return match.group(1) if match else None
|
| 35 |
+
return None
|
| 36 |
+
|
| 37 |
@classmethod
|
| 38 |
def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
|
| 39 |
"""Process response history to build a list of `Response` objects"""
|
|
|
|
| 47 |
history.insert(
|
| 48 |
0,
|
| 49 |
Response(
|
| 50 |
+
**{
|
| 51 |
+
"url": current_request.url,
|
| 52 |
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
| 53 |
+
"content": "",
|
| 54 |
+
"status": current_response.status if current_response else 301,
|
| 55 |
+
"reason": (current_response.status_text or StatusText.get(current_response.status))
|
| 56 |
+
if current_response
|
| 57 |
+
else StatusText.get(301),
|
| 58 |
+
"encoding": cls.__extract_browser_encoding(
|
| 59 |
+
current_response.headers.get("content-type", "")
|
| 60 |
+
)
|
| 61 |
+
or "utf-8",
|
| 62 |
+
"cookies": tuple(),
|
| 63 |
+
"headers": current_response.all_headers() if current_response else {},
|
| 64 |
+
"request_headers": current_request.all_headers(),
|
| 65 |
+
**parser_arguments,
|
| 66 |
+
}
|
| 67 |
),
|
| 68 |
)
|
| 69 |
except Exception as e: # pragma: no cover
|
|
|
|
| 107 |
if not final_response:
|
| 108 |
raise ValueError("Failed to get a response from the page")
|
| 109 |
|
| 110 |
+
encoding = (
|
| 111 |
+
cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
|
| 112 |
+
) # default encoding
|
| 113 |
# PlayWright API sometimes give empty status text for some reason!
|
| 114 |
status_text = final_response.status_text or StatusText.get(final_response.status)
|
| 115 |
|
|
|
|
| 121 |
page_content = ""
|
| 122 |
|
| 123 |
return Response(
|
| 124 |
+
**{
|
| 125 |
+
"url": page.url,
|
| 126 |
+
"content": page_content,
|
| 127 |
+
"status": final_response.status,
|
| 128 |
+
"reason": status_text,
|
| 129 |
+
"encoding": encoding,
|
| 130 |
+
"cookies": tuple(dict(cookie) for cookie in page.context.cookies()),
|
| 131 |
+
"headers": first_response.all_headers(),
|
| 132 |
+
"request_headers": first_response.request.all_headers(),
|
| 133 |
+
"history": history,
|
| 134 |
+
**parser_arguments,
|
| 135 |
+
}
|
| 136 |
)
|
| 137 |
|
| 138 |
@classmethod
|
|
|
|
| 150 |
history.insert(
|
| 151 |
0,
|
| 152 |
Response(
|
| 153 |
+
**{
|
| 154 |
+
"url": current_request.url,
|
| 155 |
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
| 156 |
+
"content": "",
|
| 157 |
+
"status": current_response.status if current_response else 301,
|
| 158 |
+
"reason": (current_response.status_text or StatusText.get(current_response.status))
|
| 159 |
+
if current_response
|
| 160 |
+
else StatusText.get(301),
|
| 161 |
+
"encoding": cls.__extract_browser_encoding(
|
| 162 |
+
current_response.headers.get("content-type", "")
|
| 163 |
+
)
|
| 164 |
+
or "utf-8",
|
| 165 |
+
"cookies": tuple(),
|
| 166 |
+
"headers": await current_response.all_headers() if current_response else {},
|
| 167 |
+
"request_headers": await current_request.all_headers(),
|
| 168 |
+
**parser_arguments,
|
| 169 |
+
}
|
| 170 |
),
|
| 171 |
)
|
| 172 |
except Exception as e: # pragma: no cover
|
|
|
|
| 210 |
if not final_response:
|
| 211 |
raise ValueError("Failed to get a response from the page")
|
| 212 |
|
| 213 |
+
encoding = (
|
| 214 |
+
cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
|
| 215 |
+
) # default encoding
|
| 216 |
# PlayWright API sometimes give empty status text for some reason!
|
| 217 |
status_text = final_response.status_text or StatusText.get(final_response.status)
|
| 218 |
|
|
|
|
| 224 |
page_content = ""
|
| 225 |
|
| 226 |
return Response(
|
| 227 |
+
**{
|
| 228 |
+
"url": page.url,
|
| 229 |
+
"content": page_content,
|
| 230 |
+
"status": final_response.status,
|
| 231 |
+
"reason": status_text,
|
| 232 |
+
"encoding": encoding,
|
| 233 |
+
"cookies": tuple(dict(cookie) for cookie in await page.context.cookies()),
|
| 234 |
+
"headers": await first_response.all_headers(),
|
| 235 |
+
"request_headers": await first_response.request.all_headers(),
|
| 236 |
+
"history": history,
|
| 237 |
+
**parser_arguments,
|
| 238 |
+
}
|
| 239 |
)
|
| 240 |
|
| 241 |
@staticmethod
|
|
|
|
| 247 |
:return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 248 |
"""
|
| 249 |
return Response(
|
| 250 |
+
**{
|
| 251 |
+
"url": response.url,
|
| 252 |
+
"content": response.content,
|
| 253 |
+
"status": response.status_code,
|
| 254 |
+
"reason": response.reason,
|
| 255 |
+
"encoding": response.encoding or "utf-8",
|
| 256 |
+
"cookies": dict(response.cookies),
|
| 257 |
+
"headers": dict(response.headers),
|
| 258 |
+
"request_headers": dict(response.request.headers),
|
| 259 |
+
"method": response.request.method,
|
| 260 |
+
"history": response.history, # https://github.com/lexiforest/curl_cffi/issues/82
|
| 261 |
+
**parser_arguments,
|
| 262 |
+
}
|
| 263 |
)
|
scrapling/engines/toolbelt/custom.py
CHANGED
|
@@ -17,81 +17,6 @@ from scrapling.core.custom_types import MappingProxyType
|
|
| 17 |
from scrapling.parser import Selector, SQLiteStorageSystem
|
| 18 |
|
| 19 |
|
| 20 |
-
class ResponseEncoding:
|
| 21 |
-
__DEFAULT_ENCODING = "utf-8"
|
| 22 |
-
__ISO_8859_1_CONTENT_TYPES = {
|
| 23 |
-
"text/plain",
|
| 24 |
-
"text/html",
|
| 25 |
-
"text/css",
|
| 26 |
-
"text/javascript",
|
| 27 |
-
}
|
| 28 |
-
|
| 29 |
-
@classmethod
|
| 30 |
-
@lru_cache(maxsize=128)
|
| 31 |
-
def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
|
| 32 |
-
"""Parse content type and parameters from a content-type header value.
|
| 33 |
-
|
| 34 |
-
Uses `email.message.Message` for robust header parsing according to RFC 2045.
|
| 35 |
-
|
| 36 |
-
:param header_value: Raw content-type header string
|
| 37 |
-
:return: Tuple of (content_type, parameters_dict)
|
| 38 |
-
"""
|
| 39 |
-
# Create a Message object and set the Content-Type header then get the content type and parameters
|
| 40 |
-
msg = Message()
|
| 41 |
-
msg["content-type"] = header_value
|
| 42 |
-
|
| 43 |
-
content_type = msg.get_content_type()
|
| 44 |
-
params = dict(msg.get_params(failobj=[]))
|
| 45 |
-
|
| 46 |
-
# Remove the content-type from params if present somehow
|
| 47 |
-
params.pop("content-type", None)
|
| 48 |
-
|
| 49 |
-
return content_type, params
|
| 50 |
-
|
| 51 |
-
@classmethod
|
| 52 |
-
@lru_cache(maxsize=128)
|
| 53 |
-
def get_value(cls, content_type: Optional[str], text: Optional[str] = "test") -> str:
|
| 54 |
-
"""Determine the appropriate character encoding from a content-type header.
|
| 55 |
-
|
| 56 |
-
The encoding is determined by these rules in order:
|
| 57 |
-
1. If no content-type is provided, use UTF-8
|
| 58 |
-
2. If charset parameter is present, use that encoding
|
| 59 |
-
3. If content-type is `text/*`, use ISO-8859-1 per HTTP/1.1 spec
|
| 60 |
-
4. If content-type is application/json, use UTF-8 per RFC 4627
|
| 61 |
-
5. Default to UTF-8 if nothing else matches
|
| 62 |
-
|
| 63 |
-
:param content_type: Content-Type header value or None
|
| 64 |
-
:param text: A text to test the encoding on it
|
| 65 |
-
:return: String naming the character encoding
|
| 66 |
-
"""
|
| 67 |
-
if not content_type:
|
| 68 |
-
return cls.__DEFAULT_ENCODING
|
| 69 |
-
|
| 70 |
-
try:
|
| 71 |
-
encoding = None
|
| 72 |
-
content_type, params = cls.__parse_content_type(content_type)
|
| 73 |
-
|
| 74 |
-
# First check for explicit charset parameter
|
| 75 |
-
if "charset" in params:
|
| 76 |
-
encoding = params["charset"].strip("'\"")
|
| 77 |
-
|
| 78 |
-
# Apply content-type specific rules
|
| 79 |
-
elif content_type in cls.__ISO_8859_1_CONTENT_TYPES:
|
| 80 |
-
encoding = "ISO-8859-1"
|
| 81 |
-
|
| 82 |
-
elif content_type == "application/json":
|
| 83 |
-
encoding = cls.__DEFAULT_ENCODING
|
| 84 |
-
|
| 85 |
-
if encoding:
|
| 86 |
-
_ = text.encode(encoding) # Validate encoding and validate it can encode the given text
|
| 87 |
-
return encoding
|
| 88 |
-
|
| 89 |
-
return cls.__DEFAULT_ENCODING
|
| 90 |
-
|
| 91 |
-
except (ValueError, LookupError, UnicodeEncodeError):
|
| 92 |
-
return cls.__DEFAULT_ENCODING
|
| 93 |
-
|
| 94 |
-
|
| 95 |
class Response(Selector):
|
| 96 |
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
| 97 |
|
|
@@ -116,9 +41,6 @@ class Response(Selector):
|
|
| 116 |
self.headers = headers
|
| 117 |
self.request_headers = request_headers
|
| 118 |
self.history = history or []
|
| 119 |
-
encoding = ResponseEncoding.get_value(
|
| 120 |
-
encoding, content.decode("utf-8") if isinstance(content, bytes) else content
|
| 121 |
-
)
|
| 122 |
super().__init__(
|
| 123 |
content=content,
|
| 124 |
url=adaptive_domain or url,
|
|
|
|
| 17 |
from scrapling.parser import Selector, SQLiteStorageSystem
|
| 18 |
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
class Response(Selector):
|
| 21 |
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
| 22 |
|
|
|
|
| 41 |
self.headers = headers
|
| 42 |
self.request_headers = request_headers
|
| 43 |
self.history = history or []
|
|
|
|
|
|
|
|
|
|
| 44 |
super().__init__(
|
| 45 |
content=content,
|
| 46 |
url=adaptive_domain or url,
|
scrapling/parser.py
CHANGED
|
@@ -74,7 +74,7 @@ class Selector(SelectorsGeneration):
|
|
| 74 |
self,
|
| 75 |
content: Optional[str | bytes] = None,
|
| 76 |
url: Optional[str] = None,
|
| 77 |
-
encoding: str = "
|
| 78 |
huge_tree: bool = True,
|
| 79 |
root: Optional[HtmlElement] = None,
|
| 80 |
keep_comments: Optional[bool] = False,
|
|
@@ -116,7 +116,7 @@ class Selector(SelectorsGeneration):
|
|
| 116 |
if isinstance(content, str):
|
| 117 |
body = content.strip().replace("\x00", "").encode(encoding) or b"<html/>"
|
| 118 |
elif isinstance(content, bytes):
|
| 119 |
-
body = content.replace(b"\x00", b"")
|
| 120 |
else:
|
| 121 |
raise TypeError(f"content argument must be str or bytes, got {type(content)}")
|
| 122 |
|
|
@@ -340,7 +340,7 @@ class Selector(SelectorsGeneration):
|
|
| 340 |
@property
|
| 341 |
def html_content(self) -> TextHandler:
|
| 342 |
"""Return the inner HTML code of the element"""
|
| 343 |
-
return TextHandler(tostring(self._root, encoding=
|
| 344 |
|
| 345 |
body = html_content
|
| 346 |
|
|
@@ -349,7 +349,7 @@ class Selector(SelectorsGeneration):
|
|
| 349 |
return TextHandler(
|
| 350 |
tostring(
|
| 351 |
self._root,
|
| 352 |
-
encoding=
|
| 353 |
pretty_print=True,
|
| 354 |
method="html",
|
| 355 |
with_tail=False,
|
|
|
|
| 74 |
self,
|
| 75 |
content: Optional[str | bytes] = None,
|
| 76 |
url: Optional[str] = None,
|
| 77 |
+
encoding: str = "utf-8",
|
| 78 |
huge_tree: bool = True,
|
| 79 |
root: Optional[HtmlElement] = None,
|
| 80 |
keep_comments: Optional[bool] = False,
|
|
|
|
| 116 |
if isinstance(content, str):
|
| 117 |
body = content.strip().replace("\x00", "").encode(encoding) or b"<html/>"
|
| 118 |
elif isinstance(content, bytes):
|
| 119 |
+
body = content.replace(b"\x00", b"")
|
| 120 |
else:
|
| 121 |
raise TypeError(f"content argument must be str or bytes, got {type(content)}")
|
| 122 |
|
|
|
|
| 340 |
@property
|
| 341 |
def html_content(self) -> TextHandler:
|
| 342 |
"""Return the inner HTML code of the element"""
|
| 343 |
+
return TextHandler(tostring(self._root, encoding=self.encoding, method="html", with_tail=False))
|
| 344 |
|
| 345 |
body = html_content
|
| 346 |
|
|
|
|
| 349 |
return TextHandler(
|
| 350 |
tostring(
|
| 351 |
self._root,
|
| 352 |
+
encoding=self.encoding,
|
| 353 |
pretty_print=True,
|
| 354 |
method="html",
|
| 355 |
with_tail=False,
|