Karim shoair commited on
Commit
450d5ca
·
1 Parent(s): 299ae74

fix: Fixes for multiple encoding issues (#80 & #81 )

Browse files
scrapling/core/shell.py CHANGED
@@ -317,7 +317,7 @@ def show_page_in_browser(page: Selector): # pragma: no cover
317
 
318
  try:
319
  fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
320
- with open(fd, "w", encoding="utf-8") as f:
321
  f.write(page.body)
322
 
323
  open_in_browser(f"file://{fname}")
@@ -556,7 +556,7 @@ class Convertor:
556
  elif not filename.endswith((".md", ".html", ".txt")):
557
  raise ValueError("Unknown file type: filename must end with '.md', '.html', or '.txt'")
558
  else:
559
- with open(filename, "w", encoding="utf-8") as f:
560
  extension = filename.split(".")[-1]
561
  f.write(
562
  "".join(
 
317
 
318
  try:
319
  fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
320
+ with open(fd, "w", encoding=page.encoding) as f:
321
  f.write(page.body)
322
 
323
  open_in_browser(f"file://{fname}")
 
556
  elif not filename.endswith((".md", ".html", ".txt")):
557
  raise ValueError("Unknown file type: filename must end with '.md', '.html', or '.txt'")
558
  else:
559
+ with open(filename, "w", encoding=page.encoding) as f:
560
  extension = filename.split(".")[-1]
561
  f.write(
562
  "".join(
scrapling/engines/toolbelt/convertor.py CHANGED
@@ -1,10 +1,15 @@
 
 
 
1
  from curl_cffi.requests import Response as CurlResponse
2
  from playwright.sync_api import Page as SyncPage, Response as SyncResponse
3
  from playwright.async_api import Page as AsyncPage, Response as AsyncResponse
4
 
5
  from scrapling.core.utils import log
6
- from scrapling.core._types import Dict, Optional
7
  from .custom import Response, StatusText
 
 
 
8
 
9
 
10
  class ResponseFactory:
@@ -17,6 +22,18 @@ class ResponseFactory:
17
  response objects, and managing encoding, headers, cookies, and other attributes.
18
  """
19
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  @classmethod
21
  def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
22
  """Process response history to build a list of `Response` objects"""
@@ -30,18 +47,23 @@ class ResponseFactory:
30
  history.insert(
31
  0,
32
  Response(
33
- url=current_request.url,
34
- # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
35
- content="",
36
- status=current_response.status if current_response else 301,
37
- reason=(current_response.status_text or StatusText.get(current_response.status))
38
- if current_response
39
- else StatusText.get(301),
40
- encoding=current_response.headers.get("content-type", "") or "utf-8",
41
- cookies=tuple(),
42
- headers=current_response.all_headers() if current_response else {},
43
- request_headers=current_request.all_headers(),
44
- **parser_arguments,
 
 
 
 
 
45
  ),
46
  )
47
  except Exception as e: # pragma: no cover
@@ -85,8 +107,9 @@ class ResponseFactory:
85
  if not final_response:
86
  raise ValueError("Failed to get a response from the page")
87
 
88
- # This will be parsed inside `Response`
89
- encoding = final_response.headers.get("content-type", "") or "utf-8" # default encoding
 
90
  # PlayWright API sometimes give empty status text for some reason!
91
  status_text = final_response.status_text or StatusText.get(final_response.status)
92
 
@@ -98,16 +121,18 @@ class ResponseFactory:
98
  page_content = ""
99
 
100
  return Response(
101
- url=page.url,
102
- content=page_content,
103
- status=final_response.status,
104
- reason=status_text,
105
- encoding=encoding,
106
- cookies=tuple(dict(cookie) for cookie in page.context.cookies()),
107
- headers=first_response.all_headers(),
108
- request_headers=first_response.request.all_headers(),
109
- history=history,
110
- **parser_arguments,
 
 
111
  )
112
 
113
  @classmethod
@@ -125,18 +150,23 @@ class ResponseFactory:
125
  history.insert(
126
  0,
127
  Response(
128
- url=current_request.url,
129
- # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
130
- content="",
131
- status=current_response.status if current_response else 301,
132
- reason=(current_response.status_text or StatusText.get(current_response.status))
133
- if current_response
134
- else StatusText.get(301),
135
- encoding=current_response.headers.get("content-type", "") or "utf-8",
136
- cookies=tuple(),
137
- headers=await current_response.all_headers() if current_response else {},
138
- request_headers=await current_request.all_headers(),
139
- **parser_arguments,
 
 
 
 
 
140
  ),
141
  )
142
  except Exception as e: # pragma: no cover
@@ -180,8 +210,9 @@ class ResponseFactory:
180
  if not final_response:
181
  raise ValueError("Failed to get a response from the page")
182
 
183
- # This will be parsed inside `Response`
184
- encoding = final_response.headers.get("content-type", "") or "utf-8" # default encoding
 
185
  # PlayWright API sometimes give empty status text for some reason!
186
  status_text = final_response.status_text or StatusText.get(final_response.status)
187
 
@@ -193,16 +224,18 @@ class ResponseFactory:
193
  page_content = ""
194
 
195
  return Response(
196
- url=page.url,
197
- content=page_content,
198
- status=final_response.status,
199
- reason=status_text,
200
- encoding=encoding,
201
- cookies=tuple(dict(cookie) for cookie in await page.context.cookies()),
202
- headers=await first_response.all_headers(),
203
- request_headers=await first_response.request.all_headers(),
204
- history=history,
205
- **parser_arguments,
 
 
206
  )
207
 
208
  @staticmethod
@@ -214,15 +247,17 @@ class ResponseFactory:
214
  :return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
215
  """
216
  return Response(
217
- url=response.url,
218
- content=response.content if isinstance(response.content, bytes) else response.content.encode(),
219
- status=response.status_code,
220
- reason=response.reason,
221
- encoding=response.encoding or "utf-8",
222
- cookies=dict(response.cookies),
223
- headers=dict(response.headers),
224
- request_headers=dict(response.request.headers),
225
- method=response.request.method,
226
- history=response.history, # https://github.com/lexiforest/curl_cffi/issues/82
227
- **parser_arguments,
 
 
228
  )
 
1
+ from functools import lru_cache
2
+ from re import compile as re_compile
3
+
4
  from curl_cffi.requests import Response as CurlResponse
5
  from playwright.sync_api import Page as SyncPage, Response as SyncResponse
6
  from playwright.async_api import Page as AsyncPage, Response as AsyncResponse
7
 
8
  from scrapling.core.utils import log
 
9
  from .custom import Response, StatusText
10
+ from scrapling.core._types import Dict, Optional
11
+
12
+ __CHARSET_RE__ = re_compile(r"charset=([\w-]+)")
13
 
14
 
15
  class ResponseFactory:
 
22
  response objects, and managing encoding, headers, cookies, and other attributes.
23
  """
24
 
25
+ @classmethod
26
+ @lru_cache(maxsize=16)
27
+ def __extract_browser_encoding(cls, content_type: str | None) -> Optional[str]:
28
+ """Extract browser encoding from headers.
29
+ Ex: from header "content-type: text/html; charset=utf-8" -> "utf-8
30
+ """
31
+ if content_type:
32
+ # Because Playwright can't do that by themselves like all libraries for some reason :3
33
+ match = __CHARSET_RE__.search(content_type)
34
+ return match.group(1) if match else None
35
+ return None
36
+
37
  @classmethod
38
  def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
39
  """Process response history to build a list of `Response` objects"""
 
47
  history.insert(
48
  0,
49
  Response(
50
+ **{
51
+ "url": current_request.url,
52
+ # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
53
+ "content": "",
54
+ "status": current_response.status if current_response else 301,
55
+ "reason": (current_response.status_text or StatusText.get(current_response.status))
56
+ if current_response
57
+ else StatusText.get(301),
58
+ "encoding": cls.__extract_browser_encoding(
59
+ current_response.headers.get("content-type", "")
60
+ )
61
+ or "utf-8",
62
+ "cookies": tuple(),
63
+ "headers": current_response.all_headers() if current_response else {},
64
+ "request_headers": current_request.all_headers(),
65
+ **parser_arguments,
66
+ }
67
  ),
68
  )
69
  except Exception as e: # pragma: no cover
 
107
  if not final_response:
108
  raise ValueError("Failed to get a response from the page")
109
 
110
+ encoding = (
111
+ cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
112
+ ) # default encoding
113
  # PlayWright API sometimes give empty status text for some reason!
114
  status_text = final_response.status_text or StatusText.get(final_response.status)
115
 
 
121
  page_content = ""
122
 
123
  return Response(
124
+ **{
125
+ "url": page.url,
126
+ "content": page_content,
127
+ "status": final_response.status,
128
+ "reason": status_text,
129
+ "encoding": encoding,
130
+ "cookies": tuple(dict(cookie) for cookie in page.context.cookies()),
131
+ "headers": first_response.all_headers(),
132
+ "request_headers": first_response.request.all_headers(),
133
+ "history": history,
134
+ **parser_arguments,
135
+ }
136
  )
137
 
138
  @classmethod
 
150
  history.insert(
151
  0,
152
  Response(
153
+ **{
154
+ "url": current_request.url,
155
+ # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
156
+ "content": "",
157
+ "status": current_response.status if current_response else 301,
158
+ "reason": (current_response.status_text or StatusText.get(current_response.status))
159
+ if current_response
160
+ else StatusText.get(301),
161
+ "encoding": cls.__extract_browser_encoding(
162
+ current_response.headers.get("content-type", "")
163
+ )
164
+ or "utf-8",
165
+ "cookies": tuple(),
166
+ "headers": await current_response.all_headers() if current_response else {},
167
+ "request_headers": await current_request.all_headers(),
168
+ **parser_arguments,
169
+ }
170
  ),
171
  )
172
  except Exception as e: # pragma: no cover
 
210
  if not final_response:
211
  raise ValueError("Failed to get a response from the page")
212
 
213
+ encoding = (
214
+ cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
215
+ ) # default encoding
216
  # PlayWright API sometimes give empty status text for some reason!
217
  status_text = final_response.status_text or StatusText.get(final_response.status)
218
 
 
224
  page_content = ""
225
 
226
  return Response(
227
+ **{
228
+ "url": page.url,
229
+ "content": page_content,
230
+ "status": final_response.status,
231
+ "reason": status_text,
232
+ "encoding": encoding,
233
+ "cookies": tuple(dict(cookie) for cookie in await page.context.cookies()),
234
+ "headers": await first_response.all_headers(),
235
+ "request_headers": await first_response.request.all_headers(),
236
+ "history": history,
237
+ **parser_arguments,
238
+ }
239
  )
240
 
241
  @staticmethod
 
247
  :return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
248
  """
249
  return Response(
250
+ **{
251
+ "url": response.url,
252
+ "content": response.content,
253
+ "status": response.status_code,
254
+ "reason": response.reason,
255
+ "encoding": response.encoding or "utf-8",
256
+ "cookies": dict(response.cookies),
257
+ "headers": dict(response.headers),
258
+ "request_headers": dict(response.request.headers),
259
+ "method": response.request.method,
260
+ "history": response.history, # https://github.com/lexiforest/curl_cffi/issues/82
261
+ **parser_arguments,
262
+ }
263
  )
scrapling/engines/toolbelt/custom.py CHANGED
@@ -17,81 +17,6 @@ from scrapling.core.custom_types import MappingProxyType
17
  from scrapling.parser import Selector, SQLiteStorageSystem
18
 
19
 
20
- class ResponseEncoding:
21
- __DEFAULT_ENCODING = "utf-8"
22
- __ISO_8859_1_CONTENT_TYPES = {
23
- "text/plain",
24
- "text/html",
25
- "text/css",
26
- "text/javascript",
27
- }
28
-
29
- @classmethod
30
- @lru_cache(maxsize=128)
31
- def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
32
- """Parse content type and parameters from a content-type header value.
33
-
34
- Uses `email.message.Message` for robust header parsing according to RFC 2045.
35
-
36
- :param header_value: Raw content-type header string
37
- :return: Tuple of (content_type, parameters_dict)
38
- """
39
- # Create a Message object and set the Content-Type header then get the content type and parameters
40
- msg = Message()
41
- msg["content-type"] = header_value
42
-
43
- content_type = msg.get_content_type()
44
- params = dict(msg.get_params(failobj=[]))
45
-
46
- # Remove the content-type from params if present somehow
47
- params.pop("content-type", None)
48
-
49
- return content_type, params
50
-
51
- @classmethod
52
- @lru_cache(maxsize=128)
53
- def get_value(cls, content_type: Optional[str], text: Optional[str] = "test") -> str:
54
- """Determine the appropriate character encoding from a content-type header.
55
-
56
- The encoding is determined by these rules in order:
57
- 1. If no content-type is provided, use UTF-8
58
- 2. If charset parameter is present, use that encoding
59
- 3. If content-type is `text/*`, use ISO-8859-1 per HTTP/1.1 spec
60
- 4. If content-type is application/json, use UTF-8 per RFC 4627
61
- 5. Default to UTF-8 if nothing else matches
62
-
63
- :param content_type: Content-Type header value or None
64
- :param text: A text to test the encoding on it
65
- :return: String naming the character encoding
66
- """
67
- if not content_type:
68
- return cls.__DEFAULT_ENCODING
69
-
70
- try:
71
- encoding = None
72
- content_type, params = cls.__parse_content_type(content_type)
73
-
74
- # First check for explicit charset parameter
75
- if "charset" in params:
76
- encoding = params["charset"].strip("'\"")
77
-
78
- # Apply content-type specific rules
79
- elif content_type in cls.__ISO_8859_1_CONTENT_TYPES:
80
- encoding = "ISO-8859-1"
81
-
82
- elif content_type == "application/json":
83
- encoding = cls.__DEFAULT_ENCODING
84
-
85
- if encoding:
86
- _ = text.encode(encoding) # Validate encoding and validate it can encode the given text
87
- return encoding
88
-
89
- return cls.__DEFAULT_ENCODING
90
-
91
- except (ValueError, LookupError, UnicodeEncodeError):
92
- return cls.__DEFAULT_ENCODING
93
-
94
-
95
  class Response(Selector):
96
  """This class is returned by all engines as a way to unify response type between different libraries."""
97
 
@@ -116,9 +41,6 @@ class Response(Selector):
116
  self.headers = headers
117
  self.request_headers = request_headers
118
  self.history = history or []
119
- encoding = ResponseEncoding.get_value(
120
- encoding, content.decode("utf-8") if isinstance(content, bytes) else content
121
- )
122
  super().__init__(
123
  content=content,
124
  url=adaptive_domain or url,
 
17
  from scrapling.parser import Selector, SQLiteStorageSystem
18
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  class Response(Selector):
21
  """This class is returned by all engines as a way to unify response type between different libraries."""
22
 
 
41
  self.headers = headers
42
  self.request_headers = request_headers
43
  self.history = history or []
 
 
 
44
  super().__init__(
45
  content=content,
46
  url=adaptive_domain or url,
scrapling/parser.py CHANGED
@@ -74,7 +74,7 @@ class Selector(SelectorsGeneration):
74
  self,
75
  content: Optional[str | bytes] = None,
76
  url: Optional[str] = None,
77
- encoding: str = "utf8",
78
  huge_tree: bool = True,
79
  root: Optional[HtmlElement] = None,
80
  keep_comments: Optional[bool] = False,
@@ -116,7 +116,7 @@ class Selector(SelectorsGeneration):
116
  if isinstance(content, str):
117
  body = content.strip().replace("\x00", "").encode(encoding) or b"<html/>"
118
  elif isinstance(content, bytes):
119
- body = content.replace(b"\x00", b"").strip()
120
  else:
121
  raise TypeError(f"content argument must be str or bytes, got {type(content)}")
122
 
@@ -340,7 +340,7 @@ class Selector(SelectorsGeneration):
340
  @property
341
  def html_content(self) -> TextHandler:
342
  """Return the inner HTML code of the element"""
343
- return TextHandler(tostring(self._root, encoding="unicode", method="html", with_tail=False))
344
 
345
  body = html_content
346
 
@@ -349,7 +349,7 @@ class Selector(SelectorsGeneration):
349
  return TextHandler(
350
  tostring(
351
  self._root,
352
- encoding="unicode",
353
  pretty_print=True,
354
  method="html",
355
  with_tail=False,
 
74
  self,
75
  content: Optional[str | bytes] = None,
76
  url: Optional[str] = None,
77
+ encoding: str = "utf-8",
78
  huge_tree: bool = True,
79
  root: Optional[HtmlElement] = None,
80
  keep_comments: Optional[bool] = False,
 
116
  if isinstance(content, str):
117
  body = content.strip().replace("\x00", "").encode(encoding) or b"<html/>"
118
  elif isinstance(content, bytes):
119
+ body = content.replace(b"\x00", b"")
120
  else:
121
  raise TypeError(f"content argument must be str or bytes, got {type(content)}")
122
 
 
340
  @property
341
  def html_content(self) -> TextHandler:
342
  """Return the inner HTML code of the element"""
343
+ return TextHandler(tostring(self._root, encoding=self.encoding, method="html", with_tail=False))
344
 
345
  body = html_content
346
 
 
349
  return TextHandler(
350
  tostring(
351
  self._root,
352
+ encoding=self.encoding,
353
  pretty_print=True,
354
  method="html",
355
  with_tail=False,