Karim shoair commited on
Commit
3db9c55
·
1 Parent(s): bb70f02

refactor: replace's Selector inpt (text/body) with 1 argument called `content`

Browse files
benchmarks.py CHANGED
@@ -80,7 +80,7 @@ def test_scrapling():
80
 
81
  @benchmark
82
  def test_parsel():
83
- return Selector(text=large_html).css(".item::text").extract()
84
 
85
 
86
  @benchmark
 
80
 
81
  @benchmark
82
  def test_parsel():
83
+ return Selector(content=large_html).css(".item::text").extract()
84
 
85
 
86
  @benchmark
scrapling/engines/toolbelt/convertor.py CHANGED
@@ -34,8 +34,7 @@ class ResponseFactory:
34
  Response(
35
  url=current_request.url,
36
  # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
37
- text="",
38
- body=b"",
39
  status=current_response.status if current_response else 301,
40
  reason=(
41
  current_response.status_text
@@ -112,8 +111,7 @@ class ResponseFactory:
112
 
113
  return Response(
114
  url=page.url,
115
- text=page_content,
116
- body=page_content.encode("utf-8"),
117
  status=final_response.status,
118
  reason=status_text,
119
  encoding=encoding,
@@ -141,8 +139,7 @@ class ResponseFactory:
141
  Response(
142
  url=current_request.url,
143
  # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
144
- text="",
145
- body=b"",
146
  status=current_response.status if current_response else 301,
147
  reason=(
148
  current_response.status_text
@@ -221,8 +218,7 @@ class ResponseFactory:
221
 
222
  return Response(
223
  url=page.url,
224
- text=page_content,
225
- body=page_content.encode("utf-8"),
226
  status=final_response.status,
227
  reason=status_text,
228
  encoding=encoding,
@@ -243,8 +239,7 @@ class ResponseFactory:
243
  """
244
  return Response(
245
  url=response.url,
246
- text=response.text,
247
- body=response.content
248
  if type(response.content) is bytes
249
  else response.content.encode(),
250
  status=response.status_code,
 
34
  Response(
35
  url=current_request.url,
36
  # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
37
+ content="",
 
38
  status=current_response.status if current_response else 301,
39
  reason=(
40
  current_response.status_text
 
111
 
112
  return Response(
113
  url=page.url,
114
+ content=page_content,
 
115
  status=final_response.status,
116
  reason=status_text,
117
  encoding=encoding,
 
139
  Response(
140
  url=current_request.url,
141
  # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
142
+ content="",
 
143
  status=current_response.status if current_response else 301,
144
  reason=(
145
  current_response.status_text
 
218
 
219
  return Response(
220
  url=page.url,
221
+ content=page_content,
 
222
  status=final_response.status,
223
  reason=status_text,
224
  encoding=encoding,
 
239
  """
240
  return Response(
241
  url=response.url,
242
+ content=response.content
 
243
  if type(response.content) is bytes
244
  else response.content.encode(),
245
  status=response.status_code,
scrapling/engines/toolbelt/custom.py CHANGED
@@ -103,8 +103,7 @@ class Response(Selector):
103
  def __init__(
104
  self,
105
  url: str,
106
- text: str,
107
- body: bytes,
108
  status: int,
109
  reason: str,
110
  cookies: Union[Tuple[Dict[str, str], ...], Dict[str, str]],
@@ -122,10 +121,11 @@ class Response(Selector):
122
  self.headers = headers
123
  self.request_headers = request_headers
124
  self.history = history or []
125
- encoding = ResponseEncoding.get_value(encoding, text)
 
 
126
  super().__init__(
127
- text=text,
128
- body=body,
129
  url=adaptive_domain or url,
130
  encoding=encoding,
131
  **selector_config,
 
103
  def __init__(
104
  self,
105
  url: str,
106
+ content: str | bytes,
 
107
  status: int,
108
  reason: str,
109
  cookies: Union[Tuple[Dict[str, str], ...], Dict[str, str]],
 
121
  self.headers = headers
122
  self.request_headers = request_headers
123
  self.history = history or []
124
+ encoding = ResponseEncoding.get_value(
125
+ encoding, content.decode("utf-8") if isinstance(content, bytes) else content
126
+ )
127
  super().__init__(
128
+ content=content,
 
129
  url=adaptive_domain or url,
130
  encoding=encoding,
131
  **selector_config,
scrapling/parser.py CHANGED
@@ -50,9 +50,8 @@ class Selector(SelectorsGeneration):
50
 
51
  def __init__(
52
  self,
53
- text: Optional[str] = None,
54
  url: Optional[str] = None,
55
- body: bytes = b"",
56
  encoding: str = "utf8",
57
  huge_tree: bool = True,
58
  root: Optional[html.HtmlElement] = None,
@@ -72,9 +71,8 @@ class Selector(SelectorsGeneration):
72
  not possible. You can test it here and see code explodes with `AssertionError: invalid Element proxy at...`.
73
  It's an old issue with lxml, see `this entry <https://bugs.launchpad.net/lxml/+bug/736708>`
74
 
75
- :param text: HTML body passed as text.
76
  :param url: It allows storing a URL with the HTML data for retrieving later.
77
- :param body: HTML body as an ``bytes`` object. It can be used instead of the ``text`` argument.
78
  :param encoding: The encoding type that will be used in HTML parsing, default is `UTF-8`
79
  :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
80
  the libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
@@ -88,27 +86,23 @@ class Selector(SelectorsGeneration):
88
  :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
89
  If empty, default values will be used.
90
  """
91
- if root is None and not body and text is None:
92
  raise ValueError(
93
- "Selector class needs text, body, or root arguments to work"
94
  )
95
 
96
  self.__text = ""
97
  if root is None:
98
- if text is None:
99
- if not body or not isinstance(body, bytes):
100
- raise TypeError(
101
- f"body argument must be valid and of type bytes, got {body.__class__}"
102
- )
103
-
104
- body = body.replace(b"\x00", b"").strip()
105
  else:
106
- if not isinstance(text, str):
107
- raise TypeError(
108
- f"text argument must be of type str, got {text.__class__}"
109
- )
110
-
111
- body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
112
 
113
  # https://lxml.de/api/lxml.etree.HTMLParser-class.html
114
  parser = html.HTMLParser(
@@ -122,8 +116,10 @@ class Selector(SelectorsGeneration):
122
  strip_cdata=(not keep_cdata),
123
  )
124
  self._root = etree.fromstring(body, parser=parser, base_url=url)
125
- if is_jsonable(text or body.decode()):
126
- self.__text = TextHandler(text or body.decode())
 
 
127
 
128
  else:
129
  # All HTML types inherit from HtmlMixin so this to check for all at once
@@ -930,7 +926,7 @@ class Selector(SelectorsGeneration):
930
  ) -> None:
931
  """Saves the element's unique properties to the storage for retrieval and relocation later
932
 
933
- :param element: The element itself that we want to save to storage, it can be an ` Selector ` or pure ` HtmlElement `
934
  :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
935
  the docs for more info.
936
  """
 
50
 
51
  def __init__(
52
  self,
53
+ content: Optional[Union[str, bytes]] = None,
54
  url: Optional[str] = None,
 
55
  encoding: str = "utf8",
56
  huge_tree: bool = True,
57
  root: Optional[html.HtmlElement] = None,
 
71
  not possible. You can test it here and see code explodes with `AssertionError: invalid Element proxy at...`.
72
  It's an old issue with lxml, see `this entry <https://bugs.launchpad.net/lxml/+bug/736708>`
73
 
74
+ :param content: HTML content as either string or bytes.
75
  :param url: It allows storing a URL with the HTML data for retrieving later.
 
76
  :param encoding: The encoding type that will be used in HTML parsing, default is `UTF-8`
77
  :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
78
  the libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
 
86
  :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
87
  If empty, default values will be used.
88
  """
89
+ if root is None and content is None:
90
  raise ValueError(
91
+ "Selector class needs HTML content, or root arguments to work"
92
  )
93
 
94
  self.__text = ""
95
  if root is None:
96
+ if isinstance(content, bytes):
97
+ body = content.replace(b"\x00", b"").strip()
98
+ elif isinstance(content, str):
99
+ body = (
100
+ content.strip().replace("\x00", "").encode(encoding) or b"<html/>"
101
+ )
 
102
  else:
103
+ raise TypeError(
104
+ f"content argument must be str or bytes, got {type(content)}"
105
+ )
 
 
 
106
 
107
  # https://lxml.de/api/lxml.etree.HTMLParser-class.html
108
  parser = html.HTMLParser(
 
116
  strip_cdata=(not keep_cdata),
117
  )
118
  self._root = etree.fromstring(body, parser=parser, base_url=url)
119
+
120
+ jsonable_text = content if isinstance(content, str) else body.decode()
121
+ if is_jsonable(jsonable_text):
122
+ self.__text = TextHandler(jsonable_text)
123
 
124
  else:
125
  # All HTML types inherit from HtmlMixin so this to check for all at once
 
926
  ) -> None:
927
  """Saves the element's unique properties to the storage for retrieval and relocation later
928
 
929
+ :param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
930
  :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
931
  the docs for more info.
932
  """
tests/parser/test_general.py CHANGED
@@ -173,10 +173,7 @@ class TestErrorHandling:
173
  _ = Selector(root="ayo", adaptive=False)
174
 
175
  with pytest.raises(TypeError):
176
- _ = Selector(text=1, adaptive=False)
177
-
178
- with pytest.raises(TypeError):
179
- _ = Selector(body=1, adaptive=False)
180
 
181
  def test_invalid_storage(self, page, html_content):
182
  """Test invalid storage parameter"""
 
173
  _ = Selector(root="ayo", adaptive=False)
174
 
175
  with pytest.raises(TypeError):
176
+ _ = Selector(content=1, adaptive=False)
 
 
 
177
 
178
  def test_invalid_storage(self, page, html_content):
179
  """Test invalid storage parameter"""