Karim shoair commited on
Commit ·
3db9c55
1
Parent(s): bb70f02
refactor: replace's Selector inpt (text/body) with 1 argument called `content`
Browse files- benchmarks.py +1 -1
- scrapling/engines/toolbelt/convertor.py +5 -10
- scrapling/engines/toolbelt/custom.py +5 -5
- scrapling/parser.py +18 -22
- tests/parser/test_general.py +1 -4
benchmarks.py
CHANGED
|
@@ -80,7 +80,7 @@ def test_scrapling():
|
|
| 80 |
|
| 81 |
@benchmark
|
| 82 |
def test_parsel():
|
| 83 |
-
return Selector(
|
| 84 |
|
| 85 |
|
| 86 |
@benchmark
|
|
|
|
| 80 |
|
| 81 |
@benchmark
|
| 82 |
def test_parsel():
|
| 83 |
+
return Selector(content=large_html).css(".item::text").extract()
|
| 84 |
|
| 85 |
|
| 86 |
@benchmark
|
scrapling/engines/toolbelt/convertor.py
CHANGED
|
@@ -34,8 +34,7 @@ class ResponseFactory:
|
|
| 34 |
Response(
|
| 35 |
url=current_request.url,
|
| 36 |
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
| 37 |
-
|
| 38 |
-
body=b"",
|
| 39 |
status=current_response.status if current_response else 301,
|
| 40 |
reason=(
|
| 41 |
current_response.status_text
|
|
@@ -112,8 +111,7 @@ class ResponseFactory:
|
|
| 112 |
|
| 113 |
return Response(
|
| 114 |
url=page.url,
|
| 115 |
-
|
| 116 |
-
body=page_content.encode("utf-8"),
|
| 117 |
status=final_response.status,
|
| 118 |
reason=status_text,
|
| 119 |
encoding=encoding,
|
|
@@ -141,8 +139,7 @@ class ResponseFactory:
|
|
| 141 |
Response(
|
| 142 |
url=current_request.url,
|
| 143 |
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
| 144 |
-
|
| 145 |
-
body=b"",
|
| 146 |
status=current_response.status if current_response else 301,
|
| 147 |
reason=(
|
| 148 |
current_response.status_text
|
|
@@ -221,8 +218,7 @@ class ResponseFactory:
|
|
| 221 |
|
| 222 |
return Response(
|
| 223 |
url=page.url,
|
| 224 |
-
|
| 225 |
-
body=page_content.encode("utf-8"),
|
| 226 |
status=final_response.status,
|
| 227 |
reason=status_text,
|
| 228 |
encoding=encoding,
|
|
@@ -243,8 +239,7 @@ class ResponseFactory:
|
|
| 243 |
"""
|
| 244 |
return Response(
|
| 245 |
url=response.url,
|
| 246 |
-
|
| 247 |
-
body=response.content
|
| 248 |
if type(response.content) is bytes
|
| 249 |
else response.content.encode(),
|
| 250 |
status=response.status_code,
|
|
|
|
| 34 |
Response(
|
| 35 |
url=current_request.url,
|
| 36 |
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
| 37 |
+
content="",
|
|
|
|
| 38 |
status=current_response.status if current_response else 301,
|
| 39 |
reason=(
|
| 40 |
current_response.status_text
|
|
|
|
| 111 |
|
| 112 |
return Response(
|
| 113 |
url=page.url,
|
| 114 |
+
content=page_content,
|
|
|
|
| 115 |
status=final_response.status,
|
| 116 |
reason=status_text,
|
| 117 |
encoding=encoding,
|
|
|
|
| 139 |
Response(
|
| 140 |
url=current_request.url,
|
| 141 |
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
| 142 |
+
content="",
|
|
|
|
| 143 |
status=current_response.status if current_response else 301,
|
| 144 |
reason=(
|
| 145 |
current_response.status_text
|
|
|
|
| 218 |
|
| 219 |
return Response(
|
| 220 |
url=page.url,
|
| 221 |
+
content=page_content,
|
|
|
|
| 222 |
status=final_response.status,
|
| 223 |
reason=status_text,
|
| 224 |
encoding=encoding,
|
|
|
|
| 239 |
"""
|
| 240 |
return Response(
|
| 241 |
url=response.url,
|
| 242 |
+
content=response.content
|
|
|
|
| 243 |
if type(response.content) is bytes
|
| 244 |
else response.content.encode(),
|
| 245 |
status=response.status_code,
|
scrapling/engines/toolbelt/custom.py
CHANGED
|
@@ -103,8 +103,7 @@ class Response(Selector):
|
|
| 103 |
def __init__(
|
| 104 |
self,
|
| 105 |
url: str,
|
| 106 |
-
|
| 107 |
-
body: bytes,
|
| 108 |
status: int,
|
| 109 |
reason: str,
|
| 110 |
cookies: Union[Tuple[Dict[str, str], ...], Dict[str, str]],
|
|
@@ -122,10 +121,11 @@ class Response(Selector):
|
|
| 122 |
self.headers = headers
|
| 123 |
self.request_headers = request_headers
|
| 124 |
self.history = history or []
|
| 125 |
-
encoding = ResponseEncoding.get_value(
|
|
|
|
|
|
|
| 126 |
super().__init__(
|
| 127 |
-
|
| 128 |
-
body=body,
|
| 129 |
url=adaptive_domain or url,
|
| 130 |
encoding=encoding,
|
| 131 |
**selector_config,
|
|
|
|
| 103 |
def __init__(
|
| 104 |
self,
|
| 105 |
url: str,
|
| 106 |
+
content: str | bytes,
|
|
|
|
| 107 |
status: int,
|
| 108 |
reason: str,
|
| 109 |
cookies: Union[Tuple[Dict[str, str], ...], Dict[str, str]],
|
|
|
|
| 121 |
self.headers = headers
|
| 122 |
self.request_headers = request_headers
|
| 123 |
self.history = history or []
|
| 124 |
+
encoding = ResponseEncoding.get_value(
|
| 125 |
+
encoding, content.decode("utf-8") if isinstance(content, bytes) else content
|
| 126 |
+
)
|
| 127 |
super().__init__(
|
| 128 |
+
content=content,
|
|
|
|
| 129 |
url=adaptive_domain or url,
|
| 130 |
encoding=encoding,
|
| 131 |
**selector_config,
|
scrapling/parser.py
CHANGED
|
@@ -50,9 +50,8 @@ class Selector(SelectorsGeneration):
|
|
| 50 |
|
| 51 |
def __init__(
|
| 52 |
self,
|
| 53 |
-
|
| 54 |
url: Optional[str] = None,
|
| 55 |
-
body: bytes = b"",
|
| 56 |
encoding: str = "utf8",
|
| 57 |
huge_tree: bool = True,
|
| 58 |
root: Optional[html.HtmlElement] = None,
|
|
@@ -72,9 +71,8 @@ class Selector(SelectorsGeneration):
|
|
| 72 |
not possible. You can test it here and see code explodes with `AssertionError: invalid Element proxy at...`.
|
| 73 |
It's an old issue with lxml, see `this entry <https://bugs.launchpad.net/lxml/+bug/736708>`
|
| 74 |
|
| 75 |
-
:param
|
| 76 |
:param url: It allows storing a URL with the HTML data for retrieving later.
|
| 77 |
-
:param body: HTML body as an ``bytes`` object. It can be used instead of the ``text`` argument.
|
| 78 |
:param encoding: The encoding type that will be used in HTML parsing, default is `UTF-8`
|
| 79 |
:param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
|
| 80 |
the libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
|
|
@@ -88,27 +86,23 @@ class Selector(SelectorsGeneration):
|
|
| 88 |
:param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
|
| 89 |
If empty, default values will be used.
|
| 90 |
"""
|
| 91 |
-
if root is None and
|
| 92 |
raise ValueError(
|
| 93 |
-
"Selector class needs
|
| 94 |
)
|
| 95 |
|
| 96 |
self.__text = ""
|
| 97 |
if root is None:
|
| 98 |
-
if
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
)
|
| 103 |
-
|
| 104 |
-
body = body.replace(b"\x00", b"").strip()
|
| 105 |
else:
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
)
|
| 110 |
-
|
| 111 |
-
body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
|
| 112 |
|
| 113 |
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
| 114 |
parser = html.HTMLParser(
|
|
@@ -122,8 +116,10 @@ class Selector(SelectorsGeneration):
|
|
| 122 |
strip_cdata=(not keep_cdata),
|
| 123 |
)
|
| 124 |
self._root = etree.fromstring(body, parser=parser, base_url=url)
|
| 125 |
-
|
| 126 |
-
|
|
|
|
|
|
|
| 127 |
|
| 128 |
else:
|
| 129 |
# All HTML types inherit from HtmlMixin so this to check for all at once
|
|
@@ -930,7 +926,7 @@ class Selector(SelectorsGeneration):
|
|
| 930 |
) -> None:
|
| 931 |
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
| 932 |
|
| 933 |
-
:param element: The element itself that we want to save to storage, it can be
|
| 934 |
:param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
|
| 935 |
the docs for more info.
|
| 936 |
"""
|
|
|
|
| 50 |
|
| 51 |
def __init__(
|
| 52 |
self,
|
| 53 |
+
content: Optional[Union[str, bytes]] = None,
|
| 54 |
url: Optional[str] = None,
|
|
|
|
| 55 |
encoding: str = "utf8",
|
| 56 |
huge_tree: bool = True,
|
| 57 |
root: Optional[html.HtmlElement] = None,
|
|
|
|
| 71 |
not possible. You can test it here and see code explodes with `AssertionError: invalid Element proxy at...`.
|
| 72 |
It's an old issue with lxml, see `this entry <https://bugs.launchpad.net/lxml/+bug/736708>`
|
| 73 |
|
| 74 |
+
:param content: HTML content as either string or bytes.
|
| 75 |
:param url: It allows storing a URL with the HTML data for retrieving later.
|
|
|
|
| 76 |
:param encoding: The encoding type that will be used in HTML parsing, default is `UTF-8`
|
| 77 |
:param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
|
| 78 |
the libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
|
|
|
|
| 86 |
:param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
|
| 87 |
If empty, default values will be used.
|
| 88 |
"""
|
| 89 |
+
if root is None and content is None:
|
| 90 |
raise ValueError(
|
| 91 |
+
"Selector class needs HTML content, or root arguments to work"
|
| 92 |
)
|
| 93 |
|
| 94 |
self.__text = ""
|
| 95 |
if root is None:
|
| 96 |
+
if isinstance(content, bytes):
|
| 97 |
+
body = content.replace(b"\x00", b"").strip()
|
| 98 |
+
elif isinstance(content, str):
|
| 99 |
+
body = (
|
| 100 |
+
content.strip().replace("\x00", "").encode(encoding) or b"<html/>"
|
| 101 |
+
)
|
|
|
|
| 102 |
else:
|
| 103 |
+
raise TypeError(
|
| 104 |
+
f"content argument must be str or bytes, got {type(content)}"
|
| 105 |
+
)
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
| 108 |
parser = html.HTMLParser(
|
|
|
|
| 116 |
strip_cdata=(not keep_cdata),
|
| 117 |
)
|
| 118 |
self._root = etree.fromstring(body, parser=parser, base_url=url)
|
| 119 |
+
|
| 120 |
+
jsonable_text = content if isinstance(content, str) else body.decode()
|
| 121 |
+
if is_jsonable(jsonable_text):
|
| 122 |
+
self.__text = TextHandler(jsonable_text)
|
| 123 |
|
| 124 |
else:
|
| 125 |
# All HTML types inherit from HtmlMixin so this to check for all at once
|
|
|
|
| 926 |
) -> None:
|
| 927 |
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
| 928 |
|
| 929 |
+
:param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
|
| 930 |
:param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
|
| 931 |
the docs for more info.
|
| 932 |
"""
|
tests/parser/test_general.py
CHANGED
|
@@ -173,10 +173,7 @@ class TestErrorHandling:
|
|
| 173 |
_ = Selector(root="ayo", adaptive=False)
|
| 174 |
|
| 175 |
with pytest.raises(TypeError):
|
| 176 |
-
_ = Selector(
|
| 177 |
-
|
| 178 |
-
with pytest.raises(TypeError):
|
| 179 |
-
_ = Selector(body=1, adaptive=False)
|
| 180 |
|
| 181 |
def test_invalid_storage(self, page, html_content):
|
| 182 |
"""Test invalid storage parameter"""
|
|
|
|
| 173 |
_ = Selector(root="ayo", adaptive=False)
|
| 174 |
|
| 175 |
with pytest.raises(TypeError):
|
| 176 |
+
_ = Selector(content=1, adaptive=False)
|
|
|
|
|
|
|
|
|
|
| 177 |
|
| 178 |
def test_invalid_storage(self, page, html_content):
|
| 179 |
"""Test invalid storage parameter"""
|