Karim shoair commited on
Commit ·
1e4a507
1
Parent(s): c7dc6e3
Better logic to handle json responses
Browse files- scrapling/core/utils.py +13 -1
- scrapling/parser.py +7 -15
scrapling/core/utils.py
CHANGED
|
@@ -4,8 +4,9 @@ from itertools import chain
|
|
| 4 |
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
|
| 5 |
from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache
|
| 6 |
|
| 7 |
-
from scrapling.core._types import Dict, Iterable, Any
|
| 8 |
|
|
|
|
| 9 |
from lxml import html
|
| 10 |
|
| 11 |
html_forbidden = {html.HtmlComment, }
|
|
@@ -18,6 +19,17 @@ logging.basicConfig(
|
|
| 18 |
)
|
| 19 |
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
@cache(None, typed=True)
|
| 22 |
def setup_basic_logging(level: str = 'debug'):
|
| 23 |
levels = {
|
|
|
|
| 4 |
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
|
| 5 |
from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache
|
| 6 |
|
| 7 |
+
from scrapling.core._types import Dict, Iterable, Any, Union
|
| 8 |
|
| 9 |
+
import orjson
|
| 10 |
from lxml import html
|
| 11 |
|
| 12 |
html_forbidden = {html.HtmlComment, }
|
|
|
|
| 19 |
)
|
| 20 |
|
| 21 |
|
| 22 |
+
def is_jsonable(content: Union[bytes, str]) -> bool:
|
| 23 |
+
if type(content) is bytes:
|
| 24 |
+
content = content.decode()
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
_ = orjson.loads(content)
|
| 28 |
+
return True
|
| 29 |
+
except orjson.JSONDecodeError:
|
| 30 |
+
return False
|
| 31 |
+
|
| 32 |
+
|
| 33 |
@cache(None, typed=True)
|
| 34 |
def setup_basic_logging(level: str = 'debug'):
|
| 35 |
levels = {
|
scrapling/parser.py
CHANGED
|
@@ -7,10 +7,9 @@ from scrapling.core.translator import HTMLTranslator
|
|
| 7 |
from scrapling.core.mixins import SelectorsGeneration
|
| 8 |
from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
|
| 9 |
from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
|
| 10 |
-
from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden
|
| 11 |
from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
|
| 12 |
from lxml import etree, html
|
| 13 |
-
from lxml.etree import XMLSyntaxError
|
| 14 |
from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
|
| 15 |
|
| 16 |
|
|
@@ -75,19 +74,12 @@ class Adaptor(SelectorsGeneration):
|
|
| 75 |
body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
|
| 76 |
|
| 77 |
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
self._root = etree.fromstring(body, parser=parser, base_url=url)
|
| 85 |
-
except XMLSyntaxError:
|
| 86 |
-
parser = html.HTMLParser(
|
| 87 |
-
recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
|
| 88 |
-
compact=True, huge_tree=huge_tree, default_doctype=True
|
| 89 |
-
)
|
| 90 |
-
self._root = etree.fromstring(body, parser=parser, base_url=url)
|
| 91 |
self.__text = TextHandler(text or body.decode())
|
| 92 |
|
| 93 |
else:
|
|
|
|
| 7 |
from scrapling.core.mixins import SelectorsGeneration
|
| 8 |
from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
|
| 9 |
from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
|
| 10 |
+
from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden, is_jsonable
|
| 11 |
from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
|
| 12 |
from lxml import etree, html
|
|
|
|
| 13 |
from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
|
| 14 |
|
| 15 |
|
|
|
|
| 74 |
body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
|
| 75 |
|
| 76 |
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
| 77 |
+
parser = html.HTMLParser(
|
| 78 |
+
recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
|
| 79 |
+
compact=True, huge_tree=huge_tree, default_doctype=True
|
| 80 |
+
)
|
| 81 |
+
self._root = etree.fromstring(body, parser=parser, base_url=url)
|
| 82 |
+
if is_jsonable(text or body.decode()):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
self.__text = TextHandler(text or body.decode())
|
| 84 |
|
| 85 |
else:
|