Spaces:

lenson78
/

Scrapling

Paused

App Files Files Community

Karim shoair commited on Nov 13, 2024

Commit

1e4a507

1 Parent(s): c7dc6e3

Better logic to handle json responses

Browse files

Files changed (2) hide show

scrapling/core/utils.py +13 -1
scrapling/parser.py +7 -15

scrapling/core/utils.py CHANGED Viewed

@@ -4,8 +4,9 @@ from itertools import chain
 # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
 from functools import lru_cache as cache  # functools.cache is available on Python 3.9+ only so let's keep lru_cache
-from scrapling.core._types import Dict, Iterable, Any
 from lxml import html
 html_forbidden = {html.HtmlComment, }
@@ -18,6 +19,17 @@ logging.basicConfig(
     )
 @cache(None, typed=True)
 def setup_basic_logging(level: str = 'debug'):
     levels = {

 # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
 from functools import lru_cache as cache  # functools.cache is available on Python 3.9+ only so let's keep lru_cache
+from scrapling.core._types import Dict, Iterable, Any, Union
+import orjson
 from lxml import html
 html_forbidden = {html.HtmlComment, }
     )
+def is_jsonable(content: Union[bytes, str]) -> bool:
+    if type(content) is bytes:
+        content = content.decode()
+    try:
+        _ = orjson.loads(content)
+        return True
+    except orjson.JSONDecodeError:
+        return False
 @cache(None, typed=True)
 def setup_basic_logging(level: str = 'debug'):
     levels = {

scrapling/parser.py CHANGED Viewed

@@ -7,10 +7,9 @@ from scrapling.core.translator import HTMLTranslator
 from scrapling.core.mixins import SelectorsGeneration
 from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
 from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
-from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden
 from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
 from lxml import etree, html
-from lxml.etree import XMLSyntaxError
 from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
@@ -75,19 +74,12 @@ class Adaptor(SelectorsGeneration):
                 body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
             # https://lxml.de/api/lxml.etree.HTMLParser-class.html
-            try:
-                # Test with recover set to False first so if this is a text body like a json response, we get error
-                parser = html.HTMLParser(
-                    recover=False, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
-                    compact=True, huge_tree=huge_tree, default_doctype=True
-                )
-                self._root = etree.fromstring(body, parser=parser, base_url=url)
-            except XMLSyntaxError:
-                parser = html.HTMLParser(
-                    recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
-                    compact=True, huge_tree=huge_tree, default_doctype=True
-                )
-                self._root = etree.fromstring(body, parser=parser, base_url=url)
                 self.__text = TextHandler(text or body.decode())
         else:

 from scrapling.core.mixins import SelectorsGeneration
 from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
 from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
+from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden, is_jsonable
 from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
 from lxml import etree, html
 from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
                 body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
             # https://lxml.de/api/lxml.etree.HTMLParser-class.html
+            parser = html.HTMLParser(
+                recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
+                compact=True, huge_tree=huge_tree, default_doctype=True
+            )
+            self._root = etree.fromstring(body, parser=parser, base_url=url)
+            if is_jsonable(text or body.decode()):
                 self.__text = TextHandler(text or body.decode())
         else: