Karim shoair commited on
Commit
1e4a507
·
1 Parent(s): c7dc6e3

Better logic to handle json responses

Browse files
Files changed (2) hide show
  1. scrapling/core/utils.py +13 -1
  2. scrapling/parser.py +7 -15
scrapling/core/utils.py CHANGED
@@ -4,8 +4,9 @@ from itertools import chain
4
  # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
5
  from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache
6
 
7
- from scrapling.core._types import Dict, Iterable, Any
8
 
 
9
  from lxml import html
10
 
11
  html_forbidden = {html.HtmlComment, }
@@ -18,6 +19,17 @@ logging.basicConfig(
18
  )
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
21
  @cache(None, typed=True)
22
  def setup_basic_logging(level: str = 'debug'):
23
  levels = {
 
4
  # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
5
  from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache
6
 
7
+ from scrapling.core._types import Dict, Iterable, Any, Union
8
 
9
+ import orjson
10
  from lxml import html
11
 
12
  html_forbidden = {html.HtmlComment, }
 
19
  )
20
 
21
 
22
+ def is_jsonable(content: Union[bytes, str]) -> bool:
23
+ if type(content) is bytes:
24
+ content = content.decode()
25
+
26
+ try:
27
+ _ = orjson.loads(content)
28
+ return True
29
+ except orjson.JSONDecodeError:
30
+ return False
31
+
32
+
33
  @cache(None, typed=True)
34
  def setup_basic_logging(level: str = 'debug'):
35
  levels = {
scrapling/parser.py CHANGED
@@ -7,10 +7,9 @@ from scrapling.core.translator import HTMLTranslator
7
  from scrapling.core.mixins import SelectorsGeneration
8
  from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
9
  from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
10
- from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden
11
  from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
12
  from lxml import etree, html
13
- from lxml.etree import XMLSyntaxError
14
  from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
15
 
16
 
@@ -75,19 +74,12 @@ class Adaptor(SelectorsGeneration):
75
  body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
76
 
77
  # https://lxml.de/api/lxml.etree.HTMLParser-class.html
78
- try:
79
- # Test with recover set to False first so if this is a text body like a json response, we get error
80
- parser = html.HTMLParser(
81
- recover=False, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
82
- compact=True, huge_tree=huge_tree, default_doctype=True
83
- )
84
- self._root = etree.fromstring(body, parser=parser, base_url=url)
85
- except XMLSyntaxError:
86
- parser = html.HTMLParser(
87
- recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
88
- compact=True, huge_tree=huge_tree, default_doctype=True
89
- )
90
- self._root = etree.fromstring(body, parser=parser, base_url=url)
91
  self.__text = TextHandler(text or body.decode())
92
 
93
  else:
 
7
  from scrapling.core.mixins import SelectorsGeneration
8
  from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
9
  from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
10
+ from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden, is_jsonable
11
  from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
12
  from lxml import etree, html
 
13
  from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
14
 
15
 
 
74
  body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
75
 
76
  # https://lxml.de/api/lxml.etree.HTMLParser-class.html
77
+ parser = html.HTMLParser(
78
+ recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
79
+ compact=True, huge_tree=huge_tree, default_doctype=True
80
+ )
81
+ self._root = etree.fromstring(body, parser=parser, base_url=url)
82
+ if is_jsonable(text or body.decode()):
 
 
 
 
 
 
 
83
  self.__text = TextHandler(text or body.decode())
84
 
85
  else: