Karim shoair commited on
Commit
ec56f6e
·
1 Parent(s): 514d2f3

test: adding new tests and updating existing ones

Browse files
tests/fetchers/sync/test_camoufox.py CHANGED
@@ -37,7 +37,7 @@ class TestStealthyFetcher:
37
  assert cookies == {"test": "value"}
38
 
39
  def test_automation(self, fetcher):
40
- """Test if automation break the code or not"""
41
 
42
  def scroll_page(page):
43
  page.mouse.wheel(10, 0)
@@ -59,6 +59,7 @@ class TestStealthyFetcher:
59
  {
60
  "network_idle": True,
61
  "wait": 10,
 
62
  "cookies": [],
63
  "google_search": True,
64
  "extra_headers": {"ayo": ""},
 
37
  assert cookies == {"test": "value"}
38
 
39
  def test_automation(self, fetcher):
40
+ """Test if automation breaks the code or not"""
41
 
42
  def scroll_page(page):
43
  page.mouse.wheel(10, 0)
 
59
  {
60
  "network_idle": True,
61
  "wait": 10,
62
+ "timeout": 30_000,
63
  "cookies": [],
64
  "google_search": True,
65
  "extra_headers": {"ayo": ""},
tests/parser/test_general.py CHANGED
@@ -1,10 +1,12 @@
1
  import pickle
2
  import time
 
3
 
4
  import pytest
5
  from cssselect import SelectorError, SelectorSyntaxError
6
 
7
  from scrapling import Selector
 
8
 
9
 
10
  @pytest.fixture
 
1
  import pickle
2
  import time
3
+ import logging
4
 
5
  import pytest
6
  from cssselect import SelectorError, SelectorSyntaxError
7
 
8
  from scrapling import Selector
9
+ logging.getLogger("scrapling").setLevel(logging.DEBUG)
10
 
11
 
12
  @pytest.fixture
tests/parser/test_html_utils.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from scrapling.core._html_utils import to_unicode, _replace_entities, name2codepoint
4
+
5
+
6
+ class TestToUnicode:
7
+ def test_string_input(self):
8
+ """Test to_unicode with string input"""
9
+ text = "hello world"
10
+ assert to_unicode(text) == "hello world"
11
+
12
+ def test_bytes_input_default_encoding(self):
13
+ """Test to_unicode with `bytes` input using default UTF-8"""
14
+ text = b"hello world"
15
+ assert to_unicode(text) == "hello world"
16
+
17
+ def test_bytes_input_custom_encoding(self):
18
+ """Test to_unicode with custom encoding"""
19
+ text = "café".encode('latin-1')
20
+ assert to_unicode(text, encoding='latin-1') == "café"
21
+
22
+ def test_bytes_input_with_errors(self):
23
+ """Test to_unicode with error handling"""
24
+ # Invalid UTF-8 bytes
25
+ text = b'\xff\xfe'
26
+ assert to_unicode(text, errors='ignore') == ""
27
+ assert to_unicode(text, errors='replace') == "��"
28
+
29
+ def test_invalid_input_type(self):
30
+ """Test to_unicode with an invalid input type"""
31
+ with pytest.raises(TypeError, match="to_unicode must receive bytes or str"):
32
+ to_unicode(123)
33
+
34
+ def test_none_encoding_defaults_to_utf8(self):
35
+ """Test that None encoding defaults to UTF-8"""
36
+ text = "café".encode('utf-8')
37
+ assert to_unicode(text, encoding=None) == "café"
38
+
39
+
40
+ class TestReplaceEntities:
41
+ def test_named_entities(self):
42
+ """Test replacement of named HTML entities"""
43
+ text = "& < > "  "
44
+ result = _replace_entities(text)
45
+ assert result == "& < > \" \xa0"
46
+
47
+ def test_decimal_entities(self):
48
+ """Test replacement of decimal numeric entities"""
49
+ text = "&#38; &#60; &#62;"
50
+ result = _replace_entities(text)
51
+ assert result == "& < >"
52
+
53
+ def test_hexadecimal_entities(self):
54
+ """Test replacement of hexadecimal numeric entities"""
55
+ text = "&#x26; &#x3C; &#x3E;"
56
+ result = _replace_entities(text)
57
+ assert result == "& < >"
58
+
59
+ def test_mixed_entities(self):
60
+ """Test replacement of mixed entity types"""
61
+ text = "Price: &pound;100 &#8364;50 &#x24;25"
62
+ result = _replace_entities(text)
63
+ assert result == "Price: £100 €50 $25"
64
+
65
+ def test_keep_entities(self):
66
+ """Test keeping specific entities"""
67
+ text = "&amp; &lt; &gt;"
68
+ result = _replace_entities(text, keep=['amp', 'lt'])
69
+ assert result == "&amp; &lt; >"
70
+
71
+ def test_windows_1252_range(self):
72
+ """Test handling of Windows-1252 range characters"""
73
+ text = "&#128; &#130; &#159;" # Windows-1252 range
74
+ result = _replace_entities(text)
75
+ # These should be decoded using cp1252
76
+ assert "€" in result # 128 -> Euro sign
77
+
78
+ def test_remove_illegal_entities_true(self):
79
+ """Test removing illegal entities with remove_illegal=True"""
80
+ text = "&unknown; &#999999;"
81
+ result = _replace_entities(text, remove_illegal=True)
82
+ # The function may convert large numbers to Unicode characters or leave them as-is
83
+ assert "&unknown;" not in result # Unknown entities should be removed or converted
84
+
85
+ def test_remove_illegal_entities_false(self):
86
+ """Test keeping illegal entities with remove_illegal=False"""
87
+ text = "&unknown; &#999999;"
88
+ result = _replace_entities(text, remove_illegal=False)
89
+ # Unknown entities should be preserved when remove_illegal=False
90
+ assert "&unknown;" in result
91
+ # Large numeric entities may be converted to Unicode characters
92
+
93
+ def test_bytes_input(self):
94
+ """Test with bytes input"""
95
+ text = b"&amp; &lt; &gt;"
96
+ result = _replace_entities(text)
97
+ assert result == "& < >"
98
+
99
+ def test_custom_encoding(self):
100
+ """Test with custom encoding"""
101
+ text = "&eacute;".encode('latin-1')
102
+ result = _replace_entities(text, encoding='latin-1')
103
+ assert result == "é"
104
+
105
+ def test_entities_without_semicolon(self):
106
+ """Test entities without semicolon"""
107
+ text = "&amp &lt &gt"
108
+ result = _replace_entities(text, remove_illegal=True)
109
+ # Should handle entities without a semicolon
110
+ assert len(result) <= len(text)
111
+
112
+ def test_case_insensitive_named_entities(self):
113
+ """Test case-insensitive named-entity handling"""
114
+ text = "&AMP; &Lt; &GT;"
115
+ result = _replace_entities(text)
116
+ assert result == "& < >"
117
+
118
+ def test_edge_cases(self):
119
+ """Test edge cases"""
120
+ # Empty string
121
+ assert _replace_entities("") == ""
122
+
123
+ # No entities
124
+ assert _replace_entities("plain text") == "plain text"
125
+
126
+ # Invalid numeric entity
127
+ text = "&#-1;"
128
+ result = _replace_entities(text, remove_illegal=True)
129
+ # Invalid entities may be left as-is or removed depending on implementation
130
+ assert len(result) >= 0 # Ensure no exception is raised
131
+
132
+
133
+ class TestName2Codepoint:
134
+ def test_common_entities_exist(self):
135
+ """Test that common HTML entities exist in mapping"""
136
+ common_entities = ['amp', 'lt', 'gt', 'quot', 'nbsp', 'copy', 'reg']
137
+ for entity in common_entities:
138
+ assert entity in name2codepoint
139
+
140
+ def test_greek_letters_exist(self):
141
+ """Test that Greek letter entities exist"""
142
+ greek_letters = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
143
+ for letter in greek_letters:
144
+ assert letter in name2codepoint
145
+
146
+ def test_mathematical_symbols_exist(self):
147
+ """Test that mathematical symbol entities exist"""
148
+ math_symbols = ['sum', 'prod', 'int', 'infin', 'plusmn']
149
+ for symbol in math_symbols:
150
+ assert symbol in name2codepoint
151
+
152
+ def test_currency_symbols_exist(self):
153
+ """Test that currency symbol entities exist"""
154
+ currencies = ['pound', 'yen', 'euro', 'cent']
155
+ for currency in currencies:
156
+ assert currency in name2codepoint
157
+
158
+ def test_codepoint_values(self):
159
+ """Test specific codepoint values"""
160
+ assert name2codepoint['amp'] == 0x0026 # &
161
+ assert name2codepoint['lt'] == 0x003C # <
162
+ assert name2codepoint['gt'] == 0x003E # >
163
+ assert name2codepoint['nbsp'] == 0x00A0 # non-breaking space
164
+ assert name2codepoint['copy'] == 0x00A9 # ©
165
+
166
+
167
+ class TestIntegration:
168
+ def test_real_world_html(self):
169
+ """Test with real-world HTML content"""
170
+ html = """
171
+ &lt;div class=&quot;content&quot;&gt;
172
+ &copy; 2024 Company &amp; Associates
173
+ Price: &pound;99.99 (&euro;89.99)
174
+ Math: &alpha; + &beta; = &gamma;
175
+ &lt;/div&gt;
176
+ """
177
+ result = _replace_entities(html)
178
+
179
+ assert '<div class="content">' in result
180
+ assert '© 2024 Company & Associates' in result
181
+ assert 'Price: £99.99 (€89.99)' in result
182
+ assert 'Math: α + β = γ' in result
183
+
184
+ def test_performance_with_large_text(self):
185
+ """Test performance with large text containing many entities"""
186
+ # Create large text with repeated entities
187
+ text = ("&amp; &lt; &gt; &quot; " * 1000)
188
+ result = _replace_entities(text)
189
+
190
+ # Should complete without issues and have correct content
191
+ assert result.count("&") == 1000
192
+ assert result.count("<") == 1000
193
+ assert result.count(">") == 1000
194
+ assert result.count('"') == 1000