Spaces:
Sleeping
Sleeping
File size: 12,025 Bytes
e840680 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 | import pytest
import json
from scrapling import Selector
from scrapling.core.custom_types import AttributesHandler
class TestAttributesHandler:
"""Test AttributesHandler functionality"""
@pytest.fixture
def sample_html(self):
return """
<html>
<body>
<div id="main"
class="container active"
data-config='{"theme": "dark", "version": 2.5}'
data-items='[1, 2, 3, 4, 5]'
data-invalid-json='{"broken: json}'
title="Main Container"
style="color: red; background: blue;"
data-empty=""
data-number="42"
data-bool="true"
data-url="https://example.com/page?param=value"
custom-attr="custom-value"
data-nested='{"user": {"name": "John", "age": 30}}'
data-encoded="<div>HTML</div>"
onclick="handleClick()"
data-null="null"
data-undefined="undefined">
Content
</div>
<input type="text"
name="username"
value="test@example.com"
placeholder="Enter email"
required
disabled>
<img src="/images/photo.jpg"
alt="Photo"
width="100"
height="100"
loading="lazy">
</body>
</html>
"""
@pytest.fixture
def attributes(self, sample_html):
page = Selector(sample_html)
element = page.css("#main")[0]
return element.attrib
def test_basic_attribute_access(self, attributes):
"""Test basic attribute access"""
# Dict-like access
assert attributes["id"] == "main"
assert attributes["class"] == "container active"
assert attributes["title"] == "Main Container"
# Key existence
assert "id" in attributes
assert "nonexistent" not in attributes
# Get with default
assert attributes.get("id") == "main"
assert attributes.get("nonexistent") is None
assert attributes.get("nonexistent", "default") == "default"
def test_iteration_methods(self, attributes):
"""Test iteration over attributes"""
# Keys
keys = list(attributes.keys())
assert "id" in keys
assert "class" in keys
assert "data-config" in keys
# Values
values = list(attributes.values())
assert "main" in values
assert "container active" in values
# Items
items = dict(attributes.items())
assert items["id"] == "main"
assert items["class"] == "container active"
# Length
assert len(attributes) > 0
def test_json_parsing(self, attributes):
"""Test JSON parsing from attributes"""
# Valid JSON object
config = attributes["data-config"].json()
assert config["theme"] == "dark"
assert config["version"] == 2.5
# Valid JSON array
items = attributes["data-items"].json()
assert items == [1, 2, 3, 4, 5]
# Nested JSON
nested = attributes["data-nested"].json()
assert nested["user"]["name"] == "John"
assert nested["user"]["age"] == 30
# JSON null
assert attributes["data-null"].json() is None
def test_json_error_handling(self, attributes):
"""Test JSON parsing error handling"""
# Invalid JSON should raise error or return None
with pytest.raises((json.JSONDecodeError, AttributeError)):
attributes["data-invalid-json"].json()
# Non-existent attribute
with pytest.raises(KeyError):
attributes["nonexistent"].json()
def test_json_string_property(self, attributes):
"""Test json_string property"""
# Should return JSON representation of all attributes
json_string = attributes.json_string
assert isinstance(json_string, bytes)
# Parse it back
parsed = json.loads(json_string)
assert parsed["id"] == "main"
assert parsed["class"] == "container active"
def test_search_values(self, attributes):
"""Test search_values method"""
# Exact match
results = list(attributes.search_values("main", partial=False))
assert len(results) == 1
assert "id" in results[0]
# Partial match
results = list(attributes.search_values("container", partial=True))
assert len(results) >= 1
found_keys = []
for result in results:
found_keys.extend(result.keys())
assert "class" in found_keys or "title" in found_keys
# Case sensitivity
results = list(attributes.search_values("MAIN", partial=False))
assert len(results) == 0 # Should be case-sensitive by default
# Multiple matches
results = list(attributes.search_values("2", partial=True))
assert len(results) > 1 # Should find multiple attributes
# No matches
results = list(attributes.search_values("nonexistent", partial=False))
assert len(results) == 0
def test_special_attribute_types(self, sample_html):
"""Test handling of special attribute types"""
page = Selector(sample_html)
# Boolean attributes
input_elem = page.css("input")[0]
assert "required" in input_elem.attrib
assert "disabled" in input_elem.attrib
# Empty attributes
main_elem = page.css("#main")[0]
assert main_elem.attrib["data-empty"] == ""
# Numeric string attributes
assert main_elem.attrib["data-number"] == "42"
assert main_elem.attrib["data-bool"] == "true"
def test_attribute_modification(self, sample_html):
"""Test that AttributesHandler is read-only (if applicable)"""
page = Selector(sample_html)
element = page.css("#main")[0]
attrs = element.attrib
# Test if attributes can be modified
# This behavior depends on implementation
original_id = attrs["id"]
try:
attrs["id"] = "new-id"
# If modification is allowed
assert attrs["id"] == "new-id"
# Reset
attrs["id"] = original_id
except (TypeError, AttributeError):
# If modification is not allowed (read-only)
assert attrs["id"] == original_id
def test_string_representation(self, attributes):
"""Test string representations"""
# __str__
str_repr = str(attributes)
assert isinstance(str_repr, str)
assert "id" in str_repr or "main" in str_repr
# __repr__
repr_str = repr(attributes)
assert isinstance(repr_str, str)
def test_edge_cases(self, sample_html):
"""Test edge cases and special scenarios"""
page = Selector(sample_html)
# Element with no attributes
page_with_no_attrs = Selector("<div>Content</div>")
elem = page_with_no_attrs.css("div")[0]
assert len(elem.attrib) == 0
assert list(elem.attrib.keys()) == []
assert elem.attrib.get("any") is None
# Element with encoded content
main_elem = page.css("#main")[0]
encoded = main_elem.attrib["data-encoded"]
assert "<" in encoded # Should decode it
# Style attribute parsing
style = main_elem.attrib["style"]
assert "color: red" in style
assert "background: blue" in style
def test_url_attribute(self, attributes):
"""Test URL attributes"""
url = attributes["data-url"]
assert url == "https://example.com/page?param=value"
# Could test URL joining if AttributesHandler supports it
# based on the parent element's base URL
def test_comparison_operations(self, sample_html):
"""Test comparison operations if supported"""
page = Selector(sample_html)
elem1 = page.css("#main")[0]
elem2 = page.css("input")[0]
# Different elements should have different attributes
assert elem1.attrib != elem2.attrib
# The same element should have equal attributes
elem1_again = page.css("#main")[0]
assert elem1.attrib == elem1_again.attrib
def test_complex_search_patterns(self, attributes):
"""Test complex search patterns"""
# Search for JSON-containing attributes
json_attrs = []
for key, value in attributes.items():
try:
if isinstance(value, str) and (value.startswith('{') or value.startswith('[')):
json.loads(value)
json_attrs.append(key)
except:
pass
assert "data-config" in json_attrs
assert "data-items" in json_attrs
assert "data-nested" in json_attrs
def test_attribute_filtering(self, attributes):
"""Test filtering attributes by patterns"""
# Get all data-* attributes
data_attrs = {k: v for k, v in attributes.items() if k.startswith("data-")}
assert len(data_attrs) > 5
assert "data-config" in data_attrs
assert "data-items" in data_attrs
# Get all event handler attributes
event_attrs = {k: v for k, v in attributes.items() if k.startswith("on")}
assert "onclick" in event_attrs
def test_performance_with_many_attributes(self):
"""Test performance with elements having many attributes"""
# Create an element with many attributes
attrs_list = [f'data-attr{i}="value{i}"' for i in range(100)]
html = f'<div id="test" {" ".join(attrs_list)}>Content</div>'
page = Selector(html)
element = page.css("#test")[0]
attribs = element.attrib
# Should handle many attributes efficiently
assert len(attribs) == 101 # id + 100 data attributes
# Search should still work efficiently
results = list(attribs.search_values("value50", partial=False))
assert len(results) == 1
def test_unicode_attributes(self):
"""Test handling of Unicode in attributes"""
html = """
<div id="unicode-test"
data-emoji="😀🎉"
data-chinese="你好世界"
data-arabic="مرحبا بالعالم"
data-special="café naïve">
</div>
"""
page = Selector(html)
attrs = page.css("#unicode-test")[0].attrib
assert attrs["data-emoji"] == "😀🎉"
assert attrs["data-chinese"] == "你好世界"
assert attrs["data-arabic"] == "مرحبا بالعالم"
assert attrs["data-special"] == "café naïve"
# Search with Unicode
results = list(attrs.search_values("你好", partial=True))
assert len(results) == 1
def test_malformed_attributes(self):
"""Test handling of malformed attributes"""
# Various malformed HTML scenarios
test_cases = [
'<div id="test" class=>Content</div>', # Empty attribute value
'<div id="test" class>Content</div>', # No attribute value
'<div id="test" data-"invalid"="value">Content</div>', # Invalid attribute name
'<div id=test class=no-quotes>Content</div>', # Unquoted values
]
for html in test_cases:
try:
page = Selector(html)
if page.css("div"):
attrs = page.css("div")[0].attrib
# Should handle gracefully without crashing
assert isinstance(attrs, AttributesHandler)
except:
# Some malformed HTML might not parse at all
pass
|