import re import pytest from unittest.mock import Mock from scrapling import Selector, Selectors from scrapling.core.custom_types import TextHandler, TextHandlers from scrapling.core.storage import SQLiteStorageSystem class TestSelectorAdvancedFeatures: """Test advanced Selector features like adaptive matching""" def test_adaptive_initialization_with_storage(self): """Test adaptive initialization with custom storage""" html = "

Test

" # Use the actual SQLiteStorageSystem for this test selector = Selector( content=html, adaptive=True, storage=SQLiteStorageSystem, storage_args={"storage_file": ":memory:", "url": "https://example.com"} ) assert selector._Selector__adaptive_enabled is True assert selector._storage is not None def test_adaptive_initialization_with_default_storage_args(self): """Test adaptive initialization with default storage args""" html = "

Test

" url = "https://example.com" # Test that adaptive mode uses default storage when no explicit args provided selector = Selector( content=html, url=url, adaptive=True ) # Should create storage with default args assert selector._storage is not None def test_adaptive_with_existing_storage(self): """Test adaptive initialization with existing storage object""" html = "

Test

" mock_storage = Mock() selector = Selector( content=html, adaptive=True, _storage=mock_storage ) assert selector._storage is mock_storage class TestAdvancedSelectors: """Test advanced selector functionality""" @pytest.fixture def complex_html(self): return """

First paragraph

Second paragraph

Special content Regular content
Cell 1Cell 2
Cell 3Cell 4
""" def test_comment_and_cdata_handling(self, complex_html): """Test handling of comments and CDATA""" # With comments/CDATA kept page = Selector( complex_html, keep_comments=True, keep_cdata=True ) content = page.body assert "Comment" in content assert "CDATA" in content # Without comments/CDATA page = Selector( complex_html, keep_comments=False, keep_cdata=False ) content = page.html_content assert "Comment" not in content def test_advanced_xpath_variables(self, complex_html): """Test XPath with variables""" page = Selector(complex_html) # Using XPath variables cells = page.xpath( "//td[text()=$cell_text]", cell_text="Cell 1" ) assert len(cells) == 1 assert cells[0].text == "Cell 1" def test_pseudo_elements(self, complex_html): """Test CSS pseudo-elements""" page = Selector(complex_html) # ::text pseudo-element texts = page.css("p::text") assert len(texts) == 2 assert isinstance(texts[0], Selector) assert isinstance(texts[0].get(), TextHandler) # ::attr() pseudo-element attrs = page.css("div::attr(class)") assert "container" in attrs.getall() def test_complex_attribute_operations(self, complex_html): """Test complex attribute handling""" page = Selector(complex_html) container = page.css(".container")[0] # JSON in attributes data = container.attrib["data-test"].json() assert data["key"] == "value" # Attribute searching matches = list(container.attrib.search_values("container")) assert len(matches) == 1 def test_url_joining(self): """Test URL joining functionality""" page = Selector("", url="https://example.com/page") # Relative URL assert page.urljoin("../other") == "https://example.com/other" assert page.urljoin("/absolute") == "https://example.com/absolute" assert page.urljoin("relative") == "https://example.com/relative" def test_find_operations_edge_cases(self, complex_html): """Test edge cases in find operations""" page = Selector(complex_html) # Multiple argument types _ = page.find_all( "span", ["div"], {"class": "nested"}, lambda e: e.text != "" ) # Regex pattern matching pattern = re.compile(r"Cell \d+") cells = page.find_all(pattern) assert len(cells) == 4 def test_text_operations_edge_cases(self, complex_html): """Test text operation edge cases""" page = Selector(complex_html) # get_all_text with a custom separator text = page.get_all_text(separator=" | ", strip=True) assert " | " in text # Ignore specific tags text = page.get_all_text(ignore_tags=("table",)) assert "Cell" not in text # With empty values text = page.get_all_text(valid_values=False) assert text != "" class TestTextHandlerAdvanced: """Test advanced TextHandler functionality""" def test_text_handler_operations(self): """Test various TextHandler operations""" text = TextHandler(" Hello World ") # All string methods should return TextHandler assert isinstance(text.strip(), TextHandler) assert isinstance(text.upper(), TextHandler) assert isinstance(text.lower(), TextHandler) assert isinstance(text.replace("World", "Python"), TextHandler) # Custom methods assert text.clean() == "Hello World" # Sorting text2 = TextHandler("dcba") assert text2.sort() == "abcd" def test_text_handler_regex(self): """Test regex operations on TextHandler""" text = TextHandler("Price: $10.99, Sale: $8.99") # Basic regex prices = text.re(r"\$[\d.]+") assert len(prices) == 2 assert prices[0] == "$10.99" # Case insensitive text2 = TextHandler("HELLO hello HeLLo") matches = text2.re(r"hello", case_sensitive=False) assert len(matches) == 3 # Clean match text3 = TextHandler(" He l lo ") matches = text3.re(r"He l lo", clean_match=True, case_sensitive=False) assert len(matches) == 1 def test_text_handlers_operations(self): """Test TextHandlers list operations""" handlers = TextHandlers([ TextHandler("First"), TextHandler("Second"), TextHandler("Third") ]) # Slicing should return TextHandlers assert isinstance(handlers[0:2], TextHandlers) # Get methods assert handlers.get() == "First" assert handlers.get("default") == "First" assert TextHandlers([]).get("default") == "default" class TestSelectorsAdvanced: """Test advanced Selectors functionality""" def test_selectors_filtering(self): """Test filtering operations on Selectors""" html = """

Important

Regular

Also important

""" page = Selector(html) paragraphs = page.css("p") # Filter by class highlighted = paragraphs.filter(lambda p: p.has_class("highlight")) assert len(highlighted) == 2 # Search for a specific element found = paragraphs.search(lambda p: p.text == "Regular") assert found is not None assert found.text == "Regular" def test_selectors_properties(self): """Test Selectors properties""" html = "

1

2

3

" page = Selector(html) paragraphs = page.css("p") assert paragraphs.first.text == "1" assert paragraphs.last.text == "3" assert paragraphs.length == 3