Spaces:

AUXteam
/

Scraper_hub

Running

File size: 8,782 Bytes

94ec243

import re
import pytest
from unittest.mock import Mock

from scrapling import Selector, Selectors
from scrapling.core.custom_types import TextHandler, TextHandlers
from scrapling.core.storage import SQLiteStorageSystem


class TestSelectorAdvancedFeatures:
    """Test advanced Selector features like adaptive matching"""

    def test_adaptive_initialization_with_storage(self):
        """Test adaptive initialization with custom storage"""
        html = "<html><body><p>Test</p></body></html>"

        # Use the actual SQLiteStorageSystem for this test
        selector = Selector(
            content=html,
            adaptive=True,
            storage=SQLiteStorageSystem,
            storage_args={"storage_file": ":memory:", "url": "https://example.com"}
        )

        assert selector._Selector__adaptive_enabled is True
        assert selector._storage is not None

    def test_adaptive_initialization_with_default_storage_args(self):
        """Test adaptive initialization with default storage args"""
        html = "<html><body><p>Test</p></body></html>"
        url = "https://example.com"

        # Test that adaptive mode uses default storage when no explicit args provided
        selector = Selector(
            content=html,
            url=url,
            adaptive=True
        )

        # Should create storage with default args
        assert selector._storage is not None

    def test_adaptive_with_existing_storage(self):
        """Test adaptive initialization with existing storage object"""
        html = "<html><body><p>Test</p></body></html>"

        mock_storage = Mock()

        selector = Selector(
            content=html,
            adaptive=True,
            _storage=mock_storage
        )

        assert selector._storage is mock_storage


class TestAdvancedSelectors:
    """Test advanced selector functionality"""

    @pytest.fixture
    def complex_html(self):
        return """
        <html>
            <body>
                <div class="container" data-test='{"key": "value"}'>
                    <p>First paragraph</p>
                    <!-- Comment -->
                    <p>Second paragraph</p>
                    <![CDATA[Some CDATA content]]>
                    <div class="nested">
                        <span id="special">Special content</span>
                        <span>Regular content</span>
                    </div>
                    <table>
                        <tr><td>Cell 1</td><td>Cell 2</td></tr>
                        <tr><td>Cell 3</td><td>Cell 4</td></tr>
                    </table>
                </div>
            </body>
        </html>
        """

    def test_comment_and_cdata_handling(self, complex_html):
        """Test handling of comments and CDATA"""
        # With comments/CDATA kept
        page = Selector(
            complex_html,
            keep_comments=True,
            keep_cdata=True
        )
        content = page.body
        assert "Comment" in content
        assert "CDATA" in content

        # Without comments/CDATA
        page = Selector(
            complex_html,
            keep_comments=False,
            keep_cdata=False
        )
        content = page.html_content
        assert "Comment" not in content

    def test_advanced_xpath_variables(self, complex_html):
        """Test XPath with variables"""
        page = Selector(complex_html)

        # Using XPath variables
        cells = page.xpath(
            "//td[text()=$cell_text]",
            cell_text="Cell 1"
        )
        assert len(cells) == 1
        assert cells[0].text == "Cell 1"

    def test_pseudo_elements(self, complex_html):
        """Test CSS pseudo-elements"""
        page = Selector(complex_html)

        # ::text pseudo-element
        texts = page.css("p::text")
        assert len(texts) == 2
        assert isinstance(texts[0], Selector)
        assert isinstance(texts[0].get(), TextHandler)

        # ::attr() pseudo-element
        attrs = page.css("div::attr(class)")
        assert "container" in attrs.getall()

    def test_complex_attribute_operations(self, complex_html):
        """Test complex attribute handling"""
        page = Selector(complex_html)
        container = page.css(".container")[0]

        # JSON in attributes
        data = container.attrib["data-test"].json()
        assert data["key"] == "value"

        # Attribute searching
        matches = list(container.attrib.search_values("container"))
        assert len(matches) == 1

    def test_url_joining(self):
        """Test URL joining functionality"""
        page = Selector("<html></html>", url="https://example.com/page")

        # Relative URL
        assert page.urljoin("../other") == "https://example.com/other"
        assert page.urljoin("/absolute") == "https://example.com/absolute"
        assert page.urljoin("relative") == "https://example.com/relative"

    def test_find_operations_edge_cases(self, complex_html):
        """Test edge cases in find operations"""
        page = Selector(complex_html)

        # Multiple argument types
        _ = page.find_all(
            "span",
            ["div"],
            {"class": "nested"},
            lambda e: e.text != ""
        )

        # Regex pattern matching
        pattern = re.compile(r"Cell \d+")
        cells = page.find_all(pattern)
        assert len(cells) == 4

    def test_text_operations_edge_cases(self, complex_html):
        """Test text operation edge cases"""
        page = Selector(complex_html)

        # get_all_text with a custom separator
        text = page.get_all_text(separator=" | ", strip=True)
        assert " | " in text

        # Ignore specific tags
        text = page.get_all_text(ignore_tags=("table",))
        assert "Cell" not in text

        # With empty values
        text = page.get_all_text(valid_values=False)
        assert text != ""


class TestTextHandlerAdvanced:
    """Test advanced TextHandler functionality"""

    def test_text_handler_operations(self):
        """Test various TextHandler operations"""
        text = TextHandler("  Hello World  ")

        # All string methods should return TextHandler
        assert isinstance(text.strip(), TextHandler)
        assert isinstance(text.upper(), TextHandler)
        assert isinstance(text.lower(), TextHandler)
        assert isinstance(text.replace("World", "Python"), TextHandler)

        # Custom methods
        assert text.clean() == "Hello World"

        # Sorting
        text2 = TextHandler("dcba")
        assert text2.sort() == "abcd"

    def test_text_handler_regex(self):
        """Test regex operations on TextHandler"""
        text = TextHandler("Price: $10.99, Sale: $8.99")

        # Basic regex
        prices = text.re(r"\$[\d.]+")
        assert len(prices) == 2
        assert prices[0] == "$10.99"

        # Case insensitive
        text2 = TextHandler("HELLO hello HeLLo")
        matches = text2.re(r"hello", case_sensitive=False)
        assert len(matches) == 3

        # Clean match
        text3 = TextHandler(" He  l  lo  ")
        matches = text3.re(r"He l lo", clean_match=True, case_sensitive=False)
        assert len(matches) == 1

    def test_text_handlers_operations(self):
        """Test TextHandlers list operations"""
        handlers = TextHandlers([
            TextHandler("First"),
            TextHandler("Second"),
            TextHandler("Third")
        ])

        # Slicing should return TextHandlers
        assert isinstance(handlers[0:2], TextHandlers)

        # Get methods
        assert handlers.get() == "First"
        assert handlers.get("default") == "First"
        assert TextHandlers([]).get("default") == "default"


class TestSelectorsAdvanced:
    """Test advanced Selectors functionality"""

    def test_selectors_filtering(self):
        """Test filtering operations on Selectors"""
        html = """
        <div>
            <p class="highlight">Important</p>
            <p>Regular</p>
            <p class="highlight">Also important</p>
        </div>
        """
        page = Selector(html)
        paragraphs = page.css("p")

        # Filter by class
        highlighted = paragraphs.filter(lambda p: p.has_class("highlight"))
        assert len(highlighted) == 2

        # Search for a specific element
        found = paragraphs.search(lambda p: p.text == "Regular")
        assert found is not None
        assert found.text == "Regular"

    def test_selectors_properties(self):
        """Test Selectors properties"""
        html = "<div><p>1</p><p>2</p><p>3</p></div>"
        page = Selector(html)
        paragraphs = page.css("p")

        assert paragraphs.first.text == "1"
        assert paragraphs.last.text == "3"
        assert paragraphs.length == 3