Complex Web Page

import pickle import time import logging import pytest from cssselect import SelectorError, SelectorSyntaxError from scrapling import Selector logging.getLogger("scrapling").setLevel(logging.DEBUG) @pytest.fixture def html_content(): return """ Complex Web Page

Products

Product 1

This is product 1

$10.99

Product 2

This is product 2

$20.99

Product 3

This is product 3

$15.99

Customer Reviews

Great product!

John Doe

Good value for money.

Jane Smith

""" @pytest.fixture def page(html_content): return Selector(html_content, adaptive=False) # CSS Selector Tests class TestCSSSelectors: def test_basic_product_selection(self, page): """Test selecting all product elements""" elements = page.css("main #products .product-list article.product") assert len(elements) == 3 def test_in_stock_product_selection(self, page): """Test selecting in-stock products""" in_stock_products = page.css( 'main #products .product-list article.product:not(:contains("Out of stock"))' ) assert len(in_stock_products) == 2 # XPath Selector Tests class TestXPathSelectors: def test_high_rating_reviews(self, page): """Test selecting reviews with high ratings""" reviews = page.xpath( '//section[@id="reviews"]//div[contains(@class, "review") and @data-rating >= 4]' ) assert len(reviews) == 2 def test_high_priced_products(self, page): """Test selecting products above a certain price""" high_priced_products = page.xpath( '//article[contains(@class, "product")]' '[number(translate(substring-after(.//span[@class="price"], "$"), ",", "")) > 15]' ) assert len(high_priced_products) == 2 # Text Matching Tests class TestTextMatching: def test_regex_multiple_matches(self, page): """Test finding multiple matches with regex""" stock_info = page.find_by_regex(r"In stock: \d+", first_match=False) assert len(stock_info) == 2 def test_regex_first_match(self, page): """Test finding the first match with regex""" stock_info = page.find_by_regex( r"In stock: \d+", first_match=True, case_sensitive=True ) assert stock_info.text == "In stock: 5" def test_partial_text_match(self, page): """Test finding elements with partial text match""" stock_info = page.find_by_text(r"In stock:", partial=True, first_match=False) assert len(stock_info) == 2 def test_exact_text_match(self, page): """Test finding elements with exact text match""" out_of_stock = page.find_by_text( "Out of stock", partial=False, first_match=False ) assert len(out_of_stock) == 1 # Similar Elements Tests class TestSimilarElements: def test_finding_similar_products(self, page): """Test finding similar product elements""" first_product = page.css(".product").first similar_products = first_product.find_similar() assert len(similar_products) == 2 def test_finding_similar_reviews(self, page): """Test finding similar review elements with additional filtering""" first_review = page.find("div", class_="review") similar_high_rated_reviews = [ review for review in first_review.find_similar() if int(review.attrib.get("data-rating", 0)) >= 4 ] assert len(similar_high_rated_reviews) == 1 # Error Handling Tests class TestErrorHandling: def test_invalid_selector_initialization(self): """Test various invalid Selector initializations""" # No arguments with pytest.raises(ValueError): _ = Selector(adaptive=False) with pytest.raises(TypeError): _ = Selector(content=1, adaptive=False) def test_invalid_storage(self, page, html_content): """Test invalid storage parameter""" with pytest.raises(ValueError): _ = Selector(html_content, storage=object, adaptive=True) def test_bad_selectors(self, page): """Test handling of invalid selectors""" with pytest.raises((SelectorError, SelectorSyntaxError)): page.css("4 ayo") with pytest.raises((SelectorError, SelectorSyntaxError)): page.xpath("4 ayo") # Pickling and Object Representation Tests class TestPicklingAndRepresentation: def test_unpickleable_objects(self, page): """Test that Selector objects cannot be pickled""" table = page.css(".product-list")[0] with pytest.raises(TypeError): pickle.dumps(table) def test_string_representations(self, page): """Test custom string representations of objects""" table = page.css(".product-list")[0] assert issubclass(type(table.__str__()), str) assert issubclass(type(table.__repr__()), str) assert issubclass(type(table.attrib.__str__()), str) assert issubclass(type(table.attrib.__repr__()), str) # Navigation and Traversal Tests class TestElementNavigation: def test_basic_navigation_properties(self, page): """Test basic navigation properties of elements""" table = page.css(".product-list")[0] assert table.path is not None assert table.html_content != "" assert table.prettify() != "" def test_parent_and_sibling_navigation(self, page): """Test parent and sibling navigation""" table = page.css(".product-list")[0] parent = table.parent assert parent["id"] == "products" parent_siblings = parent.siblings assert len(parent_siblings) == 1 def test_child_navigation(self, page): """Test child navigation""" table = page.css(".product-list")[0] children = table.children assert len(children) == 3 def test_next_and_previous_navigation(self, page): """Test next and previous element navigation""" child = page.css(".product-list")[0].find({"data-id": "1"}) next_element = child.next assert next_element.attrib["data-id"] == "2" prev_element = next_element.previous assert prev_element.tag == child.tag def test_ancestor_finding(self, page): """Test finding ancestors of elements""" all_prices = page.css(".price") products_with_prices = [ price.find_ancestor(lambda p: p.has_class("product")) for price in all_prices ] assert len(products_with_prices) == 3 # JSON and Attribute Tests class TestJSONAndAttributes: def test_json_conversion(self, page): """Test converting content to JSON""" script_content = page.css("#page-data::text")[0].get() assert issubclass(type(script_content.sort()), str) page_data = script_content.json() assert page_data["totalProducts"] == 3 assert "lastUpdated" in page_data def test_attribute_operations(self, page): """Test various attribute-related operations""" # Product ID extraction products = page.css(".product") product_ids = [product.attrib["data-id"] for product in products] assert product_ids == ["1", "2", "3"] assert "data-id" in products[0] # Review rating calculations reviews = page.css(".review") review_ratings = [int(review.attrib["data-rating"]) for review in reviews] assert sum(review_ratings) / len(review_ratings) == 4.5 # Attribute searching key_value = list(products[0].attrib.search_values("1", partial=False)) assert list(key_value[0].keys()) == ["data-id"] key_value = list(products[0].attrib.search_values("1", partial=True)) assert list(key_value[0].keys()) == ["data-id"] # JSON attribute conversion attr_json = page.css("#products").first.attrib["schema"].json() assert attr_json == {"jsonable": "data"} assert isinstance(page.css("#products")[0].attrib.json_string, bytes) # Performance Test def test_large_html_parsing_performance(): """Test parsing and selecting performance on large HTML""" large_html = ( "" + '

' * 5000 + "

" * 5000 + "" ) start_time = time.time() parsed = Selector(large_html, adaptive=False) elements = parsed.css(".item") end_time = time.time() # assert len(elements) == 5000 # GitHub actions don't like this line # Converting 5000 elements to a class and doing operations on them will take time # Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average assert ( end_time - start_time < 0.5 ) # Locally I test on 0.1 but on GitHub actions with browsers and threading sometimes closing adds fractions of seconds # Selector Generation Test def test_selectors_generation(page): """Try to create selectors for all elements in the page""" def _traverse(element: Selector): assert isinstance(element.generate_css_selector, str) assert isinstance(element.generate_full_css_selector, str) assert isinstance(element.generate_xpath_selector, str) assert isinstance(element.generate_full_xpath_selector, str) for branch in element.children: _traverse(branch) _traverse(page) # Miscellaneous Tests def test_getting_all_text(page): """Test getting all text from the page""" assert page.get_all_text() != "" def test_regex_on_text(page): """Test regex operations on text""" element = page.css('[data-id="1"] .price')[0] match = element.re_first(r"[\.\d]+") assert match == "10.99" match = element.text.re(r"(\d+)", replace_entities=False) assert len(match) == 2