Spaces:

lenson78
/

Scrapling

Paused

App Files Files Community

Scrapling / tests /parser /test_parser_advanced.py

mph

Move test for get_all_text to test_parser_advanced.py

14f7490 25 days ago

raw

history blame contribute delete

9.56 kB

	import re
	import pytest
	from unittest.mock import Mock

	from scrapling import Selector, Selectors
	from scrapling.core.custom_types import TextHandler, TextHandlers
	from scrapling.core.storage import SQLiteStorageSystem


	class TestSelectorAdvancedFeatures:
	"""Test advanced Selector features like adaptive matching"""

	def test_adaptive_initialization_with_storage(self):
	"""Test adaptive initialization with custom storage"""
	html = "<html><body><p>Test</p></body></html>"

	# Use the actual SQLiteStorageSystem for this test
	selector = Selector(
	content=html,
	adaptive=True,
	storage=SQLiteStorageSystem,
	storage_args={"storage_file": ":memory:", "url": "https://example.com"}
	)

	assert selector._Selector__adaptive_enabled is True
	assert selector._storage is not None

	def test_adaptive_initialization_with_default_storage_args(self):
	"""Test adaptive initialization with default storage args"""
	html = "<html><body><p>Test</p></body></html>"
	url = "https://example.com"

	# Test that adaptive mode uses default storage when no explicit args provided
	selector = Selector(
	content=html,
	url=url,
	adaptive=True
	)

	# Should create storage with default args
	assert selector._storage is not None

	def test_adaptive_with_existing_storage(self):
	"""Test adaptive initialization with existing storage object"""
	html = "<html><body><p>Test</p></body></html>"

	mock_storage = Mock()

	selector = Selector(
	content=html,
	adaptive=True,
	_storage=mock_storage
	)

	assert selector._storage is mock_storage


	class TestAdvancedSelectors:
	"""Test advanced selector functionality"""

	@pytest.fixture
	def complex_html(self):
	return """
	<html>
	<body>
	<div class="container" data-test='{"key": "value"}'>
	<p>First paragraph</p>
	<!-- Comment -->
	<p>Second paragraph</p>
	<![CDATA[Some CDATA content]]>
	<div class="nested">
	<span id="special">Special content</span>
	<span>Regular content</span>
	</div>
	<table>
	<tr><td>Cell 1</td><td>Cell 2</td></tr>
	<tr><td>Cell 3</td><td>Cell 4</td></tr>
	</table>
	</div>
	</body>
	</html>
	"""

	def test_comment_and_cdata_handling(self, complex_html):
	"""Test handling of comments and CDATA"""
	# With comments/CDATA kept
	page = Selector(
	complex_html,
	keep_comments=True,
	keep_cdata=True
	)
	content = page.body
	assert "Comment" in content
	assert "CDATA" in content

	# Without comments/CDATA
	page = Selector(
	complex_html,
	keep_comments=False,
	keep_cdata=False
	)
	content = page.html_content
	assert "Comment" not in content

	def test_advanced_xpath_variables(self, complex_html):
	"""Test XPath with variables"""
	page = Selector(complex_html)

	# Using XPath variables
	cells = page.xpath(
	"//td[text()=$cell_text]",
	cell_text="Cell 1"
	)
	assert len(cells) == 1
	assert cells[0].text == "Cell 1"

	def test_pseudo_elements(self, complex_html):
	"""Test CSS pseudo-elements"""
	page = Selector(complex_html)

	# ::text pseudo-element
	texts = page.css("p::text")
	assert len(texts) == 2
	assert isinstance(texts[0], Selector)
	assert isinstance(texts[0].get(), TextHandler)

	# ::attr() pseudo-element
	attrs = page.css("div::attr(class)")
	assert "container" in attrs.getall()

	def test_complex_attribute_operations(self, complex_html):
	"""Test complex attribute handling"""
	page = Selector(complex_html)
	container = page.css(".container")[0]

	# JSON in attributes
	data = container.attrib["data-test"].json()
	assert data["key"] == "value"

	# Attribute searching
	matches = list(container.attrib.search_values("container"))
	assert len(matches) == 1

	def test_url_joining(self):
	"""Test URL joining functionality"""
	page = Selector("<html></html>", url="https://example.com/page")

	# Relative URL
	assert page.urljoin("../other") == "https://example.com/other"
	assert page.urljoin("/absolute") == "https://example.com/absolute"
	assert page.urljoin("relative") == "https://example.com/relative"

	def test_find_operations_edge_cases(self, complex_html):
	"""Test edge cases in find operations"""
	page = Selector(complex_html)

	# Multiple argument types
	_ = page.find_all(
	"span",
	["div"],
	{"class": "nested"},
	lambda e: e.text != ""
	)

	# Regex pattern matching
	pattern = re.compile(r"Cell \d+")
	cells = page.find_all(pattern)
	assert len(cells) == 4

	def test_text_operations_edge_cases(self, complex_html):
	"""Test text operation edge cases"""
	page = Selector(complex_html)

	# get_all_text with a custom separator
	text = page.get_all_text(separator=" \| ", strip=True)
	assert " \| " in text

	# Ignore specific tags
	text = page.get_all_text(ignore_tags=("table",))
	assert "Cell" not in text

	# With empty values
	text = page.get_all_text(valid_values=False)
	assert text != ""

	def test_get_all_text_preserves_interleaved_text_nodes(self):
	"""Test get_all_text preserves interleaved text nodes"""
	html = """
	<html>
	<body>
	<main>
	string1
	<b>string2</b>
	string3
	<div>
	<span>string4</span>
	</div>
	string5
	<script>ignored</script>
	string6
	<style>ignored</style>
	string7
	</main>
	</body>
	</html>
	"""

	page = Selector(html, adaptive=False)
	node = page.css("main")[0]

	assert node.get_all_text("\n", strip=True) == "string1\nstring2\nstring3\nstring4\nstring5\nstring6\nstring7"


	class TestTextHandlerAdvanced:
	"""Test advanced TextHandler functionality"""

	def test_text_handler_operations(self):
	"""Test various TextHandler operations"""
	text = TextHandler(" Hello World ")

	# All string methods should return TextHandler
	assert isinstance(text.strip(), TextHandler)
	assert isinstance(text.upper(), TextHandler)
	assert isinstance(text.lower(), TextHandler)
	assert isinstance(text.replace("World", "Python"), TextHandler)

	# Custom methods
	assert text.clean() == "Hello World"

	# Sorting
	text2 = TextHandler("dcba")
	assert text2.sort() == "abcd"

	def test_text_handler_regex(self):
	"""Test regex operations on TextHandler"""
	text = TextHandler("Price: $10.99, Sale: $8.99")

	# Basic regex
	prices = text.re(r"\$[\d.]+")
	assert len(prices) == 2
	assert prices[0] == "$10.99"

	# Case insensitive
	text2 = TextHandler("HELLO hello HeLLo")
	matches = text2.re(r"hello", case_sensitive=False)
	assert len(matches) == 3

	# Clean match
	text3 = TextHandler(" He l lo ")
	matches = text3.re(r"He l lo", clean_match=True, case_sensitive=False)
	assert len(matches) == 1

	def test_text_handlers_operations(self):
	"""Test TextHandlers list operations"""
	handlers = TextHandlers([
	TextHandler("First"),
	TextHandler("Second"),
	TextHandler("Third")
	])

	# Slicing should return TextHandlers
	assert isinstance(handlers[0:2], TextHandlers)

	# Get methods
	assert handlers.get() == "First"
	assert handlers.get("default") == "First"
	assert TextHandlers([]).get("default") == "default"


	class TestSelectorsAdvanced:
	"""Test advanced Selectors functionality"""

	def test_selectors_filtering(self):
	"""Test filtering operations on Selectors"""
	html = """
	<div>
	<p class="highlight">Important</p>
	<p>Regular</p>
	<p class="highlight">Also important</p>
	</div>
	"""
	page = Selector(html)
	paragraphs = page.css("p")

	# Filter by class
	highlighted = paragraphs.filter(lambda p: p.has_class("highlight"))
	assert len(highlighted) == 2

	# Search for a specific element
	found = paragraphs.search(lambda p: p.text == "Regular")
	assert found is not None
	assert found.text == "Regular"

	def test_selectors_properties(self):
	"""Test Selectors properties"""
	html = "<div><p>1</p><p>2</p><p>3</p></div>"
	page = Selector(html)
	paragraphs = page.css("p")

	assert paragraphs.first.text == "1"
	assert paragraphs.last.text == "3"
	assert paragraphs.length == 3