Spaces:

NeerajCodz
/

scrapeRL

Sleeping

App Files Files Community

scrapeRL / backend /tests /test_sites /test_registry.py

NeerajCodz

feat: intelligent search-based navigation for trending/popular content

0735175 2 months ago

raw

history blame contribute delete

3.04 kB

	"""Comprehensive tests for all built-in site templates."""

	from fastapi.testclient import TestClient

	from app.api.routes.scrape import _create_intelligent_navigation_plan
	from app.sites.registry import list_site_templates, match_site_template
	from app.sites.templates import SITE_TEMPLATES


	def test_site_template_catalog_is_large_and_unique() -> None:
	"""Catalog should contain many unique templates for production coverage."""

	assert len(SITE_TEMPLATES) >= 50
	ids = [template.site_id for template in SITE_TEMPLATES]
	assert len(ids) == len(set(ids))


	def test_every_template_matches_its_primary_domain() -> None:
	"""Every template must be discoverable from its primary domain."""

	for template in SITE_TEMPLATES:
	primary_domain = template.domains[0]
	matched = match_site_template("extract structured data", [f"https://{primary_domain}"])
	assert matched is not None, f"No match for {template.site_id} ({primary_domain})"
	assert matched.site_id == template.site_id


	def test_every_template_generates_site_aware_navigation_plan() -> None:
	"""Planner should attach site template metadata for all known domains."""

	for template in SITE_TEMPLATES:
	primary_domain = template.domains[0]
	plan = _create_intelligent_navigation_plan("collect entries", [f"https://{primary_domain}"])
	assert plan.get("site_template_id") == template.site_id
	assert plan.get("site_template_name") == template.name


	def test_every_template_available_through_api(client: TestClient) -> None:
	"""Each template should be retrievable via /api/sites/{site_id}."""

	for template in SITE_TEMPLATES:
	response = client.get(f"/api/sites/{template.site_id}")
	assert response.status_code == 200, f"API retrieval failed for {template.site_id}"
	payload = response.json()
	assert payload["site_id"] == template.site_id
	assert len(payload["domains"]) >= 1
	assert "navigation_steps" in payload


	def test_registry_serialization_covers_all_templates() -> None:
	"""Serialized registry output should include every template exactly once."""

	serialized = list_site_templates()
	serialized_ids = {item["site_id"] for item in serialized}
	template_ids = {template.site_id for template in SITE_TEMPLATES}
	assert serialized_ids == template_ids


	def test_alias_matching_avoids_single_char_false_positive() -> None:
	"""Single-character aliases should not match inside larger words (e.g. 'x' in 'extract')."""

	matched = match_site_template(
	"Find and extract top scraping frameworks with details",
	["open source scraping frameworks comparison"],
	)
	assert matched is None


	def test_alias_matching_still_supports_explicit_x_reference() -> None:
	"""Explicit references to X should still match the X template."""

	matched = match_site_template(
	"Get top posts from x today",
	["social trend query"],
	)
	assert matched is not None
	assert matched.site_id == "x"