scrapeRL / backend /tests /test_sites /test_registry.py
NeerajCodz's picture
feat: intelligent search-based navigation for trending/popular content
0735175
"""Comprehensive tests for all built-in site templates."""
from fastapi.testclient import TestClient
from app.api.routes.scrape import _create_intelligent_navigation_plan
from app.sites.registry import list_site_templates, match_site_template
from app.sites.templates import SITE_TEMPLATES
def test_site_template_catalog_is_large_and_unique() -> None:
"""Catalog should contain many unique templates for production coverage."""
assert len(SITE_TEMPLATES) >= 50
ids = [template.site_id for template in SITE_TEMPLATES]
assert len(ids) == len(set(ids))
def test_every_template_matches_its_primary_domain() -> None:
"""Every template must be discoverable from its primary domain."""
for template in SITE_TEMPLATES:
primary_domain = template.domains[0]
matched = match_site_template("extract structured data", [f"https://{primary_domain}"])
assert matched is not None, f"No match for {template.site_id} ({primary_domain})"
assert matched.site_id == template.site_id
def test_every_template_generates_site_aware_navigation_plan() -> None:
"""Planner should attach site template metadata for all known domains."""
for template in SITE_TEMPLATES:
primary_domain = template.domains[0]
plan = _create_intelligent_navigation_plan("collect entries", [f"https://{primary_domain}"])
assert plan.get("site_template_id") == template.site_id
assert plan.get("site_template_name") == template.name
def test_every_template_available_through_api(client: TestClient) -> None:
"""Each template should be retrievable via /api/sites/{site_id}."""
for template in SITE_TEMPLATES:
response = client.get(f"/api/sites/{template.site_id}")
assert response.status_code == 200, f"API retrieval failed for {template.site_id}"
payload = response.json()
assert payload["site_id"] == template.site_id
assert len(payload["domains"]) >= 1
assert "navigation_steps" in payload
def test_registry_serialization_covers_all_templates() -> None:
"""Serialized registry output should include every template exactly once."""
serialized = list_site_templates()
serialized_ids = {item["site_id"] for item in serialized}
template_ids = {template.site_id for template in SITE_TEMPLATES}
assert serialized_ids == template_ids
def test_alias_matching_avoids_single_char_false_positive() -> None:
"""Single-character aliases should not match inside larger words (e.g. 'x' in 'extract')."""
matched = match_site_template(
"Find and extract top scraping frameworks with details",
["open source scraping frameworks comparison"],
)
assert matched is None
def test_alias_matching_still_supports_explicit_x_reference() -> None:
"""Explicit references to X should still match the X template."""
matched = match_site_template(
"Get top posts from x today",
["social trend query"],
)
assert matched is not None
assert matched.site_id == "x"