Spaces:
Sleeping
Sleeping
| """Comprehensive tests for all built-in site templates.""" | |
| from fastapi.testclient import TestClient | |
| from app.api.routes.scrape import _create_intelligent_navigation_plan | |
| from app.sites.registry import list_site_templates, match_site_template | |
| from app.sites.templates import SITE_TEMPLATES | |
| def test_site_template_catalog_is_large_and_unique() -> None: | |
| """Catalog should contain many unique templates for production coverage.""" | |
| assert len(SITE_TEMPLATES) >= 50 | |
| ids = [template.site_id for template in SITE_TEMPLATES] | |
| assert len(ids) == len(set(ids)) | |
| def test_every_template_matches_its_primary_domain() -> None: | |
| """Every template must be discoverable from its primary domain.""" | |
| for template in SITE_TEMPLATES: | |
| primary_domain = template.domains[0] | |
| matched = match_site_template("extract structured data", [f"https://{primary_domain}"]) | |
| assert matched is not None, f"No match for {template.site_id} ({primary_domain})" | |
| assert matched.site_id == template.site_id | |
| def test_every_template_generates_site_aware_navigation_plan() -> None: | |
| """Planner should attach site template metadata for all known domains.""" | |
| for template in SITE_TEMPLATES: | |
| primary_domain = template.domains[0] | |
| plan = _create_intelligent_navigation_plan("collect entries", [f"https://{primary_domain}"]) | |
| assert plan.get("site_template_id") == template.site_id | |
| assert plan.get("site_template_name") == template.name | |
| def test_every_template_available_through_api(client: TestClient) -> None: | |
| """Each template should be retrievable via /api/sites/{site_id}.""" | |
| for template in SITE_TEMPLATES: | |
| response = client.get(f"/api/sites/{template.site_id}") | |
| assert response.status_code == 200, f"API retrieval failed for {template.site_id}" | |
| payload = response.json() | |
| assert payload["site_id"] == template.site_id | |
| assert len(payload["domains"]) >= 1 | |
| assert "navigation_steps" in payload | |
| def test_registry_serialization_covers_all_templates() -> None: | |
| """Serialized registry output should include every template exactly once.""" | |
| serialized = list_site_templates() | |
| serialized_ids = {item["site_id"] for item in serialized} | |
| template_ids = {template.site_id for template in SITE_TEMPLATES} | |
| assert serialized_ids == template_ids | |
| def test_alias_matching_avoids_single_char_false_positive() -> None: | |
| """Single-character aliases should not match inside larger words (e.g. 'x' in 'extract').""" | |
| matched = match_site_template( | |
| "Find and extract top scraping frameworks with details", | |
| ["open source scraping frameworks comparison"], | |
| ) | |
| assert matched is None | |
| def test_alias_matching_still_supports_explicit_x_reference() -> None: | |
| """Explicit references to X should still match the X template.""" | |
| matched = match_site_template( | |
| "Get top posts from x today", | |
| ["social trend query"], | |
| ) | |
| assert matched is not None | |
| assert matched.site_id == "x" | |