Spaces:
Sleeping
Sleeping
Commit ·
7f50e50
1
Parent(s): e13f862
test: add full site template matrix coverage
Browse files- add registry tests that validate every template domain maps correctly
- add API tests for /api/sites list/get/match endpoints
- add site template matrix report under docs/test
- validate 56 template catalog with comprehensive assertions
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
backend/tests/test_api/test_sites.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for site template API endpoints."""
|
| 2 |
+
|
| 3 |
+
from fastapi.testclient import TestClient
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class TestSitesAPI:
|
| 7 |
+
"""Validate /api/sites template routes."""
|
| 8 |
+
|
| 9 |
+
def test_list_sites_returns_minimum_templates(self, client: TestClient) -> None:
|
| 10 |
+
"""List endpoint should expose a rich inbuilt catalog."""
|
| 11 |
+
|
| 12 |
+
response = client.get("/api/sites")
|
| 13 |
+
assert response.status_code == 200
|
| 14 |
+
|
| 15 |
+
data = response.json()
|
| 16 |
+
assert "sites" in data
|
| 17 |
+
assert "count" in data
|
| 18 |
+
assert data["count"] >= 30
|
| 19 |
+
assert len(data["sites"]) >= 30
|
| 20 |
+
|
| 21 |
+
site_ids = {site["site_id"] for site in data["sites"]}
|
| 22 |
+
assert "reddit" in site_ids
|
| 23 |
+
assert "github" in site_ids
|
| 24 |
+
assert "youtube" in site_ids
|
| 25 |
+
|
| 26 |
+
def test_get_specific_site_template(self, client: TestClient) -> None:
|
| 27 |
+
"""Fetch one known site template."""
|
| 28 |
+
|
| 29 |
+
response = client.get("/api/sites/reddit")
|
| 30 |
+
assert response.status_code == 200
|
| 31 |
+
data = response.json()
|
| 32 |
+
assert data["site_id"] == "reddit"
|
| 33 |
+
assert "reddit.com" in data["domains"]
|
| 34 |
+
assert "navigation_steps" in data
|
| 35 |
+
assert len(data["navigation_steps"]) > 0
|
| 36 |
+
|
| 37 |
+
def test_get_unknown_site_template_404(self, client: TestClient) -> None:
|
| 38 |
+
"""Unknown site IDs should return 404."""
|
| 39 |
+
|
| 40 |
+
response = client.get("/api/sites/not-a-real-site")
|
| 41 |
+
assert response.status_code == 404
|
| 42 |
+
|
| 43 |
+
def test_match_site_by_asset_domain(self, client: TestClient) -> None:
|
| 44 |
+
"""Domain matching should pick correct template."""
|
| 45 |
+
|
| 46 |
+
response = client.post(
|
| 47 |
+
"/api/sites/match",
|
| 48 |
+
json={
|
| 49 |
+
"instructions": "get trending communities",
|
| 50 |
+
"assets": ["https://reddit.com"],
|
| 51 |
+
},
|
| 52 |
+
)
|
| 53 |
+
assert response.status_code == 200
|
| 54 |
+
payload = response.json()
|
| 55 |
+
assert payload["matched"] is True
|
| 56 |
+
assert payload["site"]["site_id"] == "reddit"
|
| 57 |
+
|
| 58 |
+
def test_match_site_by_instruction_alias(self, client: TestClient) -> None:
|
| 59 |
+
"""Alias matching should work even when URL is missing."""
|
| 60 |
+
|
| 61 |
+
response = client.post(
|
| 62 |
+
"/api/sites/match",
|
| 63 |
+
json={
|
| 64 |
+
"instructions": "scrape latest youtube videos",
|
| 65 |
+
"assets": [],
|
| 66 |
+
},
|
| 67 |
+
)
|
| 68 |
+
assert response.status_code == 200
|
| 69 |
+
payload = response.json()
|
| 70 |
+
assert payload["matched"] is True
|
| 71 |
+
assert payload["site"]["site_id"] == "youtube"
|
| 72 |
+
|
| 73 |
+
def test_match_site_returns_false_for_unknown(self, client: TestClient) -> None:
|
| 74 |
+
"""Matcher should return matched=false when no template fits."""
|
| 75 |
+
|
| 76 |
+
response = client.post(
|
| 77 |
+
"/api/sites/match",
|
| 78 |
+
json={
|
| 79 |
+
"instructions": "scrape intranet dashboard",
|
| 80 |
+
"assets": ["https://internal.local.example"],
|
| 81 |
+
},
|
| 82 |
+
)
|
| 83 |
+
assert response.status_code == 200
|
| 84 |
+
payload = response.json()
|
| 85 |
+
assert payload["matched"] is False
|
| 86 |
+
assert payload["site"] is None
|
backend/tests/test_sites/test_registry.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Comprehensive tests for all built-in site templates."""
|
| 2 |
+
|
| 3 |
+
from fastapi.testclient import TestClient
|
| 4 |
+
|
| 5 |
+
from app.api.routes.scrape import _create_intelligent_navigation_plan
|
| 6 |
+
from app.sites.registry import list_site_templates, match_site_template
|
| 7 |
+
from app.sites.templates import SITE_TEMPLATES
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def test_site_template_catalog_is_large_and_unique() -> None:
|
| 11 |
+
"""Catalog should contain many unique templates for production coverage."""
|
| 12 |
+
|
| 13 |
+
assert len(SITE_TEMPLATES) >= 50
|
| 14 |
+
ids = [template.site_id for template in SITE_TEMPLATES]
|
| 15 |
+
assert len(ids) == len(set(ids))
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def test_every_template_matches_its_primary_domain() -> None:
|
| 19 |
+
"""Every template must be discoverable from its primary domain."""
|
| 20 |
+
|
| 21 |
+
for template in SITE_TEMPLATES:
|
| 22 |
+
primary_domain = template.domains[0]
|
| 23 |
+
matched = match_site_template("extract structured data", [f"https://{primary_domain}"])
|
| 24 |
+
assert matched is not None, f"No match for {template.site_id} ({primary_domain})"
|
| 25 |
+
assert matched.site_id == template.site_id
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def test_every_template_generates_site_aware_navigation_plan() -> None:
|
| 29 |
+
"""Planner should attach site template metadata for all known domains."""
|
| 30 |
+
|
| 31 |
+
for template in SITE_TEMPLATES:
|
| 32 |
+
primary_domain = template.domains[0]
|
| 33 |
+
plan = _create_intelligent_navigation_plan("collect entries", [f"https://{primary_domain}"])
|
| 34 |
+
assert plan.get("site_template_id") == template.site_id
|
| 35 |
+
assert plan.get("site_template_name") == template.name
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def test_every_template_available_through_api(client: TestClient) -> None:
|
| 39 |
+
"""Each template should be retrievable via /api/sites/{site_id}."""
|
| 40 |
+
|
| 41 |
+
for template in SITE_TEMPLATES:
|
| 42 |
+
response = client.get(f"/api/sites/{template.site_id}")
|
| 43 |
+
assert response.status_code == 200, f"API retrieval failed for {template.site_id}"
|
| 44 |
+
payload = response.json()
|
| 45 |
+
assert payload["site_id"] == template.site_id
|
| 46 |
+
assert len(payload["domains"]) >= 1
|
| 47 |
+
assert "navigation_steps" in payload
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def test_registry_serialization_covers_all_templates() -> None:
|
| 51 |
+
"""Serialized registry output should include every template exactly once."""
|
| 52 |
+
|
| 53 |
+
serialized = list_site_templates()
|
| 54 |
+
serialized_ids = {item["site_id"] for item in serialized}
|
| 55 |
+
template_ids = {template.site_id for template in SITE_TEMPLATES}
|
| 56 |
+
assert serialized_ids == template_ids
|
docs/test/site_template_matrix_report.md
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Site Template Matrix Test Report
|
| 2 |
+
|
| 3 |
+
**Date:** 2026-04-05
|
| 4 |
+
**Scope:** Backend site-template registry, agent integration, and full template coverage tests
|
| 5 |
+
|
| 6 |
+
## Summary
|
| 7 |
+
|
| 8 |
+
- Inbuilt templates expanded to **56 sites**
|
| 9 |
+
- Agents now load template context during planning/navigation
|
| 10 |
+
- New API surface added: `/api/sites`, `/api/sites/{site_id}`, `/api/sites/match`
|
| 11 |
+
- Full template test suite added and passing
|
| 12 |
+
|
| 13 |
+
## Automated Tests
|
| 14 |
+
|
| 15 |
+
Command:
|
| 16 |
+
|
| 17 |
+
```bash
|
| 18 |
+
cd backend
|
| 19 |
+
python -m pytest tests/test_sites/test_registry.py tests/test_api/test_sites.py -q
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
Result:
|
| 23 |
+
|
| 24 |
+
- **11 passed**
|
| 25 |
+
- Coverage includes:
|
| 26 |
+
- catalog size and uniqueness
|
| 27 |
+
- domain matching for every template
|
| 28 |
+
- navigation-plan site-template propagation for every template
|
| 29 |
+
- API retrieval for every template
|
| 30 |
+
- registry serialization completeness
|
| 31 |
+
|
| 32 |
+
## Runtime Validation
|
| 33 |
+
|
| 34 |
+
### 1. Template catalog endpoint
|
| 35 |
+
|
| 36 |
+
- `GET /api/sites`
|
| 37 |
+
- Result: `count = 56`
|
| 38 |
+
|
| 39 |
+
### 2. Template match endpoint
|
| 40 |
+
|
| 41 |
+
- `POST /api/sites/match` with `https://reddit.com`
|
| 42 |
+
- Result: `matched = true`, `site_id = reddit`
|
| 43 |
+
|
| 44 |
+
### 3. Agent template self-reference
|
| 45 |
+
|
| 46 |
+
Reddit scrape stream validation confirmed:
|
| 47 |
+
|
| 48 |
+
- `site_template` step emitted by navigator
|
| 49 |
+
- `planner_python.extracted_data.site_template_id = reddit`
|
| 50 |
+
- `navigator_python.extracted_data.site_template_id = reddit`
|
| 51 |
+
|
| 52 |
+
### 4. Strategy integration checks
|
| 53 |
+
|
| 54 |
+
- Reddit request → `navigation_strategy = reddit_trending`
|
| 55 |
+
- GitHub trending request → `navigation_strategy = github_trending`
|
| 56 |
+
- Generic known domains (e.g., YouTube) → `site_template_id` populated, strategy-aware exploration
|
| 57 |
+
|
| 58 |
+
## Folder Structure Additions
|
| 59 |
+
|
| 60 |
+
```text
|
| 61 |
+
backend/app/sites/
|
| 62 |
+
__init__.py
|
| 63 |
+
models.py
|
| 64 |
+
templates.py
|
| 65 |
+
registry.py
|
| 66 |
+
|
| 67 |
+
backend/tests/test_sites/
|
| 68 |
+
test_registry.py
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
## Notes
|
| 72 |
+
|
| 73 |
+
- Reddit direct endpoints are network-blocked in this environment; scraper uses fallback strategy while still preserving template-aware agent flow.
|
| 74 |
+
- Template-aware events are now visible in execution trace for debugging and orchestration transparency.
|