NeerajCodz Copilot commited on
Commit
7f50e50
·
1 Parent(s): e13f862

test: add full site template matrix coverage

Browse files

- add registry tests that validate every template domain maps correctly
- add API tests for /api/sites list/get/match endpoints
- add site template matrix report under docs/test
- validate 56 template catalog with comprehensive assertions

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

backend/tests/test_api/test_sites.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for site template API endpoints."""
2
+
3
+ from fastapi.testclient import TestClient
4
+
5
+
6
+ class TestSitesAPI:
7
+ """Validate /api/sites template routes."""
8
+
9
+ def test_list_sites_returns_minimum_templates(self, client: TestClient) -> None:
10
+ """List endpoint should expose a rich inbuilt catalog."""
11
+
12
+ response = client.get("/api/sites")
13
+ assert response.status_code == 200
14
+
15
+ data = response.json()
16
+ assert "sites" in data
17
+ assert "count" in data
18
+ assert data["count"] >= 30
19
+ assert len(data["sites"]) >= 30
20
+
21
+ site_ids = {site["site_id"] for site in data["sites"]}
22
+ assert "reddit" in site_ids
23
+ assert "github" in site_ids
24
+ assert "youtube" in site_ids
25
+
26
+ def test_get_specific_site_template(self, client: TestClient) -> None:
27
+ """Fetch one known site template."""
28
+
29
+ response = client.get("/api/sites/reddit")
30
+ assert response.status_code == 200
31
+ data = response.json()
32
+ assert data["site_id"] == "reddit"
33
+ assert "reddit.com" in data["domains"]
34
+ assert "navigation_steps" in data
35
+ assert len(data["navigation_steps"]) > 0
36
+
37
+ def test_get_unknown_site_template_404(self, client: TestClient) -> None:
38
+ """Unknown site IDs should return 404."""
39
+
40
+ response = client.get("/api/sites/not-a-real-site")
41
+ assert response.status_code == 404
42
+
43
+ def test_match_site_by_asset_domain(self, client: TestClient) -> None:
44
+ """Domain matching should pick correct template."""
45
+
46
+ response = client.post(
47
+ "/api/sites/match",
48
+ json={
49
+ "instructions": "get trending communities",
50
+ "assets": ["https://reddit.com"],
51
+ },
52
+ )
53
+ assert response.status_code == 200
54
+ payload = response.json()
55
+ assert payload["matched"] is True
56
+ assert payload["site"]["site_id"] == "reddit"
57
+
58
+ def test_match_site_by_instruction_alias(self, client: TestClient) -> None:
59
+ """Alias matching should work even when URL is missing."""
60
+
61
+ response = client.post(
62
+ "/api/sites/match",
63
+ json={
64
+ "instructions": "scrape latest youtube videos",
65
+ "assets": [],
66
+ },
67
+ )
68
+ assert response.status_code == 200
69
+ payload = response.json()
70
+ assert payload["matched"] is True
71
+ assert payload["site"]["site_id"] == "youtube"
72
+
73
+ def test_match_site_returns_false_for_unknown(self, client: TestClient) -> None:
74
+ """Matcher should return matched=false when no template fits."""
75
+
76
+ response = client.post(
77
+ "/api/sites/match",
78
+ json={
79
+ "instructions": "scrape intranet dashboard",
80
+ "assets": ["https://internal.local.example"],
81
+ },
82
+ )
83
+ assert response.status_code == 200
84
+ payload = response.json()
85
+ assert payload["matched"] is False
86
+ assert payload["site"] is None
backend/tests/test_sites/test_registry.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Comprehensive tests for all built-in site templates."""
2
+
3
+ from fastapi.testclient import TestClient
4
+
5
+ from app.api.routes.scrape import _create_intelligent_navigation_plan
6
+ from app.sites.registry import list_site_templates, match_site_template
7
+ from app.sites.templates import SITE_TEMPLATES
8
+
9
+
10
+ def test_site_template_catalog_is_large_and_unique() -> None:
11
+ """Catalog should contain many unique templates for production coverage."""
12
+
13
+ assert len(SITE_TEMPLATES) >= 50
14
+ ids = [template.site_id for template in SITE_TEMPLATES]
15
+ assert len(ids) == len(set(ids))
16
+
17
+
18
+ def test_every_template_matches_its_primary_domain() -> None:
19
+ """Every template must be discoverable from its primary domain."""
20
+
21
+ for template in SITE_TEMPLATES:
22
+ primary_domain = template.domains[0]
23
+ matched = match_site_template("extract structured data", [f"https://{primary_domain}"])
24
+ assert matched is not None, f"No match for {template.site_id} ({primary_domain})"
25
+ assert matched.site_id == template.site_id
26
+
27
+
28
+ def test_every_template_generates_site_aware_navigation_plan() -> None:
29
+ """Planner should attach site template metadata for all known domains."""
30
+
31
+ for template in SITE_TEMPLATES:
32
+ primary_domain = template.domains[0]
33
+ plan = _create_intelligent_navigation_plan("collect entries", [f"https://{primary_domain}"])
34
+ assert plan.get("site_template_id") == template.site_id
35
+ assert plan.get("site_template_name") == template.name
36
+
37
+
38
+ def test_every_template_available_through_api(client: TestClient) -> None:
39
+ """Each template should be retrievable via /api/sites/{site_id}."""
40
+
41
+ for template in SITE_TEMPLATES:
42
+ response = client.get(f"/api/sites/{template.site_id}")
43
+ assert response.status_code == 200, f"API retrieval failed for {template.site_id}"
44
+ payload = response.json()
45
+ assert payload["site_id"] == template.site_id
46
+ assert len(payload["domains"]) >= 1
47
+ assert "navigation_steps" in payload
48
+
49
+
50
+ def test_registry_serialization_covers_all_templates() -> None:
51
+ """Serialized registry output should include every template exactly once."""
52
+
53
+ serialized = list_site_templates()
54
+ serialized_ids = {item["site_id"] for item in serialized}
55
+ template_ids = {template.site_id for template in SITE_TEMPLATES}
56
+ assert serialized_ids == template_ids
docs/test/site_template_matrix_report.md ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Site Template Matrix Test Report
2
+
3
+ **Date:** 2026-04-05
4
+ **Scope:** Backend site-template registry, agent integration, and full template coverage tests
5
+
6
+ ## Summary
7
+
8
+ - Inbuilt templates expanded to **56 sites**
9
+ - Agents now load template context during planning/navigation
10
+ - New API surface added: `/api/sites`, `/api/sites/{site_id}`, `/api/sites/match`
11
+ - Full template test suite added and passing
12
+
13
+ ## Automated Tests
14
+
15
+ Command:
16
+
17
+ ```bash
18
+ cd backend
19
+ python -m pytest tests/test_sites/test_registry.py tests/test_api/test_sites.py -q
20
+ ```
21
+
22
+ Result:
23
+
24
+ - **11 passed**
25
+ - Coverage includes:
26
+ - catalog size and uniqueness
27
+ - domain matching for every template
28
+ - navigation-plan site-template propagation for every template
29
+ - API retrieval for every template
30
+ - registry serialization completeness
31
+
32
+ ## Runtime Validation
33
+
34
+ ### 1. Template catalog endpoint
35
+
36
+ - `GET /api/sites`
37
+ - Result: `count = 56`
38
+
39
+ ### 2. Template match endpoint
40
+
41
+ - `POST /api/sites/match` with `https://reddit.com`
42
+ - Result: `matched = true`, `site_id = reddit`
43
+
44
+ ### 3. Agent template self-reference
45
+
46
+ Reddit scrape stream validation confirmed:
47
+
48
+ - `site_template` step emitted by navigator
49
+ - `planner_python.extracted_data.site_template_id = reddit`
50
+ - `navigator_python.extracted_data.site_template_id = reddit`
51
+
52
+ ### 4. Strategy integration checks
53
+
54
+ - Reddit request → `navigation_strategy = reddit_trending`
55
+ - GitHub trending request → `navigation_strategy = github_trending`
56
+ - Generic known domains (e.g., YouTube) → `site_template_id` populated, strategy-aware exploration
57
+
58
+ ## Folder Structure Additions
59
+
60
+ ```text
61
+ backend/app/sites/
62
+ __init__.py
63
+ models.py
64
+ templates.py
65
+ registry.py
66
+
67
+ backend/tests/test_sites/
68
+ test_registry.py
69
+ ```
70
+
71
+ ## Notes
72
+
73
+ - Reddit direct endpoints are network-blocked in this environment; scraper uses fallback strategy while still preserving template-aware agent flow.
74
+ - Template-aware events are now visible in execution trace for debugging and orchestration transparency.