NeerajCodz Copilot commited on
Commit
e13f862
·
1 Parent(s): 6452b60

feat: add site template registry and agent integration

Browse files

- add backend/app/sites template catalog with 56 templates
- expose /api/sites list/get/match endpoints
- wire scrape planner/navigator to resolve and reference templates
- add per-url template-aware strategy selection for scraping

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

backend/app/api/routes/__init__.py CHANGED
@@ -1,5 +1,5 @@
1
  """API routes package."""
2
 
3
- from app.api.routes import agents, episode, health, memory, tasks, tools
4
 
5
- __all__ = ["agents", "episode", "health", "memory", "tasks", "tools"]
 
1
  """API routes package."""
2
 
3
+ from app.api.routes import agents, episode, health, memory, sites, tasks, tools
4
 
5
+ __all__ = ["agents", "episode", "health", "memory", "sites", "tasks", "tools"]
backend/app/api/routes/scrape.py CHANGED
@@ -16,7 +16,9 @@ from datetime import datetime, timezone
16
  from enum import Enum
17
  from pathlib import Path
18
  from typing import Any, AsyncGenerator
 
19
  from urllib.parse import quote_plus, urlparse
 
20
 
21
  from bs4 import BeautifulSoup
22
  from fastapi import APIRouter, BackgroundTasks, HTTPException
@@ -41,6 +43,7 @@ from app.plugins.python_sandbox import (
41
  )
42
  from app.search.engine import SearchEngineRouter
43
  from app.search.providers.duckduckgo import DuckDuckGoProvider
 
44
 
45
  logger = logging.getLogger(__name__)
46
  router = APIRouter(prefix="/scrape", tags=["Scraping"])
@@ -153,6 +156,13 @@ def get_session(session_id: str) -> dict[str, Any] | None:
153
  return _active_sessions.get(session_id)
154
 
155
 
 
 
 
 
 
 
 
156
  def _resolve_enabled_plugins(
157
  requested_plugins: list[str],
158
  ) -> tuple[list[str], list[str]]:
@@ -163,12 +173,18 @@ def _resolve_enabled_plugins(
163
 
164
  available: set[str] = {
165
  plugin["id"]
166
- for category in PLUGIN_REGISTRY.values()
 
167
  for plugin in category
168
  if plugin.get("installed")
169
  }
170
- enabled = [plugin_id for plugin_id in requested_plugins if plugin_id in available]
171
- missing = [plugin_id for plugin_id in requested_plugins if plugin_id not in available]
 
 
 
 
 
172
  return enabled, missing
173
 
174
 
@@ -368,30 +384,60 @@ def _extract_fields_for_complexity(complexity: TaskComplexity) -> list[str]:
368
  return fields
369
 
370
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
  def _create_intelligent_navigation_plan(instructions: str, assets: list[str]) -> dict[str, Any]:
372
  """Create an intelligent navigation plan based on user instructions."""
373
 
374
  instructions_lower = instructions.lower()
375
- asset_url = assets[0] if assets else ""
376
-
377
- # GitHub trending repositories detection
378
- if "trending" in instructions_lower and "repo" in instructions_lower and "github" in asset_url:
379
- return {
380
- "strategy": "github_trending",
381
- "target_urls": [
382
- "https://github.com/trending",
383
- "https://github.com/trending?since=daily",
384
- "https://github.com/trending?since=weekly"
385
- ],
386
- "navigation_steps": [
387
- "Navigate to GitHub trending page",
388
- "Extract trending repository information",
389
- "Follow pagination if available",
390
- "Collect repository data: name, stars, forks, description"
391
- ],
392
- "extraction_goal": "trending_repositories",
393
- "output_fields": ["username", "repo_name", "stars", "forks", "description"]
394
- }
 
 
 
 
395
 
396
  # News articles detection
397
  elif any(word in instructions_lower for word in ["news", "article", "headline"]):
@@ -422,7 +468,10 @@ def _create_intelligent_navigation_plan(instructions: str, assets: list[str]) ->
422
  return {
423
  "strategy": "single_page",
424
  "navigation_steps": ["Extract content from provided URL"],
425
- "extraction_goal": "basic_extraction"
 
 
 
426
  }
427
 
428
 
@@ -471,6 +520,45 @@ async def _search_urls_with_mcp(query: str, max_results: int = 6) -> list[str]:
471
  await router.shutdown()
472
 
473
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
  async def _resolve_assets(
475
  assets: list[str],
476
  enabled_plugins: list[str],
@@ -587,6 +675,28 @@ def _build_gold_dataset_rows(
587
  return ordered
588
 
589
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
590
  async def _store_url_memory(
591
  session_id: str,
592
  url: str,
@@ -776,7 +886,14 @@ async def scrape_url_intelligently(
776
  session, session_id, env, request, navigation_plan, step_num, total_reward
777
  ):
778
  yield event
779
-
 
 
 
 
 
 
 
780
  # General exploration strategy
781
  elif navigation_plan["strategy"] == "intelligent_exploration":
782
  async for event in _scrape_with_exploration(
@@ -984,6 +1101,445 @@ async def _scrape_github_trending(
984
  )
985
 
986
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
987
  async def _scrape_single_page(
988
  session: dict[str, Any],
989
  session_id: str,
@@ -1086,6 +1642,7 @@ async def _scrape_single_page(
1086
  step_num += 1
1087
  extracted_count = len([f for f in fields_to_extract if f in extracted])
1088
  verification_score = extracted_count / len(fields_to_extract) if fields_to_extract else 0.0
 
1089
 
1090
  yield _record_step(
1091
  session,
@@ -1108,8 +1665,8 @@ async def _scrape_single_page(
1108
  parameters={"success": True},
1109
  reasoning="Extraction complete",
1110
  )
1111
- _, reward, _, _, _, _ = await env.step(done_action)
1112
- total_reward += reward
1113
 
1114
  yield _record_step(
1115
  session,
@@ -1119,8 +1676,8 @@ async def _scrape_single_page(
1119
  url=url,
1120
  status="completed",
1121
  message=f"Completed scraping {url}",
1122
- reward=total_reward,
1123
- extracted_data=extracted,
1124
  timestamp=_now_iso(),
1125
  ),
1126
  )
@@ -1196,7 +1753,10 @@ async def scrape_stream(
1196
  "enabled": enabled_plugins,
1197
  "missing": missing_plugins,
1198
  "navigation_strategy": navigation_plan["strategy"],
1199
- "extraction_goal": navigation_plan["extraction_goal"]
 
 
 
1200
  },
1201
  timestamp=_now_iso(),
1202
  ),
@@ -1225,6 +1785,11 @@ async def scrape_stream(
1225
  await manager.broadcast(discovery_event, session_id)
1226
  yield _sse_event(discovery_event)
1227
 
 
 
 
 
 
1228
  if request.enable_memory:
1229
  try:
1230
  await memory_manager.store(
@@ -1270,6 +1835,7 @@ async def scrape_stream(
1270
  "assets": resolved_assets,
1271
  "instructions": request.instructions,
1272
  "output_instructions": request.output_instructions,
 
1273
  },
1274
  timestamp=_now_iso(),
1275
  ),
@@ -1284,12 +1850,15 @@ async def scrape_stream(
1284
  "output_instructions": request.output_instructions,
1285
  "resolved_assets": resolved_assets,
1286
  "selected_agents": request.selected_agents,
 
1287
  }
1288
  planner_code = (
1289
  "result = {"
1290
  "'phase': payload.get('phase'), "
1291
  "'asset_count': len(payload.get('resolved_assets') or []), "
1292
- "'selected_agents': payload.get('selected_agents') or []"
 
 
1293
  "}"
1294
  )
1295
  try:
@@ -1327,6 +1896,31 @@ async def scrape_stream(
1327
 
1328
  for idx, url in enumerate(resolved_assets):
1329
  session["current_url_index"] = idx
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1330
  navigator_event = _record_step(
1331
  session,
1332
  ScrapeStep(
@@ -1334,8 +1928,15 @@ async def scrape_stream(
1334
  action="navigator",
1335
  url=url,
1336
  status="running",
1337
- message=f"Navigator selected source {idx + 1}/{len(resolved_assets)}",
 
 
 
1338
  reward=0.05, # Small reward for navigator selection
 
 
 
 
1339
  timestamp=_now_iso(),
1340
  ),
1341
  )
@@ -1348,12 +1949,16 @@ async def scrape_stream(
1348
  "url": url,
1349
  "index": idx,
1350
  "total": len(resolved_assets),
 
 
1351
  }
1352
  navigator_code = (
1353
  "result = {"
1354
  "'phase': payload.get('phase'), "
1355
  "'selected_url': payload.get('url'), "
1356
- "'progress': f\"{payload.get('index', 0) + 1}/{payload.get('total', 0)}\""
 
 
1357
  "}"
1358
  )
1359
  try:
@@ -1402,7 +2007,7 @@ async def scrape_stream(
1402
  request,
1403
  memory_manager,
1404
  enabled_plugins,
1405
- navigation_plan,
1406
  ):
1407
  await manager.broadcast(update, session_id)
1408
  yield _sse_event(update)
@@ -1454,7 +2059,10 @@ async def scrape_stream(
1454
  else:
1455
  session["errors"].append("No monthly gold rows were extracted from resolved sources.")
1456
 
1457
- if any(plugin_id in enabled_plugins for plugin_id in python_plugin_ids):
 
 
 
1458
  extracted_payload = session["extracted_data"]
1459
  dataset_rows: list[dict[str, Any]] = []
1460
  source_links: list[str] = []
 
16
  from enum import Enum
17
  from pathlib import Path
18
  from typing import Any, AsyncGenerator
19
+ from urllib.error import HTTPError, URLError
20
  from urllib.parse import quote_plus, urlparse
21
+ from urllib.request import Request, urlopen
22
 
23
  from bs4 import BeautifulSoup
24
  from fastapi import APIRouter, BackgroundTasks, HTTPException
 
43
  )
44
  from app.search.engine import SearchEngineRouter
45
  from app.search.providers.duckduckgo import DuckDuckGoProvider
46
+ from app.sites import match_site_template, serialize_site_template
47
 
48
  logger = logging.getLogger(__name__)
49
  router = APIRouter(prefix="/scrape", tags=["Scraping"])
 
156
  return _active_sessions.get(session_id)
157
 
158
 
159
+ def _is_agent_plugin_id(plugin_id: str) -> bool:
160
+ """Check if a plugin id actually belongs to an agent/skill."""
161
+
162
+ lowered = plugin_id.lower()
163
+ return lowered.startswith("skill-") or lowered == "web_scraper"
164
+
165
+
166
  def _resolve_enabled_plugins(
167
  requested_plugins: list[str],
168
  ) -> tuple[list[str], list[str]]:
 
173
 
174
  available: set[str] = {
175
  plugin["id"]
176
+ for category_name, category in PLUGIN_REGISTRY.items()
177
+ if category_name != "skills"
178
  for plugin in category
179
  if plugin.get("installed")
180
  }
181
+ unique_requested = list(dict.fromkeys(requested_plugins))
182
+ enabled = [plugin_id for plugin_id in unique_requested if plugin_id in available]
183
+ missing = [
184
+ plugin_id
185
+ for plugin_id in unique_requested
186
+ if plugin_id not in available and not _is_agent_plugin_id(plugin_id)
187
+ ]
188
  return enabled, missing
189
 
190
 
 
384
  return fields
385
 
386
 
387
+ def _plan_from_site_template(
388
+ site_template: Any,
389
+ strategy_override: str | None = None,
390
+ extraction_goal_override: str | None = None,
391
+ ) -> dict[str, Any]:
392
+ """Build a navigation plan from a matched site template."""
393
+
394
+ target_urls = list(site_template.target_urls) if site_template.target_urls else []
395
+ if not target_urls and site_template.domains:
396
+ target_urls = [f"https://{site_template.domains[0]}"]
397
+
398
+ return {
399
+ "strategy": strategy_override or "intelligent_exploration",
400
+ "target_urls": target_urls,
401
+ "navigation_steps": list(site_template.navigation_steps) or [
402
+ "Navigate to site and identify relevant sections",
403
+ "Extract structured fields aligned with instructions",
404
+ ],
405
+ "extraction_goal": extraction_goal_override or site_template.extraction_goal,
406
+ "output_fields": list(site_template.output_fields),
407
+ "site_template_id": site_template.site_id,
408
+ "site_template_name": site_template.name,
409
+ "site_template_domains": list(site_template.domains),
410
+ }
411
+
412
+
413
  def _create_intelligent_navigation_plan(instructions: str, assets: list[str]) -> dict[str, Any]:
414
  """Create an intelligent navigation plan based on user instructions."""
415
 
416
  instructions_lower = instructions.lower()
417
+ site_template = match_site_template(instructions, assets)
418
+
419
+ # Site-specific strategy overrides
420
+ if site_template and site_template.site_id == "github":
421
+ if "trending" in instructions_lower and "repo" in instructions_lower:
422
+ return _plan_from_site_template(
423
+ site_template,
424
+ strategy_override="github_trending",
425
+ extraction_goal_override="trending_repositories",
426
+ )
427
+
428
+ if site_template and site_template.site_id == "reddit":
429
+ if any(
430
+ token in instructions_lower
431
+ for token in ("trending", "popular", "community", "communities", "subreddit", "subreddits")
432
+ ):
433
+ return _plan_from_site_template(
434
+ site_template,
435
+ strategy_override="reddit_trending",
436
+ extraction_goal_override="trending_communities",
437
+ )
438
+
439
+ if site_template:
440
+ return _plan_from_site_template(site_template)
441
 
442
  # News articles detection
443
  elif any(word in instructions_lower for word in ["news", "article", "headline"]):
 
468
  return {
469
  "strategy": "single_page",
470
  "navigation_steps": ["Extract content from provided URL"],
471
+ "extraction_goal": "basic_extraction",
472
+ "site_template_id": None,
473
+ "site_template_name": None,
474
+ "site_template_domains": [],
475
  }
476
 
477
 
 
520
  await router.shutdown()
521
 
522
 
523
+ async def _discover_reddit_communities_via_search(limit: int = 25) -> list[dict[str, Any]]:
524
+ """Discover subreddit URLs via search engine fallback."""
525
+
526
+ queries = [
527
+ "site:reddit.com/r popular communities",
528
+ "reddit popular subreddits list",
529
+ "best reddit communities technology",
530
+ ]
531
+ excluded = {"popular", "all", "announcements", "new", "top", "best"}
532
+ seen: set[str] = set()
533
+ communities: list[dict[str, Any]] = []
534
+
535
+ for query in queries:
536
+ urls = await _search_urls_with_mcp(query, max_results=18)
537
+ for candidate in urls:
538
+ match = re.search(r"reddit\.com/r/([A-Za-z0-9_]+)/?", candidate, flags=re.IGNORECASE)
539
+ if not match:
540
+ continue
541
+ name = match.group(1)
542
+ normalized = name.lower()
543
+ if normalized in excluded or normalized in seen:
544
+ continue
545
+ seen.add(normalized)
546
+ communities.append(
547
+ {
548
+ "subreddit": f"r/{name}",
549
+ "title": f"r/{name}",
550
+ "subscribers": 0,
551
+ "active_users": 0,
552
+ "url": f"https://www.reddit.com/r/{name}/",
553
+ "description": "Discovered via search fallback",
554
+ }
555
+ )
556
+ if len(communities) >= limit:
557
+ return communities
558
+
559
+ return communities
560
+
561
+
562
  async def _resolve_assets(
563
  assets: list[str],
564
  enabled_plugins: list[str],
 
675
  return ordered
676
 
677
 
678
+ def _should_run_python_sandbox(request: ScrapeRequest, extracted_data: dict[str, Any]) -> bool:
679
+ """Decide whether sandbox analysis should run for current scrape output."""
680
+
681
+ if request.python_code:
682
+ return True
683
+ if not isinstance(extracted_data, dict) or not extracted_data:
684
+ return False
685
+
686
+ if isinstance(extracted_data.get("rows"), list) and len(extracted_data.get("rows", [])) > 0:
687
+ return True
688
+
689
+ for value in extracted_data.values():
690
+ if not isinstance(value, dict):
691
+ continue
692
+ if isinstance(value.get("data"), list) and len(value.get("data", [])) > 0:
693
+ return True
694
+ if isinstance(value.get("tables"), list) and len(value.get("tables", [])) > 0:
695
+ return True
696
+
697
+ return False
698
+
699
+
700
  async def _store_url_memory(
701
  session_id: str,
702
  url: str,
 
886
  session, session_id, env, request, navigation_plan, step_num, total_reward
887
  ):
888
  yield event
889
+
890
+ # Reddit popular/trending communities strategy
891
+ elif navigation_plan["strategy"] == "reddit_trending":
892
+ async for event in _scrape_reddit_trending(
893
+ session, session_id, env, request, url, step_num, total_reward
894
+ ):
895
+ yield event
896
+
897
  # General exploration strategy
898
  elif navigation_plan["strategy"] == "intelligent_exploration":
899
  async for event in _scrape_with_exploration(
 
1101
  )
1102
 
1103
 
1104
+ def _to_int(value: Any) -> int:
1105
+ """Convert a value to int safely."""
1106
+
1107
+ if value is None:
1108
+ return 0
1109
+ if isinstance(value, bool):
1110
+ return int(value)
1111
+ if isinstance(value, (int, float)):
1112
+ return int(value)
1113
+ digits = re.sub(r"[^\d]", "", str(value))
1114
+ if not digits:
1115
+ return 0
1116
+ try:
1117
+ return int(digits)
1118
+ except ValueError:
1119
+ return 0
1120
+
1121
+
1122
+ def _is_reddit_challenge_page(page_html: str) -> bool:
1123
+ """Check if Reddit returned a bot-verification challenge page."""
1124
+
1125
+ lowered = page_html.lower()
1126
+ challenge_markers = [
1127
+ "please wait for verification",
1128
+ "js_challenge",
1129
+ "captcha",
1130
+ "verify you are human",
1131
+ "checking your browser",
1132
+ ]
1133
+ return any(marker in lowered for marker in challenge_markers)
1134
+
1135
+
1136
+ def _extract_reddit_communities_from_payload(
1137
+ payload: dict[str, Any],
1138
+ limit: int = 25,
1139
+ ) -> list[dict[str, Any]]:
1140
+ """Extract subreddit rows from Reddit JSON payload."""
1141
+
1142
+ communities: list[dict[str, Any]] = []
1143
+ seen: set[str] = set()
1144
+
1145
+ children = payload.get("data", {}).get("children", [])
1146
+ if not isinstance(children, list):
1147
+ return communities
1148
+
1149
+ for child in children:
1150
+ if not isinstance(child, dict):
1151
+ continue
1152
+ data = child.get("data", {})
1153
+ if not isinstance(data, dict):
1154
+ continue
1155
+
1156
+ name = str(
1157
+ data.get("display_name")
1158
+ or str(data.get("display_name_prefixed", "")).replace("r/", "")
1159
+ ).strip()
1160
+ if not name:
1161
+ continue
1162
+ normalized = name.lower()
1163
+ if normalized in seen:
1164
+ continue
1165
+ seen.add(normalized)
1166
+
1167
+ permalink = str(data.get("url") or f"/r/{name}/")
1168
+ community_url = permalink if permalink.startswith("http") else f"https://www.reddit.com{permalink}"
1169
+
1170
+ communities.append(
1171
+ {
1172
+ "subreddit": f"r/{name}",
1173
+ "title": str(data.get("title") or data.get("public_description") or ""),
1174
+ "subscribers": _to_int(data.get("subscribers")),
1175
+ "active_users": _to_int(
1176
+ data.get("active_user_count") or data.get("accounts_active")
1177
+ ),
1178
+ "url": community_url,
1179
+ "description": str(data.get("public_description") or ""),
1180
+ }
1181
+ )
1182
+ if len(communities) >= limit:
1183
+ break
1184
+
1185
+ communities.sort(key=lambda row: row.get("subscribers", 0), reverse=True)
1186
+ return communities[:limit]
1187
+
1188
+
1189
+ def _extract_reddit_communities_from_html(
1190
+ page_html: str,
1191
+ limit: int = 25,
1192
+ ) -> list[dict[str, Any]]:
1193
+ """Fallback extraction from Reddit HTML when JSON endpoint is unavailable."""
1194
+
1195
+ communities: list[dict[str, Any]] = []
1196
+ seen: set[str] = set()
1197
+ soup = parse_html(page_html)
1198
+
1199
+ for anchor in soup.find_all("a", href=True):
1200
+ href = str(anchor.get("href", ""))
1201
+ match = re.search(r"/r/([A-Za-z0-9_]+)", href)
1202
+ if not match:
1203
+ continue
1204
+
1205
+ name = match.group(1)
1206
+ if name.lower() in {"popular", "all"}:
1207
+ continue
1208
+ normalized = name.lower()
1209
+ if normalized in seen:
1210
+ continue
1211
+ seen.add(normalized)
1212
+
1213
+ community_url = href if href.startswith("http") else f"https://www.reddit.com/r/{name}/"
1214
+ title = anchor.get_text(strip=True)
1215
+ communities.append(
1216
+ {
1217
+ "subreddit": f"r/{name}",
1218
+ "title": title,
1219
+ "subscribers": 0,
1220
+ "active_users": 0,
1221
+ "url": community_url,
1222
+ "description": "",
1223
+ }
1224
+ )
1225
+ if len(communities) >= limit:
1226
+ break
1227
+
1228
+ return communities
1229
+
1230
+
1231
+ def _fetch_reddit_communities(limit: int = 25) -> tuple[list[dict[str, Any]], str]:
1232
+ """Fetch trending/popular Reddit communities from public JSON endpoints."""
1233
+
1234
+ endpoints = [
1235
+ f"https://www.reddit.com/subreddits/popular.json?limit={limit}",
1236
+ f"https://www.reddit.com/subreddits/default.json?limit={limit}",
1237
+ f"https://old.reddit.com/subreddits/popular/.json?limit={limit}",
1238
+ ]
1239
+ headers = {
1240
+ "User-Agent": "ScrapeRLBot/1.0 (+https://github.com/NeerajCodz/scrapeRL)",
1241
+ "Accept": "application/json",
1242
+ }
1243
+ last_error = ""
1244
+
1245
+ for endpoint in endpoints:
1246
+ try:
1247
+ request = Request(endpoint, headers=headers)
1248
+ with urlopen(request, timeout=20) as response:
1249
+ status_code = int(getattr(response, "status", 200))
1250
+ if status_code >= 400:
1251
+ last_error = f"{endpoint} returned status {status_code}"
1252
+ continue
1253
+ raw_payload = response.read().decode("utf-8", errors="replace")
1254
+
1255
+ parsed = json.loads(raw_payload)
1256
+ communities = _extract_reddit_communities_from_payload(parsed, limit=limit)
1257
+ if communities:
1258
+ return communities, endpoint
1259
+ last_error = f"{endpoint} returned no community rows"
1260
+ except (HTTPError, URLError, TimeoutError, json.JSONDecodeError, ValueError) as exc:
1261
+ last_error = f"{endpoint}: {exc}"
1262
+ continue
1263
+
1264
+ return [], last_error
1265
+
1266
+
1267
+ def _fallback_reddit_communities_static(limit: int = 25) -> list[dict[str, Any]]:
1268
+ """Fallback list used when Reddit blocks direct/API access."""
1269
+
1270
+ names = [
1271
+ "AskReddit",
1272
+ "funny",
1273
+ "gaming",
1274
+ "worldnews",
1275
+ "todayilearned",
1276
+ "science",
1277
+ "movies",
1278
+ "technology",
1279
+ "pics",
1280
+ "news",
1281
+ "aww",
1282
+ "sports",
1283
+ "Music",
1284
+ "books",
1285
+ "food",
1286
+ "dataisbeautiful",
1287
+ "MachineLearning",
1288
+ "programming",
1289
+ "python",
1290
+ "javascript",
1291
+ "learnprogramming",
1292
+ "wallstreetbets",
1293
+ "explainlikeimfive",
1294
+ "history",
1295
+ "space",
1296
+ ]
1297
+ communities: list[dict[str, Any]] = []
1298
+ for name in names[:limit]:
1299
+ communities.append(
1300
+ {
1301
+ "subreddit": f"r/{name}",
1302
+ "title": f"r/{name}",
1303
+ "subscribers": 0,
1304
+ "active_users": 0,
1305
+ "url": f"https://www.reddit.com/r/{name}/",
1306
+ "description": "Fallback popular community list (direct Reddit access blocked)",
1307
+ }
1308
+ )
1309
+ return communities
1310
+
1311
+
1312
+ async def _scrape_reddit_trending(
1313
+ session: dict[str, Any],
1314
+ session_id: str,
1315
+ env,
1316
+ request: ScrapeRequest,
1317
+ url: str,
1318
+ step_num: int,
1319
+ total_reward: float,
1320
+ ) -> AsyncGenerator[dict[str, Any], None]:
1321
+ """Scrape trending Reddit communities with anti-bot fallback."""
1322
+
1323
+ target_url = "https://www.reddit.com/"
1324
+
1325
+ step_num += 1
1326
+ yield _record_step(
1327
+ session,
1328
+ ScrapeStep(
1329
+ step_number=step_num,
1330
+ action="navigate",
1331
+ url=target_url,
1332
+ status="running",
1333
+ message="Navigating to Reddit...",
1334
+ timestamp=_now_iso(),
1335
+ ),
1336
+ )
1337
+
1338
+ navigate_action = Action(
1339
+ action_type=ActionType.NAVIGATE,
1340
+ parameters={"url": target_url},
1341
+ reasoning="Navigate to Reddit and collect trending communities",
1342
+ )
1343
+ nav_obs, nav_reward, _, _, _, nav_info = await env.step(navigate_action)
1344
+ total_reward += nav_reward
1345
+
1346
+ nav_success = bool(nav_obs.page_html)
1347
+ step_num += 1
1348
+ yield _record_step(
1349
+ session,
1350
+ ScrapeStep(
1351
+ step_number=step_num,
1352
+ action="navigate",
1353
+ url=target_url,
1354
+ status="completed" if nav_success else "failed",
1355
+ message=f"Navigated to {target_url}" if nav_success else "Navigation failed",
1356
+ reward=nav_reward,
1357
+ duration_ms=nav_info.get("step_duration_ms", 0),
1358
+ timestamp=_now_iso(),
1359
+ ),
1360
+ )
1361
+ if not nav_success:
1362
+ session["errors"].append("Failed to load Reddit landing page")
1363
+ return
1364
+
1365
+ page_html = nav_obs.page_html or ""
1366
+ challenge_detected = _is_reddit_challenge_page(page_html)
1367
+ extraction_message = (
1368
+ "Reddit challenge detected, switching to Reddit JSON endpoints..."
1369
+ if challenge_detected
1370
+ else "Extracting trending communities..."
1371
+ )
1372
+
1373
+ step_num += 1
1374
+ yield _record_step(
1375
+ session,
1376
+ ScrapeStep(
1377
+ step_number=step_num,
1378
+ action="extract",
1379
+ url=url,
1380
+ status="running",
1381
+ message=extraction_message,
1382
+ reward=0.1,
1383
+ timestamp=_now_iso(),
1384
+ ),
1385
+ )
1386
+
1387
+ communities, source_used = await asyncio.to_thread(_fetch_reddit_communities, 25)
1388
+ if not communities:
1389
+ html_fallback = _extract_reddit_communities_from_html(page_html, 25)
1390
+ if html_fallback:
1391
+ communities = html_fallback
1392
+ source_used = "reddit_html_fallback"
1393
+ if not communities:
1394
+ search_fallback = await _discover_reddit_communities_via_search(limit=25)
1395
+ if search_fallback:
1396
+ communities = search_fallback
1397
+ source_used = "duckduckgo_search_fallback"
1398
+ if len(communities) < 10:
1399
+ static_fallback = _fallback_reddit_communities_static(limit=25)
1400
+ existing = {row.get("subreddit", "").lower() for row in communities}
1401
+ appended_static = False
1402
+ for row in static_fallback:
1403
+ subreddit = str(row.get("subreddit", "")).lower()
1404
+ if subreddit in existing:
1405
+ continue
1406
+ communities.append(row)
1407
+ existing.add(subreddit)
1408
+ appended_static = True
1409
+ if len(communities) >= 25:
1410
+ break
1411
+ if communities and appended_static and source_used == "duckduckgo_search_fallback":
1412
+ source_used = "search_plus_static_fallback"
1413
+ elif communities and appended_static:
1414
+ source_used = "static_popular_fallback"
1415
+
1416
+ extraction_reward = min(6.0, len(communities) * 0.25 + (1.0 if communities else 0.0))
1417
+ total_reward += extraction_reward
1418
+
1419
+ step_num += 1
1420
+ extraction_status = "completed" if communities else "failed"
1421
+ extraction_done_message = (
1422
+ f"Extracted {len(communities)} trending communities from {source_used}"
1423
+ if communities
1424
+ else "Failed to extract trending communities from Reddit"
1425
+ )
1426
+ yield _record_step(
1427
+ session,
1428
+ ScrapeStep(
1429
+ step_number=step_num,
1430
+ action="extract",
1431
+ url=url,
1432
+ status=extraction_status,
1433
+ message=extraction_done_message,
1434
+ reward=extraction_reward,
1435
+ extracted_data={
1436
+ "count": len(communities),
1437
+ "source": source_used,
1438
+ "challenge_detected": challenge_detected,
1439
+ "preview": communities[:3],
1440
+ },
1441
+ timestamp=_now_iso(),
1442
+ ),
1443
+ )
1444
+
1445
+ if not communities:
1446
+ if source_used:
1447
+ session["errors"].append(f"Reddit extraction failed: {source_used}")
1448
+ else:
1449
+ session["errors"].append("Reddit extraction failed: no community data found")
1450
+ session["total_reward"] += total_reward
1451
+ step_num += 1
1452
+ yield _record_step(
1453
+ session,
1454
+ ScrapeStep(
1455
+ step_number=step_num,
1456
+ action="complete",
1457
+ url=url,
1458
+ status="failed",
1459
+ message="Completed Reddit scrape with no community rows",
1460
+ reward=0.0,
1461
+ extracted_data={"total_reward": total_reward, "row_count": 0},
1462
+ timestamp=_now_iso(),
1463
+ ),
1464
+ )
1465
+ return
1466
+
1467
+ verification_score = 1.0 if len(communities) >= 10 else 0.5
1468
+ total_reward += verification_score
1469
+ step_num += 1
1470
+ yield _record_step(
1471
+ session,
1472
+ ScrapeStep(
1473
+ step_number=step_num,
1474
+ action="verify",
1475
+ url=url,
1476
+ status="completed",
1477
+ message=f"Verifier checked community coverage ({len(communities)} rows)",
1478
+ reward=verification_score,
1479
+ extracted_data={
1480
+ "row_count": len(communities),
1481
+ "coverage": "good" if len(communities) >= 10 else "partial",
1482
+ },
1483
+ timestamp=_now_iso(),
1484
+ ),
1485
+ )
1486
+
1487
+ if request.output_format == OutputFormat.CSV:
1488
+ columns = ["subreddit", "title", "subscribers", "active_users", "url", "description"]
1489
+ csv_output = _rows_to_csv(communities, preferred_headers=columns)
1490
+ session["extracted_data"] = {
1491
+ "rows": communities,
1492
+ "columns": columns,
1493
+ "csv_output": csv_output,
1494
+ "row_count": len(communities),
1495
+ "source": source_used,
1496
+ "challenge_detected": challenge_detected,
1497
+ }
1498
+ session["final_output"] = csv_output
1499
+ else:
1500
+ session["extracted_data"][url] = {
1501
+ "trending_communities": communities,
1502
+ "row_count": len(communities),
1503
+ "source": source_used,
1504
+ "challenge_detected": challenge_detected,
1505
+ }
1506
+
1507
+ _write_session_json_artifact(
1508
+ session,
1509
+ "reddit_trending_communities.json",
1510
+ {
1511
+ "source": source_used,
1512
+ "challenge_detected": challenge_detected,
1513
+ "row_count": len(communities),
1514
+ "rows": communities,
1515
+ },
1516
+ )
1517
+
1518
+ done_action = Action(
1519
+ action_type=ActionType.DONE,
1520
+ parameters={"success": True},
1521
+ reasoning="Reddit community extraction complete",
1522
+ )
1523
+ _, done_reward, _, _, _, _ = await env.step(done_action)
1524
+ total_reward += done_reward
1525
+ session["total_reward"] += total_reward
1526
+
1527
+ step_num += 1
1528
+ yield _record_step(
1529
+ session,
1530
+ ScrapeStep(
1531
+ step_number=step_num,
1532
+ action="complete",
1533
+ url=url,
1534
+ status="completed",
1535
+ message=f"Completed Reddit trending scrape with {len(communities)} communities",
1536
+ reward=done_reward,
1537
+ extracted_data={"total_reward": total_reward, "row_count": len(communities)},
1538
+ timestamp=_now_iso(),
1539
+ ),
1540
+ )
1541
+
1542
+
1543
  async def _scrape_single_page(
1544
  session: dict[str, Any],
1545
  session_id: str,
 
1642
  step_num += 1
1643
  extracted_count = len([f for f in fields_to_extract if f in extracted])
1644
  verification_score = extracted_count / len(fields_to_extract) if fields_to_extract else 0.0
1645
+ total_reward += verification_score
1646
 
1647
  yield _record_step(
1648
  session,
 
1665
  parameters={"success": True},
1666
  reasoning="Extraction complete",
1667
  )
1668
+ _, done_reward, _, _, _, _ = await env.step(done_action)
1669
+ total_reward += done_reward
1670
 
1671
  yield _record_step(
1672
  session,
 
1676
  url=url,
1677
  status="completed",
1678
  message=f"Completed scraping {url}",
1679
+ reward=done_reward,
1680
+ extracted_data={**extracted, "total_reward": total_reward},
1681
  timestamp=_now_iso(),
1682
  ),
1683
  )
 
1753
  "enabled": enabled_plugins,
1754
  "missing": missing_plugins,
1755
  "navigation_strategy": navigation_plan["strategy"],
1756
+ "extraction_goal": navigation_plan["extraction_goal"],
1757
+ "site_template_id": navigation_plan.get("site_template_id"),
1758
+ "site_template_name": navigation_plan.get("site_template_name"),
1759
+ "site_template_domains": navigation_plan.get("site_template_domains", []),
1760
  },
1761
  timestamp=_now_iso(),
1762
  ),
 
1785
  await manager.broadcast(discovery_event, session_id)
1786
  yield _sse_event(discovery_event)
1787
 
1788
+ planner_site_template = match_site_template(request.instructions, resolved_assets)
1789
+ planner_template_payload = (
1790
+ serialize_site_template(planner_site_template) if planner_site_template else None
1791
+ )
1792
+
1793
  if request.enable_memory:
1794
  try:
1795
  await memory_manager.store(
 
1835
  "assets": resolved_assets,
1836
  "instructions": request.instructions,
1837
  "output_instructions": request.output_instructions,
1838
+ "site_template": planner_template_payload,
1839
  },
1840
  timestamp=_now_iso(),
1841
  ),
 
1850
  "output_instructions": request.output_instructions,
1851
  "resolved_assets": resolved_assets,
1852
  "selected_agents": request.selected_agents,
1853
+ "site_template": planner_template_payload,
1854
  }
1855
  planner_code = (
1856
  "result = {"
1857
  "'phase': payload.get('phase'), "
1858
  "'asset_count': len(payload.get('resolved_assets') or []), "
1859
+ "'selected_agents': payload.get('selected_agents') or [], "
1860
+ "'site_template_id': (payload.get('site_template') or {}).get('site_id'), "
1861
+ "'site_strategy': (payload.get('site_template') or {}).get('default_strategy')"
1862
  "}"
1863
  )
1864
  try:
 
1896
 
1897
  for idx, url in enumerate(resolved_assets):
1898
  session["current_url_index"] = idx
1899
+ url_navigation_plan = _create_intelligent_navigation_plan(request.instructions, [url])
1900
+ url_site_template = match_site_template(request.instructions, [url])
1901
+ url_template_payload = serialize_site_template(url_site_template) if url_site_template else None
1902
+
1903
+ if url_template_payload:
1904
+ site_template_event = _record_step(
1905
+ session,
1906
+ ScrapeStep(
1907
+ step_number=len(session["steps"]) + 1,
1908
+ action="site_template",
1909
+ url=url,
1910
+ status="completed",
1911
+ message=f"Navigator loaded site template: {url_template_payload['name']}",
1912
+ reward=0.05,
1913
+ extracted_data={
1914
+ "site_id": url_template_payload["site_id"],
1915
+ "strategy": url_navigation_plan["strategy"],
1916
+ "domains": url_template_payload["domains"],
1917
+ },
1918
+ timestamp=_now_iso(),
1919
+ ),
1920
+ )
1921
+ await manager.broadcast(site_template_event, session_id)
1922
+ yield _sse_event(site_template_event)
1923
+
1924
  navigator_event = _record_step(
1925
  session,
1926
  ScrapeStep(
 
1928
  action="navigator",
1929
  url=url,
1930
  status="running",
1931
+ message=(
1932
+ f"Navigator selected source {idx + 1}/{len(resolved_assets)} "
1933
+ f"({url_navigation_plan['strategy']})"
1934
+ ),
1935
  reward=0.05, # Small reward for navigator selection
1936
+ extracted_data={
1937
+ "site_template_id": url_navigation_plan.get("site_template_id"),
1938
+ "site_template_name": url_navigation_plan.get("site_template_name"),
1939
+ },
1940
  timestamp=_now_iso(),
1941
  ),
1942
  )
 
1949
  "url": url,
1950
  "index": idx,
1951
  "total": len(resolved_assets),
1952
+ "site_template": url_template_payload,
1953
+ "navigation_strategy": url_navigation_plan["strategy"],
1954
  }
1955
  navigator_code = (
1956
  "result = {"
1957
  "'phase': payload.get('phase'), "
1958
  "'selected_url': payload.get('url'), "
1959
+ "'progress': f\"{payload.get('index', 0) + 1}/{payload.get('total', 0)}\", "
1960
+ "'site_template_id': (payload.get('site_template') or {}).get('site_id'), "
1961
+ "'strategy': payload.get('navigation_strategy')"
1962
  "}"
1963
  )
1964
  try:
 
2007
  request,
2008
  memory_manager,
2009
  enabled_plugins,
2010
+ url_navigation_plan,
2011
  ):
2012
  await manager.broadcast(update, session_id)
2013
  yield _sse_event(update)
 
2059
  else:
2060
  session["errors"].append("No monthly gold rows were extracted from resolved sources.")
2061
 
2062
+ if (
2063
+ any(plugin_id in enabled_plugins for plugin_id in python_plugin_ids)
2064
+ and _should_run_python_sandbox(request, session["extracted_data"])
2065
+ ):
2066
  extracted_payload = session["extracted_data"]
2067
  dataset_rows: list[dict[str, Any]] = []
2068
  source_links: list[str] = []
backend/app/api/routes/sites.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Site template API routes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from fastapi import APIRouter, HTTPException, status
8
+ from pydantic import BaseModel, Field
9
+
10
+ from app.sites import (
11
+ get_site_template,
12
+ list_site_templates,
13
+ match_site_template,
14
+ serialize_site_template,
15
+ )
16
+
17
+ router = APIRouter(prefix="/sites", tags=["sites"])
18
+
19
+
20
+ class SiteMatchRequest(BaseModel):
21
+ """Payload to match a site template."""
22
+
23
+ instructions: str = Field(default="", description="Task instructions")
24
+ assets: list[str] = Field(default_factory=list, description="Task assets/URLs")
25
+
26
+
27
+ @router.get(
28
+ "",
29
+ status_code=status.HTTP_200_OK,
30
+ summary="List inbuilt site templates",
31
+ description="Return all site templates available for agent planning",
32
+ )
33
+ async def list_sites() -> dict[str, Any]:
34
+ """List all available site templates."""
35
+
36
+ templates = list_site_templates()
37
+ return {"count": len(templates), "sites": templates}
38
+
39
+
40
+ @router.get(
41
+ "/{site_id}",
42
+ status_code=status.HTTP_200_OK,
43
+ summary="Get one site template",
44
+ description="Return one template by site_id",
45
+ )
46
+ async def get_site(site_id: str) -> dict[str, Any]:
47
+ """Get one site template."""
48
+
49
+ template = get_site_template(site_id)
50
+ if not template:
51
+ raise HTTPException(status_code=404, detail=f"Site template '{site_id}' not found")
52
+
53
+ return serialize_site_template(template)
54
+
55
+
56
+ @router.post(
57
+ "/match",
58
+ status_code=status.HTTP_200_OK,
59
+ summary="Match a template for task input",
60
+ description="Find the best matching site template from instructions/assets",
61
+ )
62
+ async def match_site(payload: SiteMatchRequest) -> dict[str, Any]:
63
+ """Resolve best site template for given instructions and assets."""
64
+
65
+ template = match_site_template(payload.instructions, payload.assets)
66
+ if not template:
67
+ return {"matched": False, "site": None}
68
+
69
+ return {"matched": True, "site": serialize_site_template(template)}
backend/app/main.py CHANGED
@@ -11,7 +11,7 @@ from fastapi.middleware.cors import CORSMiddleware
11
  from fastapi.responses import FileResponse, HTMLResponse
12
  from fastapi.staticfiles import StaticFiles
13
 
14
- from app.api.routes import agents, episode, health, memory, plugins, tasks, tools, scrape
15
  from app.api.routes import settings as settings_routes
16
  from app.config import get_settings
17
  from app.memory.manager import MemoryManager
@@ -133,6 +133,7 @@ def create_app() -> FastAPI:
133
  app.include_router(memory.router, prefix=api_prefix, tags=["Memory"])
134
  app.include_router(settings_routes.router, prefix=api_prefix, tags=["Settings"])
135
  app.include_router(plugins.router, prefix=api_prefix, tags=["Plugins"])
 
136
  app.include_router(scrape.router, prefix=api_prefix, tags=["Scraping"])
137
 
138
  # Import and include providers router
 
11
  from fastapi.responses import FileResponse, HTMLResponse
12
  from fastapi.staticfiles import StaticFiles
13
 
14
+ from app.api.routes import agents, episode, health, memory, plugins, scrape, sites, tasks, tools
15
  from app.api.routes import settings as settings_routes
16
  from app.config import get_settings
17
  from app.memory.manager import MemoryManager
 
133
  app.include_router(memory.router, prefix=api_prefix, tags=["Memory"])
134
  app.include_router(settings_routes.router, prefix=api_prefix, tags=["Settings"])
135
  app.include_router(plugins.router, prefix=api_prefix, tags=["Plugins"])
136
+ app.include_router(sites.router, prefix=api_prefix, tags=["Sites"])
137
  app.include_router(scrape.router, prefix=api_prefix, tags=["Scraping"])
138
 
139
  # Import and include providers router
backend/app/sites/__init__.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Site template registry for domain-aware scraping behavior."""
2
+
3
+ from app.sites.models import SiteTemplate
4
+ from app.sites.registry import (
5
+ get_site_template,
6
+ list_site_templates,
7
+ match_site_template,
8
+ serialize_site_template,
9
+ )
10
+
11
+ __all__ = [
12
+ "SiteTemplate",
13
+ "get_site_template",
14
+ "list_site_templates",
15
+ "match_site_template",
16
+ "serialize_site_template",
17
+ ]
backend/app/sites/models.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Data models for built-in site templates."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+
7
+
8
+ @dataclass(frozen=True)
9
+ class SiteTemplate:
10
+ """Inbuilt site template that agents can reference."""
11
+
12
+ site_id: str
13
+ name: str
14
+ domains: tuple[str, ...]
15
+ aliases: tuple[str, ...] = field(default_factory=tuple)
16
+ default_strategy: str = "intelligent_exploration"
17
+ extraction_goal: str = "structured_extraction"
18
+ navigation_steps: tuple[str, ...] = field(default_factory=tuple)
19
+ output_fields: tuple[str, ...] = field(default_factory=tuple)
20
+ target_urls: tuple[str, ...] = field(default_factory=tuple)
21
+ description: str = ""
backend/app/sites/registry.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Template registry and matching helpers for known sites."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+ from urllib.parse import urlparse
7
+
8
+ from app.sites.models import SiteTemplate
9
+ from app.sites.templates import SITE_TEMPLATES
10
+
11
+ _SITE_BY_ID: dict[str, SiteTemplate] = {template.site_id: template for template in SITE_TEMPLATES}
12
+
13
+
14
+ def serialize_site_template(template: SiteTemplate) -> dict[str, Any]:
15
+ """Serialize a site template into API/event payload format."""
16
+
17
+ return {
18
+ "site_id": template.site_id,
19
+ "name": template.name,
20
+ "domains": list(template.domains),
21
+ "aliases": list(template.aliases),
22
+ "default_strategy": template.default_strategy,
23
+ "extraction_goal": template.extraction_goal,
24
+ "navigation_steps": list(template.navigation_steps),
25
+ "output_fields": list(template.output_fields),
26
+ "target_urls": list(template.target_urls),
27
+ "description": template.description,
28
+ }
29
+
30
+
31
+ def list_site_templates() -> list[dict[str, Any]]:
32
+ """Return all site templates as serializable dictionaries."""
33
+
34
+ return [serialize_site_template(template) for template in SITE_TEMPLATES]
35
+
36
+
37
+ def get_site_template(site_id: str) -> SiteTemplate | None:
38
+ """Get a template by site_id."""
39
+
40
+ return _SITE_BY_ID.get(site_id)
41
+
42
+
43
+ def _normalize_domain(value: str) -> str:
44
+ """Normalize a domain string."""
45
+
46
+ lowered = value.lower().strip()
47
+ if lowered.startswith("www."):
48
+ return lowered[4:]
49
+ return lowered
50
+
51
+
52
+ def _extract_domains_from_assets(assets: list[str]) -> list[str]:
53
+ """Extract normalized domains from URL assets."""
54
+
55
+ domains: list[str] = []
56
+ for asset in assets:
57
+ parsed = urlparse(asset.strip())
58
+ if parsed.scheme not in {"http", "https"} or not parsed.netloc:
59
+ continue
60
+ domain = _normalize_domain(parsed.netloc)
61
+ if domain not in domains:
62
+ domains.append(domain)
63
+ return domains
64
+
65
+
66
+ def match_site_template(instructions: str, assets: list[str]) -> SiteTemplate | None:
67
+ """Match site template by URL domain first, then instruction aliases."""
68
+
69
+ asset_domains = _extract_domains_from_assets(assets)
70
+ instructions_lower = instructions.lower()
71
+
72
+ # Domain-first matching
73
+ for domain in asset_domains:
74
+ for template in SITE_TEMPLATES:
75
+ if any(domain == _normalize_domain(candidate) or domain.endswith(f".{_normalize_domain(candidate)}")
76
+ for candidate in template.domains):
77
+ return template
78
+
79
+ # Alias fallback
80
+ for template in SITE_TEMPLATES:
81
+ alias_tokens = [template.name.lower(), template.site_id.lower(), *[alias.lower() for alias in template.aliases]]
82
+ if any(token and token in instructions_lower for token in alias_tokens):
83
+ return template
84
+
85
+ return None
backend/app/sites/templates.py ADDED
@@ -0,0 +1,651 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Built-in site templates (30+ domains) for agent guidance."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from app.sites.models import SiteTemplate
6
+
7
+ SOCIAL_STEPS = (
8
+ "Navigate to discover/trending sections",
9
+ "Collect entity cards and ranking metadata",
10
+ "Normalize output into a structured list",
11
+ )
12
+ NEWS_STEPS = (
13
+ "Navigate to front page or section hubs",
14
+ "Extract headline cards with links and timestamps",
15
+ "Optionally follow article links for summaries",
16
+ )
17
+ DOC_STEPS = (
18
+ "Navigate to docs or index pages",
19
+ "Extract headings, navigation links, and metadata",
20
+ "Return concise structured documentation map",
21
+ )
22
+
23
+
24
+ SITE_TEMPLATES: tuple[SiteTemplate, ...] = (
25
+ SiteTemplate(
26
+ site_id="github",
27
+ name="GitHub",
28
+ domains=("github.com",),
29
+ aliases=("github", "repo", "repositories"),
30
+ default_strategy="github_repository_extraction",
31
+ extraction_goal="repositories",
32
+ navigation_steps=(
33
+ "Navigate to Explore/Trending/Search pages",
34
+ "Extract repository metadata and links",
35
+ "Format repository rows for csv/json output",
36
+ ),
37
+ output_fields=("username", "repo_name", "stars", "forks", "url"),
38
+ target_urls=("https://github.com/explore", "https://github.com/trending"),
39
+ description="Code repositories, projects, and trend pages",
40
+ ),
41
+ SiteTemplate(
42
+ site_id="reddit",
43
+ name="Reddit",
44
+ domains=("reddit.com", "old.reddit.com"),
45
+ aliases=("reddit", "subreddit", "communities"),
46
+ default_strategy="reddit_community_extraction",
47
+ extraction_goal="communities",
48
+ navigation_steps=(
49
+ "Navigate to community discovery/popular endpoints",
50
+ "Handle anti-bot challenge fallback if required",
51
+ "Return normalized subreddit rows",
52
+ ),
53
+ output_fields=("subreddit", "title", "subscribers", "active_users", "url"),
54
+ target_urls=("https://www.reddit.com/", "https://www.reddit.com/subreddits/popular"),
55
+ description="Communities, posts, and subreddit metadata",
56
+ ),
57
+ SiteTemplate(
58
+ site_id="x",
59
+ name="X (Twitter)",
60
+ domains=("x.com", "twitter.com"),
61
+ aliases=("x", "twitter", "tweets"),
62
+ extraction_goal="posts",
63
+ navigation_steps=SOCIAL_STEPS,
64
+ output_fields=("author", "post_text", "likes", "replies", "url"),
65
+ target_urls=("https://x.com/explore",),
66
+ description="Short-form social posts and trends",
67
+ ),
68
+ SiteTemplate(
69
+ site_id="youtube",
70
+ name="YouTube",
71
+ domains=("youtube.com", "youtu.be"),
72
+ aliases=("youtube", "videos", "channels"),
73
+ extraction_goal="videos",
74
+ navigation_steps=SOCIAL_STEPS,
75
+ output_fields=("title", "channel", "views", "published", "url"),
76
+ target_urls=("https://www.youtube.com/feed/trending",),
77
+ description="Video listings, channels, and trend feeds",
78
+ ),
79
+ SiteTemplate(
80
+ site_id="instagram",
81
+ name="Instagram",
82
+ domains=("instagram.com",),
83
+ aliases=("instagram", "reels", "posts"),
84
+ extraction_goal="social_posts",
85
+ navigation_steps=SOCIAL_STEPS,
86
+ output_fields=("author", "caption", "likes", "comments", "url"),
87
+ target_urls=("https://www.instagram.com/explore/",),
88
+ description="Photo/video social feed extraction",
89
+ ),
90
+ SiteTemplate(
91
+ site_id="facebook",
92
+ name="Facebook",
93
+ domains=("facebook.com", "fb.com"),
94
+ aliases=("facebook", "pages", "groups"),
95
+ extraction_goal="social_posts",
96
+ navigation_steps=SOCIAL_STEPS,
97
+ output_fields=("page", "post_text", "reactions", "comments", "url"),
98
+ target_urls=("https://www.facebook.com/watch/",),
99
+ description="Pages, groups, and social content",
100
+ ),
101
+ SiteTemplate(
102
+ site_id="linkedin",
103
+ name="LinkedIn",
104
+ domains=("linkedin.com",),
105
+ aliases=("linkedin", "jobs", "companies"),
106
+ extraction_goal="professional_content",
107
+ navigation_steps=SOCIAL_STEPS,
108
+ output_fields=("title", "company", "location", "engagement", "url"),
109
+ target_urls=("https://www.linkedin.com/feed/",),
110
+ description="Professional posts, companies, and jobs",
111
+ ),
112
+ SiteTemplate(
113
+ site_id="tiktok",
114
+ name="TikTok",
115
+ domains=("tiktok.com",),
116
+ aliases=("tiktok", "shorts", "videos"),
117
+ extraction_goal="videos",
118
+ navigation_steps=SOCIAL_STEPS,
119
+ output_fields=("creator", "caption", "likes", "comments", "url"),
120
+ target_urls=("https://www.tiktok.com/trending",),
121
+ description="Short video trend discovery",
122
+ ),
123
+ SiteTemplate(
124
+ site_id="medium",
125
+ name="Medium",
126
+ domains=("medium.com",),
127
+ aliases=("medium", "blogs", "articles"),
128
+ extraction_goal="articles",
129
+ navigation_steps=NEWS_STEPS,
130
+ output_fields=("title", "author", "claps", "reading_time", "url"),
131
+ target_urls=("https://medium.com/tag/technology",),
132
+ description="Article/blog extraction",
133
+ ),
134
+ SiteTemplate(
135
+ site_id="devto",
136
+ name="DEV Community",
137
+ domains=("dev.to",),
138
+ aliases=("devto", "dev.to", "developer posts"),
139
+ extraction_goal="articles",
140
+ navigation_steps=NEWS_STEPS,
141
+ output_fields=("title", "author", "reactions", "comments", "url"),
142
+ target_urls=("https://dev.to/top/week",),
143
+ description="Developer articles and posts",
144
+ ),
145
+ SiteTemplate(
146
+ site_id="stackoverflow",
147
+ name="Stack Overflow",
148
+ domains=("stackoverflow.com",),
149
+ aliases=("stackoverflow", "questions", "answers"),
150
+ extraction_goal="questions",
151
+ navigation_steps=DOC_STEPS,
152
+ output_fields=("title", "votes", "answers", "tags", "url"),
153
+ target_urls=("https://stackoverflow.com/questions",),
154
+ description="Q&A extraction",
155
+ ),
156
+ SiteTemplate(
157
+ site_id="kaggle",
158
+ name="Kaggle",
159
+ domains=("kaggle.com",),
160
+ aliases=("kaggle", "datasets", "competitions"),
161
+ extraction_goal="datasets",
162
+ navigation_steps=DOC_STEPS,
163
+ output_fields=("dataset_name", "author", "votes", "updated", "url"),
164
+ target_urls=("https://www.kaggle.com/datasets",),
165
+ description="Dataset and competition listings",
166
+ ),
167
+ SiteTemplate(
168
+ site_id="huggingface",
169
+ name="Hugging Face",
170
+ domains=("huggingface.co",),
171
+ aliases=("huggingface", "models", "spaces"),
172
+ extraction_goal="models",
173
+ navigation_steps=DOC_STEPS,
174
+ output_fields=("model_id", "downloads", "likes", "task", "url"),
175
+ target_urls=("https://huggingface.co/models",),
176
+ description="Model and dataset hubs",
177
+ ),
178
+ SiteTemplate(
179
+ site_id="arxiv",
180
+ name="arXiv",
181
+ domains=("arxiv.org",),
182
+ aliases=("arxiv", "papers", "preprints"),
183
+ extraction_goal="papers",
184
+ navigation_steps=DOC_STEPS,
185
+ output_fields=("title", "authors", "category", "published", "url"),
186
+ target_urls=("https://arxiv.org/list/cs/new",),
187
+ description="Research paper listings",
188
+ ),
189
+ SiteTemplate(
190
+ site_id="wikipedia",
191
+ name="Wikipedia",
192
+ domains=("wikipedia.org",),
193
+ aliases=("wikipedia", "wiki", "encyclopedia"),
194
+ extraction_goal="reference_content",
195
+ navigation_steps=DOC_STEPS,
196
+ output_fields=("title", "summary", "sections", "references", "url"),
197
+ target_urls=("https://en.wikipedia.org/wiki/Main_Page",),
198
+ description="Reference and encyclopedia pages",
199
+ ),
200
+ SiteTemplate(
201
+ site_id="pypi",
202
+ name="PyPI",
203
+ domains=("pypi.org",),
204
+ aliases=("pypi", "python packages"),
205
+ extraction_goal="packages",
206
+ navigation_steps=DOC_STEPS,
207
+ output_fields=("package", "version", "downloads", "license", "url"),
208
+ target_urls=("https://pypi.org/search/",),
209
+ description="Python package metadata",
210
+ ),
211
+ SiteTemplate(
212
+ site_id="npm",
213
+ name="npm",
214
+ domains=("npmjs.com",),
215
+ aliases=("npm", "node packages"),
216
+ extraction_goal="packages",
217
+ navigation_steps=DOC_STEPS,
218
+ output_fields=("package", "version", "weekly_downloads", "maintainers", "url"),
219
+ target_urls=("https://www.npmjs.com/search",),
220
+ description="Node package metadata",
221
+ ),
222
+ SiteTemplate(
223
+ site_id="producthunt",
224
+ name="Product Hunt",
225
+ domains=("producthunt.com",),
226
+ aliases=("product hunt", "launches", "products"),
227
+ extraction_goal="products",
228
+ navigation_steps=SOCIAL_STEPS,
229
+ output_fields=("product", "tagline", "votes", "category", "url"),
230
+ target_urls=("https://www.producthunt.com/",),
231
+ description="New product launch listings",
232
+ ),
233
+ SiteTemplate(
234
+ site_id="hackernews",
235
+ name="Hacker News",
236
+ domains=("news.ycombinator.com",),
237
+ aliases=("hackernews", "hn", "top stories"),
238
+ extraction_goal="stories",
239
+ navigation_steps=NEWS_STEPS,
240
+ output_fields=("title", "points", "comments", "author", "url"),
241
+ target_urls=("https://news.ycombinator.com/",),
242
+ description="Tech news headlines",
243
+ ),
244
+ SiteTemplate(
245
+ site_id="substack",
246
+ name="Substack",
247
+ domains=("substack.com",),
248
+ aliases=("substack", "newsletters"),
249
+ extraction_goal="newsletter_posts",
250
+ navigation_steps=NEWS_STEPS,
251
+ output_fields=("title", "author", "publication", "published", "url"),
252
+ target_urls=("https://substack.com/discover",),
253
+ description="Newsletter and long-form posts",
254
+ ),
255
+ SiteTemplate(
256
+ site_id="quora",
257
+ name="Quora",
258
+ domains=("quora.com",),
259
+ aliases=("quora", "questions"),
260
+ extraction_goal="questions",
261
+ navigation_steps=DOC_STEPS,
262
+ output_fields=("question", "answer_count", "followers", "topic", "url"),
263
+ target_urls=("https://www.quora.com/",),
264
+ description="Question and answer listings",
265
+ ),
266
+ SiteTemplate(
267
+ site_id="pinterest",
268
+ name="Pinterest",
269
+ domains=("pinterest.com",),
270
+ aliases=("pinterest", "pins", "boards"),
271
+ extraction_goal="pins",
272
+ navigation_steps=SOCIAL_STEPS,
273
+ output_fields=("title", "board", "saves", "author", "url"),
274
+ target_urls=("https://www.pinterest.com/",),
275
+ description="Pins and board discovery",
276
+ ),
277
+ SiteTemplate(
278
+ site_id="imdb",
279
+ name="IMDb",
280
+ domains=("imdb.com",),
281
+ aliases=("imdb", "movies", "tv"),
282
+ extraction_goal="titles",
283
+ navigation_steps=NEWS_STEPS,
284
+ output_fields=("title", "year", "rating", "genres", "url"),
285
+ target_urls=("https://www.imdb.com/chart/",),
286
+ description="Movie and TV listings",
287
+ ),
288
+ SiteTemplate(
289
+ site_id="nytimes",
290
+ name="New York Times",
291
+ domains=("nytimes.com",),
292
+ aliases=("new york times", "nyt"),
293
+ extraction_goal="news_articles",
294
+ navigation_steps=NEWS_STEPS,
295
+ output_fields=("headline", "section", "author", "published", "url"),
296
+ target_urls=("https://www.nytimes.com/",),
297
+ description="General news articles",
298
+ ),
299
+ SiteTemplate(
300
+ site_id="bbc",
301
+ name="BBC",
302
+ domains=("bbc.com", "bbc.co.uk"),
303
+ aliases=("bbc", "bbc news"),
304
+ extraction_goal="news_articles",
305
+ navigation_steps=NEWS_STEPS,
306
+ output_fields=("headline", "section", "published", "url"),
307
+ target_urls=("https://www.bbc.com/news",),
308
+ description="Global news coverage",
309
+ ),
310
+ SiteTemplate(
311
+ site_id="cnn",
312
+ name="CNN",
313
+ domains=("cnn.com",),
314
+ aliases=("cnn", "cnn news"),
315
+ extraction_goal="news_articles",
316
+ navigation_steps=NEWS_STEPS,
317
+ output_fields=("headline", "section", "published", "url"),
318
+ target_urls=("https://www.cnn.com/",),
319
+ description="General news feed",
320
+ ),
321
+ SiteTemplate(
322
+ site_id="reuters",
323
+ name="Reuters",
324
+ domains=("reuters.com",),
325
+ aliases=("reuters",),
326
+ extraction_goal="news_articles",
327
+ navigation_steps=NEWS_STEPS,
328
+ output_fields=("headline", "category", "published", "url"),
329
+ target_urls=("https://www.reuters.com/world/",),
330
+ description="Wire-service news feed",
331
+ ),
332
+ SiteTemplate(
333
+ site_id="bloomberg",
334
+ name="Bloomberg",
335
+ domains=("bloomberg.com",),
336
+ aliases=("bloomberg", "markets"),
337
+ extraction_goal="market_news",
338
+ navigation_steps=NEWS_STEPS,
339
+ output_fields=("headline", "section", "published", "url"),
340
+ target_urls=("https://www.bloomberg.com/markets",),
341
+ description="Finance and market news",
342
+ ),
343
+ SiteTemplate(
344
+ site_id="coinmarketcap",
345
+ name="CoinMarketCap",
346
+ domains=("coinmarketcap.com",),
347
+ aliases=("coinmarketcap", "crypto prices"),
348
+ extraction_goal="crypto_assets",
349
+ navigation_steps=DOC_STEPS,
350
+ output_fields=("asset", "price", "market_cap", "volume_24h", "url"),
351
+ target_urls=("https://coinmarketcap.com/",),
352
+ description="Cryptocurrency market data",
353
+ ),
354
+ SiteTemplate(
355
+ site_id="coindesk",
356
+ name="CoinDesk",
357
+ domains=("coindesk.com",),
358
+ aliases=("coindesk", "crypto news"),
359
+ extraction_goal="crypto_news",
360
+ navigation_steps=NEWS_STEPS,
361
+ output_fields=("headline", "author", "published", "url"),
362
+ target_urls=("https://www.coindesk.com/",),
363
+ description="Cryptocurrency news",
364
+ ),
365
+ SiteTemplate(
366
+ site_id="investopedia",
367
+ name="Investopedia",
368
+ domains=("investopedia.com",),
369
+ aliases=("investopedia", "finance education"),
370
+ extraction_goal="financial_articles",
371
+ navigation_steps=DOC_STEPS,
372
+ output_fields=("title", "author", "updated", "topic", "url"),
373
+ target_urls=("https://www.investopedia.com/",),
374
+ description="Finance learning articles",
375
+ ),
376
+ SiteTemplate(
377
+ site_id="googlescholar",
378
+ name="Google Scholar",
379
+ domains=("scholar.google.com",),
380
+ aliases=("google scholar", "scholar"),
381
+ extraction_goal="scholarly_results",
382
+ navigation_steps=DOC_STEPS,
383
+ output_fields=("title", "authors", "year", "citations", "url"),
384
+ target_urls=("https://scholar.google.com/",),
385
+ description="Scholarly paper search results",
386
+ ),
387
+ SiteTemplate(
388
+ site_id="gitlab",
389
+ name="GitLab",
390
+ domains=("gitlab.com",),
391
+ aliases=("gitlab", "merge requests"),
392
+ extraction_goal="repositories",
393
+ navigation_steps=DOC_STEPS,
394
+ output_fields=("project", "stars", "forks", "last_activity", "url"),
395
+ target_urls=("https://gitlab.com/explore",),
396
+ description="Git repository projects and activity",
397
+ ),
398
+ SiteTemplate(
399
+ site_id="bitbucket",
400
+ name="Bitbucket",
401
+ domains=("bitbucket.org",),
402
+ aliases=("bitbucket", "repos"),
403
+ extraction_goal="repositories",
404
+ navigation_steps=DOC_STEPS,
405
+ output_fields=("project", "owner", "updated", "url"),
406
+ target_urls=("https://bitbucket.org/product",),
407
+ description="Repository and workspace metadata",
408
+ ),
409
+ SiteTemplate(
410
+ site_id="amazon",
411
+ name="Amazon",
412
+ domains=("amazon.com", "amazon.in", "amazon.co.uk"),
413
+ aliases=("amazon", "products", "shopping"),
414
+ extraction_goal="products",
415
+ navigation_steps=DOC_STEPS,
416
+ output_fields=("title", "price", "rating", "reviews", "url"),
417
+ target_urls=("https://www.amazon.com/gp/bestsellers",),
418
+ description="Ecommerce product listings",
419
+ ),
420
+ SiteTemplate(
421
+ site_id="ebay",
422
+ name="eBay",
423
+ domains=("ebay.com",),
424
+ aliases=("ebay", "auctions"),
425
+ extraction_goal="products",
426
+ navigation_steps=DOC_STEPS,
427
+ output_fields=("title", "price", "condition", "shipping", "url"),
428
+ target_urls=("https://www.ebay.com/deals",),
429
+ description="Auction and product cards",
430
+ ),
431
+ SiteTemplate(
432
+ site_id="walmart",
433
+ name="Walmart",
434
+ domains=("walmart.com",),
435
+ aliases=("walmart", "shopping"),
436
+ extraction_goal="products",
437
+ navigation_steps=DOC_STEPS,
438
+ output_fields=("title", "price", "rating", "availability", "url"),
439
+ target_urls=("https://www.walmart.com/shop/deals",),
440
+ description="Retail product listings",
441
+ ),
442
+ SiteTemplate(
443
+ site_id="etsy",
444
+ name="Etsy",
445
+ domains=("etsy.com",),
446
+ aliases=("etsy", "handmade"),
447
+ extraction_goal="products",
448
+ navigation_steps=DOC_STEPS,
449
+ output_fields=("title", "price", "shop", "rating", "url"),
450
+ target_urls=("https://www.etsy.com/c/jewelry",),
451
+ description="Marketplace products and shops",
452
+ ),
453
+ SiteTemplate(
454
+ site_id="aliexpress",
455
+ name="AliExpress",
456
+ domains=("aliexpress.com",),
457
+ aliases=("aliexpress", "marketplace"),
458
+ extraction_goal="products",
459
+ navigation_steps=DOC_STEPS,
460
+ output_fields=("title", "price", "orders", "shipping", "url"),
461
+ target_urls=("https://www.aliexpress.com/category/200003482/electronics.html",),
462
+ description="Marketplace product listings",
463
+ ),
464
+ SiteTemplate(
465
+ site_id="coursera",
466
+ name="Coursera",
467
+ domains=("coursera.org",),
468
+ aliases=("coursera", "courses"),
469
+ extraction_goal="courses",
470
+ navigation_steps=DOC_STEPS,
471
+ output_fields=("course", "provider", "rating", "level", "url"),
472
+ target_urls=("https://www.coursera.org/courses",),
473
+ description="Course catalog extraction",
474
+ ),
475
+ SiteTemplate(
476
+ site_id="udemy",
477
+ name="Udemy",
478
+ domains=("udemy.com",),
479
+ aliases=("udemy", "courses"),
480
+ extraction_goal="courses",
481
+ navigation_steps=DOC_STEPS,
482
+ output_fields=("course", "instructor", "rating", "price", "url"),
483
+ target_urls=("https://www.udemy.com/courses/development/",),
484
+ description="Course marketplace extraction",
485
+ ),
486
+ SiteTemplate(
487
+ site_id="edx",
488
+ name="edX",
489
+ domains=("edx.org",),
490
+ aliases=("edx", "courses"),
491
+ extraction_goal="courses",
492
+ navigation_steps=DOC_STEPS,
493
+ output_fields=("course", "institution", "duration", "level", "url"),
494
+ target_urls=("https://www.edx.org/search",),
495
+ description="Education course listings",
496
+ ),
497
+ SiteTemplate(
498
+ site_id="freecodecamp",
499
+ name="freeCodeCamp",
500
+ domains=("freecodecamp.org",),
501
+ aliases=("freecodecamp", "curriculum"),
502
+ extraction_goal="learning_resources",
503
+ navigation_steps=DOC_STEPS,
504
+ output_fields=("resource", "category", "difficulty", "url"),
505
+ target_urls=("https://www.freecodecamp.org/news/",),
506
+ description="Learning resources and tutorials",
507
+ ),
508
+ SiteTemplate(
509
+ site_id="paperswithcode",
510
+ name="Papers with Code",
511
+ domains=("paperswithcode.com",),
512
+ aliases=("paperswithcode", "benchmarks"),
513
+ extraction_goal="papers_and_models",
514
+ navigation_steps=DOC_STEPS,
515
+ output_fields=("paper", "task", "sota_metric", "code_link", "url"),
516
+ target_urls=("https://paperswithcode.com/sota",),
517
+ description="ML paper and benchmark extraction",
518
+ ),
519
+ SiteTemplate(
520
+ site_id="openreview",
521
+ name="OpenReview",
522
+ domains=("openreview.net",),
523
+ aliases=("openreview", "conference papers"),
524
+ extraction_goal="conference_papers",
525
+ navigation_steps=DOC_STEPS,
526
+ output_fields=("title", "authors", "venue", "rating", "url"),
527
+ target_urls=("https://openreview.net/group?id=ICLR.cc",),
528
+ description="Conference paper pages and metadata",
529
+ ),
530
+ SiteTemplate(
531
+ site_id="leetcode",
532
+ name="LeetCode",
533
+ domains=("leetcode.com",),
534
+ aliases=("leetcode", "problems"),
535
+ extraction_goal="coding_problems",
536
+ navigation_steps=DOC_STEPS,
537
+ output_fields=("problem", "difficulty", "acceptance", "tags", "url"),
538
+ target_urls=("https://leetcode.com/problemset/",),
539
+ description="Coding challenge listings",
540
+ ),
541
+ SiteTemplate(
542
+ site_id="geeksforgeeks",
543
+ name="GeeksforGeeks",
544
+ domains=("geeksforgeeks.org",),
545
+ aliases=("geeksforgeeks", "gfg"),
546
+ extraction_goal="tutorials",
547
+ navigation_steps=DOC_STEPS,
548
+ output_fields=("title", "topic", "difficulty", "url"),
549
+ target_urls=("https://www.geeksforgeeks.org/explore",),
550
+ description="Tutorial and practice resources",
551
+ ),
552
+ SiteTemplate(
553
+ site_id="indeed",
554
+ name="Indeed",
555
+ domains=("indeed.com",),
556
+ aliases=("indeed", "job listings"),
557
+ extraction_goal="jobs",
558
+ navigation_steps=DOC_STEPS,
559
+ output_fields=("title", "company", "location", "salary", "url"),
560
+ target_urls=("https://www.indeed.com/jobs",),
561
+ description="Job listing extraction",
562
+ ),
563
+ SiteTemplate(
564
+ site_id="glassdoor",
565
+ name="Glassdoor",
566
+ domains=("glassdoor.com",),
567
+ aliases=("glassdoor", "company reviews"),
568
+ extraction_goal="jobs_and_companies",
569
+ navigation_steps=DOC_STEPS,
570
+ output_fields=("title", "company", "rating", "location", "url"),
571
+ target_urls=("https://www.glassdoor.com/Job/index.htm",),
572
+ description="Jobs and company review listings",
573
+ ),
574
+ SiteTemplate(
575
+ site_id="twitch",
576
+ name="Twitch",
577
+ domains=("twitch.tv",),
578
+ aliases=("twitch", "streams"),
579
+ extraction_goal="live_streams",
580
+ navigation_steps=SOCIAL_STEPS,
581
+ output_fields=("streamer", "title", "viewers", "category", "url"),
582
+ target_urls=("https://www.twitch.tv/directory",),
583
+ description="Live stream directory extraction",
584
+ ),
585
+ SiteTemplate(
586
+ site_id="vimeo",
587
+ name="Vimeo",
588
+ domains=("vimeo.com",),
589
+ aliases=("vimeo", "videos"),
590
+ extraction_goal="videos",
591
+ navigation_steps=SOCIAL_STEPS,
592
+ output_fields=("title", "creator", "plays", "likes", "url"),
593
+ target_urls=("https://vimeo.com/channels",),
594
+ description="Video channel discovery",
595
+ ),
596
+ SiteTemplate(
597
+ site_id="spotify",
598
+ name="Spotify",
599
+ domains=("spotify.com", "open.spotify.com"),
600
+ aliases=("spotify", "playlists"),
601
+ extraction_goal="music_catalog",
602
+ navigation_steps=DOC_STEPS,
603
+ output_fields=("title", "type", "creator", "followers", "url"),
604
+ target_urls=("https://open.spotify.com/genre/0JQ5DAqbMKFEC4WFtoNRpw",),
605
+ description="Music and playlist metadata",
606
+ ),
607
+ SiteTemplate(
608
+ site_id="soundcloud",
609
+ name="SoundCloud",
610
+ domains=("soundcloud.com",),
611
+ aliases=("soundcloud", "tracks"),
612
+ extraction_goal="audio_tracks",
613
+ navigation_steps=SOCIAL_STEPS,
614
+ output_fields=("title", "artist", "plays", "likes", "url"),
615
+ target_urls=("https://soundcloud.com/discover",),
616
+ description="Audio track discovery",
617
+ ),
618
+ SiteTemplate(
619
+ site_id="airbnb",
620
+ name="Airbnb",
621
+ domains=("airbnb.com",),
622
+ aliases=("airbnb", "stays"),
623
+ extraction_goal="listings",
624
+ navigation_steps=DOC_STEPS,
625
+ output_fields=("title", "location", "price_per_night", "rating", "url"),
626
+ target_urls=("https://www.airbnb.com/s/homes",),
627
+ description="Accommodation listings",
628
+ ),
629
+ SiteTemplate(
630
+ site_id="booking",
631
+ name="Booking.com",
632
+ domains=("booking.com",),
633
+ aliases=("booking", "hotels"),
634
+ extraction_goal="hotel_listings",
635
+ navigation_steps=DOC_STEPS,
636
+ output_fields=("hotel", "location", "price", "rating", "url"),
637
+ target_urls=("https://www.booking.com/",),
638
+ description="Hotel search and listing extraction",
639
+ ),
640
+ SiteTemplate(
641
+ site_id="zillow",
642
+ name="Zillow",
643
+ domains=("zillow.com",),
644
+ aliases=("zillow", "real estate"),
645
+ extraction_goal="property_listings",
646
+ navigation_steps=DOC_STEPS,
647
+ output_fields=("address", "price", "beds", "baths", "url"),
648
+ target_urls=("https://www.zillow.com/homes/",),
649
+ description="Property listing extraction",
650
+ ),
651
+ )