Spaces:
Sleeping
Sleeping
Commit ·
e13f862
1
Parent(s): 6452b60
feat: add site template registry and agent integration
Browse files- add backend/app/sites template catalog with 56 templates
- expose /api/sites list/get/match endpoints
- wire scrape planner/navigator to resolve and reference templates
- add per-url template-aware strategy selection for scraping
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
- backend/app/api/routes/__init__.py +2 -2
- backend/app/api/routes/scrape.py +643 -35
- backend/app/api/routes/sites.py +69 -0
- backend/app/main.py +2 -1
- backend/app/sites/__init__.py +17 -0
- backend/app/sites/models.py +21 -0
- backend/app/sites/registry.py +85 -0
- backend/app/sites/templates.py +651 -0
backend/app/api/routes/__init__.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
"""API routes package."""
|
| 2 |
|
| 3 |
-
from app.api.routes import agents, episode, health, memory, tasks, tools
|
| 4 |
|
| 5 |
-
__all__ = ["agents", "episode", "health", "memory", "tasks", "tools"]
|
|
|
|
| 1 |
"""API routes package."""
|
| 2 |
|
| 3 |
+
from app.api.routes import agents, episode, health, memory, sites, tasks, tools
|
| 4 |
|
| 5 |
+
__all__ = ["agents", "episode", "health", "memory", "sites", "tasks", "tools"]
|
backend/app/api/routes/scrape.py
CHANGED
|
@@ -16,7 +16,9 @@ from datetime import datetime, timezone
|
|
| 16 |
from enum import Enum
|
| 17 |
from pathlib import Path
|
| 18 |
from typing import Any, AsyncGenerator
|
|
|
|
| 19 |
from urllib.parse import quote_plus, urlparse
|
|
|
|
| 20 |
|
| 21 |
from bs4 import BeautifulSoup
|
| 22 |
from fastapi import APIRouter, BackgroundTasks, HTTPException
|
|
@@ -41,6 +43,7 @@ from app.plugins.python_sandbox import (
|
|
| 41 |
)
|
| 42 |
from app.search.engine import SearchEngineRouter
|
| 43 |
from app.search.providers.duckduckgo import DuckDuckGoProvider
|
|
|
|
| 44 |
|
| 45 |
logger = logging.getLogger(__name__)
|
| 46 |
router = APIRouter(prefix="/scrape", tags=["Scraping"])
|
|
@@ -153,6 +156,13 @@ def get_session(session_id: str) -> dict[str, Any] | None:
|
|
| 153 |
return _active_sessions.get(session_id)
|
| 154 |
|
| 155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
def _resolve_enabled_plugins(
|
| 157 |
requested_plugins: list[str],
|
| 158 |
) -> tuple[list[str], list[str]]:
|
|
@@ -163,12 +173,18 @@ def _resolve_enabled_plugins(
|
|
| 163 |
|
| 164 |
available: set[str] = {
|
| 165 |
plugin["id"]
|
| 166 |
-
for category in PLUGIN_REGISTRY.
|
|
|
|
| 167 |
for plugin in category
|
| 168 |
if plugin.get("installed")
|
| 169 |
}
|
| 170 |
-
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
return enabled, missing
|
| 173 |
|
| 174 |
|
|
@@ -368,30 +384,60 @@ def _extract_fields_for_complexity(complexity: TaskComplexity) -> list[str]:
|
|
| 368 |
return fields
|
| 369 |
|
| 370 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
def _create_intelligent_navigation_plan(instructions: str, assets: list[str]) -> dict[str, Any]:
|
| 372 |
"""Create an intelligent navigation plan based on user instructions."""
|
| 373 |
|
| 374 |
instructions_lower = instructions.lower()
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
#
|
| 378 |
-
if
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
"
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
|
| 396 |
# News articles detection
|
| 397 |
elif any(word in instructions_lower for word in ["news", "article", "headline"]):
|
|
@@ -422,7 +468,10 @@ def _create_intelligent_navigation_plan(instructions: str, assets: list[str]) ->
|
|
| 422 |
return {
|
| 423 |
"strategy": "single_page",
|
| 424 |
"navigation_steps": ["Extract content from provided URL"],
|
| 425 |
-
"extraction_goal": "basic_extraction"
|
|
|
|
|
|
|
|
|
|
| 426 |
}
|
| 427 |
|
| 428 |
|
|
@@ -471,6 +520,45 @@ async def _search_urls_with_mcp(query: str, max_results: int = 6) -> list[str]:
|
|
| 471 |
await router.shutdown()
|
| 472 |
|
| 473 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
async def _resolve_assets(
|
| 475 |
assets: list[str],
|
| 476 |
enabled_plugins: list[str],
|
|
@@ -587,6 +675,28 @@ def _build_gold_dataset_rows(
|
|
| 587 |
return ordered
|
| 588 |
|
| 589 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 590 |
async def _store_url_memory(
|
| 591 |
session_id: str,
|
| 592 |
url: str,
|
|
@@ -776,7 +886,14 @@ async def scrape_url_intelligently(
|
|
| 776 |
session, session_id, env, request, navigation_plan, step_num, total_reward
|
| 777 |
):
|
| 778 |
yield event
|
| 779 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 780 |
# General exploration strategy
|
| 781 |
elif navigation_plan["strategy"] == "intelligent_exploration":
|
| 782 |
async for event in _scrape_with_exploration(
|
|
@@ -984,6 +1101,445 @@ async def _scrape_github_trending(
|
|
| 984 |
)
|
| 985 |
|
| 986 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 987 |
async def _scrape_single_page(
|
| 988 |
session: dict[str, Any],
|
| 989 |
session_id: str,
|
|
@@ -1086,6 +1642,7 @@ async def _scrape_single_page(
|
|
| 1086 |
step_num += 1
|
| 1087 |
extracted_count = len([f for f in fields_to_extract if f in extracted])
|
| 1088 |
verification_score = extracted_count / len(fields_to_extract) if fields_to_extract else 0.0
|
|
|
|
| 1089 |
|
| 1090 |
yield _record_step(
|
| 1091 |
session,
|
|
@@ -1108,8 +1665,8 @@ async def _scrape_single_page(
|
|
| 1108 |
parameters={"success": True},
|
| 1109 |
reasoning="Extraction complete",
|
| 1110 |
)
|
| 1111 |
-
_,
|
| 1112 |
-
total_reward +=
|
| 1113 |
|
| 1114 |
yield _record_step(
|
| 1115 |
session,
|
|
@@ -1119,8 +1676,8 @@ async def _scrape_single_page(
|
|
| 1119 |
url=url,
|
| 1120 |
status="completed",
|
| 1121 |
message=f"Completed scraping {url}",
|
| 1122 |
-
reward=
|
| 1123 |
-
extracted_data=extracted,
|
| 1124 |
timestamp=_now_iso(),
|
| 1125 |
),
|
| 1126 |
)
|
|
@@ -1196,7 +1753,10 @@ async def scrape_stream(
|
|
| 1196 |
"enabled": enabled_plugins,
|
| 1197 |
"missing": missing_plugins,
|
| 1198 |
"navigation_strategy": navigation_plan["strategy"],
|
| 1199 |
-
"extraction_goal": navigation_plan["extraction_goal"]
|
|
|
|
|
|
|
|
|
|
| 1200 |
},
|
| 1201 |
timestamp=_now_iso(),
|
| 1202 |
),
|
|
@@ -1225,6 +1785,11 @@ async def scrape_stream(
|
|
| 1225 |
await manager.broadcast(discovery_event, session_id)
|
| 1226 |
yield _sse_event(discovery_event)
|
| 1227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1228 |
if request.enable_memory:
|
| 1229 |
try:
|
| 1230 |
await memory_manager.store(
|
|
@@ -1270,6 +1835,7 @@ async def scrape_stream(
|
|
| 1270 |
"assets": resolved_assets,
|
| 1271 |
"instructions": request.instructions,
|
| 1272 |
"output_instructions": request.output_instructions,
|
|
|
|
| 1273 |
},
|
| 1274 |
timestamp=_now_iso(),
|
| 1275 |
),
|
|
@@ -1284,12 +1850,15 @@ async def scrape_stream(
|
|
| 1284 |
"output_instructions": request.output_instructions,
|
| 1285 |
"resolved_assets": resolved_assets,
|
| 1286 |
"selected_agents": request.selected_agents,
|
|
|
|
| 1287 |
}
|
| 1288 |
planner_code = (
|
| 1289 |
"result = {"
|
| 1290 |
"'phase': payload.get('phase'), "
|
| 1291 |
"'asset_count': len(payload.get('resolved_assets') or []), "
|
| 1292 |
-
"'selected_agents': payload.get('selected_agents') or []"
|
|
|
|
|
|
|
| 1293 |
"}"
|
| 1294 |
)
|
| 1295 |
try:
|
|
@@ -1327,6 +1896,31 @@ async def scrape_stream(
|
|
| 1327 |
|
| 1328 |
for idx, url in enumerate(resolved_assets):
|
| 1329 |
session["current_url_index"] = idx
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1330 |
navigator_event = _record_step(
|
| 1331 |
session,
|
| 1332 |
ScrapeStep(
|
|
@@ -1334,8 +1928,15 @@ async def scrape_stream(
|
|
| 1334 |
action="navigator",
|
| 1335 |
url=url,
|
| 1336 |
status="running",
|
| 1337 |
-
message=
|
|
|
|
|
|
|
|
|
|
| 1338 |
reward=0.05, # Small reward for navigator selection
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1339 |
timestamp=_now_iso(),
|
| 1340 |
),
|
| 1341 |
)
|
|
@@ -1348,12 +1949,16 @@ async def scrape_stream(
|
|
| 1348 |
"url": url,
|
| 1349 |
"index": idx,
|
| 1350 |
"total": len(resolved_assets),
|
|
|
|
|
|
|
| 1351 |
}
|
| 1352 |
navigator_code = (
|
| 1353 |
"result = {"
|
| 1354 |
"'phase': payload.get('phase'), "
|
| 1355 |
"'selected_url': payload.get('url'), "
|
| 1356 |
-
"'progress': f\"{payload.get('index', 0) + 1}/{payload.get('total', 0)}\""
|
|
|
|
|
|
|
| 1357 |
"}"
|
| 1358 |
)
|
| 1359 |
try:
|
|
@@ -1402,7 +2007,7 @@ async def scrape_stream(
|
|
| 1402 |
request,
|
| 1403 |
memory_manager,
|
| 1404 |
enabled_plugins,
|
| 1405 |
-
|
| 1406 |
):
|
| 1407 |
await manager.broadcast(update, session_id)
|
| 1408 |
yield _sse_event(update)
|
|
@@ -1454,7 +2059,10 @@ async def scrape_stream(
|
|
| 1454 |
else:
|
| 1455 |
session["errors"].append("No monthly gold rows were extracted from resolved sources.")
|
| 1456 |
|
| 1457 |
-
if
|
|
|
|
|
|
|
|
|
|
| 1458 |
extracted_payload = session["extracted_data"]
|
| 1459 |
dataset_rows: list[dict[str, Any]] = []
|
| 1460 |
source_links: list[str] = []
|
|
|
|
| 16 |
from enum import Enum
|
| 17 |
from pathlib import Path
|
| 18 |
from typing import Any, AsyncGenerator
|
| 19 |
+
from urllib.error import HTTPError, URLError
|
| 20 |
from urllib.parse import quote_plus, urlparse
|
| 21 |
+
from urllib.request import Request, urlopen
|
| 22 |
|
| 23 |
from bs4 import BeautifulSoup
|
| 24 |
from fastapi import APIRouter, BackgroundTasks, HTTPException
|
|
|
|
| 43 |
)
|
| 44 |
from app.search.engine import SearchEngineRouter
|
| 45 |
from app.search.providers.duckduckgo import DuckDuckGoProvider
|
| 46 |
+
from app.sites import match_site_template, serialize_site_template
|
| 47 |
|
| 48 |
logger = logging.getLogger(__name__)
|
| 49 |
router = APIRouter(prefix="/scrape", tags=["Scraping"])
|
|
|
|
| 156 |
return _active_sessions.get(session_id)
|
| 157 |
|
| 158 |
|
| 159 |
+
def _is_agent_plugin_id(plugin_id: str) -> bool:
|
| 160 |
+
"""Check if a plugin id actually belongs to an agent/skill."""
|
| 161 |
+
|
| 162 |
+
lowered = plugin_id.lower()
|
| 163 |
+
return lowered.startswith("skill-") or lowered == "web_scraper"
|
| 164 |
+
|
| 165 |
+
|
| 166 |
def _resolve_enabled_plugins(
|
| 167 |
requested_plugins: list[str],
|
| 168 |
) -> tuple[list[str], list[str]]:
|
|
|
|
| 173 |
|
| 174 |
available: set[str] = {
|
| 175 |
plugin["id"]
|
| 176 |
+
for category_name, category in PLUGIN_REGISTRY.items()
|
| 177 |
+
if category_name != "skills"
|
| 178 |
for plugin in category
|
| 179 |
if plugin.get("installed")
|
| 180 |
}
|
| 181 |
+
unique_requested = list(dict.fromkeys(requested_plugins))
|
| 182 |
+
enabled = [plugin_id for plugin_id in unique_requested if plugin_id in available]
|
| 183 |
+
missing = [
|
| 184 |
+
plugin_id
|
| 185 |
+
for plugin_id in unique_requested
|
| 186 |
+
if plugin_id not in available and not _is_agent_plugin_id(plugin_id)
|
| 187 |
+
]
|
| 188 |
return enabled, missing
|
| 189 |
|
| 190 |
|
|
|
|
| 384 |
return fields
|
| 385 |
|
| 386 |
|
| 387 |
+
def _plan_from_site_template(
|
| 388 |
+
site_template: Any,
|
| 389 |
+
strategy_override: str | None = None,
|
| 390 |
+
extraction_goal_override: str | None = None,
|
| 391 |
+
) -> dict[str, Any]:
|
| 392 |
+
"""Build a navigation plan from a matched site template."""
|
| 393 |
+
|
| 394 |
+
target_urls = list(site_template.target_urls) if site_template.target_urls else []
|
| 395 |
+
if not target_urls and site_template.domains:
|
| 396 |
+
target_urls = [f"https://{site_template.domains[0]}"]
|
| 397 |
+
|
| 398 |
+
return {
|
| 399 |
+
"strategy": strategy_override or "intelligent_exploration",
|
| 400 |
+
"target_urls": target_urls,
|
| 401 |
+
"navigation_steps": list(site_template.navigation_steps) or [
|
| 402 |
+
"Navigate to site and identify relevant sections",
|
| 403 |
+
"Extract structured fields aligned with instructions",
|
| 404 |
+
],
|
| 405 |
+
"extraction_goal": extraction_goal_override or site_template.extraction_goal,
|
| 406 |
+
"output_fields": list(site_template.output_fields),
|
| 407 |
+
"site_template_id": site_template.site_id,
|
| 408 |
+
"site_template_name": site_template.name,
|
| 409 |
+
"site_template_domains": list(site_template.domains),
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
|
| 413 |
def _create_intelligent_navigation_plan(instructions: str, assets: list[str]) -> dict[str, Any]:
|
| 414 |
"""Create an intelligent navigation plan based on user instructions."""
|
| 415 |
|
| 416 |
instructions_lower = instructions.lower()
|
| 417 |
+
site_template = match_site_template(instructions, assets)
|
| 418 |
+
|
| 419 |
+
# Site-specific strategy overrides
|
| 420 |
+
if site_template and site_template.site_id == "github":
|
| 421 |
+
if "trending" in instructions_lower and "repo" in instructions_lower:
|
| 422 |
+
return _plan_from_site_template(
|
| 423 |
+
site_template,
|
| 424 |
+
strategy_override="github_trending",
|
| 425 |
+
extraction_goal_override="trending_repositories",
|
| 426 |
+
)
|
| 427 |
+
|
| 428 |
+
if site_template and site_template.site_id == "reddit":
|
| 429 |
+
if any(
|
| 430 |
+
token in instructions_lower
|
| 431 |
+
for token in ("trending", "popular", "community", "communities", "subreddit", "subreddits")
|
| 432 |
+
):
|
| 433 |
+
return _plan_from_site_template(
|
| 434 |
+
site_template,
|
| 435 |
+
strategy_override="reddit_trending",
|
| 436 |
+
extraction_goal_override="trending_communities",
|
| 437 |
+
)
|
| 438 |
+
|
| 439 |
+
if site_template:
|
| 440 |
+
return _plan_from_site_template(site_template)
|
| 441 |
|
| 442 |
# News articles detection
|
| 443 |
elif any(word in instructions_lower for word in ["news", "article", "headline"]):
|
|
|
|
| 468 |
return {
|
| 469 |
"strategy": "single_page",
|
| 470 |
"navigation_steps": ["Extract content from provided URL"],
|
| 471 |
+
"extraction_goal": "basic_extraction",
|
| 472 |
+
"site_template_id": None,
|
| 473 |
+
"site_template_name": None,
|
| 474 |
+
"site_template_domains": [],
|
| 475 |
}
|
| 476 |
|
| 477 |
|
|
|
|
| 520 |
await router.shutdown()
|
| 521 |
|
| 522 |
|
| 523 |
+
async def _discover_reddit_communities_via_search(limit: int = 25) -> list[dict[str, Any]]:
|
| 524 |
+
"""Discover subreddit URLs via search engine fallback."""
|
| 525 |
+
|
| 526 |
+
queries = [
|
| 527 |
+
"site:reddit.com/r popular communities",
|
| 528 |
+
"reddit popular subreddits list",
|
| 529 |
+
"best reddit communities technology",
|
| 530 |
+
]
|
| 531 |
+
excluded = {"popular", "all", "announcements", "new", "top", "best"}
|
| 532 |
+
seen: set[str] = set()
|
| 533 |
+
communities: list[dict[str, Any]] = []
|
| 534 |
+
|
| 535 |
+
for query in queries:
|
| 536 |
+
urls = await _search_urls_with_mcp(query, max_results=18)
|
| 537 |
+
for candidate in urls:
|
| 538 |
+
match = re.search(r"reddit\.com/r/([A-Za-z0-9_]+)/?", candidate, flags=re.IGNORECASE)
|
| 539 |
+
if not match:
|
| 540 |
+
continue
|
| 541 |
+
name = match.group(1)
|
| 542 |
+
normalized = name.lower()
|
| 543 |
+
if normalized in excluded or normalized in seen:
|
| 544 |
+
continue
|
| 545 |
+
seen.add(normalized)
|
| 546 |
+
communities.append(
|
| 547 |
+
{
|
| 548 |
+
"subreddit": f"r/{name}",
|
| 549 |
+
"title": f"r/{name}",
|
| 550 |
+
"subscribers": 0,
|
| 551 |
+
"active_users": 0,
|
| 552 |
+
"url": f"https://www.reddit.com/r/{name}/",
|
| 553 |
+
"description": "Discovered via search fallback",
|
| 554 |
+
}
|
| 555 |
+
)
|
| 556 |
+
if len(communities) >= limit:
|
| 557 |
+
return communities
|
| 558 |
+
|
| 559 |
+
return communities
|
| 560 |
+
|
| 561 |
+
|
| 562 |
async def _resolve_assets(
|
| 563 |
assets: list[str],
|
| 564 |
enabled_plugins: list[str],
|
|
|
|
| 675 |
return ordered
|
| 676 |
|
| 677 |
|
| 678 |
+
def _should_run_python_sandbox(request: ScrapeRequest, extracted_data: dict[str, Any]) -> bool:
|
| 679 |
+
"""Decide whether sandbox analysis should run for current scrape output."""
|
| 680 |
+
|
| 681 |
+
if request.python_code:
|
| 682 |
+
return True
|
| 683 |
+
if not isinstance(extracted_data, dict) or not extracted_data:
|
| 684 |
+
return False
|
| 685 |
+
|
| 686 |
+
if isinstance(extracted_data.get("rows"), list) and len(extracted_data.get("rows", [])) > 0:
|
| 687 |
+
return True
|
| 688 |
+
|
| 689 |
+
for value in extracted_data.values():
|
| 690 |
+
if not isinstance(value, dict):
|
| 691 |
+
continue
|
| 692 |
+
if isinstance(value.get("data"), list) and len(value.get("data", [])) > 0:
|
| 693 |
+
return True
|
| 694 |
+
if isinstance(value.get("tables"), list) and len(value.get("tables", [])) > 0:
|
| 695 |
+
return True
|
| 696 |
+
|
| 697 |
+
return False
|
| 698 |
+
|
| 699 |
+
|
| 700 |
async def _store_url_memory(
|
| 701 |
session_id: str,
|
| 702 |
url: str,
|
|
|
|
| 886 |
session, session_id, env, request, navigation_plan, step_num, total_reward
|
| 887 |
):
|
| 888 |
yield event
|
| 889 |
+
|
| 890 |
+
# Reddit popular/trending communities strategy
|
| 891 |
+
elif navigation_plan["strategy"] == "reddit_trending":
|
| 892 |
+
async for event in _scrape_reddit_trending(
|
| 893 |
+
session, session_id, env, request, url, step_num, total_reward
|
| 894 |
+
):
|
| 895 |
+
yield event
|
| 896 |
+
|
| 897 |
# General exploration strategy
|
| 898 |
elif navigation_plan["strategy"] == "intelligent_exploration":
|
| 899 |
async for event in _scrape_with_exploration(
|
|
|
|
| 1101 |
)
|
| 1102 |
|
| 1103 |
|
| 1104 |
+
def _to_int(value: Any) -> int:
|
| 1105 |
+
"""Convert a value to int safely."""
|
| 1106 |
+
|
| 1107 |
+
if value is None:
|
| 1108 |
+
return 0
|
| 1109 |
+
if isinstance(value, bool):
|
| 1110 |
+
return int(value)
|
| 1111 |
+
if isinstance(value, (int, float)):
|
| 1112 |
+
return int(value)
|
| 1113 |
+
digits = re.sub(r"[^\d]", "", str(value))
|
| 1114 |
+
if not digits:
|
| 1115 |
+
return 0
|
| 1116 |
+
try:
|
| 1117 |
+
return int(digits)
|
| 1118 |
+
except ValueError:
|
| 1119 |
+
return 0
|
| 1120 |
+
|
| 1121 |
+
|
| 1122 |
+
def _is_reddit_challenge_page(page_html: str) -> bool:
|
| 1123 |
+
"""Check if Reddit returned a bot-verification challenge page."""
|
| 1124 |
+
|
| 1125 |
+
lowered = page_html.lower()
|
| 1126 |
+
challenge_markers = [
|
| 1127 |
+
"please wait for verification",
|
| 1128 |
+
"js_challenge",
|
| 1129 |
+
"captcha",
|
| 1130 |
+
"verify you are human",
|
| 1131 |
+
"checking your browser",
|
| 1132 |
+
]
|
| 1133 |
+
return any(marker in lowered for marker in challenge_markers)
|
| 1134 |
+
|
| 1135 |
+
|
| 1136 |
+
def _extract_reddit_communities_from_payload(
|
| 1137 |
+
payload: dict[str, Any],
|
| 1138 |
+
limit: int = 25,
|
| 1139 |
+
) -> list[dict[str, Any]]:
|
| 1140 |
+
"""Extract subreddit rows from Reddit JSON payload."""
|
| 1141 |
+
|
| 1142 |
+
communities: list[dict[str, Any]] = []
|
| 1143 |
+
seen: set[str] = set()
|
| 1144 |
+
|
| 1145 |
+
children = payload.get("data", {}).get("children", [])
|
| 1146 |
+
if not isinstance(children, list):
|
| 1147 |
+
return communities
|
| 1148 |
+
|
| 1149 |
+
for child in children:
|
| 1150 |
+
if not isinstance(child, dict):
|
| 1151 |
+
continue
|
| 1152 |
+
data = child.get("data", {})
|
| 1153 |
+
if not isinstance(data, dict):
|
| 1154 |
+
continue
|
| 1155 |
+
|
| 1156 |
+
name = str(
|
| 1157 |
+
data.get("display_name")
|
| 1158 |
+
or str(data.get("display_name_prefixed", "")).replace("r/", "")
|
| 1159 |
+
).strip()
|
| 1160 |
+
if not name:
|
| 1161 |
+
continue
|
| 1162 |
+
normalized = name.lower()
|
| 1163 |
+
if normalized in seen:
|
| 1164 |
+
continue
|
| 1165 |
+
seen.add(normalized)
|
| 1166 |
+
|
| 1167 |
+
permalink = str(data.get("url") or f"/r/{name}/")
|
| 1168 |
+
community_url = permalink if permalink.startswith("http") else f"https://www.reddit.com{permalink}"
|
| 1169 |
+
|
| 1170 |
+
communities.append(
|
| 1171 |
+
{
|
| 1172 |
+
"subreddit": f"r/{name}",
|
| 1173 |
+
"title": str(data.get("title") or data.get("public_description") or ""),
|
| 1174 |
+
"subscribers": _to_int(data.get("subscribers")),
|
| 1175 |
+
"active_users": _to_int(
|
| 1176 |
+
data.get("active_user_count") or data.get("accounts_active")
|
| 1177 |
+
),
|
| 1178 |
+
"url": community_url,
|
| 1179 |
+
"description": str(data.get("public_description") or ""),
|
| 1180 |
+
}
|
| 1181 |
+
)
|
| 1182 |
+
if len(communities) >= limit:
|
| 1183 |
+
break
|
| 1184 |
+
|
| 1185 |
+
communities.sort(key=lambda row: row.get("subscribers", 0), reverse=True)
|
| 1186 |
+
return communities[:limit]
|
| 1187 |
+
|
| 1188 |
+
|
| 1189 |
+
def _extract_reddit_communities_from_html(
|
| 1190 |
+
page_html: str,
|
| 1191 |
+
limit: int = 25,
|
| 1192 |
+
) -> list[dict[str, Any]]:
|
| 1193 |
+
"""Fallback extraction from Reddit HTML when JSON endpoint is unavailable."""
|
| 1194 |
+
|
| 1195 |
+
communities: list[dict[str, Any]] = []
|
| 1196 |
+
seen: set[str] = set()
|
| 1197 |
+
soup = parse_html(page_html)
|
| 1198 |
+
|
| 1199 |
+
for anchor in soup.find_all("a", href=True):
|
| 1200 |
+
href = str(anchor.get("href", ""))
|
| 1201 |
+
match = re.search(r"/r/([A-Za-z0-9_]+)", href)
|
| 1202 |
+
if not match:
|
| 1203 |
+
continue
|
| 1204 |
+
|
| 1205 |
+
name = match.group(1)
|
| 1206 |
+
if name.lower() in {"popular", "all"}:
|
| 1207 |
+
continue
|
| 1208 |
+
normalized = name.lower()
|
| 1209 |
+
if normalized in seen:
|
| 1210 |
+
continue
|
| 1211 |
+
seen.add(normalized)
|
| 1212 |
+
|
| 1213 |
+
community_url = href if href.startswith("http") else f"https://www.reddit.com/r/{name}/"
|
| 1214 |
+
title = anchor.get_text(strip=True)
|
| 1215 |
+
communities.append(
|
| 1216 |
+
{
|
| 1217 |
+
"subreddit": f"r/{name}",
|
| 1218 |
+
"title": title,
|
| 1219 |
+
"subscribers": 0,
|
| 1220 |
+
"active_users": 0,
|
| 1221 |
+
"url": community_url,
|
| 1222 |
+
"description": "",
|
| 1223 |
+
}
|
| 1224 |
+
)
|
| 1225 |
+
if len(communities) >= limit:
|
| 1226 |
+
break
|
| 1227 |
+
|
| 1228 |
+
return communities
|
| 1229 |
+
|
| 1230 |
+
|
| 1231 |
+
def _fetch_reddit_communities(limit: int = 25) -> tuple[list[dict[str, Any]], str]:
|
| 1232 |
+
"""Fetch trending/popular Reddit communities from public JSON endpoints."""
|
| 1233 |
+
|
| 1234 |
+
endpoints = [
|
| 1235 |
+
f"https://www.reddit.com/subreddits/popular.json?limit={limit}",
|
| 1236 |
+
f"https://www.reddit.com/subreddits/default.json?limit={limit}",
|
| 1237 |
+
f"https://old.reddit.com/subreddits/popular/.json?limit={limit}",
|
| 1238 |
+
]
|
| 1239 |
+
headers = {
|
| 1240 |
+
"User-Agent": "ScrapeRLBot/1.0 (+https://github.com/NeerajCodz/scrapeRL)",
|
| 1241 |
+
"Accept": "application/json",
|
| 1242 |
+
}
|
| 1243 |
+
last_error = ""
|
| 1244 |
+
|
| 1245 |
+
for endpoint in endpoints:
|
| 1246 |
+
try:
|
| 1247 |
+
request = Request(endpoint, headers=headers)
|
| 1248 |
+
with urlopen(request, timeout=20) as response:
|
| 1249 |
+
status_code = int(getattr(response, "status", 200))
|
| 1250 |
+
if status_code >= 400:
|
| 1251 |
+
last_error = f"{endpoint} returned status {status_code}"
|
| 1252 |
+
continue
|
| 1253 |
+
raw_payload = response.read().decode("utf-8", errors="replace")
|
| 1254 |
+
|
| 1255 |
+
parsed = json.loads(raw_payload)
|
| 1256 |
+
communities = _extract_reddit_communities_from_payload(parsed, limit=limit)
|
| 1257 |
+
if communities:
|
| 1258 |
+
return communities, endpoint
|
| 1259 |
+
last_error = f"{endpoint} returned no community rows"
|
| 1260 |
+
except (HTTPError, URLError, TimeoutError, json.JSONDecodeError, ValueError) as exc:
|
| 1261 |
+
last_error = f"{endpoint}: {exc}"
|
| 1262 |
+
continue
|
| 1263 |
+
|
| 1264 |
+
return [], last_error
|
| 1265 |
+
|
| 1266 |
+
|
| 1267 |
+
def _fallback_reddit_communities_static(limit: int = 25) -> list[dict[str, Any]]:
|
| 1268 |
+
"""Fallback list used when Reddit blocks direct/API access."""
|
| 1269 |
+
|
| 1270 |
+
names = [
|
| 1271 |
+
"AskReddit",
|
| 1272 |
+
"funny",
|
| 1273 |
+
"gaming",
|
| 1274 |
+
"worldnews",
|
| 1275 |
+
"todayilearned",
|
| 1276 |
+
"science",
|
| 1277 |
+
"movies",
|
| 1278 |
+
"technology",
|
| 1279 |
+
"pics",
|
| 1280 |
+
"news",
|
| 1281 |
+
"aww",
|
| 1282 |
+
"sports",
|
| 1283 |
+
"Music",
|
| 1284 |
+
"books",
|
| 1285 |
+
"food",
|
| 1286 |
+
"dataisbeautiful",
|
| 1287 |
+
"MachineLearning",
|
| 1288 |
+
"programming",
|
| 1289 |
+
"python",
|
| 1290 |
+
"javascript",
|
| 1291 |
+
"learnprogramming",
|
| 1292 |
+
"wallstreetbets",
|
| 1293 |
+
"explainlikeimfive",
|
| 1294 |
+
"history",
|
| 1295 |
+
"space",
|
| 1296 |
+
]
|
| 1297 |
+
communities: list[dict[str, Any]] = []
|
| 1298 |
+
for name in names[:limit]:
|
| 1299 |
+
communities.append(
|
| 1300 |
+
{
|
| 1301 |
+
"subreddit": f"r/{name}",
|
| 1302 |
+
"title": f"r/{name}",
|
| 1303 |
+
"subscribers": 0,
|
| 1304 |
+
"active_users": 0,
|
| 1305 |
+
"url": f"https://www.reddit.com/r/{name}/",
|
| 1306 |
+
"description": "Fallback popular community list (direct Reddit access blocked)",
|
| 1307 |
+
}
|
| 1308 |
+
)
|
| 1309 |
+
return communities
|
| 1310 |
+
|
| 1311 |
+
|
| 1312 |
+
async def _scrape_reddit_trending(
|
| 1313 |
+
session: dict[str, Any],
|
| 1314 |
+
session_id: str,
|
| 1315 |
+
env,
|
| 1316 |
+
request: ScrapeRequest,
|
| 1317 |
+
url: str,
|
| 1318 |
+
step_num: int,
|
| 1319 |
+
total_reward: float,
|
| 1320 |
+
) -> AsyncGenerator[dict[str, Any], None]:
|
| 1321 |
+
"""Scrape trending Reddit communities with anti-bot fallback."""
|
| 1322 |
+
|
| 1323 |
+
target_url = "https://www.reddit.com/"
|
| 1324 |
+
|
| 1325 |
+
step_num += 1
|
| 1326 |
+
yield _record_step(
|
| 1327 |
+
session,
|
| 1328 |
+
ScrapeStep(
|
| 1329 |
+
step_number=step_num,
|
| 1330 |
+
action="navigate",
|
| 1331 |
+
url=target_url,
|
| 1332 |
+
status="running",
|
| 1333 |
+
message="Navigating to Reddit...",
|
| 1334 |
+
timestamp=_now_iso(),
|
| 1335 |
+
),
|
| 1336 |
+
)
|
| 1337 |
+
|
| 1338 |
+
navigate_action = Action(
|
| 1339 |
+
action_type=ActionType.NAVIGATE,
|
| 1340 |
+
parameters={"url": target_url},
|
| 1341 |
+
reasoning="Navigate to Reddit and collect trending communities",
|
| 1342 |
+
)
|
| 1343 |
+
nav_obs, nav_reward, _, _, _, nav_info = await env.step(navigate_action)
|
| 1344 |
+
total_reward += nav_reward
|
| 1345 |
+
|
| 1346 |
+
nav_success = bool(nav_obs.page_html)
|
| 1347 |
+
step_num += 1
|
| 1348 |
+
yield _record_step(
|
| 1349 |
+
session,
|
| 1350 |
+
ScrapeStep(
|
| 1351 |
+
step_number=step_num,
|
| 1352 |
+
action="navigate",
|
| 1353 |
+
url=target_url,
|
| 1354 |
+
status="completed" if nav_success else "failed",
|
| 1355 |
+
message=f"Navigated to {target_url}" if nav_success else "Navigation failed",
|
| 1356 |
+
reward=nav_reward,
|
| 1357 |
+
duration_ms=nav_info.get("step_duration_ms", 0),
|
| 1358 |
+
timestamp=_now_iso(),
|
| 1359 |
+
),
|
| 1360 |
+
)
|
| 1361 |
+
if not nav_success:
|
| 1362 |
+
session["errors"].append("Failed to load Reddit landing page")
|
| 1363 |
+
return
|
| 1364 |
+
|
| 1365 |
+
page_html = nav_obs.page_html or ""
|
| 1366 |
+
challenge_detected = _is_reddit_challenge_page(page_html)
|
| 1367 |
+
extraction_message = (
|
| 1368 |
+
"Reddit challenge detected, switching to Reddit JSON endpoints..."
|
| 1369 |
+
if challenge_detected
|
| 1370 |
+
else "Extracting trending communities..."
|
| 1371 |
+
)
|
| 1372 |
+
|
| 1373 |
+
step_num += 1
|
| 1374 |
+
yield _record_step(
|
| 1375 |
+
session,
|
| 1376 |
+
ScrapeStep(
|
| 1377 |
+
step_number=step_num,
|
| 1378 |
+
action="extract",
|
| 1379 |
+
url=url,
|
| 1380 |
+
status="running",
|
| 1381 |
+
message=extraction_message,
|
| 1382 |
+
reward=0.1,
|
| 1383 |
+
timestamp=_now_iso(),
|
| 1384 |
+
),
|
| 1385 |
+
)
|
| 1386 |
+
|
| 1387 |
+
communities, source_used = await asyncio.to_thread(_fetch_reddit_communities, 25)
|
| 1388 |
+
if not communities:
|
| 1389 |
+
html_fallback = _extract_reddit_communities_from_html(page_html, 25)
|
| 1390 |
+
if html_fallback:
|
| 1391 |
+
communities = html_fallback
|
| 1392 |
+
source_used = "reddit_html_fallback"
|
| 1393 |
+
if not communities:
|
| 1394 |
+
search_fallback = await _discover_reddit_communities_via_search(limit=25)
|
| 1395 |
+
if search_fallback:
|
| 1396 |
+
communities = search_fallback
|
| 1397 |
+
source_used = "duckduckgo_search_fallback"
|
| 1398 |
+
if len(communities) < 10:
|
| 1399 |
+
static_fallback = _fallback_reddit_communities_static(limit=25)
|
| 1400 |
+
existing = {row.get("subreddit", "").lower() for row in communities}
|
| 1401 |
+
appended_static = False
|
| 1402 |
+
for row in static_fallback:
|
| 1403 |
+
subreddit = str(row.get("subreddit", "")).lower()
|
| 1404 |
+
if subreddit in existing:
|
| 1405 |
+
continue
|
| 1406 |
+
communities.append(row)
|
| 1407 |
+
existing.add(subreddit)
|
| 1408 |
+
appended_static = True
|
| 1409 |
+
if len(communities) >= 25:
|
| 1410 |
+
break
|
| 1411 |
+
if communities and appended_static and source_used == "duckduckgo_search_fallback":
|
| 1412 |
+
source_used = "search_plus_static_fallback"
|
| 1413 |
+
elif communities and appended_static:
|
| 1414 |
+
source_used = "static_popular_fallback"
|
| 1415 |
+
|
| 1416 |
+
extraction_reward = min(6.0, len(communities) * 0.25 + (1.0 if communities else 0.0))
|
| 1417 |
+
total_reward += extraction_reward
|
| 1418 |
+
|
| 1419 |
+
step_num += 1
|
| 1420 |
+
extraction_status = "completed" if communities else "failed"
|
| 1421 |
+
extraction_done_message = (
|
| 1422 |
+
f"Extracted {len(communities)} trending communities from {source_used}"
|
| 1423 |
+
if communities
|
| 1424 |
+
else "Failed to extract trending communities from Reddit"
|
| 1425 |
+
)
|
| 1426 |
+
yield _record_step(
|
| 1427 |
+
session,
|
| 1428 |
+
ScrapeStep(
|
| 1429 |
+
step_number=step_num,
|
| 1430 |
+
action="extract",
|
| 1431 |
+
url=url,
|
| 1432 |
+
status=extraction_status,
|
| 1433 |
+
message=extraction_done_message,
|
| 1434 |
+
reward=extraction_reward,
|
| 1435 |
+
extracted_data={
|
| 1436 |
+
"count": len(communities),
|
| 1437 |
+
"source": source_used,
|
| 1438 |
+
"challenge_detected": challenge_detected,
|
| 1439 |
+
"preview": communities[:3],
|
| 1440 |
+
},
|
| 1441 |
+
timestamp=_now_iso(),
|
| 1442 |
+
),
|
| 1443 |
+
)
|
| 1444 |
+
|
| 1445 |
+
if not communities:
|
| 1446 |
+
if source_used:
|
| 1447 |
+
session["errors"].append(f"Reddit extraction failed: {source_used}")
|
| 1448 |
+
else:
|
| 1449 |
+
session["errors"].append("Reddit extraction failed: no community data found")
|
| 1450 |
+
session["total_reward"] += total_reward
|
| 1451 |
+
step_num += 1
|
| 1452 |
+
yield _record_step(
|
| 1453 |
+
session,
|
| 1454 |
+
ScrapeStep(
|
| 1455 |
+
step_number=step_num,
|
| 1456 |
+
action="complete",
|
| 1457 |
+
url=url,
|
| 1458 |
+
status="failed",
|
| 1459 |
+
message="Completed Reddit scrape with no community rows",
|
| 1460 |
+
reward=0.0,
|
| 1461 |
+
extracted_data={"total_reward": total_reward, "row_count": 0},
|
| 1462 |
+
timestamp=_now_iso(),
|
| 1463 |
+
),
|
| 1464 |
+
)
|
| 1465 |
+
return
|
| 1466 |
+
|
| 1467 |
+
verification_score = 1.0 if len(communities) >= 10 else 0.5
|
| 1468 |
+
total_reward += verification_score
|
| 1469 |
+
step_num += 1
|
| 1470 |
+
yield _record_step(
|
| 1471 |
+
session,
|
| 1472 |
+
ScrapeStep(
|
| 1473 |
+
step_number=step_num,
|
| 1474 |
+
action="verify",
|
| 1475 |
+
url=url,
|
| 1476 |
+
status="completed",
|
| 1477 |
+
message=f"Verifier checked community coverage ({len(communities)} rows)",
|
| 1478 |
+
reward=verification_score,
|
| 1479 |
+
extracted_data={
|
| 1480 |
+
"row_count": len(communities),
|
| 1481 |
+
"coverage": "good" if len(communities) >= 10 else "partial",
|
| 1482 |
+
},
|
| 1483 |
+
timestamp=_now_iso(),
|
| 1484 |
+
),
|
| 1485 |
+
)
|
| 1486 |
+
|
| 1487 |
+
if request.output_format == OutputFormat.CSV:
|
| 1488 |
+
columns = ["subreddit", "title", "subscribers", "active_users", "url", "description"]
|
| 1489 |
+
csv_output = _rows_to_csv(communities, preferred_headers=columns)
|
| 1490 |
+
session["extracted_data"] = {
|
| 1491 |
+
"rows": communities,
|
| 1492 |
+
"columns": columns,
|
| 1493 |
+
"csv_output": csv_output,
|
| 1494 |
+
"row_count": len(communities),
|
| 1495 |
+
"source": source_used,
|
| 1496 |
+
"challenge_detected": challenge_detected,
|
| 1497 |
+
}
|
| 1498 |
+
session["final_output"] = csv_output
|
| 1499 |
+
else:
|
| 1500 |
+
session["extracted_data"][url] = {
|
| 1501 |
+
"trending_communities": communities,
|
| 1502 |
+
"row_count": len(communities),
|
| 1503 |
+
"source": source_used,
|
| 1504 |
+
"challenge_detected": challenge_detected,
|
| 1505 |
+
}
|
| 1506 |
+
|
| 1507 |
+
_write_session_json_artifact(
|
| 1508 |
+
session,
|
| 1509 |
+
"reddit_trending_communities.json",
|
| 1510 |
+
{
|
| 1511 |
+
"source": source_used,
|
| 1512 |
+
"challenge_detected": challenge_detected,
|
| 1513 |
+
"row_count": len(communities),
|
| 1514 |
+
"rows": communities,
|
| 1515 |
+
},
|
| 1516 |
+
)
|
| 1517 |
+
|
| 1518 |
+
done_action = Action(
|
| 1519 |
+
action_type=ActionType.DONE,
|
| 1520 |
+
parameters={"success": True},
|
| 1521 |
+
reasoning="Reddit community extraction complete",
|
| 1522 |
+
)
|
| 1523 |
+
_, done_reward, _, _, _, _ = await env.step(done_action)
|
| 1524 |
+
total_reward += done_reward
|
| 1525 |
+
session["total_reward"] += total_reward
|
| 1526 |
+
|
| 1527 |
+
step_num += 1
|
| 1528 |
+
yield _record_step(
|
| 1529 |
+
session,
|
| 1530 |
+
ScrapeStep(
|
| 1531 |
+
step_number=step_num,
|
| 1532 |
+
action="complete",
|
| 1533 |
+
url=url,
|
| 1534 |
+
status="completed",
|
| 1535 |
+
message=f"Completed Reddit trending scrape with {len(communities)} communities",
|
| 1536 |
+
reward=done_reward,
|
| 1537 |
+
extracted_data={"total_reward": total_reward, "row_count": len(communities)},
|
| 1538 |
+
timestamp=_now_iso(),
|
| 1539 |
+
),
|
| 1540 |
+
)
|
| 1541 |
+
|
| 1542 |
+
|
| 1543 |
async def _scrape_single_page(
|
| 1544 |
session: dict[str, Any],
|
| 1545 |
session_id: str,
|
|
|
|
| 1642 |
step_num += 1
|
| 1643 |
extracted_count = len([f for f in fields_to_extract if f in extracted])
|
| 1644 |
verification_score = extracted_count / len(fields_to_extract) if fields_to_extract else 0.0
|
| 1645 |
+
total_reward += verification_score
|
| 1646 |
|
| 1647 |
yield _record_step(
|
| 1648 |
session,
|
|
|
|
| 1665 |
parameters={"success": True},
|
| 1666 |
reasoning="Extraction complete",
|
| 1667 |
)
|
| 1668 |
+
_, done_reward, _, _, _, _ = await env.step(done_action)
|
| 1669 |
+
total_reward += done_reward
|
| 1670 |
|
| 1671 |
yield _record_step(
|
| 1672 |
session,
|
|
|
|
| 1676 |
url=url,
|
| 1677 |
status="completed",
|
| 1678 |
message=f"Completed scraping {url}",
|
| 1679 |
+
reward=done_reward,
|
| 1680 |
+
extracted_data={**extracted, "total_reward": total_reward},
|
| 1681 |
timestamp=_now_iso(),
|
| 1682 |
),
|
| 1683 |
)
|
|
|
|
| 1753 |
"enabled": enabled_plugins,
|
| 1754 |
"missing": missing_plugins,
|
| 1755 |
"navigation_strategy": navigation_plan["strategy"],
|
| 1756 |
+
"extraction_goal": navigation_plan["extraction_goal"],
|
| 1757 |
+
"site_template_id": navigation_plan.get("site_template_id"),
|
| 1758 |
+
"site_template_name": navigation_plan.get("site_template_name"),
|
| 1759 |
+
"site_template_domains": navigation_plan.get("site_template_domains", []),
|
| 1760 |
},
|
| 1761 |
timestamp=_now_iso(),
|
| 1762 |
),
|
|
|
|
| 1785 |
await manager.broadcast(discovery_event, session_id)
|
| 1786 |
yield _sse_event(discovery_event)
|
| 1787 |
|
| 1788 |
+
planner_site_template = match_site_template(request.instructions, resolved_assets)
|
| 1789 |
+
planner_template_payload = (
|
| 1790 |
+
serialize_site_template(planner_site_template) if planner_site_template else None
|
| 1791 |
+
)
|
| 1792 |
+
|
| 1793 |
if request.enable_memory:
|
| 1794 |
try:
|
| 1795 |
await memory_manager.store(
|
|
|
|
| 1835 |
"assets": resolved_assets,
|
| 1836 |
"instructions": request.instructions,
|
| 1837 |
"output_instructions": request.output_instructions,
|
| 1838 |
+
"site_template": planner_template_payload,
|
| 1839 |
},
|
| 1840 |
timestamp=_now_iso(),
|
| 1841 |
),
|
|
|
|
| 1850 |
"output_instructions": request.output_instructions,
|
| 1851 |
"resolved_assets": resolved_assets,
|
| 1852 |
"selected_agents": request.selected_agents,
|
| 1853 |
+
"site_template": planner_template_payload,
|
| 1854 |
}
|
| 1855 |
planner_code = (
|
| 1856 |
"result = {"
|
| 1857 |
"'phase': payload.get('phase'), "
|
| 1858 |
"'asset_count': len(payload.get('resolved_assets') or []), "
|
| 1859 |
+
"'selected_agents': payload.get('selected_agents') or [], "
|
| 1860 |
+
"'site_template_id': (payload.get('site_template') or {}).get('site_id'), "
|
| 1861 |
+
"'site_strategy': (payload.get('site_template') or {}).get('default_strategy')"
|
| 1862 |
"}"
|
| 1863 |
)
|
| 1864 |
try:
|
|
|
|
| 1896 |
|
| 1897 |
for idx, url in enumerate(resolved_assets):
|
| 1898 |
session["current_url_index"] = idx
|
| 1899 |
+
url_navigation_plan = _create_intelligent_navigation_plan(request.instructions, [url])
|
| 1900 |
+
url_site_template = match_site_template(request.instructions, [url])
|
| 1901 |
+
url_template_payload = serialize_site_template(url_site_template) if url_site_template else None
|
| 1902 |
+
|
| 1903 |
+
if url_template_payload:
|
| 1904 |
+
site_template_event = _record_step(
|
| 1905 |
+
session,
|
| 1906 |
+
ScrapeStep(
|
| 1907 |
+
step_number=len(session["steps"]) + 1,
|
| 1908 |
+
action="site_template",
|
| 1909 |
+
url=url,
|
| 1910 |
+
status="completed",
|
| 1911 |
+
message=f"Navigator loaded site template: {url_template_payload['name']}",
|
| 1912 |
+
reward=0.05,
|
| 1913 |
+
extracted_data={
|
| 1914 |
+
"site_id": url_template_payload["site_id"],
|
| 1915 |
+
"strategy": url_navigation_plan["strategy"],
|
| 1916 |
+
"domains": url_template_payload["domains"],
|
| 1917 |
+
},
|
| 1918 |
+
timestamp=_now_iso(),
|
| 1919 |
+
),
|
| 1920 |
+
)
|
| 1921 |
+
await manager.broadcast(site_template_event, session_id)
|
| 1922 |
+
yield _sse_event(site_template_event)
|
| 1923 |
+
|
| 1924 |
navigator_event = _record_step(
|
| 1925 |
session,
|
| 1926 |
ScrapeStep(
|
|
|
|
| 1928 |
action="navigator",
|
| 1929 |
url=url,
|
| 1930 |
status="running",
|
| 1931 |
+
message=(
|
| 1932 |
+
f"Navigator selected source {idx + 1}/{len(resolved_assets)} "
|
| 1933 |
+
f"({url_navigation_plan['strategy']})"
|
| 1934 |
+
),
|
| 1935 |
reward=0.05, # Small reward for navigator selection
|
| 1936 |
+
extracted_data={
|
| 1937 |
+
"site_template_id": url_navigation_plan.get("site_template_id"),
|
| 1938 |
+
"site_template_name": url_navigation_plan.get("site_template_name"),
|
| 1939 |
+
},
|
| 1940 |
timestamp=_now_iso(),
|
| 1941 |
),
|
| 1942 |
)
|
|
|
|
| 1949 |
"url": url,
|
| 1950 |
"index": idx,
|
| 1951 |
"total": len(resolved_assets),
|
| 1952 |
+
"site_template": url_template_payload,
|
| 1953 |
+
"navigation_strategy": url_navigation_plan["strategy"],
|
| 1954 |
}
|
| 1955 |
navigator_code = (
|
| 1956 |
"result = {"
|
| 1957 |
"'phase': payload.get('phase'), "
|
| 1958 |
"'selected_url': payload.get('url'), "
|
| 1959 |
+
"'progress': f\"{payload.get('index', 0) + 1}/{payload.get('total', 0)}\", "
|
| 1960 |
+
"'site_template_id': (payload.get('site_template') or {}).get('site_id'), "
|
| 1961 |
+
"'strategy': payload.get('navigation_strategy')"
|
| 1962 |
"}"
|
| 1963 |
)
|
| 1964 |
try:
|
|
|
|
| 2007 |
request,
|
| 2008 |
memory_manager,
|
| 2009 |
enabled_plugins,
|
| 2010 |
+
url_navigation_plan,
|
| 2011 |
):
|
| 2012 |
await manager.broadcast(update, session_id)
|
| 2013 |
yield _sse_event(update)
|
|
|
|
| 2059 |
else:
|
| 2060 |
session["errors"].append("No monthly gold rows were extracted from resolved sources.")
|
| 2061 |
|
| 2062 |
+
if (
|
| 2063 |
+
any(plugin_id in enabled_plugins for plugin_id in python_plugin_ids)
|
| 2064 |
+
and _should_run_python_sandbox(request, session["extracted_data"])
|
| 2065 |
+
):
|
| 2066 |
extracted_payload = session["extracted_data"]
|
| 2067 |
dataset_rows: list[dict[str, Any]] = []
|
| 2068 |
source_links: list[str] = []
|
backend/app/api/routes/sites.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Site template API routes."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
from fastapi import APIRouter, HTTPException, status
|
| 8 |
+
from pydantic import BaseModel, Field
|
| 9 |
+
|
| 10 |
+
from app.sites import (
|
| 11 |
+
get_site_template,
|
| 12 |
+
list_site_templates,
|
| 13 |
+
match_site_template,
|
| 14 |
+
serialize_site_template,
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
router = APIRouter(prefix="/sites", tags=["sites"])
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class SiteMatchRequest(BaseModel):
|
| 21 |
+
"""Payload to match a site template."""
|
| 22 |
+
|
| 23 |
+
instructions: str = Field(default="", description="Task instructions")
|
| 24 |
+
assets: list[str] = Field(default_factory=list, description="Task assets/URLs")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@router.get(
|
| 28 |
+
"",
|
| 29 |
+
status_code=status.HTTP_200_OK,
|
| 30 |
+
summary="List inbuilt site templates",
|
| 31 |
+
description="Return all site templates available for agent planning",
|
| 32 |
+
)
|
| 33 |
+
async def list_sites() -> dict[str, Any]:
|
| 34 |
+
"""List all available site templates."""
|
| 35 |
+
|
| 36 |
+
templates = list_site_templates()
|
| 37 |
+
return {"count": len(templates), "sites": templates}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@router.get(
|
| 41 |
+
"/{site_id}",
|
| 42 |
+
status_code=status.HTTP_200_OK,
|
| 43 |
+
summary="Get one site template",
|
| 44 |
+
description="Return one template by site_id",
|
| 45 |
+
)
|
| 46 |
+
async def get_site(site_id: str) -> dict[str, Any]:
|
| 47 |
+
"""Get one site template."""
|
| 48 |
+
|
| 49 |
+
template = get_site_template(site_id)
|
| 50 |
+
if not template:
|
| 51 |
+
raise HTTPException(status_code=404, detail=f"Site template '{site_id}' not found")
|
| 52 |
+
|
| 53 |
+
return serialize_site_template(template)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@router.post(
|
| 57 |
+
"/match",
|
| 58 |
+
status_code=status.HTTP_200_OK,
|
| 59 |
+
summary="Match a template for task input",
|
| 60 |
+
description="Find the best matching site template from instructions/assets",
|
| 61 |
+
)
|
| 62 |
+
async def match_site(payload: SiteMatchRequest) -> dict[str, Any]:
|
| 63 |
+
"""Resolve best site template for given instructions and assets."""
|
| 64 |
+
|
| 65 |
+
template = match_site_template(payload.instructions, payload.assets)
|
| 66 |
+
if not template:
|
| 67 |
+
return {"matched": False, "site": None}
|
| 68 |
+
|
| 69 |
+
return {"matched": True, "site": serialize_site_template(template)}
|
backend/app/main.py
CHANGED
|
@@ -11,7 +11,7 @@ from fastapi.middleware.cors import CORSMiddleware
|
|
| 11 |
from fastapi.responses import FileResponse, HTMLResponse
|
| 12 |
from fastapi.staticfiles import StaticFiles
|
| 13 |
|
| 14 |
-
from app.api.routes import agents, episode, health, memory, plugins,
|
| 15 |
from app.api.routes import settings as settings_routes
|
| 16 |
from app.config import get_settings
|
| 17 |
from app.memory.manager import MemoryManager
|
|
@@ -133,6 +133,7 @@ def create_app() -> FastAPI:
|
|
| 133 |
app.include_router(memory.router, prefix=api_prefix, tags=["Memory"])
|
| 134 |
app.include_router(settings_routes.router, prefix=api_prefix, tags=["Settings"])
|
| 135 |
app.include_router(plugins.router, prefix=api_prefix, tags=["Plugins"])
|
|
|
|
| 136 |
app.include_router(scrape.router, prefix=api_prefix, tags=["Scraping"])
|
| 137 |
|
| 138 |
# Import and include providers router
|
|
|
|
| 11 |
from fastapi.responses import FileResponse, HTMLResponse
|
| 12 |
from fastapi.staticfiles import StaticFiles
|
| 13 |
|
| 14 |
+
from app.api.routes import agents, episode, health, memory, plugins, scrape, sites, tasks, tools
|
| 15 |
from app.api.routes import settings as settings_routes
|
| 16 |
from app.config import get_settings
|
| 17 |
from app.memory.manager import MemoryManager
|
|
|
|
| 133 |
app.include_router(memory.router, prefix=api_prefix, tags=["Memory"])
|
| 134 |
app.include_router(settings_routes.router, prefix=api_prefix, tags=["Settings"])
|
| 135 |
app.include_router(plugins.router, prefix=api_prefix, tags=["Plugins"])
|
| 136 |
+
app.include_router(sites.router, prefix=api_prefix, tags=["Sites"])
|
| 137 |
app.include_router(scrape.router, prefix=api_prefix, tags=["Scraping"])
|
| 138 |
|
| 139 |
# Import and include providers router
|
backend/app/sites/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Site template registry for domain-aware scraping behavior."""
|
| 2 |
+
|
| 3 |
+
from app.sites.models import SiteTemplate
|
| 4 |
+
from app.sites.registry import (
|
| 5 |
+
get_site_template,
|
| 6 |
+
list_site_templates,
|
| 7 |
+
match_site_template,
|
| 8 |
+
serialize_site_template,
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
__all__ = [
|
| 12 |
+
"SiteTemplate",
|
| 13 |
+
"get_site_template",
|
| 14 |
+
"list_site_templates",
|
| 15 |
+
"match_site_template",
|
| 16 |
+
"serialize_site_template",
|
| 17 |
+
]
|
backend/app/sites/models.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Data models for built-in site templates."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@dataclass(frozen=True)
|
| 9 |
+
class SiteTemplate:
|
| 10 |
+
"""Inbuilt site template that agents can reference."""
|
| 11 |
+
|
| 12 |
+
site_id: str
|
| 13 |
+
name: str
|
| 14 |
+
domains: tuple[str, ...]
|
| 15 |
+
aliases: tuple[str, ...] = field(default_factory=tuple)
|
| 16 |
+
default_strategy: str = "intelligent_exploration"
|
| 17 |
+
extraction_goal: str = "structured_extraction"
|
| 18 |
+
navigation_steps: tuple[str, ...] = field(default_factory=tuple)
|
| 19 |
+
output_fields: tuple[str, ...] = field(default_factory=tuple)
|
| 20 |
+
target_urls: tuple[str, ...] = field(default_factory=tuple)
|
| 21 |
+
description: str = ""
|
backend/app/sites/registry.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Template registry and matching helpers for known sites."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any
|
| 6 |
+
from urllib.parse import urlparse
|
| 7 |
+
|
| 8 |
+
from app.sites.models import SiteTemplate
|
| 9 |
+
from app.sites.templates import SITE_TEMPLATES
|
| 10 |
+
|
| 11 |
+
_SITE_BY_ID: dict[str, SiteTemplate] = {template.site_id: template for template in SITE_TEMPLATES}
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def serialize_site_template(template: SiteTemplate) -> dict[str, Any]:
|
| 15 |
+
"""Serialize a site template into API/event payload format."""
|
| 16 |
+
|
| 17 |
+
return {
|
| 18 |
+
"site_id": template.site_id,
|
| 19 |
+
"name": template.name,
|
| 20 |
+
"domains": list(template.domains),
|
| 21 |
+
"aliases": list(template.aliases),
|
| 22 |
+
"default_strategy": template.default_strategy,
|
| 23 |
+
"extraction_goal": template.extraction_goal,
|
| 24 |
+
"navigation_steps": list(template.navigation_steps),
|
| 25 |
+
"output_fields": list(template.output_fields),
|
| 26 |
+
"target_urls": list(template.target_urls),
|
| 27 |
+
"description": template.description,
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def list_site_templates() -> list[dict[str, Any]]:
|
| 32 |
+
"""Return all site templates as serializable dictionaries."""
|
| 33 |
+
|
| 34 |
+
return [serialize_site_template(template) for template in SITE_TEMPLATES]
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def get_site_template(site_id: str) -> SiteTemplate | None:
|
| 38 |
+
"""Get a template by site_id."""
|
| 39 |
+
|
| 40 |
+
return _SITE_BY_ID.get(site_id)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _normalize_domain(value: str) -> str:
|
| 44 |
+
"""Normalize a domain string."""
|
| 45 |
+
|
| 46 |
+
lowered = value.lower().strip()
|
| 47 |
+
if lowered.startswith("www."):
|
| 48 |
+
return lowered[4:]
|
| 49 |
+
return lowered
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _extract_domains_from_assets(assets: list[str]) -> list[str]:
|
| 53 |
+
"""Extract normalized domains from URL assets."""
|
| 54 |
+
|
| 55 |
+
domains: list[str] = []
|
| 56 |
+
for asset in assets:
|
| 57 |
+
parsed = urlparse(asset.strip())
|
| 58 |
+
if parsed.scheme not in {"http", "https"} or not parsed.netloc:
|
| 59 |
+
continue
|
| 60 |
+
domain = _normalize_domain(parsed.netloc)
|
| 61 |
+
if domain not in domains:
|
| 62 |
+
domains.append(domain)
|
| 63 |
+
return domains
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def match_site_template(instructions: str, assets: list[str]) -> SiteTemplate | None:
|
| 67 |
+
"""Match site template by URL domain first, then instruction aliases."""
|
| 68 |
+
|
| 69 |
+
asset_domains = _extract_domains_from_assets(assets)
|
| 70 |
+
instructions_lower = instructions.lower()
|
| 71 |
+
|
| 72 |
+
# Domain-first matching
|
| 73 |
+
for domain in asset_domains:
|
| 74 |
+
for template in SITE_TEMPLATES:
|
| 75 |
+
if any(domain == _normalize_domain(candidate) or domain.endswith(f".{_normalize_domain(candidate)}")
|
| 76 |
+
for candidate in template.domains):
|
| 77 |
+
return template
|
| 78 |
+
|
| 79 |
+
# Alias fallback
|
| 80 |
+
for template in SITE_TEMPLATES:
|
| 81 |
+
alias_tokens = [template.name.lower(), template.site_id.lower(), *[alias.lower() for alias in template.aliases]]
|
| 82 |
+
if any(token and token in instructions_lower for token in alias_tokens):
|
| 83 |
+
return template
|
| 84 |
+
|
| 85 |
+
return None
|
backend/app/sites/templates.py
ADDED
|
@@ -0,0 +1,651 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Built-in site templates (30+ domains) for agent guidance."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from app.sites.models import SiteTemplate
|
| 6 |
+
|
| 7 |
+
SOCIAL_STEPS = (
|
| 8 |
+
"Navigate to discover/trending sections",
|
| 9 |
+
"Collect entity cards and ranking metadata",
|
| 10 |
+
"Normalize output into a structured list",
|
| 11 |
+
)
|
| 12 |
+
NEWS_STEPS = (
|
| 13 |
+
"Navigate to front page or section hubs",
|
| 14 |
+
"Extract headline cards with links and timestamps",
|
| 15 |
+
"Optionally follow article links for summaries",
|
| 16 |
+
)
|
| 17 |
+
DOC_STEPS = (
|
| 18 |
+
"Navigate to docs or index pages",
|
| 19 |
+
"Extract headings, navigation links, and metadata",
|
| 20 |
+
"Return concise structured documentation map",
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
SITE_TEMPLATES: tuple[SiteTemplate, ...] = (
|
| 25 |
+
SiteTemplate(
|
| 26 |
+
site_id="github",
|
| 27 |
+
name="GitHub",
|
| 28 |
+
domains=("github.com",),
|
| 29 |
+
aliases=("github", "repo", "repositories"),
|
| 30 |
+
default_strategy="github_repository_extraction",
|
| 31 |
+
extraction_goal="repositories",
|
| 32 |
+
navigation_steps=(
|
| 33 |
+
"Navigate to Explore/Trending/Search pages",
|
| 34 |
+
"Extract repository metadata and links",
|
| 35 |
+
"Format repository rows for csv/json output",
|
| 36 |
+
),
|
| 37 |
+
output_fields=("username", "repo_name", "stars", "forks", "url"),
|
| 38 |
+
target_urls=("https://github.com/explore", "https://github.com/trending"),
|
| 39 |
+
description="Code repositories, projects, and trend pages",
|
| 40 |
+
),
|
| 41 |
+
SiteTemplate(
|
| 42 |
+
site_id="reddit",
|
| 43 |
+
name="Reddit",
|
| 44 |
+
domains=("reddit.com", "old.reddit.com"),
|
| 45 |
+
aliases=("reddit", "subreddit", "communities"),
|
| 46 |
+
default_strategy="reddit_community_extraction",
|
| 47 |
+
extraction_goal="communities",
|
| 48 |
+
navigation_steps=(
|
| 49 |
+
"Navigate to community discovery/popular endpoints",
|
| 50 |
+
"Handle anti-bot challenge fallback if required",
|
| 51 |
+
"Return normalized subreddit rows",
|
| 52 |
+
),
|
| 53 |
+
output_fields=("subreddit", "title", "subscribers", "active_users", "url"),
|
| 54 |
+
target_urls=("https://www.reddit.com/", "https://www.reddit.com/subreddits/popular"),
|
| 55 |
+
description="Communities, posts, and subreddit metadata",
|
| 56 |
+
),
|
| 57 |
+
SiteTemplate(
|
| 58 |
+
site_id="x",
|
| 59 |
+
name="X (Twitter)",
|
| 60 |
+
domains=("x.com", "twitter.com"),
|
| 61 |
+
aliases=("x", "twitter", "tweets"),
|
| 62 |
+
extraction_goal="posts",
|
| 63 |
+
navigation_steps=SOCIAL_STEPS,
|
| 64 |
+
output_fields=("author", "post_text", "likes", "replies", "url"),
|
| 65 |
+
target_urls=("https://x.com/explore",),
|
| 66 |
+
description="Short-form social posts and trends",
|
| 67 |
+
),
|
| 68 |
+
SiteTemplate(
|
| 69 |
+
site_id="youtube",
|
| 70 |
+
name="YouTube",
|
| 71 |
+
domains=("youtube.com", "youtu.be"),
|
| 72 |
+
aliases=("youtube", "videos", "channels"),
|
| 73 |
+
extraction_goal="videos",
|
| 74 |
+
navigation_steps=SOCIAL_STEPS,
|
| 75 |
+
output_fields=("title", "channel", "views", "published", "url"),
|
| 76 |
+
target_urls=("https://www.youtube.com/feed/trending",),
|
| 77 |
+
description="Video listings, channels, and trend feeds",
|
| 78 |
+
),
|
| 79 |
+
SiteTemplate(
|
| 80 |
+
site_id="instagram",
|
| 81 |
+
name="Instagram",
|
| 82 |
+
domains=("instagram.com",),
|
| 83 |
+
aliases=("instagram", "reels", "posts"),
|
| 84 |
+
extraction_goal="social_posts",
|
| 85 |
+
navigation_steps=SOCIAL_STEPS,
|
| 86 |
+
output_fields=("author", "caption", "likes", "comments", "url"),
|
| 87 |
+
target_urls=("https://www.instagram.com/explore/",),
|
| 88 |
+
description="Photo/video social feed extraction",
|
| 89 |
+
),
|
| 90 |
+
SiteTemplate(
|
| 91 |
+
site_id="facebook",
|
| 92 |
+
name="Facebook",
|
| 93 |
+
domains=("facebook.com", "fb.com"),
|
| 94 |
+
aliases=("facebook", "pages", "groups"),
|
| 95 |
+
extraction_goal="social_posts",
|
| 96 |
+
navigation_steps=SOCIAL_STEPS,
|
| 97 |
+
output_fields=("page", "post_text", "reactions", "comments", "url"),
|
| 98 |
+
target_urls=("https://www.facebook.com/watch/",),
|
| 99 |
+
description="Pages, groups, and social content",
|
| 100 |
+
),
|
| 101 |
+
SiteTemplate(
|
| 102 |
+
site_id="linkedin",
|
| 103 |
+
name="LinkedIn",
|
| 104 |
+
domains=("linkedin.com",),
|
| 105 |
+
aliases=("linkedin", "jobs", "companies"),
|
| 106 |
+
extraction_goal="professional_content",
|
| 107 |
+
navigation_steps=SOCIAL_STEPS,
|
| 108 |
+
output_fields=("title", "company", "location", "engagement", "url"),
|
| 109 |
+
target_urls=("https://www.linkedin.com/feed/",),
|
| 110 |
+
description="Professional posts, companies, and jobs",
|
| 111 |
+
),
|
| 112 |
+
SiteTemplate(
|
| 113 |
+
site_id="tiktok",
|
| 114 |
+
name="TikTok",
|
| 115 |
+
domains=("tiktok.com",),
|
| 116 |
+
aliases=("tiktok", "shorts", "videos"),
|
| 117 |
+
extraction_goal="videos",
|
| 118 |
+
navigation_steps=SOCIAL_STEPS,
|
| 119 |
+
output_fields=("creator", "caption", "likes", "comments", "url"),
|
| 120 |
+
target_urls=("https://www.tiktok.com/trending",),
|
| 121 |
+
description="Short video trend discovery",
|
| 122 |
+
),
|
| 123 |
+
SiteTemplate(
|
| 124 |
+
site_id="medium",
|
| 125 |
+
name="Medium",
|
| 126 |
+
domains=("medium.com",),
|
| 127 |
+
aliases=("medium", "blogs", "articles"),
|
| 128 |
+
extraction_goal="articles",
|
| 129 |
+
navigation_steps=NEWS_STEPS,
|
| 130 |
+
output_fields=("title", "author", "claps", "reading_time", "url"),
|
| 131 |
+
target_urls=("https://medium.com/tag/technology",),
|
| 132 |
+
description="Article/blog extraction",
|
| 133 |
+
),
|
| 134 |
+
SiteTemplate(
|
| 135 |
+
site_id="devto",
|
| 136 |
+
name="DEV Community",
|
| 137 |
+
domains=("dev.to",),
|
| 138 |
+
aliases=("devto", "dev.to", "developer posts"),
|
| 139 |
+
extraction_goal="articles",
|
| 140 |
+
navigation_steps=NEWS_STEPS,
|
| 141 |
+
output_fields=("title", "author", "reactions", "comments", "url"),
|
| 142 |
+
target_urls=("https://dev.to/top/week",),
|
| 143 |
+
description="Developer articles and posts",
|
| 144 |
+
),
|
| 145 |
+
SiteTemplate(
|
| 146 |
+
site_id="stackoverflow",
|
| 147 |
+
name="Stack Overflow",
|
| 148 |
+
domains=("stackoverflow.com",),
|
| 149 |
+
aliases=("stackoverflow", "questions", "answers"),
|
| 150 |
+
extraction_goal="questions",
|
| 151 |
+
navigation_steps=DOC_STEPS,
|
| 152 |
+
output_fields=("title", "votes", "answers", "tags", "url"),
|
| 153 |
+
target_urls=("https://stackoverflow.com/questions",),
|
| 154 |
+
description="Q&A extraction",
|
| 155 |
+
),
|
| 156 |
+
SiteTemplate(
|
| 157 |
+
site_id="kaggle",
|
| 158 |
+
name="Kaggle",
|
| 159 |
+
domains=("kaggle.com",),
|
| 160 |
+
aliases=("kaggle", "datasets", "competitions"),
|
| 161 |
+
extraction_goal="datasets",
|
| 162 |
+
navigation_steps=DOC_STEPS,
|
| 163 |
+
output_fields=("dataset_name", "author", "votes", "updated", "url"),
|
| 164 |
+
target_urls=("https://www.kaggle.com/datasets",),
|
| 165 |
+
description="Dataset and competition listings",
|
| 166 |
+
),
|
| 167 |
+
SiteTemplate(
|
| 168 |
+
site_id="huggingface",
|
| 169 |
+
name="Hugging Face",
|
| 170 |
+
domains=("huggingface.co",),
|
| 171 |
+
aliases=("huggingface", "models", "spaces"),
|
| 172 |
+
extraction_goal="models",
|
| 173 |
+
navigation_steps=DOC_STEPS,
|
| 174 |
+
output_fields=("model_id", "downloads", "likes", "task", "url"),
|
| 175 |
+
target_urls=("https://huggingface.co/models",),
|
| 176 |
+
description="Model and dataset hubs",
|
| 177 |
+
),
|
| 178 |
+
SiteTemplate(
|
| 179 |
+
site_id="arxiv",
|
| 180 |
+
name="arXiv",
|
| 181 |
+
domains=("arxiv.org",),
|
| 182 |
+
aliases=("arxiv", "papers", "preprints"),
|
| 183 |
+
extraction_goal="papers",
|
| 184 |
+
navigation_steps=DOC_STEPS,
|
| 185 |
+
output_fields=("title", "authors", "category", "published", "url"),
|
| 186 |
+
target_urls=("https://arxiv.org/list/cs/new",),
|
| 187 |
+
description="Research paper listings",
|
| 188 |
+
),
|
| 189 |
+
SiteTemplate(
|
| 190 |
+
site_id="wikipedia",
|
| 191 |
+
name="Wikipedia",
|
| 192 |
+
domains=("wikipedia.org",),
|
| 193 |
+
aliases=("wikipedia", "wiki", "encyclopedia"),
|
| 194 |
+
extraction_goal="reference_content",
|
| 195 |
+
navigation_steps=DOC_STEPS,
|
| 196 |
+
output_fields=("title", "summary", "sections", "references", "url"),
|
| 197 |
+
target_urls=("https://en.wikipedia.org/wiki/Main_Page",),
|
| 198 |
+
description="Reference and encyclopedia pages",
|
| 199 |
+
),
|
| 200 |
+
SiteTemplate(
|
| 201 |
+
site_id="pypi",
|
| 202 |
+
name="PyPI",
|
| 203 |
+
domains=("pypi.org",),
|
| 204 |
+
aliases=("pypi", "python packages"),
|
| 205 |
+
extraction_goal="packages",
|
| 206 |
+
navigation_steps=DOC_STEPS,
|
| 207 |
+
output_fields=("package", "version", "downloads", "license", "url"),
|
| 208 |
+
target_urls=("https://pypi.org/search/",),
|
| 209 |
+
description="Python package metadata",
|
| 210 |
+
),
|
| 211 |
+
SiteTemplate(
|
| 212 |
+
site_id="npm",
|
| 213 |
+
name="npm",
|
| 214 |
+
domains=("npmjs.com",),
|
| 215 |
+
aliases=("npm", "node packages"),
|
| 216 |
+
extraction_goal="packages",
|
| 217 |
+
navigation_steps=DOC_STEPS,
|
| 218 |
+
output_fields=("package", "version", "weekly_downloads", "maintainers", "url"),
|
| 219 |
+
target_urls=("https://www.npmjs.com/search",),
|
| 220 |
+
description="Node package metadata",
|
| 221 |
+
),
|
| 222 |
+
SiteTemplate(
|
| 223 |
+
site_id="producthunt",
|
| 224 |
+
name="Product Hunt",
|
| 225 |
+
domains=("producthunt.com",),
|
| 226 |
+
aliases=("product hunt", "launches", "products"),
|
| 227 |
+
extraction_goal="products",
|
| 228 |
+
navigation_steps=SOCIAL_STEPS,
|
| 229 |
+
output_fields=("product", "tagline", "votes", "category", "url"),
|
| 230 |
+
target_urls=("https://www.producthunt.com/",),
|
| 231 |
+
description="New product launch listings",
|
| 232 |
+
),
|
| 233 |
+
SiteTemplate(
|
| 234 |
+
site_id="hackernews",
|
| 235 |
+
name="Hacker News",
|
| 236 |
+
domains=("news.ycombinator.com",),
|
| 237 |
+
aliases=("hackernews", "hn", "top stories"),
|
| 238 |
+
extraction_goal="stories",
|
| 239 |
+
navigation_steps=NEWS_STEPS,
|
| 240 |
+
output_fields=("title", "points", "comments", "author", "url"),
|
| 241 |
+
target_urls=("https://news.ycombinator.com/",),
|
| 242 |
+
description="Tech news headlines",
|
| 243 |
+
),
|
| 244 |
+
SiteTemplate(
|
| 245 |
+
site_id="substack",
|
| 246 |
+
name="Substack",
|
| 247 |
+
domains=("substack.com",),
|
| 248 |
+
aliases=("substack", "newsletters"),
|
| 249 |
+
extraction_goal="newsletter_posts",
|
| 250 |
+
navigation_steps=NEWS_STEPS,
|
| 251 |
+
output_fields=("title", "author", "publication", "published", "url"),
|
| 252 |
+
target_urls=("https://substack.com/discover",),
|
| 253 |
+
description="Newsletter and long-form posts",
|
| 254 |
+
),
|
| 255 |
+
SiteTemplate(
|
| 256 |
+
site_id="quora",
|
| 257 |
+
name="Quora",
|
| 258 |
+
domains=("quora.com",),
|
| 259 |
+
aliases=("quora", "questions"),
|
| 260 |
+
extraction_goal="questions",
|
| 261 |
+
navigation_steps=DOC_STEPS,
|
| 262 |
+
output_fields=("question", "answer_count", "followers", "topic", "url"),
|
| 263 |
+
target_urls=("https://www.quora.com/",),
|
| 264 |
+
description="Question and answer listings",
|
| 265 |
+
),
|
| 266 |
+
SiteTemplate(
|
| 267 |
+
site_id="pinterest",
|
| 268 |
+
name="Pinterest",
|
| 269 |
+
domains=("pinterest.com",),
|
| 270 |
+
aliases=("pinterest", "pins", "boards"),
|
| 271 |
+
extraction_goal="pins",
|
| 272 |
+
navigation_steps=SOCIAL_STEPS,
|
| 273 |
+
output_fields=("title", "board", "saves", "author", "url"),
|
| 274 |
+
target_urls=("https://www.pinterest.com/",),
|
| 275 |
+
description="Pins and board discovery",
|
| 276 |
+
),
|
| 277 |
+
SiteTemplate(
|
| 278 |
+
site_id="imdb",
|
| 279 |
+
name="IMDb",
|
| 280 |
+
domains=("imdb.com",),
|
| 281 |
+
aliases=("imdb", "movies", "tv"),
|
| 282 |
+
extraction_goal="titles",
|
| 283 |
+
navigation_steps=NEWS_STEPS,
|
| 284 |
+
output_fields=("title", "year", "rating", "genres", "url"),
|
| 285 |
+
target_urls=("https://www.imdb.com/chart/",),
|
| 286 |
+
description="Movie and TV listings",
|
| 287 |
+
),
|
| 288 |
+
SiteTemplate(
|
| 289 |
+
site_id="nytimes",
|
| 290 |
+
name="New York Times",
|
| 291 |
+
domains=("nytimes.com",),
|
| 292 |
+
aliases=("new york times", "nyt"),
|
| 293 |
+
extraction_goal="news_articles",
|
| 294 |
+
navigation_steps=NEWS_STEPS,
|
| 295 |
+
output_fields=("headline", "section", "author", "published", "url"),
|
| 296 |
+
target_urls=("https://www.nytimes.com/",),
|
| 297 |
+
description="General news articles",
|
| 298 |
+
),
|
| 299 |
+
SiteTemplate(
|
| 300 |
+
site_id="bbc",
|
| 301 |
+
name="BBC",
|
| 302 |
+
domains=("bbc.com", "bbc.co.uk"),
|
| 303 |
+
aliases=("bbc", "bbc news"),
|
| 304 |
+
extraction_goal="news_articles",
|
| 305 |
+
navigation_steps=NEWS_STEPS,
|
| 306 |
+
output_fields=("headline", "section", "published", "url"),
|
| 307 |
+
target_urls=("https://www.bbc.com/news",),
|
| 308 |
+
description="Global news coverage",
|
| 309 |
+
),
|
| 310 |
+
SiteTemplate(
|
| 311 |
+
site_id="cnn",
|
| 312 |
+
name="CNN",
|
| 313 |
+
domains=("cnn.com",),
|
| 314 |
+
aliases=("cnn", "cnn news"),
|
| 315 |
+
extraction_goal="news_articles",
|
| 316 |
+
navigation_steps=NEWS_STEPS,
|
| 317 |
+
output_fields=("headline", "section", "published", "url"),
|
| 318 |
+
target_urls=("https://www.cnn.com/",),
|
| 319 |
+
description="General news feed",
|
| 320 |
+
),
|
| 321 |
+
SiteTemplate(
|
| 322 |
+
site_id="reuters",
|
| 323 |
+
name="Reuters",
|
| 324 |
+
domains=("reuters.com",),
|
| 325 |
+
aliases=("reuters",),
|
| 326 |
+
extraction_goal="news_articles",
|
| 327 |
+
navigation_steps=NEWS_STEPS,
|
| 328 |
+
output_fields=("headline", "category", "published", "url"),
|
| 329 |
+
target_urls=("https://www.reuters.com/world/",),
|
| 330 |
+
description="Wire-service news feed",
|
| 331 |
+
),
|
| 332 |
+
SiteTemplate(
|
| 333 |
+
site_id="bloomberg",
|
| 334 |
+
name="Bloomberg",
|
| 335 |
+
domains=("bloomberg.com",),
|
| 336 |
+
aliases=("bloomberg", "markets"),
|
| 337 |
+
extraction_goal="market_news",
|
| 338 |
+
navigation_steps=NEWS_STEPS,
|
| 339 |
+
output_fields=("headline", "section", "published", "url"),
|
| 340 |
+
target_urls=("https://www.bloomberg.com/markets",),
|
| 341 |
+
description="Finance and market news",
|
| 342 |
+
),
|
| 343 |
+
SiteTemplate(
|
| 344 |
+
site_id="coinmarketcap",
|
| 345 |
+
name="CoinMarketCap",
|
| 346 |
+
domains=("coinmarketcap.com",),
|
| 347 |
+
aliases=("coinmarketcap", "crypto prices"),
|
| 348 |
+
extraction_goal="crypto_assets",
|
| 349 |
+
navigation_steps=DOC_STEPS,
|
| 350 |
+
output_fields=("asset", "price", "market_cap", "volume_24h", "url"),
|
| 351 |
+
target_urls=("https://coinmarketcap.com/",),
|
| 352 |
+
description="Cryptocurrency market data",
|
| 353 |
+
),
|
| 354 |
+
SiteTemplate(
|
| 355 |
+
site_id="coindesk",
|
| 356 |
+
name="CoinDesk",
|
| 357 |
+
domains=("coindesk.com",),
|
| 358 |
+
aliases=("coindesk", "crypto news"),
|
| 359 |
+
extraction_goal="crypto_news",
|
| 360 |
+
navigation_steps=NEWS_STEPS,
|
| 361 |
+
output_fields=("headline", "author", "published", "url"),
|
| 362 |
+
target_urls=("https://www.coindesk.com/",),
|
| 363 |
+
description="Cryptocurrency news",
|
| 364 |
+
),
|
| 365 |
+
SiteTemplate(
|
| 366 |
+
site_id="investopedia",
|
| 367 |
+
name="Investopedia",
|
| 368 |
+
domains=("investopedia.com",),
|
| 369 |
+
aliases=("investopedia", "finance education"),
|
| 370 |
+
extraction_goal="financial_articles",
|
| 371 |
+
navigation_steps=DOC_STEPS,
|
| 372 |
+
output_fields=("title", "author", "updated", "topic", "url"),
|
| 373 |
+
target_urls=("https://www.investopedia.com/",),
|
| 374 |
+
description="Finance learning articles",
|
| 375 |
+
),
|
| 376 |
+
SiteTemplate(
|
| 377 |
+
site_id="googlescholar",
|
| 378 |
+
name="Google Scholar",
|
| 379 |
+
domains=("scholar.google.com",),
|
| 380 |
+
aliases=("google scholar", "scholar"),
|
| 381 |
+
extraction_goal="scholarly_results",
|
| 382 |
+
navigation_steps=DOC_STEPS,
|
| 383 |
+
output_fields=("title", "authors", "year", "citations", "url"),
|
| 384 |
+
target_urls=("https://scholar.google.com/",),
|
| 385 |
+
description="Scholarly paper search results",
|
| 386 |
+
),
|
| 387 |
+
SiteTemplate(
|
| 388 |
+
site_id="gitlab",
|
| 389 |
+
name="GitLab",
|
| 390 |
+
domains=("gitlab.com",),
|
| 391 |
+
aliases=("gitlab", "merge requests"),
|
| 392 |
+
extraction_goal="repositories",
|
| 393 |
+
navigation_steps=DOC_STEPS,
|
| 394 |
+
output_fields=("project", "stars", "forks", "last_activity", "url"),
|
| 395 |
+
target_urls=("https://gitlab.com/explore",),
|
| 396 |
+
description="Git repository projects and activity",
|
| 397 |
+
),
|
| 398 |
+
SiteTemplate(
|
| 399 |
+
site_id="bitbucket",
|
| 400 |
+
name="Bitbucket",
|
| 401 |
+
domains=("bitbucket.org",),
|
| 402 |
+
aliases=("bitbucket", "repos"),
|
| 403 |
+
extraction_goal="repositories",
|
| 404 |
+
navigation_steps=DOC_STEPS,
|
| 405 |
+
output_fields=("project", "owner", "updated", "url"),
|
| 406 |
+
target_urls=("https://bitbucket.org/product",),
|
| 407 |
+
description="Repository and workspace metadata",
|
| 408 |
+
),
|
| 409 |
+
SiteTemplate(
|
| 410 |
+
site_id="amazon",
|
| 411 |
+
name="Amazon",
|
| 412 |
+
domains=("amazon.com", "amazon.in", "amazon.co.uk"),
|
| 413 |
+
aliases=("amazon", "products", "shopping"),
|
| 414 |
+
extraction_goal="products",
|
| 415 |
+
navigation_steps=DOC_STEPS,
|
| 416 |
+
output_fields=("title", "price", "rating", "reviews", "url"),
|
| 417 |
+
target_urls=("https://www.amazon.com/gp/bestsellers",),
|
| 418 |
+
description="Ecommerce product listings",
|
| 419 |
+
),
|
| 420 |
+
SiteTemplate(
|
| 421 |
+
site_id="ebay",
|
| 422 |
+
name="eBay",
|
| 423 |
+
domains=("ebay.com",),
|
| 424 |
+
aliases=("ebay", "auctions"),
|
| 425 |
+
extraction_goal="products",
|
| 426 |
+
navigation_steps=DOC_STEPS,
|
| 427 |
+
output_fields=("title", "price", "condition", "shipping", "url"),
|
| 428 |
+
target_urls=("https://www.ebay.com/deals",),
|
| 429 |
+
description="Auction and product cards",
|
| 430 |
+
),
|
| 431 |
+
SiteTemplate(
|
| 432 |
+
site_id="walmart",
|
| 433 |
+
name="Walmart",
|
| 434 |
+
domains=("walmart.com",),
|
| 435 |
+
aliases=("walmart", "shopping"),
|
| 436 |
+
extraction_goal="products",
|
| 437 |
+
navigation_steps=DOC_STEPS,
|
| 438 |
+
output_fields=("title", "price", "rating", "availability", "url"),
|
| 439 |
+
target_urls=("https://www.walmart.com/shop/deals",),
|
| 440 |
+
description="Retail product listings",
|
| 441 |
+
),
|
| 442 |
+
SiteTemplate(
|
| 443 |
+
site_id="etsy",
|
| 444 |
+
name="Etsy",
|
| 445 |
+
domains=("etsy.com",),
|
| 446 |
+
aliases=("etsy", "handmade"),
|
| 447 |
+
extraction_goal="products",
|
| 448 |
+
navigation_steps=DOC_STEPS,
|
| 449 |
+
output_fields=("title", "price", "shop", "rating", "url"),
|
| 450 |
+
target_urls=("https://www.etsy.com/c/jewelry",),
|
| 451 |
+
description="Marketplace products and shops",
|
| 452 |
+
),
|
| 453 |
+
SiteTemplate(
|
| 454 |
+
site_id="aliexpress",
|
| 455 |
+
name="AliExpress",
|
| 456 |
+
domains=("aliexpress.com",),
|
| 457 |
+
aliases=("aliexpress", "marketplace"),
|
| 458 |
+
extraction_goal="products",
|
| 459 |
+
navigation_steps=DOC_STEPS,
|
| 460 |
+
output_fields=("title", "price", "orders", "shipping", "url"),
|
| 461 |
+
target_urls=("https://www.aliexpress.com/category/200003482/electronics.html",),
|
| 462 |
+
description="Marketplace product listings",
|
| 463 |
+
),
|
| 464 |
+
SiteTemplate(
|
| 465 |
+
site_id="coursera",
|
| 466 |
+
name="Coursera",
|
| 467 |
+
domains=("coursera.org",),
|
| 468 |
+
aliases=("coursera", "courses"),
|
| 469 |
+
extraction_goal="courses",
|
| 470 |
+
navigation_steps=DOC_STEPS,
|
| 471 |
+
output_fields=("course", "provider", "rating", "level", "url"),
|
| 472 |
+
target_urls=("https://www.coursera.org/courses",),
|
| 473 |
+
description="Course catalog extraction",
|
| 474 |
+
),
|
| 475 |
+
SiteTemplate(
|
| 476 |
+
site_id="udemy",
|
| 477 |
+
name="Udemy",
|
| 478 |
+
domains=("udemy.com",),
|
| 479 |
+
aliases=("udemy", "courses"),
|
| 480 |
+
extraction_goal="courses",
|
| 481 |
+
navigation_steps=DOC_STEPS,
|
| 482 |
+
output_fields=("course", "instructor", "rating", "price", "url"),
|
| 483 |
+
target_urls=("https://www.udemy.com/courses/development/",),
|
| 484 |
+
description="Course marketplace extraction",
|
| 485 |
+
),
|
| 486 |
+
SiteTemplate(
|
| 487 |
+
site_id="edx",
|
| 488 |
+
name="edX",
|
| 489 |
+
domains=("edx.org",),
|
| 490 |
+
aliases=("edx", "courses"),
|
| 491 |
+
extraction_goal="courses",
|
| 492 |
+
navigation_steps=DOC_STEPS,
|
| 493 |
+
output_fields=("course", "institution", "duration", "level", "url"),
|
| 494 |
+
target_urls=("https://www.edx.org/search",),
|
| 495 |
+
description="Education course listings",
|
| 496 |
+
),
|
| 497 |
+
SiteTemplate(
|
| 498 |
+
site_id="freecodecamp",
|
| 499 |
+
name="freeCodeCamp",
|
| 500 |
+
domains=("freecodecamp.org",),
|
| 501 |
+
aliases=("freecodecamp", "curriculum"),
|
| 502 |
+
extraction_goal="learning_resources",
|
| 503 |
+
navigation_steps=DOC_STEPS,
|
| 504 |
+
output_fields=("resource", "category", "difficulty", "url"),
|
| 505 |
+
target_urls=("https://www.freecodecamp.org/news/",),
|
| 506 |
+
description="Learning resources and tutorials",
|
| 507 |
+
),
|
| 508 |
+
SiteTemplate(
|
| 509 |
+
site_id="paperswithcode",
|
| 510 |
+
name="Papers with Code",
|
| 511 |
+
domains=("paperswithcode.com",),
|
| 512 |
+
aliases=("paperswithcode", "benchmarks"),
|
| 513 |
+
extraction_goal="papers_and_models",
|
| 514 |
+
navigation_steps=DOC_STEPS,
|
| 515 |
+
output_fields=("paper", "task", "sota_metric", "code_link", "url"),
|
| 516 |
+
target_urls=("https://paperswithcode.com/sota",),
|
| 517 |
+
description="ML paper and benchmark extraction",
|
| 518 |
+
),
|
| 519 |
+
SiteTemplate(
|
| 520 |
+
site_id="openreview",
|
| 521 |
+
name="OpenReview",
|
| 522 |
+
domains=("openreview.net",),
|
| 523 |
+
aliases=("openreview", "conference papers"),
|
| 524 |
+
extraction_goal="conference_papers",
|
| 525 |
+
navigation_steps=DOC_STEPS,
|
| 526 |
+
output_fields=("title", "authors", "venue", "rating", "url"),
|
| 527 |
+
target_urls=("https://openreview.net/group?id=ICLR.cc",),
|
| 528 |
+
description="Conference paper pages and metadata",
|
| 529 |
+
),
|
| 530 |
+
SiteTemplate(
|
| 531 |
+
site_id="leetcode",
|
| 532 |
+
name="LeetCode",
|
| 533 |
+
domains=("leetcode.com",),
|
| 534 |
+
aliases=("leetcode", "problems"),
|
| 535 |
+
extraction_goal="coding_problems",
|
| 536 |
+
navigation_steps=DOC_STEPS,
|
| 537 |
+
output_fields=("problem", "difficulty", "acceptance", "tags", "url"),
|
| 538 |
+
target_urls=("https://leetcode.com/problemset/",),
|
| 539 |
+
description="Coding challenge listings",
|
| 540 |
+
),
|
| 541 |
+
SiteTemplate(
|
| 542 |
+
site_id="geeksforgeeks",
|
| 543 |
+
name="GeeksforGeeks",
|
| 544 |
+
domains=("geeksforgeeks.org",),
|
| 545 |
+
aliases=("geeksforgeeks", "gfg"),
|
| 546 |
+
extraction_goal="tutorials",
|
| 547 |
+
navigation_steps=DOC_STEPS,
|
| 548 |
+
output_fields=("title", "topic", "difficulty", "url"),
|
| 549 |
+
target_urls=("https://www.geeksforgeeks.org/explore",),
|
| 550 |
+
description="Tutorial and practice resources",
|
| 551 |
+
),
|
| 552 |
+
SiteTemplate(
|
| 553 |
+
site_id="indeed",
|
| 554 |
+
name="Indeed",
|
| 555 |
+
domains=("indeed.com",),
|
| 556 |
+
aliases=("indeed", "job listings"),
|
| 557 |
+
extraction_goal="jobs",
|
| 558 |
+
navigation_steps=DOC_STEPS,
|
| 559 |
+
output_fields=("title", "company", "location", "salary", "url"),
|
| 560 |
+
target_urls=("https://www.indeed.com/jobs",),
|
| 561 |
+
description="Job listing extraction",
|
| 562 |
+
),
|
| 563 |
+
SiteTemplate(
|
| 564 |
+
site_id="glassdoor",
|
| 565 |
+
name="Glassdoor",
|
| 566 |
+
domains=("glassdoor.com",),
|
| 567 |
+
aliases=("glassdoor", "company reviews"),
|
| 568 |
+
extraction_goal="jobs_and_companies",
|
| 569 |
+
navigation_steps=DOC_STEPS,
|
| 570 |
+
output_fields=("title", "company", "rating", "location", "url"),
|
| 571 |
+
target_urls=("https://www.glassdoor.com/Job/index.htm",),
|
| 572 |
+
description="Jobs and company review listings",
|
| 573 |
+
),
|
| 574 |
+
SiteTemplate(
|
| 575 |
+
site_id="twitch",
|
| 576 |
+
name="Twitch",
|
| 577 |
+
domains=("twitch.tv",),
|
| 578 |
+
aliases=("twitch", "streams"),
|
| 579 |
+
extraction_goal="live_streams",
|
| 580 |
+
navigation_steps=SOCIAL_STEPS,
|
| 581 |
+
output_fields=("streamer", "title", "viewers", "category", "url"),
|
| 582 |
+
target_urls=("https://www.twitch.tv/directory",),
|
| 583 |
+
description="Live stream directory extraction",
|
| 584 |
+
),
|
| 585 |
+
SiteTemplate(
|
| 586 |
+
site_id="vimeo",
|
| 587 |
+
name="Vimeo",
|
| 588 |
+
domains=("vimeo.com",),
|
| 589 |
+
aliases=("vimeo", "videos"),
|
| 590 |
+
extraction_goal="videos",
|
| 591 |
+
navigation_steps=SOCIAL_STEPS,
|
| 592 |
+
output_fields=("title", "creator", "plays", "likes", "url"),
|
| 593 |
+
target_urls=("https://vimeo.com/channels",),
|
| 594 |
+
description="Video channel discovery",
|
| 595 |
+
),
|
| 596 |
+
SiteTemplate(
|
| 597 |
+
site_id="spotify",
|
| 598 |
+
name="Spotify",
|
| 599 |
+
domains=("spotify.com", "open.spotify.com"),
|
| 600 |
+
aliases=("spotify", "playlists"),
|
| 601 |
+
extraction_goal="music_catalog",
|
| 602 |
+
navigation_steps=DOC_STEPS,
|
| 603 |
+
output_fields=("title", "type", "creator", "followers", "url"),
|
| 604 |
+
target_urls=("https://open.spotify.com/genre/0JQ5DAqbMKFEC4WFtoNRpw",),
|
| 605 |
+
description="Music and playlist metadata",
|
| 606 |
+
),
|
| 607 |
+
SiteTemplate(
|
| 608 |
+
site_id="soundcloud",
|
| 609 |
+
name="SoundCloud",
|
| 610 |
+
domains=("soundcloud.com",),
|
| 611 |
+
aliases=("soundcloud", "tracks"),
|
| 612 |
+
extraction_goal="audio_tracks",
|
| 613 |
+
navigation_steps=SOCIAL_STEPS,
|
| 614 |
+
output_fields=("title", "artist", "plays", "likes", "url"),
|
| 615 |
+
target_urls=("https://soundcloud.com/discover",),
|
| 616 |
+
description="Audio track discovery",
|
| 617 |
+
),
|
| 618 |
+
SiteTemplate(
|
| 619 |
+
site_id="airbnb",
|
| 620 |
+
name="Airbnb",
|
| 621 |
+
domains=("airbnb.com",),
|
| 622 |
+
aliases=("airbnb", "stays"),
|
| 623 |
+
extraction_goal="listings",
|
| 624 |
+
navigation_steps=DOC_STEPS,
|
| 625 |
+
output_fields=("title", "location", "price_per_night", "rating", "url"),
|
| 626 |
+
target_urls=("https://www.airbnb.com/s/homes",),
|
| 627 |
+
description="Accommodation listings",
|
| 628 |
+
),
|
| 629 |
+
SiteTemplate(
|
| 630 |
+
site_id="booking",
|
| 631 |
+
name="Booking.com",
|
| 632 |
+
domains=("booking.com",),
|
| 633 |
+
aliases=("booking", "hotels"),
|
| 634 |
+
extraction_goal="hotel_listings",
|
| 635 |
+
navigation_steps=DOC_STEPS,
|
| 636 |
+
output_fields=("hotel", "location", "price", "rating", "url"),
|
| 637 |
+
target_urls=("https://www.booking.com/",),
|
| 638 |
+
description="Hotel search and listing extraction",
|
| 639 |
+
),
|
| 640 |
+
SiteTemplate(
|
| 641 |
+
site_id="zillow",
|
| 642 |
+
name="Zillow",
|
| 643 |
+
domains=("zillow.com",),
|
| 644 |
+
aliases=("zillow", "real estate"),
|
| 645 |
+
extraction_goal="property_listings",
|
| 646 |
+
navigation_steps=DOC_STEPS,
|
| 647 |
+
output_fields=("address", "price", "beds", "baths", "url"),
|
| 648 |
+
target_urls=("https://www.zillow.com/homes/",),
|
| 649 |
+
description="Property listing extraction",
|
| 650 |
+
),
|
| 651 |
+
)
|