NeerajCodz Copilot commited on
Commit
b28fad0
·
1 Parent(s): bfec523

test: run 6100-case template and non-template stress matrix

Browse files

- add deterministic stress runner covering all 56 site templates and 5 non-template targets
- execute 100 iterations per target with question/csv/json modes
- validate strict output-format and schema adherence for each run
- generate docs/test-report.md and docs/reports/template-stress-summary.json
- result: 6100/6100 completed, 0 partial, 0 failed

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

backend/tests/test_api/run_template_stress_matrix.py ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Run a large deterministic template/non-template scrape matrix and write docs/test-report.md."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ import re
8
+ import time
9
+ from collections import defaultdict
10
+ from dataclasses import dataclass
11
+ from pathlib import Path
12
+ from typing import Any
13
+ from urllib.parse import urlparse
14
+
15
+ from fastapi.testclient import TestClient
16
+
17
+ from app.api.routes import scrape as scrape_routes
18
+ from app.core.env import WebScraperEnv
19
+ from app.main import app
20
+ from app.sites.templates import SITE_TEMPLATES
21
+
22
+ BASE_PLUGINS = ["mcp-browser", "mcp-search", "mcp-html"]
23
+ DEFAULT_AGENTS = ["planner", "navigator", "extractor", "verifier"]
24
+ ITERATIONS_PER_TARGET = 100
25
+ NON_TEMPLATE_ASSETS = [
26
+ "https://unknown-synth-alpha.test",
27
+ "https://unknown-synth-beta.test",
28
+ "https://unknown-synth-gamma.test",
29
+ "open source scraping tools benchmark",
30
+ "synthetic market intelligence dashboard comparison",
31
+ ]
32
+
33
+
34
+ @dataclass(frozen=True)
35
+ class Scenario:
36
+ """One test scenario for a specific asset/template target."""
37
+
38
+ target_id: str
39
+ asset: str
40
+ is_template: bool
41
+ output_format: str
42
+ instructions: str
43
+ output_instructions: str
44
+ requested_columns: tuple[str, ...]
45
+ mode: str
46
+
47
+
48
+ def _build_gold_csv(months: int = 180) -> str:
49
+ lines = ["Date,Price"]
50
+ year = 2012
51
+ month = 1
52
+ for index in range(months):
53
+ price = 1120.0 + (index * 2.75)
54
+ lines.append(f"{year:04d}-{month:02d}-01,{price:.2f}")
55
+ month += 1
56
+ if month > 12:
57
+ month = 1
58
+ year += 1
59
+ return "\n".join(lines)
60
+
61
+
62
+ def _build_html_payload(url: str) -> str:
63
+ parsed = urlparse(url)
64
+ domain = parsed.netloc or "example.com"
65
+ path = parsed.path or "/"
66
+ slug = path.strip("/").replace("/", "-") or "home"
67
+
68
+ sample_cards = """
69
+ <article class="card">
70
+ <h2><a href="/alpha/item-one">alpha / item-one</a></h2>
71
+ <div>stars 1,234 forks 210</div>
72
+ </article>
73
+ <article class="card">
74
+ <h2><a href="/beta/item-two">beta / item-two</a></h2>
75
+ <div>stars 987 forks 145</div>
76
+ </article>
77
+ <article class="card">
78
+ <h2><a href="/gamma/item-three">gamma / item-three</a></h2>
79
+ <div>stars 876 forks 132</div>
80
+ </article>
81
+ """
82
+
83
+ return f"""
84
+ <html>
85
+ <head>
86
+ <title>{domain} :: {slug}</title>
87
+ <meta name="description" content="Mock page for {domain} and {slug}" />
88
+ </head>
89
+ <body>
90
+ <h1>{domain} heading</h1>
91
+ <p>Offline deterministic content for {url}. Contact: test+{slug}@example.com</p>
92
+ <a href="https://{domain}/about">About</a>
93
+ <a href="https://{domain}/contact">Contact</a>
94
+ <a href="mailto:hello@example.com">Email</a>
95
+ <table>
96
+ <tr><th>month</th><th>gold_price_usd</th></tr>
97
+ <tr><td>2016-01</td><td>1101.00</td></tr>
98
+ <tr><td>2016-02</td><td>1104.00</td></tr>
99
+ </table>
100
+ {sample_cards}
101
+ </body>
102
+ </html>
103
+ """
104
+
105
+
106
+ def _requested_columns(output_instructions: str) -> tuple[str, ...]:
107
+ cleaned = output_instructions.strip()
108
+ cleaned = re.sub(r"^(?:csv|json|table)\s+of\s+", "", cleaned, flags=re.IGNORECASE)
109
+ cleaned = cleaned.replace(" and ", ", ")
110
+ columns: list[str] = []
111
+ for piece in cleaned.split(","):
112
+ value = re.sub(r"[^A-Za-z0-9_]+", " ", piece).strip().lower().replace(" ", "_")
113
+ if value and value not in columns:
114
+ columns.append(value)
115
+ return tuple(columns)
116
+
117
+
118
+ def _build_payload(scenario: Scenario) -> dict[str, Any]:
119
+ return {
120
+ "assets": [scenario.asset],
121
+ "instructions": scenario.instructions,
122
+ "output_instructions": scenario.output_instructions,
123
+ "output_format": scenario.output_format,
124
+ "complexity": "low",
125
+ "model": "llama-3.1-70b-versatile",
126
+ "provider": "groq",
127
+ "enable_memory": True,
128
+ "enable_plugins": list(BASE_PLUGINS),
129
+ "selected_agents": list(DEFAULT_AGENTS),
130
+ "max_steps": 30,
131
+ }
132
+
133
+
134
+ def _build_template_scenario(template: Any, iteration: int) -> Scenario:
135
+ mode_idx = iteration % 3
136
+ fields = tuple(str(field).lower() for field in template.output_fields[:4]) or ("title", "url")
137
+ asset = f"https://{template.domains[0]}"
138
+
139
+ if mode_idx == 0:
140
+ return Scenario(
141
+ target_id=template.site_id,
142
+ asset=asset,
143
+ is_template=True,
144
+ output_format="text",
145
+ instructions=f"What are the top visible {template.extraction_goal} on {template.name} right now?",
146
+ output_instructions="Answer the question clearly in plain text.",
147
+ requested_columns=(),
148
+ mode="question",
149
+ )
150
+ if mode_idx == 1:
151
+ output_instructions = f"csv of {', '.join(fields)}"
152
+ return Scenario(
153
+ target_id=template.site_id,
154
+ asset=asset,
155
+ is_template=True,
156
+ output_format="csv",
157
+ instructions=f"Extract the top visible {template.extraction_goal} and return rows.",
158
+ output_instructions=output_instructions,
159
+ requested_columns=_requested_columns(output_instructions),
160
+ mode="csv",
161
+ )
162
+
163
+ output_instructions = f"json of {', '.join(fields)}"
164
+ return Scenario(
165
+ target_id=template.site_id,
166
+ asset=asset,
167
+ is_template=True,
168
+ output_format="json",
169
+ instructions=f"Extract structured {template.extraction_goal} entities from this asset.",
170
+ output_instructions=output_instructions,
171
+ requested_columns=_requested_columns(output_instructions),
172
+ mode="json",
173
+ )
174
+
175
+
176
+ def _build_non_template_scenario(asset: str, iteration: int) -> Scenario:
177
+ mode_idx = iteration % 3
178
+ if mode_idx == 0:
179
+ return Scenario(
180
+ target_id=f"non-template:{asset}",
181
+ asset=asset,
182
+ is_template=False,
183
+ output_format="text",
184
+ instructions="What is available on this target and what can be extracted?",
185
+ output_instructions="Answer the question clearly in plain text.",
186
+ requested_columns=(),
187
+ mode="question",
188
+ )
189
+ if mode_idx == 1:
190
+ output_instructions = "csv of title, url, content"
191
+ return Scenario(
192
+ target_id=f"non-template:{asset}",
193
+ asset=asset,
194
+ is_template=False,
195
+ output_format="csv",
196
+ instructions="Extract key entities and metadata from the target.",
197
+ output_instructions=output_instructions,
198
+ requested_columns=_requested_columns(output_instructions),
199
+ mode="csv",
200
+ )
201
+
202
+ output_instructions = "json of title, url, content"
203
+ return Scenario(
204
+ target_id=f"non-template:{asset}",
205
+ asset=asset,
206
+ is_template=False,
207
+ output_format="json",
208
+ instructions="Extract key entities and metadata from the target.",
209
+ output_instructions=output_instructions,
210
+ requested_columns=_requested_columns(output_instructions),
211
+ mode="json",
212
+ )
213
+
214
+
215
+ def _collect_stream_events(client: TestClient, payload: dict[str, Any]) -> list[dict[str, Any]]:
216
+ events: list[dict[str, Any]] = []
217
+ with client.stream("POST", "/api/scrape/stream", json=payload) as response:
218
+ if response.status_code != 200:
219
+ raise RuntimeError(f"stream request failed with status {response.status_code}")
220
+ for raw_line in response.iter_lines():
221
+ if not raw_line:
222
+ continue
223
+ line = raw_line if isinstance(raw_line, str) else raw_line.decode("utf-8", errors="ignore")
224
+ if not line.startswith("data: "):
225
+ continue
226
+ try:
227
+ events.append(json.loads(line[6:]))
228
+ except json.JSONDecodeError:
229
+ continue
230
+ return events
231
+
232
+
233
+ def _schema_ok_for_complete(complete_data: dict[str, Any], scenario: Scenario) -> bool:
234
+ if not scenario.requested_columns:
235
+ output = complete_data.get("output")
236
+ return isinstance(output, str) and bool(output.strip())
237
+
238
+ extracted_data = complete_data.get("extracted_data")
239
+ if scenario.output_format == "csv":
240
+ if not isinstance(extracted_data, dict):
241
+ return False
242
+ columns = tuple((extracted_data.get("columns") or []))
243
+ return columns == scenario.requested_columns
244
+
245
+ if not isinstance(extracted_data, dict):
246
+ return False
247
+ rows: list[dict[str, Any]] = []
248
+ for value in extracted_data.values():
249
+ if isinstance(value, list):
250
+ rows = value
251
+ break
252
+ if not rows:
253
+ return False
254
+ first = rows[0]
255
+ if not isinstance(first, dict):
256
+ return False
257
+ return tuple(first.keys()) == scenario.requested_columns
258
+
259
+
260
+ def _run_matrix() -> dict[str, Any]:
261
+ os.environ["SCRAPERL_DISABLE_LIVE_LLM"] = "1"
262
+
263
+ original_execute_navigate = WebScraperEnv._execute_navigate
264
+ original_search_urls = scrape_routes._search_urls_with_mcp
265
+ original_fetch_reddit = scrape_routes._fetch_reddit_communities
266
+
267
+ async def fake_execute_navigate(self: WebScraperEnv, url: str) -> dict[str, Any]:
268
+ normalized = str(url).strip()
269
+ if not normalized.startswith("http"):
270
+ normalized = f"https://{normalized}"
271
+ if "gold" in normalized and normalized.endswith(".csv"):
272
+ self._page_content_type = "text/csv; charset=utf-8"
273
+ self._page_html = _build_gold_csv()
274
+ self._page_title = "gold-prices-monthly"
275
+ else:
276
+ self._page_content_type = "text/html; charset=utf-8"
277
+ self._page_html = _build_html_payload(normalized)
278
+ self._page_title = urlparse(normalized).netloc or "example.com"
279
+ return {
280
+ "success": True,
281
+ "url": normalized,
282
+ "status_code": 200,
283
+ "content_type": self._page_content_type,
284
+ "tls_verification_bypassed": False,
285
+ }
286
+
287
+ async def fake_search_urls(query: str, max_results: int = 6) -> list[str]:
288
+ token = re.sub(r"[^a-z0-9]+", "-", query.lower()).strip("-") or "query"
289
+ count = max(1, min(max_results, 3))
290
+ return [f"https://{token}.example.com/source-{index}" for index in range(1, count + 1)]
291
+
292
+ def fake_fetch_reddit_communities(limit: int = 25) -> tuple[list[dict[str, Any]], str]:
293
+ rows: list[dict[str, Any]] = []
294
+ for index in range(limit):
295
+ rows.append(
296
+ {
297
+ "subreddit": f"r/mockcommunity{index + 1}",
298
+ "title": f"Mock Community {index + 1}",
299
+ "subscribers": 200000 - (index * 1000),
300
+ "active_users": 15000 - (index * 100),
301
+ "url": f"https://www.reddit.com/r/mockcommunity{index + 1}/",
302
+ "description": "Offline mocked Reddit community",
303
+ }
304
+ )
305
+ return rows, "mock_reddit_json"
306
+
307
+ WebScraperEnv._execute_navigate = fake_execute_navigate
308
+ scrape_routes._search_urls_with_mcp = fake_search_urls
309
+ scrape_routes._fetch_reddit_communities = fake_fetch_reddit_communities
310
+
311
+ started = time.time()
312
+ stats: dict[str, Any] = {
313
+ "iterations_per_target": ITERATIONS_PER_TARGET,
314
+ "template_count": len(SITE_TEMPLATES),
315
+ "non_template_target_count": len(NON_TEMPLATE_ASSETS),
316
+ "total_runs": 0,
317
+ "completed_runs": 0,
318
+ "partial_runs": 0,
319
+ "failed_runs": 0,
320
+ "schema_failures": 0,
321
+ "format_failures": 0,
322
+ "error_samples": [],
323
+ "template_results": defaultdict(lambda: {"runs": 0, "completed": 0, "partial": 0, "failed": 0}),
324
+ "non_template_results": defaultdict(lambda: {"runs": 0, "completed": 0, "partial": 0, "failed": 0}),
325
+ }
326
+
327
+ try:
328
+ with TestClient(app) as client:
329
+ for template in SITE_TEMPLATES:
330
+ for iteration in range(ITERATIONS_PER_TARGET):
331
+ scenario = _build_template_scenario(template, iteration)
332
+ payload = _build_payload(scenario)
333
+ target_bucket = stats["template_results"][template.site_id]
334
+ target_bucket["runs"] += 1
335
+ stats["total_runs"] += 1
336
+ session_id: str | None = None
337
+ try:
338
+ events = _collect_stream_events(client, payload)
339
+ init_event = next((event for event in events if event.get("type") == "init"), None)
340
+ complete_event = next((event for event in events if event.get("type") == "complete"), None)
341
+ if not init_event or not complete_event:
342
+ raise RuntimeError("missing init/complete events")
343
+ session_id = str(init_event.get("session_id", ""))
344
+ complete_data = complete_event.get("data") or {}
345
+ status = str(complete_data.get("status", "failed"))
346
+ output_format = str(complete_data.get("output_format", ""))
347
+ if output_format != scenario.output_format:
348
+ stats["format_failures"] += 1
349
+ raise RuntimeError(
350
+ f"output_format mismatch expected={scenario.output_format} got={output_format}"
351
+ )
352
+ if not _schema_ok_for_complete(complete_data, scenario):
353
+ stats["schema_failures"] += 1
354
+ raise RuntimeError("schema validation failed")
355
+ if status == "completed":
356
+ stats["completed_runs"] += 1
357
+ target_bucket["completed"] += 1
358
+ else:
359
+ stats["partial_runs"] += 1
360
+ target_bucket["partial"] += 1
361
+ except Exception as exc: # noqa: BLE001
362
+ stats["failed_runs"] += 1
363
+ target_bucket["failed"] += 1
364
+ if len(stats["error_samples"]) < 30:
365
+ stats["error_samples"].append(
366
+ {
367
+ "target_id": scenario.target_id,
368
+ "mode": scenario.mode,
369
+ "asset": scenario.asset,
370
+ "error": str(exc),
371
+ }
372
+ )
373
+ finally:
374
+ if session_id:
375
+ client.delete(f"/api/scrape/{session_id}/cleanup")
376
+
377
+ for asset in NON_TEMPLATE_ASSETS:
378
+ for iteration in range(ITERATIONS_PER_TARGET):
379
+ scenario = _build_non_template_scenario(asset, iteration)
380
+ payload = _build_payload(scenario)
381
+ target_bucket = stats["non_template_results"][asset]
382
+ target_bucket["runs"] += 1
383
+ stats["total_runs"] += 1
384
+ session_id = None
385
+ try:
386
+ events = _collect_stream_events(client, payload)
387
+ init_event = next((event for event in events if event.get("type") == "init"), None)
388
+ complete_event = next((event for event in events if event.get("type") == "complete"), None)
389
+ if not init_event or not complete_event:
390
+ raise RuntimeError("missing init/complete events")
391
+ session_id = str(init_event.get("session_id", ""))
392
+ complete_data = complete_event.get("data") or {}
393
+ status = str(complete_data.get("status", "failed"))
394
+ output_format = str(complete_data.get("output_format", ""))
395
+ if output_format != scenario.output_format:
396
+ stats["format_failures"] += 1
397
+ raise RuntimeError(
398
+ f"output_format mismatch expected={scenario.output_format} got={output_format}"
399
+ )
400
+ if not _schema_ok_for_complete(complete_data, scenario):
401
+ stats["schema_failures"] += 1
402
+ raise RuntimeError("schema validation failed")
403
+ if status == "completed":
404
+ stats["completed_runs"] += 1
405
+ target_bucket["completed"] += 1
406
+ else:
407
+ stats["partial_runs"] += 1
408
+ target_bucket["partial"] += 1
409
+ except Exception as exc: # noqa: BLE001
410
+ stats["failed_runs"] += 1
411
+ target_bucket["failed"] += 1
412
+ if len(stats["error_samples"]) < 30:
413
+ stats["error_samples"].append(
414
+ {
415
+ "target_id": scenario.target_id,
416
+ "mode": scenario.mode,
417
+ "asset": scenario.asset,
418
+ "error": str(exc),
419
+ }
420
+ )
421
+ finally:
422
+ if session_id:
423
+ client.delete(f"/api/scrape/{session_id}/cleanup")
424
+ finally:
425
+ WebScraperEnv._execute_navigate = original_execute_navigate
426
+ scrape_routes._search_urls_with_mcp = original_search_urls
427
+ scrape_routes._fetch_reddit_communities = original_fetch_reddit
428
+
429
+ stats["duration_seconds"] = round(time.time() - started, 2)
430
+ stats["template_results"] = dict(stats["template_results"])
431
+ stats["non_template_results"] = dict(stats["non_template_results"])
432
+ return stats
433
+
434
+
435
+ def _write_report(stats: dict[str, Any]) -> None:
436
+ project_root = Path(__file__).resolve().parents[3]
437
+ docs_dir = project_root / "docs"
438
+ reports_dir = docs_dir / "reports"
439
+ reports_dir.mkdir(parents=True, exist_ok=True)
440
+
441
+ json_path = reports_dir / "template-stress-summary.json"
442
+ json_path.write_text(json.dumps(stats, indent=2), encoding="utf-8")
443
+
444
+ total = int(stats["total_runs"])
445
+ completed = int(stats["completed_runs"])
446
+ partial = int(stats["partial_runs"])
447
+ failed = int(stats["failed_runs"])
448
+ pass_rate = (completed / total * 100.0) if total else 0.0
449
+
450
+ template_lines = []
451
+ for site_id, row in sorted(stats["template_results"].items()):
452
+ template_lines.append(
453
+ f"| `{site_id}` | {row['runs']} | {row['completed']} | {row['partial']} | {row['failed']} |"
454
+ )
455
+
456
+ non_template_lines = []
457
+ for asset, row in sorted(stats["non_template_results"].items()):
458
+ non_template_lines.append(
459
+ f"| `{asset}` | {row['runs']} | {row['completed']} | {row['partial']} | {row['failed']} |"
460
+ )
461
+
462
+ error_lines = []
463
+ for sample in stats["error_samples"]:
464
+ error_lines.append(
465
+ f"- `{sample['target_id']}` ({sample['mode']}) asset=`{sample['asset']}` error=`{sample['error']}`"
466
+ )
467
+ if not error_lines:
468
+ error_lines.append("- No failures captured.")
469
+
470
+ report = f"""# Template Stress Test Report
471
+
472
+ ## Scope
473
+ - Template targets: **{stats['template_count']}**
474
+ - Non-template targets: **{stats['non_template_target_count']}**
475
+ - Iterations per target: **{stats['iterations_per_target']}**
476
+ - Total runs: **{total}**
477
+ - Modes cycled per target: **question**, **csv**, **json**
478
+ - Execution mode: deterministic offline mocks (`SCRAPERL_DISABLE_LIVE_LLM=1`)
479
+
480
+ ## Aggregate Result
481
+ - Completed: **{completed}**
482
+ - Partial: **{partial}**
483
+ - Failed: **{failed}**
484
+ - Pass rate (completed/total): **{pass_rate:.2f}%**
485
+ - Schema failures: **{stats['schema_failures']}**
486
+ - Output-format mismatches: **{stats['format_failures']}**
487
+ - Duration: **{stats['duration_seconds']} seconds**
488
+
489
+ ## Per-Template Results
490
+ | Template | Runs | Completed | Partial | Failed |
491
+ |---|---:|---:|---:|---:|
492
+ {chr(10).join(template_lines)}
493
+
494
+ ## Non-Template Results
495
+ | Asset | Runs | Completed | Partial | Failed |
496
+ |---|---:|---:|---:|---:|
497
+ {chr(10).join(non_template_lines)}
498
+
499
+ ## Failure Samples
500
+ {chr(10).join(error_lines)}
501
+
502
+ ## Notes
503
+ - Templates are used as **reference hints** (navigation targets/field hints), not rigid scraper scripts.
504
+ - Agent flow evaluates **assets + instructions + output_format + output_instructions** per request.
505
+ - Output schema validation checks strict column adherence for CSV/JSON runs.
506
+ - Raw machine summary: `docs/reports/template-stress-summary.json`.
507
+ """
508
+
509
+ report_path = docs_dir / "test-report.md"
510
+ report_path.write_text(report, encoding="utf-8")
511
+
512
+
513
+ def main() -> None:
514
+ stats = _run_matrix()
515
+ _write_report(stats)
516
+ print(json.dumps(
517
+ {
518
+ "total_runs": stats["total_runs"],
519
+ "completed_runs": stats["completed_runs"],
520
+ "partial_runs": stats["partial_runs"],
521
+ "failed_runs": stats["failed_runs"],
522
+ "duration_seconds": stats["duration_seconds"],
523
+ },
524
+ indent=2,
525
+ ))
526
+
527
+
528
+ if __name__ == "__main__":
529
+ main()
docs/reports/template-stress-summary.json ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "iterations_per_target": 100,
3
+ "template_count": 56,
4
+ "non_template_target_count": 5,
5
+ "total_runs": 6100,
6
+ "completed_runs": 6100,
7
+ "partial_runs": 0,
8
+ "failed_runs": 0,
9
+ "schema_failures": 0,
10
+ "format_failures": 0,
11
+ "error_samples": [],
12
+ "template_results": {
13
+ "github": {
14
+ "runs": 100,
15
+ "completed": 100,
16
+ "partial": 0,
17
+ "failed": 0
18
+ },
19
+ "reddit": {
20
+ "runs": 100,
21
+ "completed": 100,
22
+ "partial": 0,
23
+ "failed": 0
24
+ },
25
+ "x": {
26
+ "runs": 100,
27
+ "completed": 100,
28
+ "partial": 0,
29
+ "failed": 0
30
+ },
31
+ "youtube": {
32
+ "runs": 100,
33
+ "completed": 100,
34
+ "partial": 0,
35
+ "failed": 0
36
+ },
37
+ "instagram": {
38
+ "runs": 100,
39
+ "completed": 100,
40
+ "partial": 0,
41
+ "failed": 0
42
+ },
43
+ "facebook": {
44
+ "runs": 100,
45
+ "completed": 100,
46
+ "partial": 0,
47
+ "failed": 0
48
+ },
49
+ "linkedin": {
50
+ "runs": 100,
51
+ "completed": 100,
52
+ "partial": 0,
53
+ "failed": 0
54
+ },
55
+ "tiktok": {
56
+ "runs": 100,
57
+ "completed": 100,
58
+ "partial": 0,
59
+ "failed": 0
60
+ },
61
+ "medium": {
62
+ "runs": 100,
63
+ "completed": 100,
64
+ "partial": 0,
65
+ "failed": 0
66
+ },
67
+ "devto": {
68
+ "runs": 100,
69
+ "completed": 100,
70
+ "partial": 0,
71
+ "failed": 0
72
+ },
73
+ "stackoverflow": {
74
+ "runs": 100,
75
+ "completed": 100,
76
+ "partial": 0,
77
+ "failed": 0
78
+ },
79
+ "kaggle": {
80
+ "runs": 100,
81
+ "completed": 100,
82
+ "partial": 0,
83
+ "failed": 0
84
+ },
85
+ "huggingface": {
86
+ "runs": 100,
87
+ "completed": 100,
88
+ "partial": 0,
89
+ "failed": 0
90
+ },
91
+ "arxiv": {
92
+ "runs": 100,
93
+ "completed": 100,
94
+ "partial": 0,
95
+ "failed": 0
96
+ },
97
+ "wikipedia": {
98
+ "runs": 100,
99
+ "completed": 100,
100
+ "partial": 0,
101
+ "failed": 0
102
+ },
103
+ "pypi": {
104
+ "runs": 100,
105
+ "completed": 100,
106
+ "partial": 0,
107
+ "failed": 0
108
+ },
109
+ "npm": {
110
+ "runs": 100,
111
+ "completed": 100,
112
+ "partial": 0,
113
+ "failed": 0
114
+ },
115
+ "producthunt": {
116
+ "runs": 100,
117
+ "completed": 100,
118
+ "partial": 0,
119
+ "failed": 0
120
+ },
121
+ "hackernews": {
122
+ "runs": 100,
123
+ "completed": 100,
124
+ "partial": 0,
125
+ "failed": 0
126
+ },
127
+ "substack": {
128
+ "runs": 100,
129
+ "completed": 100,
130
+ "partial": 0,
131
+ "failed": 0
132
+ },
133
+ "quora": {
134
+ "runs": 100,
135
+ "completed": 100,
136
+ "partial": 0,
137
+ "failed": 0
138
+ },
139
+ "pinterest": {
140
+ "runs": 100,
141
+ "completed": 100,
142
+ "partial": 0,
143
+ "failed": 0
144
+ },
145
+ "imdb": {
146
+ "runs": 100,
147
+ "completed": 100,
148
+ "partial": 0,
149
+ "failed": 0
150
+ },
151
+ "nytimes": {
152
+ "runs": 100,
153
+ "completed": 100,
154
+ "partial": 0,
155
+ "failed": 0
156
+ },
157
+ "bbc": {
158
+ "runs": 100,
159
+ "completed": 100,
160
+ "partial": 0,
161
+ "failed": 0
162
+ },
163
+ "cnn": {
164
+ "runs": 100,
165
+ "completed": 100,
166
+ "partial": 0,
167
+ "failed": 0
168
+ },
169
+ "reuters": {
170
+ "runs": 100,
171
+ "completed": 100,
172
+ "partial": 0,
173
+ "failed": 0
174
+ },
175
+ "bloomberg": {
176
+ "runs": 100,
177
+ "completed": 100,
178
+ "partial": 0,
179
+ "failed": 0
180
+ },
181
+ "coinmarketcap": {
182
+ "runs": 100,
183
+ "completed": 100,
184
+ "partial": 0,
185
+ "failed": 0
186
+ },
187
+ "coindesk": {
188
+ "runs": 100,
189
+ "completed": 100,
190
+ "partial": 0,
191
+ "failed": 0
192
+ },
193
+ "investopedia": {
194
+ "runs": 100,
195
+ "completed": 100,
196
+ "partial": 0,
197
+ "failed": 0
198
+ },
199
+ "googlescholar": {
200
+ "runs": 100,
201
+ "completed": 100,
202
+ "partial": 0,
203
+ "failed": 0
204
+ },
205
+ "gitlab": {
206
+ "runs": 100,
207
+ "completed": 100,
208
+ "partial": 0,
209
+ "failed": 0
210
+ },
211
+ "bitbucket": {
212
+ "runs": 100,
213
+ "completed": 100,
214
+ "partial": 0,
215
+ "failed": 0
216
+ },
217
+ "amazon": {
218
+ "runs": 100,
219
+ "completed": 100,
220
+ "partial": 0,
221
+ "failed": 0
222
+ },
223
+ "ebay": {
224
+ "runs": 100,
225
+ "completed": 100,
226
+ "partial": 0,
227
+ "failed": 0
228
+ },
229
+ "walmart": {
230
+ "runs": 100,
231
+ "completed": 100,
232
+ "partial": 0,
233
+ "failed": 0
234
+ },
235
+ "etsy": {
236
+ "runs": 100,
237
+ "completed": 100,
238
+ "partial": 0,
239
+ "failed": 0
240
+ },
241
+ "aliexpress": {
242
+ "runs": 100,
243
+ "completed": 100,
244
+ "partial": 0,
245
+ "failed": 0
246
+ },
247
+ "coursera": {
248
+ "runs": 100,
249
+ "completed": 100,
250
+ "partial": 0,
251
+ "failed": 0
252
+ },
253
+ "udemy": {
254
+ "runs": 100,
255
+ "completed": 100,
256
+ "partial": 0,
257
+ "failed": 0
258
+ },
259
+ "edx": {
260
+ "runs": 100,
261
+ "completed": 100,
262
+ "partial": 0,
263
+ "failed": 0
264
+ },
265
+ "freecodecamp": {
266
+ "runs": 100,
267
+ "completed": 100,
268
+ "partial": 0,
269
+ "failed": 0
270
+ },
271
+ "paperswithcode": {
272
+ "runs": 100,
273
+ "completed": 100,
274
+ "partial": 0,
275
+ "failed": 0
276
+ },
277
+ "openreview": {
278
+ "runs": 100,
279
+ "completed": 100,
280
+ "partial": 0,
281
+ "failed": 0
282
+ },
283
+ "leetcode": {
284
+ "runs": 100,
285
+ "completed": 100,
286
+ "partial": 0,
287
+ "failed": 0
288
+ },
289
+ "geeksforgeeks": {
290
+ "runs": 100,
291
+ "completed": 100,
292
+ "partial": 0,
293
+ "failed": 0
294
+ },
295
+ "indeed": {
296
+ "runs": 100,
297
+ "completed": 100,
298
+ "partial": 0,
299
+ "failed": 0
300
+ },
301
+ "glassdoor": {
302
+ "runs": 100,
303
+ "completed": 100,
304
+ "partial": 0,
305
+ "failed": 0
306
+ },
307
+ "twitch": {
308
+ "runs": 100,
309
+ "completed": 100,
310
+ "partial": 0,
311
+ "failed": 0
312
+ },
313
+ "vimeo": {
314
+ "runs": 100,
315
+ "completed": 100,
316
+ "partial": 0,
317
+ "failed": 0
318
+ },
319
+ "spotify": {
320
+ "runs": 100,
321
+ "completed": 100,
322
+ "partial": 0,
323
+ "failed": 0
324
+ },
325
+ "soundcloud": {
326
+ "runs": 100,
327
+ "completed": 100,
328
+ "partial": 0,
329
+ "failed": 0
330
+ },
331
+ "airbnb": {
332
+ "runs": 100,
333
+ "completed": 100,
334
+ "partial": 0,
335
+ "failed": 0
336
+ },
337
+ "booking": {
338
+ "runs": 100,
339
+ "completed": 100,
340
+ "partial": 0,
341
+ "failed": 0
342
+ },
343
+ "zillow": {
344
+ "runs": 100,
345
+ "completed": 100,
346
+ "partial": 0,
347
+ "failed": 0
348
+ }
349
+ },
350
+ "non_template_results": {
351
+ "https://unknown-synth-alpha.test": {
352
+ "runs": 100,
353
+ "completed": 100,
354
+ "partial": 0,
355
+ "failed": 0
356
+ },
357
+ "https://unknown-synth-beta.test": {
358
+ "runs": 100,
359
+ "completed": 100,
360
+ "partial": 0,
361
+ "failed": 0
362
+ },
363
+ "https://unknown-synth-gamma.test": {
364
+ "runs": 100,
365
+ "completed": 100,
366
+ "partial": 0,
367
+ "failed": 0
368
+ },
369
+ "open source scraping tools benchmark": {
370
+ "runs": 100,
371
+ "completed": 100,
372
+ "partial": 0,
373
+ "failed": 0
374
+ },
375
+ "synthetic market intelligence dashboard comparison": {
376
+ "runs": 100,
377
+ "completed": 100,
378
+ "partial": 0,
379
+ "failed": 0
380
+ }
381
+ },
382
+ "duration_seconds": 81.16
383
+ }
docs/test-report.md ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Template Stress Test Report
2
+
3
+ ## Scope
4
+ - Template targets: **56**
5
+ - Non-template targets: **5**
6
+ - Iterations per target: **100**
7
+ - Total runs: **6100**
8
+ - Modes cycled per target: **question**, **csv**, **json**
9
+ - Execution mode: deterministic offline mocks (`SCRAPERL_DISABLE_LIVE_LLM=1`)
10
+
11
+ ## Aggregate Result
12
+ - Completed: **6100**
13
+ - Partial: **0**
14
+ - Failed: **0**
15
+ - Pass rate (completed/total): **100.00%**
16
+ - Schema failures: **0**
17
+ - Output-format mismatches: **0**
18
+ - Duration: **81.16 seconds**
19
+
20
+ ## Per-Template Results
21
+ | Template | Runs | Completed | Partial | Failed |
22
+ |---|---:|---:|---:|---:|
23
+ | `airbnb` | 100 | 100 | 0 | 0 |
24
+ | `aliexpress` | 100 | 100 | 0 | 0 |
25
+ | `amazon` | 100 | 100 | 0 | 0 |
26
+ | `arxiv` | 100 | 100 | 0 | 0 |
27
+ | `bbc` | 100 | 100 | 0 | 0 |
28
+ | `bitbucket` | 100 | 100 | 0 | 0 |
29
+ | `bloomberg` | 100 | 100 | 0 | 0 |
30
+ | `booking` | 100 | 100 | 0 | 0 |
31
+ | `cnn` | 100 | 100 | 0 | 0 |
32
+ | `coindesk` | 100 | 100 | 0 | 0 |
33
+ | `coinmarketcap` | 100 | 100 | 0 | 0 |
34
+ | `coursera` | 100 | 100 | 0 | 0 |
35
+ | `devto` | 100 | 100 | 0 | 0 |
36
+ | `ebay` | 100 | 100 | 0 | 0 |
37
+ | `edx` | 100 | 100 | 0 | 0 |
38
+ | `etsy` | 100 | 100 | 0 | 0 |
39
+ | `facebook` | 100 | 100 | 0 | 0 |
40
+ | `freecodecamp` | 100 | 100 | 0 | 0 |
41
+ | `geeksforgeeks` | 100 | 100 | 0 | 0 |
42
+ | `github` | 100 | 100 | 0 | 0 |
43
+ | `gitlab` | 100 | 100 | 0 | 0 |
44
+ | `glassdoor` | 100 | 100 | 0 | 0 |
45
+ | `googlescholar` | 100 | 100 | 0 | 0 |
46
+ | `hackernews` | 100 | 100 | 0 | 0 |
47
+ | `huggingface` | 100 | 100 | 0 | 0 |
48
+ | `imdb` | 100 | 100 | 0 | 0 |
49
+ | `indeed` | 100 | 100 | 0 | 0 |
50
+ | `instagram` | 100 | 100 | 0 | 0 |
51
+ | `investopedia` | 100 | 100 | 0 | 0 |
52
+ | `kaggle` | 100 | 100 | 0 | 0 |
53
+ | `leetcode` | 100 | 100 | 0 | 0 |
54
+ | `linkedin` | 100 | 100 | 0 | 0 |
55
+ | `medium` | 100 | 100 | 0 | 0 |
56
+ | `npm` | 100 | 100 | 0 | 0 |
57
+ | `nytimes` | 100 | 100 | 0 | 0 |
58
+ | `openreview` | 100 | 100 | 0 | 0 |
59
+ | `paperswithcode` | 100 | 100 | 0 | 0 |
60
+ | `pinterest` | 100 | 100 | 0 | 0 |
61
+ | `producthunt` | 100 | 100 | 0 | 0 |
62
+ | `pypi` | 100 | 100 | 0 | 0 |
63
+ | `quora` | 100 | 100 | 0 | 0 |
64
+ | `reddit` | 100 | 100 | 0 | 0 |
65
+ | `reuters` | 100 | 100 | 0 | 0 |
66
+ | `soundcloud` | 100 | 100 | 0 | 0 |
67
+ | `spotify` | 100 | 100 | 0 | 0 |
68
+ | `stackoverflow` | 100 | 100 | 0 | 0 |
69
+ | `substack` | 100 | 100 | 0 | 0 |
70
+ | `tiktok` | 100 | 100 | 0 | 0 |
71
+ | `twitch` | 100 | 100 | 0 | 0 |
72
+ | `udemy` | 100 | 100 | 0 | 0 |
73
+ | `vimeo` | 100 | 100 | 0 | 0 |
74
+ | `walmart` | 100 | 100 | 0 | 0 |
75
+ | `wikipedia` | 100 | 100 | 0 | 0 |
76
+ | `x` | 100 | 100 | 0 | 0 |
77
+ | `youtube` | 100 | 100 | 0 | 0 |
78
+ | `zillow` | 100 | 100 | 0 | 0 |
79
+
80
+ ## Non-Template Results
81
+ | Asset | Runs | Completed | Partial | Failed |
82
+ |---|---:|---:|---:|---:|
83
+ | `https://unknown-synth-alpha.test` | 100 | 100 | 0 | 0 |
84
+ | `https://unknown-synth-beta.test` | 100 | 100 | 0 | 0 |
85
+ | `https://unknown-synth-gamma.test` | 100 | 100 | 0 | 0 |
86
+ | `open source scraping tools benchmark` | 100 | 100 | 0 | 0 |
87
+ | `synthetic market intelligence dashboard comparison` | 100 | 100 | 0 | 0 |
88
+
89
+ ## Failure Samples
90
+ - No failures captured.
91
+
92
+ ## Notes
93
+ - Templates are used as **reference hints** (navigation targets/field hints), not rigid scraper scripts.
94
+ - Agent flow evaluates **assets + instructions + output_format + output_instructions** per request.
95
+ - Output schema validation checks strict column adherence for CSV/JSON runs.
96
+ - Raw machine summary: `docs/reports/template-stress-summary.json`.