Spaces:
Running
Running
Blend mode complete: per-crawler timeouts, Coverage Confidence, SDK skeleton, 3.9s cold latency
7200045 | """ | |
| Knowledge Universe β Crawler Profiler | |
| ====================================== | |
| John (Performance Track) | |
| Run this to identify which crawlers are causing latency. | |
| Shows per-crawler timing, result count, and success/fail status. | |
| Usage: | |
| python scripts/crawler_profiler.py | |
| NOTE on API_KEY: | |
| The profiler calls crawler.crawl() directly β it does NOT go through | |
| the HTTP API endpoint. Therefore it does NOT need an API key. | |
| The .env file is loaded only to pick up GITHUB_TOKEN, YOUTUBE_API_KEY, | |
| KAGGLE_USERNAME, and KAGGLE_KEY so the keyed crawlers can authenticate. | |
| If you see empty results for GitHub/YouTube/Kaggle, check that those | |
| keys are set in your .env file. | |
| """ | |
| import asyncio | |
| import os | |
| import time | |
| import sys | |
| # Ensure project root is on path | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from dotenv import load_dotenv | |
| load_dotenv() # Loads GITHUB_TOKEN, YOUTUBE_API_KEY, KAGGLE_* etc. | |
| TEST_QUERIES = [ | |
| ("mixture of experts architecture", 4), | |
| ("what is machine learning", 1), | |
| ("RLHF reward model training", 5), | |
| ] | |
| async def profile_crawlers(query_index: int = 0): | |
| """Profile each crawler individually and report timing.""" | |
| from src.crawlers.crawler_pool import CrawlerPool | |
| from config.settings import get_settings | |
| settings = get_settings() | |
| pool = CrawlerPool() | |
| crawlers = pool.get_active_crawlers() | |
| topic, difficulty = TEST_QUERIES[query_index] | |
| print("=" * 70) | |
| print("CRAWLER PROFILER β Knowledge Universe") | |
| print("=" * 70) | |
| print(f"Active crawlers: {len(crawlers)}") | |
| print(f"Default timeout: {settings.CRAWLER_TIMEOUT}s") | |
| print() | |
| print(f"Test query: '{topic}' (difficulty={difficulty})") | |
| print() | |
| print(f"Keys loaded:") | |
| print(f" GITHUB_TOKEN: {'β SET' if os.getenv('GITHUB_TOKEN') else 'β MISSING'}") | |
| print(f" YOUTUBE_API_KEY:{'β SET' if os.getenv('YOUTUBE_API_KEY') else 'β MISSING'}") | |
| print(f" KAGGLE_USERNAME:{'β SET' if os.getenv('KAGGLE_USERNAME') else 'β MISSING'}") | |
| print() | |
| print("-" * 70) | |
| print(f"{'Crawler':<35} {'Status':<12} {'Results':>8} {'Time':>8} {'Timeout'}") | |
| print("-" * 70) | |
| results_by_crawler = {} | |
| for crawler in crawlers: | |
| name = crawler.__class__.__name__ | |
| timeout = settings.get_crawler_timeout(name) | |
| start = time.time() | |
| try: | |
| result = await asyncio.wait_for( | |
| crawler.crawl(topic, difficulty), | |
| timeout=timeout, | |
| ) | |
| elapsed = round((time.time() - start) * 1000) | |
| count = len(result) if result else 0 | |
| status = "β OK" if count > 0 else "β EMPTY" | |
| results_by_crawler[name] = { | |
| "status": status, | |
| "count": count, | |
| "time_ms": elapsed, | |
| "timeout": timeout, | |
| } | |
| print( | |
| f"{name:<35} {status:<12} {count:>8} {elapsed:>7}ms " | |
| f"(limit={timeout}s)" | |
| ) | |
| except asyncio.TimeoutError: | |
| elapsed = round((time.time() - start) * 1000) | |
| results_by_crawler[name] = { | |
| "status": "β± TIMEOUT", | |
| "count": 0, | |
| "time_ms": elapsed, | |
| "timeout": timeout, | |
| } | |
| print( | |
| f"{name:<35} {'β± TIMEOUT':<12} {'0':>8} {elapsed:>7}ms " | |
| f"(limit={timeout}s)" | |
| ) | |
| except Exception as e: | |
| elapsed = round((time.time() - start) * 1000) | |
| results_by_crawler[name] = { | |
| "status": "β ERROR", | |
| "count": 0, | |
| "time_ms": elapsed, | |
| "timeout": timeout, | |
| "error": str(e)[:60], | |
| } | |
| print( | |
| f"{name:<35} {'β ERROR':<12} {'0':>8} {elapsed:>7}ms " | |
| f"(limit={timeout}s)" | |
| ) | |
| print(f" Error: {str(e)[:70]}") | |
| print("-" * 70) | |
| # Sort by time descending | |
| sorted_crawlers = sorted( | |
| results_by_crawler.items(), | |
| key=lambda x: x[1]["time_ms"], | |
| reverse=True, | |
| ) | |
| print("\nπ SLOWEST CRAWLERS:") | |
| print("-" * 50) | |
| for name, data in sorted_crawlers[:5]: | |
| bar = "β" * min(35, data["time_ms"] // 250) | |
| print(f"{name:<35} {data['time_ms']:>6}ms {bar}") | |
| all_times = [d["time_ms"] for d in results_by_crawler.values()] | |
| print(f"\nπ PARALLEL CRAWL ESTIMATE:") | |
| print(f" Sequential total: {sum(all_times):>8,}ms ({sum(all_times)//1000}s)") | |
| print(f" Parallel ceiling: {max(all_times):>8,}ms (~{max(all_times)//1000}s) β slowest crawler") | |
| timeouts = [n for n, d in results_by_crawler.items() if "TIMEOUT" in d["status"]] | |
| empty = [n for n, d in results_by_crawler.items() | |
| if d["count"] == 0 and "TIMEOUT" not in d["status"] | |
| and "ERROR" not in d["status"]] | |
| ok = [n for n, d in results_by_crawler.items() if "OK" in d["status"]] | |
| errors = [n for n, d in results_by_crawler.items() if "ERROR" in d["status"]] | |
| total_results = sum(d["count"] for d in results_by_crawler.values()) | |
| print(f"\nπ SUMMARY:") | |
| print(f" Returning results ({len(ok)}): {ok}") | |
| print(f" Timed out ({len(timeouts)}): {timeouts}") | |
| print(f" Empty results ({len(empty)}): {empty}") | |
| print(f" Errors ({len(errors)}): {errors}") | |
| print(f" Total results available: {total_results}") | |
| # Per-crawler timeout analysis | |
| print(f"\nπ TIMEOUT EFFICIENCY:") | |
| for name, data in sorted_crawlers: | |
| timeout_ms = data["timeout"] * 1000 | |
| actual_ms = data["time_ms"] | |
| wasted_ms = max(0, timeout_ms - actual_ms) | |
| if data["count"] == 0 and wasted_ms > 500: | |
| print( | |
| f" {name}: returned 0 results, wasted {wasted_ms}ms " | |
| f"({actual_ms}ms actual vs {timeout_ms}ms limit)" | |
| ) | |
| print("\n" + "=" * 70) | |
| print("CTO RECOMMENDATION (Rick):") | |
| if timeouts: | |
| for t in timeouts: | |
| current = results_by_crawler[t]["timeout"] | |
| print(f" {t}: already set to {current}s β if still timing out,") | |
| print(f" reduce further to 1s (these crawlers are unreachable)") | |
| slow_empty = [ | |
| (n, d) for n, d in results_by_crawler.items() | |
| if d["count"] == 0 and d["time_ms"] > 1000 | |
| and "TIMEOUT" not in d["status"] | |
| ] | |
| if slow_empty: | |
| print(f" Slow empty crawlers (waste time with 0 results):") | |
| for n, d in slow_empty: | |
| print(f" {n}: {d['time_ms']}ms for 0 results") | |
| print("=" * 70) | |
| if __name__ == "__main__": | |
| query_idx = int(sys.argv[1]) if len(sys.argv) > 1 else 0 | |
| if query_idx >= len(TEST_QUERIES): | |
| print(f"Query index must be 0-{len(TEST_QUERIES)-1}") | |
| print(f"Available: {[q for q, _ in TEST_QUERIES]}") | |
| sys.exit(1) | |
| asyncio.run(profile_crawlers(query_idx)) |