""" Knowledge Universe — Crawler Profiler ====================================== John (Performance Track) Run this to identify which crawlers are causing latency. Shows per-crawler timing, result count, and success/fail status. Usage: python scripts/crawler_profiler.py NOTE on API_KEY: The profiler calls crawler.crawl() directly — it does NOT go through the HTTP API endpoint. Therefore it does NOT need an API key. The .env file is loaded only to pick up GITHUB_TOKEN, YOUTUBE_API_KEY, KAGGLE_USERNAME, and KAGGLE_KEY so the keyed crawlers can authenticate. If you see empty results for GitHub/YouTube/Kaggle, check that those keys are set in your .env file. """ import asyncio import os import time import sys # Ensure project root is on path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dotenv import load_dotenv load_dotenv() # Loads GITHUB_TOKEN, YOUTUBE_API_KEY, KAGGLE_* etc. TEST_QUERIES = [ ("mixture of experts architecture", 4), ("what is machine learning", 1), ("RLHF reward model training", 5), ] async def profile_crawlers(query_index: int = 0): """Profile each crawler individually and report timing.""" from src.crawlers.crawler_pool import CrawlerPool from config.settings import get_settings settings = get_settings() pool = CrawlerPool() crawlers = pool.get_active_crawlers() topic, difficulty = TEST_QUERIES[query_index] print("=" * 70) print("CRAWLER PROFILER — Knowledge Universe") print("=" * 70) print(f"Active crawlers: {len(crawlers)}") print(f"Default timeout: {settings.CRAWLER_TIMEOUT}s") print() print(f"Test query: '{topic}' (difficulty={difficulty})") print() print(f"Keys loaded:") print(f" GITHUB_TOKEN: {'✅ SET' if os.getenv('GITHUB_TOKEN') else '❌ MISSING'}") print(f" YOUTUBE_API_KEY:{'✅ SET' if os.getenv('YOUTUBE_API_KEY') else '❌ MISSING'}") print(f" KAGGLE_USERNAME:{'✅ SET' if os.getenv('KAGGLE_USERNAME') else '❌ MISSING'}") print() print("-" * 70) print(f"{'Crawler':<35} {'Status':<12} {'Results':>8} {'Time':>8} {'Timeout'}") print("-" * 70) results_by_crawler = {} for crawler in crawlers: name = crawler.__class__.__name__ timeout = settings.get_crawler_timeout(name) start = time.time() try: result = await asyncio.wait_for( crawler.crawl(topic, difficulty), timeout=timeout, ) elapsed = round((time.time() - start) * 1000) count = len(result) if result else 0 status = "✅ OK" if count > 0 else "⚠ EMPTY" results_by_crawler[name] = { "status": status, "count": count, "time_ms": elapsed, "timeout": timeout, } print( f"{name:<35} {status:<12} {count:>8} {elapsed:>7}ms " f"(limit={timeout}s)" ) except asyncio.TimeoutError: elapsed = round((time.time() - start) * 1000) results_by_crawler[name] = { "status": "⏱ TIMEOUT", "count": 0, "time_ms": elapsed, "timeout": timeout, } print( f"{name:<35} {'⏱ TIMEOUT':<12} {'0':>8} {elapsed:>7}ms " f"(limit={timeout}s)" ) except Exception as e: elapsed = round((time.time() - start) * 1000) results_by_crawler[name] = { "status": "❌ ERROR", "count": 0, "time_ms": elapsed, "timeout": timeout, "error": str(e)[:60], } print( f"{name:<35} {'❌ ERROR':<12} {'0':>8} {elapsed:>7}ms " f"(limit={timeout}s)" ) print(f" Error: {str(e)[:70]}") print("-" * 70) # Sort by time descending sorted_crawlers = sorted( results_by_crawler.items(), key=lambda x: x[1]["time_ms"], reverse=True, ) print("\n📊 SLOWEST CRAWLERS:") print("-" * 50) for name, data in sorted_crawlers[:5]: bar = "█" * min(35, data["time_ms"] // 250) print(f"{name:<35} {data['time_ms']:>6}ms {bar}") all_times = [d["time_ms"] for d in results_by_crawler.values()] print(f"\n📊 PARALLEL CRAWL ESTIMATE:") print(f" Sequential total: {sum(all_times):>8,}ms ({sum(all_times)//1000}s)") print(f" Parallel ceiling: {max(all_times):>8,}ms (~{max(all_times)//1000}s) — slowest crawler") timeouts = [n for n, d in results_by_crawler.items() if "TIMEOUT" in d["status"]] empty = [n for n, d in results_by_crawler.items() if d["count"] == 0 and "TIMEOUT" not in d["status"] and "ERROR" not in d["status"]] ok = [n for n, d in results_by_crawler.items() if "OK" in d["status"]] errors = [n for n, d in results_by_crawler.items() if "ERROR" in d["status"]] total_results = sum(d["count"] for d in results_by_crawler.values()) print(f"\n📊 SUMMARY:") print(f" Returning results ({len(ok)}): {ok}") print(f" Timed out ({len(timeouts)}): {timeouts}") print(f" Empty results ({len(empty)}): {empty}") print(f" Errors ({len(errors)}): {errors}") print(f" Total results available: {total_results}") # Per-crawler timeout analysis print(f"\n📊 TIMEOUT EFFICIENCY:") for name, data in sorted_crawlers: timeout_ms = data["timeout"] * 1000 actual_ms = data["time_ms"] wasted_ms = max(0, timeout_ms - actual_ms) if data["count"] == 0 and wasted_ms > 500: print( f" {name}: returned 0 results, wasted {wasted_ms}ms " f"({actual_ms}ms actual vs {timeout_ms}ms limit)" ) print("\n" + "=" * 70) print("CTO RECOMMENDATION (Rick):") if timeouts: for t in timeouts: current = results_by_crawler[t]["timeout"] print(f" {t}: already set to {current}s — if still timing out,") print(f" reduce further to 1s (these crawlers are unreachable)") slow_empty = [ (n, d) for n, d in results_by_crawler.items() if d["count"] == 0 and d["time_ms"] > 1000 and "TIMEOUT" not in d["status"] ] if slow_empty: print(f" Slow empty crawlers (waste time with 0 results):") for n, d in slow_empty: print(f" {n}: {d['time_ms']}ms for 0 results") print("=" * 70) if __name__ == "__main__": query_idx = int(sys.argv[1]) if len(sys.argv) > 1 else 0 if query_idx >= len(TEST_QUERIES): print(f"Query index must be 0-{len(TEST_QUERIES)-1}") print(f"Available: {[q for q, _ in TEST_QUERIES]}") sys.exit(1) asyncio.run(profile_crawlers(query_idx))