Knowledge-Universe / scripts /crawler_profiler.py
vlsiddarth's picture
Blend mode complete: per-crawler timeouts, Coverage Confidence, SDK skeleton, 3.9s cold latency
7200045
"""
Knowledge Universe β€” Crawler Profiler
======================================
John (Performance Track)
Run this to identify which crawlers are causing latency.
Shows per-crawler timing, result count, and success/fail status.
Usage:
python scripts/crawler_profiler.py
NOTE on API_KEY:
The profiler calls crawler.crawl() directly β€” it does NOT go through
the HTTP API endpoint. Therefore it does NOT need an API key.
The .env file is loaded only to pick up GITHUB_TOKEN, YOUTUBE_API_KEY,
KAGGLE_USERNAME, and KAGGLE_KEY so the keyed crawlers can authenticate.
If you see empty results for GitHub/YouTube/Kaggle, check that those
keys are set in your .env file.
"""
import asyncio
import os
import time
import sys
# Ensure project root is on path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dotenv import load_dotenv
load_dotenv() # Loads GITHUB_TOKEN, YOUTUBE_API_KEY, KAGGLE_* etc.
TEST_QUERIES = [
("mixture of experts architecture", 4),
("what is machine learning", 1),
("RLHF reward model training", 5),
]
async def profile_crawlers(query_index: int = 0):
"""Profile each crawler individually and report timing."""
from src.crawlers.crawler_pool import CrawlerPool
from config.settings import get_settings
settings = get_settings()
pool = CrawlerPool()
crawlers = pool.get_active_crawlers()
topic, difficulty = TEST_QUERIES[query_index]
print("=" * 70)
print("CRAWLER PROFILER β€” Knowledge Universe")
print("=" * 70)
print(f"Active crawlers: {len(crawlers)}")
print(f"Default timeout: {settings.CRAWLER_TIMEOUT}s")
print()
print(f"Test query: '{topic}' (difficulty={difficulty})")
print()
print(f"Keys loaded:")
print(f" GITHUB_TOKEN: {'βœ… SET' if os.getenv('GITHUB_TOKEN') else '❌ MISSING'}")
print(f" YOUTUBE_API_KEY:{'βœ… SET' if os.getenv('YOUTUBE_API_KEY') else '❌ MISSING'}")
print(f" KAGGLE_USERNAME:{'βœ… SET' if os.getenv('KAGGLE_USERNAME') else '❌ MISSING'}")
print()
print("-" * 70)
print(f"{'Crawler':<35} {'Status':<12} {'Results':>8} {'Time':>8} {'Timeout'}")
print("-" * 70)
results_by_crawler = {}
for crawler in crawlers:
name = crawler.__class__.__name__
timeout = settings.get_crawler_timeout(name)
start = time.time()
try:
result = await asyncio.wait_for(
crawler.crawl(topic, difficulty),
timeout=timeout,
)
elapsed = round((time.time() - start) * 1000)
count = len(result) if result else 0
status = "βœ… OK" if count > 0 else "⚠ EMPTY"
results_by_crawler[name] = {
"status": status,
"count": count,
"time_ms": elapsed,
"timeout": timeout,
}
print(
f"{name:<35} {status:<12} {count:>8} {elapsed:>7}ms "
f"(limit={timeout}s)"
)
except asyncio.TimeoutError:
elapsed = round((time.time() - start) * 1000)
results_by_crawler[name] = {
"status": "⏱ TIMEOUT",
"count": 0,
"time_ms": elapsed,
"timeout": timeout,
}
print(
f"{name:<35} {'⏱ TIMEOUT':<12} {'0':>8} {elapsed:>7}ms "
f"(limit={timeout}s)"
)
except Exception as e:
elapsed = round((time.time() - start) * 1000)
results_by_crawler[name] = {
"status": "❌ ERROR",
"count": 0,
"time_ms": elapsed,
"timeout": timeout,
"error": str(e)[:60],
}
print(
f"{name:<35} {'❌ ERROR':<12} {'0':>8} {elapsed:>7}ms "
f"(limit={timeout}s)"
)
print(f" Error: {str(e)[:70]}")
print("-" * 70)
# Sort by time descending
sorted_crawlers = sorted(
results_by_crawler.items(),
key=lambda x: x[1]["time_ms"],
reverse=True,
)
print("\nπŸ“Š SLOWEST CRAWLERS:")
print("-" * 50)
for name, data in sorted_crawlers[:5]:
bar = "β–ˆ" * min(35, data["time_ms"] // 250)
print(f"{name:<35} {data['time_ms']:>6}ms {bar}")
all_times = [d["time_ms"] for d in results_by_crawler.values()]
print(f"\nπŸ“Š PARALLEL CRAWL ESTIMATE:")
print(f" Sequential total: {sum(all_times):>8,}ms ({sum(all_times)//1000}s)")
print(f" Parallel ceiling: {max(all_times):>8,}ms (~{max(all_times)//1000}s) β€” slowest crawler")
timeouts = [n for n, d in results_by_crawler.items() if "TIMEOUT" in d["status"]]
empty = [n for n, d in results_by_crawler.items()
if d["count"] == 0 and "TIMEOUT" not in d["status"]
and "ERROR" not in d["status"]]
ok = [n for n, d in results_by_crawler.items() if "OK" in d["status"]]
errors = [n for n, d in results_by_crawler.items() if "ERROR" in d["status"]]
total_results = sum(d["count"] for d in results_by_crawler.values())
print(f"\nπŸ“Š SUMMARY:")
print(f" Returning results ({len(ok)}): {ok}")
print(f" Timed out ({len(timeouts)}): {timeouts}")
print(f" Empty results ({len(empty)}): {empty}")
print(f" Errors ({len(errors)}): {errors}")
print(f" Total results available: {total_results}")
# Per-crawler timeout analysis
print(f"\nπŸ“Š TIMEOUT EFFICIENCY:")
for name, data in sorted_crawlers:
timeout_ms = data["timeout"] * 1000
actual_ms = data["time_ms"]
wasted_ms = max(0, timeout_ms - actual_ms)
if data["count"] == 0 and wasted_ms > 500:
print(
f" {name}: returned 0 results, wasted {wasted_ms}ms "
f"({actual_ms}ms actual vs {timeout_ms}ms limit)"
)
print("\n" + "=" * 70)
print("CTO RECOMMENDATION (Rick):")
if timeouts:
for t in timeouts:
current = results_by_crawler[t]["timeout"]
print(f" {t}: already set to {current}s β€” if still timing out,")
print(f" reduce further to 1s (these crawlers are unreachable)")
slow_empty = [
(n, d) for n, d in results_by_crawler.items()
if d["count"] == 0 and d["time_ms"] > 1000
and "TIMEOUT" not in d["status"]
]
if slow_empty:
print(f" Slow empty crawlers (waste time with 0 results):")
for n, d in slow_empty:
print(f" {n}: {d['time_ms']}ms for 0 results")
print("=" * 70)
if __name__ == "__main__":
query_idx = int(sys.argv[1]) if len(sys.argv) > 1 else 0
if query_idx >= len(TEST_QUERIES):
print(f"Query index must be 0-{len(TEST_QUERIES)-1}")
print(f"Available: {[q for q, _ in TEST_QUERIES]}")
sys.exit(1)
asyncio.run(profile_crawlers(query_idx))