Spaces:

vlsiddarth
/

Knowledge-Universe

Running

App Files Files Community

Knowledge-Universe / scripts /crawler_profiler.py

vlsiddarth

Blend mode complete: per-crawler timeouts, Coverage Confidence, SDK skeleton, 3.9s cold latency

7200045 about 2 months ago

raw

history blame contribute delete

7.04 kB

	"""
	Knowledge Universe — Crawler Profiler
	======================================
	John (Performance Track)

	Run this to identify which crawlers are causing latency.
	Shows per-crawler timing, result count, and success/fail status.

	Usage:
	python scripts/crawler_profiler.py

	NOTE on API_KEY:
	The profiler calls crawler.crawl() directly — it does NOT go through
	the HTTP API endpoint. Therefore it does NOT need an API key.
	The .env file is loaded only to pick up GITHUB_TOKEN, YOUTUBE_API_KEY,
	KAGGLE_USERNAME, and KAGGLE_KEY so the keyed crawlers can authenticate.

	If you see empty results for GitHub/YouTube/Kaggle, check that those
	keys are set in your .env file.
	"""

	import asyncio
	import os
	import time
	import sys

	# Ensure project root is on path
	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	from dotenv import load_dotenv
	load_dotenv() # Loads GITHUB_TOKEN, YOUTUBE_API_KEY, KAGGLE_* etc.

	TEST_QUERIES = [
	("mixture of experts architecture", 4),
	("what is machine learning", 1),
	("RLHF reward model training", 5),
	]


	async def profile_crawlers(query_index: int = 0):
	"""Profile each crawler individually and report timing."""

	from src.crawlers.crawler_pool import CrawlerPool
	from config.settings import get_settings

	settings = get_settings()
	pool = CrawlerPool()
	crawlers = pool.get_active_crawlers()

	topic, difficulty = TEST_QUERIES[query_index]

	print("=" * 70)
	print("CRAWLER PROFILER — Knowledge Universe")
	print("=" * 70)
	print(f"Active crawlers: {len(crawlers)}")
	print(f"Default timeout: {settings.CRAWLER_TIMEOUT}s")
	print()
	print(f"Test query: '{topic}' (difficulty={difficulty})")
	print()
	print(f"Keys loaded:")
	print(f" GITHUB_TOKEN: {'✅ SET' if os.getenv('GITHUB_TOKEN') else '❌ MISSING'}")
	print(f" YOUTUBE_API_KEY:{'✅ SET' if os.getenv('YOUTUBE_API_KEY') else '❌ MISSING'}")
	print(f" KAGGLE_USERNAME:{'✅ SET' if os.getenv('KAGGLE_USERNAME') else '❌ MISSING'}")
	print()
	print("-" * 70)
	print(f"{'Crawler':<35} {'Status':<12} {'Results':>8} {'Time':>8} {'Timeout'}")
	print("-" * 70)

	results_by_crawler = {}

	for crawler in crawlers:
	name = crawler.__class__.__name__
	timeout = settings.get_crawler_timeout(name)
	start = time.time()

	try:
	result = await asyncio.wait_for(
	crawler.crawl(topic, difficulty),
	timeout=timeout,
	)
	elapsed = round((time.time() - start) * 1000)
	count = len(result) if result else 0
	status = "✅ OK" if count > 0 else "⚠ EMPTY"

	results_by_crawler[name] = {
	"status": status,
	"count": count,
	"time_ms": elapsed,
	"timeout": timeout,
	}
	print(
	f"{name:<35} {status:<12} {count:>8} {elapsed:>7}ms "
	f"(limit={timeout}s)"
	)

	except asyncio.TimeoutError:
	elapsed = round((time.time() - start) * 1000)
	results_by_crawler[name] = {
	"status": "⏱ TIMEOUT",
	"count": 0,
	"time_ms": elapsed,
	"timeout": timeout,
	}
	print(
	f"{name:<35} {'⏱ TIMEOUT':<12} {'0':>8} {elapsed:>7}ms "
	f"(limit={timeout}s)"
	)

	except Exception as e:
	elapsed = round((time.time() - start) * 1000)
	results_by_crawler[name] = {
	"status": "❌ ERROR",
	"count": 0,
	"time_ms": elapsed,
	"timeout": timeout,
	"error": str(e)[:60],
	}
	print(
	f"{name:<35} {'❌ ERROR':<12} {'0':>8} {elapsed:>7}ms "
	f"(limit={timeout}s)"
	)
	print(f" Error: {str(e)[:70]}")

	print("-" * 70)

	# Sort by time descending
	sorted_crawlers = sorted(
	results_by_crawler.items(),
	key=lambda x: x[1]["time_ms"],
	reverse=True,
	)

	print("\n📊 SLOWEST CRAWLERS:")
	print("-" * 50)
	for name, data in sorted_crawlers[:5]:
	bar = "█" * min(35, data["time_ms"] // 250)
	print(f"{name:<35} {data['time_ms']:>6}ms {bar}")

	all_times = [d["time_ms"] for d in results_by_crawler.values()]
	print(f"\n📊 PARALLEL CRAWL ESTIMATE:")
	print(f" Sequential total: {sum(all_times):>8,}ms ({sum(all_times)//1000}s)")
	print(f" Parallel ceiling: {max(all_times):>8,}ms (~{max(all_times)//1000}s) — slowest crawler")

	timeouts = [n for n, d in results_by_crawler.items() if "TIMEOUT" in d["status"]]
	empty = [n for n, d in results_by_crawler.items()
	if d["count"] == 0 and "TIMEOUT" not in d["status"]
	and "ERROR" not in d["status"]]
	ok = [n for n, d in results_by_crawler.items() if "OK" in d["status"]]
	errors = [n for n, d in results_by_crawler.items() if "ERROR" in d["status"]]
	total_results = sum(d["count"] for d in results_by_crawler.values())

	print(f"\n📊 SUMMARY:")
	print(f" Returning results ({len(ok)}): {ok}")
	print(f" Timed out ({len(timeouts)}): {timeouts}")
	print(f" Empty results ({len(empty)}): {empty}")
	print(f" Errors ({len(errors)}): {errors}")
	print(f" Total results available: {total_results}")

	# Per-crawler timeout analysis
	print(f"\n📊 TIMEOUT EFFICIENCY:")
	for name, data in sorted_crawlers:
	timeout_ms = data["timeout"] * 1000
	actual_ms = data["time_ms"]
	wasted_ms = max(0, timeout_ms - actual_ms)
	if data["count"] == 0 and wasted_ms > 500:
	print(
	f" {name}: returned 0 results, wasted {wasted_ms}ms "
	f"({actual_ms}ms actual vs {timeout_ms}ms limit)"
	)

	print("\n" + "=" * 70)
	print("CTO RECOMMENDATION (Rick):")
	if timeouts:
	for t in timeouts:
	current = results_by_crawler[t]["timeout"]
	print(f" {t}: already set to {current}s — if still timing out,")
	print(f" reduce further to 1s (these crawlers are unreachable)")
	slow_empty = [
	(n, d) for n, d in results_by_crawler.items()
	if d["count"] == 0 and d["time_ms"] > 1000
	and "TIMEOUT" not in d["status"]
	]
	if slow_empty:
	print(f" Slow empty crawlers (waste time with 0 results):")
	for n, d in slow_empty:
	print(f" {n}: {d['time_ms']}ms for 0 results")
	print("=" * 70)


	if __name__ == "__main__":
	query_idx = int(sys.argv[1]) if len(sys.argv) > 1 else 0
	if query_idx >= len(TEST_QUERIES):
	print(f"Query index must be 0-{len(TEST_QUERIES)-1}")
	print(f"Available: {[q for q, _ in TEST_QUERIES]}")
	sys.exit(1)
	asyncio.run(profile_crawlers(query_idx))