Spaces:
Running
Running
File size: 7,038 Bytes
7200045 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 | """
Knowledge Universe β Crawler Profiler
======================================
John (Performance Track)
Run this to identify which crawlers are causing latency.
Shows per-crawler timing, result count, and success/fail status.
Usage:
python scripts/crawler_profiler.py
NOTE on API_KEY:
The profiler calls crawler.crawl() directly β it does NOT go through
the HTTP API endpoint. Therefore it does NOT need an API key.
The .env file is loaded only to pick up GITHUB_TOKEN, YOUTUBE_API_KEY,
KAGGLE_USERNAME, and KAGGLE_KEY so the keyed crawlers can authenticate.
If you see empty results for GitHub/YouTube/Kaggle, check that those
keys are set in your .env file.
"""
import asyncio
import os
import time
import sys
# Ensure project root is on path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dotenv import load_dotenv
load_dotenv() # Loads GITHUB_TOKEN, YOUTUBE_API_KEY, KAGGLE_* etc.
TEST_QUERIES = [
("mixture of experts architecture", 4),
("what is machine learning", 1),
("RLHF reward model training", 5),
]
async def profile_crawlers(query_index: int = 0):
"""Profile each crawler individually and report timing."""
from src.crawlers.crawler_pool import CrawlerPool
from config.settings import get_settings
settings = get_settings()
pool = CrawlerPool()
crawlers = pool.get_active_crawlers()
topic, difficulty = TEST_QUERIES[query_index]
print("=" * 70)
print("CRAWLER PROFILER β Knowledge Universe")
print("=" * 70)
print(f"Active crawlers: {len(crawlers)}")
print(f"Default timeout: {settings.CRAWLER_TIMEOUT}s")
print()
print(f"Test query: '{topic}' (difficulty={difficulty})")
print()
print(f"Keys loaded:")
print(f" GITHUB_TOKEN: {'β
SET' if os.getenv('GITHUB_TOKEN') else 'β MISSING'}")
print(f" YOUTUBE_API_KEY:{'β
SET' if os.getenv('YOUTUBE_API_KEY') else 'β MISSING'}")
print(f" KAGGLE_USERNAME:{'β
SET' if os.getenv('KAGGLE_USERNAME') else 'β MISSING'}")
print()
print("-" * 70)
print(f"{'Crawler':<35} {'Status':<12} {'Results':>8} {'Time':>8} {'Timeout'}")
print("-" * 70)
results_by_crawler = {}
for crawler in crawlers:
name = crawler.__class__.__name__
timeout = settings.get_crawler_timeout(name)
start = time.time()
try:
result = await asyncio.wait_for(
crawler.crawl(topic, difficulty),
timeout=timeout,
)
elapsed = round((time.time() - start) * 1000)
count = len(result) if result else 0
status = "β
OK" if count > 0 else "β EMPTY"
results_by_crawler[name] = {
"status": status,
"count": count,
"time_ms": elapsed,
"timeout": timeout,
}
print(
f"{name:<35} {status:<12} {count:>8} {elapsed:>7}ms "
f"(limit={timeout}s)"
)
except asyncio.TimeoutError:
elapsed = round((time.time() - start) * 1000)
results_by_crawler[name] = {
"status": "β± TIMEOUT",
"count": 0,
"time_ms": elapsed,
"timeout": timeout,
}
print(
f"{name:<35} {'β± TIMEOUT':<12} {'0':>8} {elapsed:>7}ms "
f"(limit={timeout}s)"
)
except Exception as e:
elapsed = round((time.time() - start) * 1000)
results_by_crawler[name] = {
"status": "β ERROR",
"count": 0,
"time_ms": elapsed,
"timeout": timeout,
"error": str(e)[:60],
}
print(
f"{name:<35} {'β ERROR':<12} {'0':>8} {elapsed:>7}ms "
f"(limit={timeout}s)"
)
print(f" Error: {str(e)[:70]}")
print("-" * 70)
# Sort by time descending
sorted_crawlers = sorted(
results_by_crawler.items(),
key=lambda x: x[1]["time_ms"],
reverse=True,
)
print("\nπ SLOWEST CRAWLERS:")
print("-" * 50)
for name, data in sorted_crawlers[:5]:
bar = "β" * min(35, data["time_ms"] // 250)
print(f"{name:<35} {data['time_ms']:>6}ms {bar}")
all_times = [d["time_ms"] for d in results_by_crawler.values()]
print(f"\nπ PARALLEL CRAWL ESTIMATE:")
print(f" Sequential total: {sum(all_times):>8,}ms ({sum(all_times)//1000}s)")
print(f" Parallel ceiling: {max(all_times):>8,}ms (~{max(all_times)//1000}s) β slowest crawler")
timeouts = [n for n, d in results_by_crawler.items() if "TIMEOUT" in d["status"]]
empty = [n for n, d in results_by_crawler.items()
if d["count"] == 0 and "TIMEOUT" not in d["status"]
and "ERROR" not in d["status"]]
ok = [n for n, d in results_by_crawler.items() if "OK" in d["status"]]
errors = [n for n, d in results_by_crawler.items() if "ERROR" in d["status"]]
total_results = sum(d["count"] for d in results_by_crawler.values())
print(f"\nπ SUMMARY:")
print(f" Returning results ({len(ok)}): {ok}")
print(f" Timed out ({len(timeouts)}): {timeouts}")
print(f" Empty results ({len(empty)}): {empty}")
print(f" Errors ({len(errors)}): {errors}")
print(f" Total results available: {total_results}")
# Per-crawler timeout analysis
print(f"\nπ TIMEOUT EFFICIENCY:")
for name, data in sorted_crawlers:
timeout_ms = data["timeout"] * 1000
actual_ms = data["time_ms"]
wasted_ms = max(0, timeout_ms - actual_ms)
if data["count"] == 0 and wasted_ms > 500:
print(
f" {name}: returned 0 results, wasted {wasted_ms}ms "
f"({actual_ms}ms actual vs {timeout_ms}ms limit)"
)
print("\n" + "=" * 70)
print("CTO RECOMMENDATION (Rick):")
if timeouts:
for t in timeouts:
current = results_by_crawler[t]["timeout"]
print(f" {t}: already set to {current}s β if still timing out,")
print(f" reduce further to 1s (these crawlers are unreachable)")
slow_empty = [
(n, d) for n, d in results_by_crawler.items()
if d["count"] == 0 and d["time_ms"] > 1000
and "TIMEOUT" not in d["status"]
]
if slow_empty:
print(f" Slow empty crawlers (waste time with 0 results):")
for n, d in slow_empty:
print(f" {n}: {d['time_ms']}ms for 0 results")
print("=" * 70)
if __name__ == "__main__":
query_idx = int(sys.argv[1]) if len(sys.argv) > 1 else 0
if query_idx >= len(TEST_QUERIES):
print(f"Query index must be 0-{len(TEST_QUERIES)-1}")
print(f"Available: {[q for q, _ in TEST_QUERIES]}")
sys.exit(1)
asyncio.run(profile_crawlers(query_idx)) |