File size: 7,038 Bytes
7200045
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
"""
Knowledge Universe β€” Crawler Profiler
======================================
John (Performance Track)

Run this to identify which crawlers are causing latency.
Shows per-crawler timing, result count, and success/fail status.

Usage:
    python scripts/crawler_profiler.py

NOTE on API_KEY:
    The profiler calls crawler.crawl() directly β€” it does NOT go through
    the HTTP API endpoint. Therefore it does NOT need an API key.
    The .env file is loaded only to pick up GITHUB_TOKEN, YOUTUBE_API_KEY,
    KAGGLE_USERNAME, and KAGGLE_KEY so the keyed crawlers can authenticate.
    
    If you see empty results for GitHub/YouTube/Kaggle, check that those
    keys are set in your .env file.
"""

import asyncio
import os
import time
import sys

# Ensure project root is on path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from dotenv import load_dotenv
load_dotenv()  # Loads GITHUB_TOKEN, YOUTUBE_API_KEY, KAGGLE_* etc.

TEST_QUERIES = [
    ("mixture of experts architecture", 4),
    ("what is machine learning", 1),
    ("RLHF reward model training", 5),
]


async def profile_crawlers(query_index: int = 0):
    """Profile each crawler individually and report timing."""

    from src.crawlers.crawler_pool import CrawlerPool
    from config.settings import get_settings

    settings = get_settings()
    pool     = CrawlerPool()
    crawlers = pool.get_active_crawlers()

    topic, difficulty = TEST_QUERIES[query_index]

    print("=" * 70)
    print("CRAWLER PROFILER β€” Knowledge Universe")
    print("=" * 70)
    print(f"Active crawlers: {len(crawlers)}")
    print(f"Default timeout: {settings.CRAWLER_TIMEOUT}s")
    print()
    print(f"Test query: '{topic}' (difficulty={difficulty})")
    print()
    print(f"Keys loaded:")
    print(f"  GITHUB_TOKEN:   {'βœ… SET' if os.getenv('GITHUB_TOKEN') else '❌ MISSING'}")
    print(f"  YOUTUBE_API_KEY:{'βœ… SET' if os.getenv('YOUTUBE_API_KEY') else '❌ MISSING'}")
    print(f"  KAGGLE_USERNAME:{'βœ… SET' if os.getenv('KAGGLE_USERNAME') else '❌ MISSING'}")
    print()
    print("-" * 70)
    print(f"{'Crawler':<35} {'Status':<12} {'Results':>8} {'Time':>8}  {'Timeout'}")
    print("-" * 70)

    results_by_crawler = {}

    for crawler in crawlers:
        name    = crawler.__class__.__name__
        timeout = settings.get_crawler_timeout(name)
        start   = time.time()

        try:
            result = await asyncio.wait_for(
                crawler.crawl(topic, difficulty),
                timeout=timeout,
            )
            elapsed = round((time.time() - start) * 1000)
            count   = len(result) if result else 0
            status  = "βœ… OK" if count > 0 else "⚠ EMPTY"

            results_by_crawler[name] = {
                "status":   status,
                "count":    count,
                "time_ms":  elapsed,
                "timeout":  timeout,
            }
            print(
                f"{name:<35} {status:<12} {count:>8} {elapsed:>7}ms  "
                f"(limit={timeout}s)"
            )

        except asyncio.TimeoutError:
            elapsed = round((time.time() - start) * 1000)
            results_by_crawler[name] = {
                "status":  "⏱ TIMEOUT",
                "count":   0,
                "time_ms": elapsed,
                "timeout": timeout,
            }
            print(
                f"{name:<35} {'⏱ TIMEOUT':<12} {'0':>8} {elapsed:>7}ms  "
                f"(limit={timeout}s)"
            )

        except Exception as e:
            elapsed = round((time.time() - start) * 1000)
            results_by_crawler[name] = {
                "status":  "❌ ERROR",
                "count":   0,
                "time_ms": elapsed,
                "timeout": timeout,
                "error":   str(e)[:60],
            }
            print(
                f"{name:<35} {'❌ ERROR':<12} {'0':>8} {elapsed:>7}ms  "
                f"(limit={timeout}s)"
            )
            print(f"  Error: {str(e)[:70]}")

    print("-" * 70)

    # Sort by time descending
    sorted_crawlers = sorted(
        results_by_crawler.items(),
        key=lambda x: x[1]["time_ms"],
        reverse=True,
    )

    print("\nπŸ“Š SLOWEST CRAWLERS:")
    print("-" * 50)
    for name, data in sorted_crawlers[:5]:
        bar = "β–ˆ" * min(35, data["time_ms"] // 250)
        print(f"{name:<35} {data['time_ms']:>6}ms  {bar}")

    all_times = [d["time_ms"] for d in results_by_crawler.values()]
    print(f"\nπŸ“Š PARALLEL CRAWL ESTIMATE:")
    print(f"  Sequential total:   {sum(all_times):>8,}ms ({sum(all_times)//1000}s)")
    print(f"  Parallel ceiling:   {max(all_times):>8,}ms (~{max(all_times)//1000}s) β€” slowest crawler")

    timeouts = [n for n, d in results_by_crawler.items() if "TIMEOUT" in d["status"]]
    empty    = [n for n, d in results_by_crawler.items()
                if d["count"] == 0 and "TIMEOUT" not in d["status"]
                and "ERROR" not in d["status"]]
    ok       = [n for n, d in results_by_crawler.items() if "OK" in d["status"]]
    errors   = [n for n, d in results_by_crawler.items() if "ERROR" in d["status"]]
    total_results = sum(d["count"] for d in results_by_crawler.values())

    print(f"\nπŸ“Š SUMMARY:")
    print(f"  Returning results ({len(ok)}):  {ok}")
    print(f"  Timed out ({len(timeouts)}):        {timeouts}")
    print(f"  Empty results ({len(empty)}):   {empty}")
    print(f"  Errors ({len(errors)}):           {errors}")
    print(f"  Total results available: {total_results}")

    # Per-crawler timeout analysis
    print(f"\nπŸ“Š TIMEOUT EFFICIENCY:")
    for name, data in sorted_crawlers:
        timeout_ms   = data["timeout"] * 1000
        actual_ms    = data["time_ms"]
        wasted_ms    = max(0, timeout_ms - actual_ms)
        if data["count"] == 0 and wasted_ms > 500:
            print(
                f"  {name}: returned 0 results, wasted {wasted_ms}ms "
                f"({actual_ms}ms actual vs {timeout_ms}ms limit)"
            )

    print("\n" + "=" * 70)
    print("CTO RECOMMENDATION (Rick):")
    if timeouts:
        for t in timeouts:
            current = results_by_crawler[t]["timeout"]
            print(f"  {t}: already set to {current}s β€” if still timing out,")
            print(f"    reduce further to 1s (these crawlers are unreachable)")
    slow_empty = [
        (n, d) for n, d in results_by_crawler.items()
        if d["count"] == 0 and d["time_ms"] > 1000
        and "TIMEOUT" not in d["status"]
    ]
    if slow_empty:
        print(f"  Slow empty crawlers (waste time with 0 results):")
        for n, d in slow_empty:
            print(f"    {n}: {d['time_ms']}ms for 0 results")
    print("=" * 70)


if __name__ == "__main__":
    query_idx = int(sys.argv[1]) if len(sys.argv) > 1 else 0
    if query_idx >= len(TEST_QUERIES):
        print(f"Query index must be 0-{len(TEST_QUERIES)-1}")
        print(f"Available: {[q for q, _ in TEST_QUERIES]}")
        sys.exit(1)
    asyncio.run(profile_crawlers(query_idx))