zhiminy commited on
Commit
7485ae4
·
1 Parent(s): f2759c2
Files changed (3) hide show
  1. app.py +459 -1092
  2. msr.py +345 -811
  3. requirements.txt +2 -0
app.py CHANGED
@@ -3,52 +3,31 @@ from gradio_leaderboard import Leaderboard
3
  import json
4
  import os
5
  import time
 
6
  import requests
7
  from datetime import datetime, timezone, timedelta
8
  from collections import defaultdict
9
  from huggingface_hub import HfApi, hf_hub_download
10
- from datasets import load_dataset, Dataset
11
- import threading
12
  from dotenv import load_dotenv
13
  import pandas as pd
14
  import random
15
- import argparse
16
  import plotly.graph_objects as go
17
  from plotly.subplots import make_subplots
18
  from apscheduler.schedulers.background import BackgroundScheduler
19
  from apscheduler.triggers.cron import CronTrigger
 
20
 
21
  # Load environment variables
22
  load_dotenv()
23
 
24
- # Parse command-line arguments
25
- parser = argparse.ArgumentParser(description='SWE Agent Issue Leaderboard')
26
- parser.add_argument('--debug', '--DEBUG', action='store_true',
27
- help='Enable debug mode (limits issue retrieval to 10 per query pattern)')
28
- parser.add_argument('--no-debug', '--production', action='store_true',
29
- help='Explicitly disable debug mode (force production mode)')
30
- args = parser.parse_args()
31
-
32
  # =============================================================================
33
  # CONFIGURATION
34
  # =============================================================================
35
 
36
- # DEBUG MODE: Set to True to limit issue retrieval for testing
37
- # When enabled, only fetches up to 10 issues per query pattern per agent
38
- # Priority: 1) Command-line args, 2) Environment variable, 3) Default (False)
39
- if args.no_debug:
40
- DEBUG_MODE = False
41
- elif args.debug:
42
- DEBUG_MODE = True
43
- else:
44
- DEBUG_MODE = os.getenv('DEBUG_MODE', 'False').lower() in ('true', '1', 'yes')
45
-
46
- # In-memory cache for debug mode (data persists during session but NOT saved to HF)
47
- DEBUG_ISSUE_METADATA_CACHE = defaultdict(list)
48
-
49
  AGENTS_REPO = "SWE-Arena/swe_agents" # HuggingFace dataset for agent metadata
50
  ISSUE_METADATA_REPO = "SWE-Arena/issue_metadata" # HuggingFace dataset for issue metadata
51
- LEADERBOARD_TIME_FRAME_DAYS = 180 # Time frame for leaderboard (past 6 months)
 
52
 
53
  LEADERBOARD_COLUMNS = [
54
  ("Agent Name", "string"),
@@ -104,7 +83,7 @@ def normalize_date_format(date_string):
104
  """
105
  if not date_string or date_string == 'N/A':
106
  return 'N/A'
107
-
108
  try:
109
  # Parse the date string (handles both with and without microseconds)
110
  if '.' in date_string:
@@ -113,7 +92,7 @@ def normalize_date_format(date_string):
113
  else:
114
  # Already in correct format or GitHub format
115
  return date_string
116
-
117
  # Convert to standardized format
118
  return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
119
  except Exception as e:
@@ -122,306 +101,279 @@ def normalize_date_format(date_string):
122
 
123
 
124
  # =============================================================================
125
- # GITHUB API OPERATIONS
126
  # =============================================================================
127
 
128
- def request_with_backoff(method, url, *, headers=None, params=None, json_body=None, data=None, max_retries=10, timeout=30, token_pool=None, token=None):
129
  """
130
- Perform an HTTP request with exponential backoff and jitter for GitHub API.
131
- Retries on 403/429 (rate limits), 5xx server errors, and transient network exceptions.
132
-
133
- Args:
134
- token_pool: Optional TokenPool instance for automatic rate limit tracking
135
- token: Optional token being used (for marking as rate-limited)
136
 
137
- Returns the final requests.Response on success or non-retryable status, or None after exhausting retries.
 
138
  """
139
- delay = 1.0
140
- for attempt in range(max_retries):
141
- try:
142
- resp = requests.request(
143
- method,
144
- url,
145
- headers=headers or {},
146
- params=params,
147
- json=json_body,
148
- data=data,
149
- timeout=timeout
150
- )
151
-
152
- status = resp.status_code
153
-
154
- # Success
155
- if 200 <= status < 300:
156
- return resp
157
-
158
- # Rate limits or server errors -> retry with backoff
159
- if status in (403, 429) or 500 <= status < 600:
160
- wait = None
161
- reset_timestamp = None
162
-
163
- # Prefer Retry-After when present
164
- retry_after = resp.headers.get('Retry-After') or resp.headers.get('retry-after')
165
- if retry_after:
166
- try:
167
- wait = float(retry_after)
168
- except Exception:
169
- wait = None
170
 
171
- # Fallback to X-RateLimit-Reset when 403/429
172
- if wait is None and status in (403, 429):
173
- reset_hdr = resp.headers.get('X-RateLimit-Reset') or resp.headers.get('x-ratelimit-reset')
174
- if reset_hdr:
175
- try:
176
- reset_ts = int(float(reset_hdr))
177
- reset_timestamp = reset_ts
178
- wait = max(reset_ts - time.time() + 2, 1)
179
- except Exception:
180
- wait = None
181
 
182
- # Mark token as rate-limited if we have token_pool and token
183
- if status in (403, 429) and token_pool and token:
184
- token_pool.mark_rate_limited(token, reset_timestamp)
185
 
186
- # Final fallback: exponential backoff with jitter
187
- if wait is None:
188
- wait = delay + random.uniform(0, 0.5)
189
 
190
- # Cap individual wait to avoid extreme sleeps
191
- wait = max(1.0, min(wait, 120.0))
192
- print(f"GitHub API {status}. Backing off {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
193
- time.sleep(wait)
194
- delay = min(delay * 2, 60.0)
195
- continue
196
 
197
- # Non-retryable error; return response for caller to handle
198
- return resp
199
-
200
- except requests.RequestException as e:
201
- # Network error -> retry with backoff
202
- wait = delay + random.uniform(0, 0.5)
203
- wait = max(1.0, min(wait, 60.0))
204
- print(f"Request error: {e}. Retrying in {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
205
- time.sleep(wait)
206
- delay = min(delay * 2, 60.0)
207
-
208
- print(f"Exceeded max retries for {url}")
209
- return None
210
-
211
- def get_github_tokens():
212
- """Get all GitHub tokens from environment variables (all keys starting with GITHUB_TOKEN)."""
213
- tokens = []
214
- for key, value in os.environ.items():
215
- if key.startswith('GITHUB_TOKEN') and value:
216
- tokens.append(value)
217
-
218
- if not tokens:
219
- print("Warning: No GITHUB_TOKEN found. API rate limits: 60/hour (authenticated: 5000/hour)")
220
  else:
221
- print(f" Loaded {len(tokens)} GitHub token(s) for rotation")
222
-
223
- return tokens
224
-
225
 
226
- def get_github_token():
227
- """Get primary GitHub token from environment variables (backward compatibility)."""
228
- token = os.getenv('GITHUB_TOKEN')
229
- if not token:
230
- print("Warning: GITHUB_TOKEN not found. API rate limits: 60/hour (authenticated: 5000/hour)")
231
- return token
232
 
233
-
234
- class TokenPool:
235
  """
236
- Hybrid token pool with parallel execution and round-robin fallback.
237
 
238
- Splits tokens into two pools:
239
- - 50% for parallel execution (maximize throughput)
240
- - 50% for round-robin backup (handle rate limits)
241
 
242
- Features:
243
- - Automatic rate limit detection and tracking
244
- - Token recovery when rate limits expire
245
- - Statistics monitoring
246
- - Thread-safe operations
247
  """
248
- def __init__(self, tokens):
249
- import threading
 
 
 
 
 
250
 
251
- # Store all tokens
252
- self.all_tokens = tokens if tokens else [None]
253
- total_tokens = len(self.all_tokens)
254
 
255
- # Split tokens into parallel and round-robin pools (50/50)
256
- # For odd numbers, round-robin gets the extra token
257
- split_point = max(1, total_tokens // 2)
258
 
259
- self.parallel_tokens = self.all_tokens[:split_point]
260
- self.roundrobin_tokens = self.all_tokens[split_point:] if split_point < total_tokens else self.all_tokens
 
261
 
262
- # Round-robin index for fallback pool
263
- self.roundrobin_index = 0
 
264
 
265
- # Track rate-limited tokens with reset timestamps
266
- self.rate_limited_tokens = {} # {token: reset_timestamp}
 
 
 
267
 
268
- # Statistics
269
- self.stats = {
270
- 'parallel_calls': 0,
271
- 'roundrobin_calls': 0,
272
- 'fallback_triggers': 0
 
 
 
 
 
 
 
 
273
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
- # Thread lock for thread-safety
276
- self.lock = threading.Lock()
277
-
278
- print(f"🔀 Token Pool Initialized:")
279
- print(f" Total tokens: {total_tokens}")
280
- print(f" Parallel pool: {len(self.parallel_tokens)} tokens")
281
- print(f" Round-robin pool: {len(self.roundrobin_tokens)} tokens")
282
-
283
- def _clean_expired_rate_limits(self):
284
- """Remove tokens from rate-limited set if their reset time has passed."""
285
- current_time = time.time()
286
- expired = [token for token, reset_time in self.rate_limited_tokens.items()
287
- if reset_time and current_time >= reset_time]
288
- for token in expired:
289
- del self.rate_limited_tokens[token]
290
- print(f" ✓ Token recovered from rate limit")
291
-
292
- def get_parallel_token(self):
293
- """Get an available token from the parallel pool."""
294
- with self.lock:
295
- self._clean_expired_rate_limits()
296
-
297
- # Find first available parallel token (not rate-limited)
298
- for token in self.parallel_tokens:
299
- if token not in self.rate_limited_tokens:
300
- self.stats['parallel_calls'] += 1
301
- return token
302
-
303
- # All parallel tokens are rate-limited
304
- return None
305
 
306
- def get_roundrobin_token(self):
307
- """Get next token from round-robin pool."""
308
- with self.lock:
309
- self._clean_expired_rate_limits()
310
-
311
- if not self.roundrobin_tokens:
312
- return None
313
-
314
- # Try up to pool size to find non-rate-limited token
315
- attempts = 0
316
- max_attempts = len(self.roundrobin_tokens)
317
-
318
- while attempts < max_attempts:
319
- token = self.roundrobin_tokens[self.roundrobin_index]
320
- self.roundrobin_index = (self.roundrobin_index + 1) % len(self.roundrobin_tokens)
321
- attempts += 1
322
-
323
- if token not in self.rate_limited_tokens:
324
- self.stats['roundrobin_calls'] += 1
325
- return token
326
-
327
- # All round-robin tokens are rate-limited, return one anyway
328
- # (request_with_backoff will handle the rate limit)
329
- token = self.roundrobin_tokens[self.roundrobin_index]
330
- self.roundrobin_index = (self.roundrobin_index + 1) % len(self.roundrobin_tokens)
331
- self.stats['roundrobin_calls'] += 1
332
- return token
333
-
334
- def get_next_token(self):
335
- """
336
- Get next available token using hybrid strategy:
337
- 1. Try parallel pool first
338
- 2. Fall back to round-robin if parallel is exhausted
339
- """
340
- # Try parallel pool first
341
- token = self.get_parallel_token()
342
-
343
- if token is not None:
344
- return token
345
-
346
- # Parallel pool exhausted, fall back to round-robin
347
- with self.lock:
348
- self.stats['fallback_triggers'] += 1
349
-
350
- return self.get_roundrobin_token()
351
-
352
- def get_headers(self):
353
- """Get headers with the next token in rotation."""
354
- token = self.get_next_token()
355
- return {'Authorization': f'token {token}'} if token else {}
356
-
357
- def mark_rate_limited(self, token, reset_timestamp=None):
358
- """
359
- Mark a token as rate-limited with optional reset timestamp.
360
-
361
- Args:
362
- token: The token to mark
363
- reset_timestamp: Unix timestamp when rate limit resets (optional)
364
- """
365
- with self.lock:
366
- self.rate_limited_tokens[token] = reset_timestamp
367
- pool_type = "parallel" if token in self.parallel_tokens else "round-robin"
368
- if reset_timestamp:
369
- reset_time = datetime.fromtimestamp(reset_timestamp, timezone.utc).strftime('%H:%M:%S UTC')
370
- print(f" ⚠️ Token marked as rate-limited ({pool_type} pool, resets at {reset_time})")
371
- else:
372
- print(f" ⚠️ Token marked as rate-limited ({pool_type} pool)")
373
-
374
- def get_available_parallel_tokens(self):
375
- """Get list of all available (non-rate-limited) parallel tokens."""
376
- with self.lock:
377
- self._clean_expired_rate_limits()
378
- return [token for token in self.parallel_tokens
379
- if token not in self.rate_limited_tokens]
380
-
381
- def get_stats(self):
382
- """Get current statistics."""
383
- with self.lock:
384
- self._clean_expired_rate_limits()
385
- parallel_rate_limited = sum(1 for t in self.parallel_tokens
386
- if t in self.rate_limited_tokens)
387
- roundrobin_rate_limited = sum(1 for t in self.roundrobin_tokens
388
- if t in self.rate_limited_tokens)
389
-
390
- return {
391
- **self.stats,
392
- 'parallel_rate_limited': parallel_rate_limited,
393
- 'roundrobin_rate_limited': roundrobin_rate_limited
394
- }
395
 
396
- def print_stats(self):
397
- """Print statistics about token pool usage."""
398
- stats = self.get_stats()
399
- total_calls = stats['parallel_calls'] + stats['roundrobin_calls']
400
 
401
- if total_calls == 0:
402
- print("📊 No API calls made yet")
403
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
 
405
- parallel_pct = (stats['parallel_calls'] / total_calls * 100) if total_calls > 0 else 0
406
- roundrobin_pct = (stats['roundrobin_calls'] / total_calls * 100) if total_calls > 0 else 0
 
 
 
 
 
 
 
 
407
 
408
- print(f"📊 Token Pool Statistics:")
409
- print(f" Total API calls: {total_calls}")
410
- print(f" Parallel calls: {stats['parallel_calls']} ({parallel_pct:.1f}%)")
411
- print(f" Round-robin calls: {stats['roundrobin_calls']} ({roundrobin_pct:.1f}%)")
412
- print(f" Fallback triggers: {stats['fallback_triggers']}")
413
- print(f" Currently rate-limited: {stats['parallel_rate_limited']} parallel, {stats['roundrobin_rate_limited']} round-robin")
414
 
415
 
416
  def validate_github_username(identifier):
417
- """Verify that a GitHub identifier exists with backoff-aware requests."""
418
  try:
419
  token = get_github_token()
420
  headers = {'Authorization': f'token {token}'} if token else {}
421
  url = f'https://api.github.com/users/{identifier}'
422
- response = request_with_backoff('GET', url, headers=headers, max_retries=1)
423
- if response is None:
424
- return False, "Validation error: network/rate limit exhausted"
425
  if response.status_code == 200:
426
  return True, "Username is valid"
427
  elif response.status_code == 404:
@@ -432,320 +384,15 @@ def validate_github_username(identifier):
432
  return False, f"Validation error: {str(e)}"
433
 
434
 
435
- def fetch_issues_parallel(query_patterns, start_date, end_date, token_pool, issues_by_id, debug_limit=None):
436
- """
437
- Fetch issues for multiple query patterns in parallel using available parallel tokens.
438
-
439
- Args:
440
- query_patterns: List of query patterns to search
441
- start_date: Start date for time range
442
- end_date: End date for time range
443
- token_pool: TokenPool instance for token management
444
- issues_by_id: Shared dictionary to store issues (thread-safe operations)
445
- debug_limit: If set, stops fetching after this many issues per pattern
446
-
447
- Returns:
448
- Total number of issues found across all patterns
449
- """
450
- import concurrent.futures
451
- import threading
452
-
453
- # Get available parallel tokens
454
- available_tokens = token_pool.get_available_parallel_tokens()
455
-
456
- if not available_tokens:
457
- print(" ⚠️ No parallel tokens available, using sequential fallback")
458
- total_found = 0
459
- for pattern in query_patterns:
460
- count = fetch_issues_with_time_partition(
461
- pattern, start_date, end_date, token_pool, issues_by_id, debug_limit, depth=0
462
- )
463
- total_found += count
464
- return total_found
465
-
466
- # Determine max workers based on available tokens
467
- max_workers = min(len(query_patterns), len(available_tokens))
468
-
469
- print(f" 🚀 Using parallel execution with {max_workers} workers")
470
-
471
- # Thread-safe lock for issues_by_id updates
472
- lock = threading.Lock()
473
-
474
- def fetch_pattern(pattern, token):
475
- """Worker function to fetch issues for a single pattern."""
476
- # Create temporary dict for this pattern
477
- pattern_issues = {}
478
-
479
- try:
480
- # Fetch issues for this pattern
481
- count = fetch_issues_with_time_partition(
482
- pattern,
483
- start_date,
484
- end_date,
485
- token_pool,
486
- pattern_issues,
487
- debug_limit,
488
- depth=0
489
- )
490
-
491
- # Merge into shared dict with lock
492
- with lock:
493
- for issue_id, issue in pattern_issues.items():
494
- if issue_id not in issues_by_id:
495
- issues_by_id[issue_id] = issue
496
-
497
- return count
498
-
499
- except Exception as e:
500
- print(f" ✗ Error in parallel fetch for pattern '{pattern}': {str(e)}")
501
- return 0
502
-
503
- # Execute patterns in parallel
504
- total_found = 0
505
- with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
506
- # Map patterns to tokens
507
- futures = []
508
- for i, pattern in enumerate(query_patterns):
509
- token = available_tokens[i % len(available_tokens)]
510
- future = executor.submit(fetch_pattern, pattern, token)
511
- futures.append(future)
512
-
513
- # Collect results
514
- for future in concurrent.futures.as_completed(futures):
515
- try:
516
- count = future.result()
517
- total_found += count
518
- except Exception as e:
519
- print(f" ✗ Parallel execution error: {str(e)}")
520
-
521
- return total_found
522
-
523
-
524
- def fetch_issues_with_time_partition(base_query, start_date, end_date, token_pool, issues_by_id, debug_limit=None, depth=0):
525
- """
526
- Fetch issues within a specific time range using time-based partitioning.
527
- Recursively splits the time range if hitting the 1000-result limit.
528
- Supports splitting by day, hour, minute, and second as needed.
529
-
530
- Args:
531
- base_query: Base GitHub search query
532
- start_date: Start date for time range
533
- end_date: End date for time range
534
- token_pool: TokenPool instance for rotating tokens
535
- issues_by_id: Dictionary to store issues (deduplicated by ID)
536
- debug_limit: If set, stops fetching after this many issues (for testing)
537
- depth: Current recursion depth (for tracking)
538
-
539
- Returns the number of issues found in this time partition.
540
- """
541
- # Calculate time difference
542
- time_diff = end_date - start_date
543
- total_seconds = time_diff.total_seconds()
544
-
545
- # Determine granularity and format dates accordingly
546
- if total_seconds >= 86400: # >= 1 day
547
- # Use day granularity (YYYY-MM-DD)
548
- start_str = start_date.strftime('%Y-%m-%d')
549
- end_str = end_date.strftime('%Y-%m-%d')
550
- elif total_seconds >= 3600: # >= 1 hour but < 1 day
551
- # Use hour granularity (YYYY-MM-DDTHH:MM:SSZ)
552
- start_str = start_date.strftime('%Y-%m-%dT%H:00:00Z')
553
- end_str = end_date.strftime('%Y-%m-%dT%H:59:59Z')
554
- elif total_seconds >= 60: # >= 1 minute but < 1 hour
555
- # Use minute granularity (YYYY-MM-DDTHH:MM:SSZ)
556
- start_str = start_date.strftime('%Y-%m-%dT%H:%M:00Z')
557
- end_str = end_date.strftime('%Y-%m-%dT%H:%M:59Z')
558
- else: # < 1 minute
559
- # Use second granularity (YYYY-MM-DDTHH:MM:SSZ)
560
- start_str = start_date.strftime('%Y-%m-%dT%H:%M:%SZ')
561
- end_str = end_date.strftime('%Y-%m-%dT%H:%M:%SZ')
562
-
563
- # Add date range to query
564
- query = f'{base_query} created:{start_str}..{end_str}'
565
-
566
- indent = " " + " " * depth
567
- print(f"{indent}Searching range {start_str} to {end_str}...")
568
-
569
- page = 1
570
- per_page = 100
571
- total_in_partition = 0
572
-
573
- while True:
574
- # Check debug limit
575
- if debug_limit is not None and total_in_partition >= debug_limit:
576
- print(f"{indent} 🐛 DEBUG MODE: Reached limit of {debug_limit} issues, stopping...")
577
- return total_in_partition
578
- url = 'https://api.github.com/search/issues'
579
- params = {
580
- 'q': query,
581
- 'per_page': per_page,
582
- 'page': page,
583
- 'sort': 'created',
584
- 'order': 'asc'
585
- }
586
-
587
- try:
588
- headers = token_pool.get_headers()
589
- response = request_with_backoff('GET', url, headers=headers, params=params)
590
- if response is None:
591
- print(f"{indent} Error: retries exhausted for range {start_str} to {end_str}")
592
- return total_in_partition
593
-
594
- if response.status_code != 200:
595
- print(f"{indent} Error: HTTP {response.status_code} for range {start_str} to {end_str}")
596
- return total_in_partition
597
-
598
- data = response.json()
599
- total_count = data.get('total_count', 0)
600
- items = data.get('items', [])
601
-
602
- if not items:
603
- break
604
-
605
- # Add issues to global dict
606
- for issue in items:
607
- issue_id = issue.get('id')
608
- if issue_id and issue_id not in issues_by_id:
609
- issues_by_id[issue_id] = issue
610
- total_in_partition += 1
611
-
612
- # Check if we hit the 1000-result limit
613
- if total_count > 1000 and page == 10:
614
- print(f"{indent} ⚠️ Hit 1000-result limit ({total_count} total). Splitting time range...")
615
-
616
- # Determine how to split based on time range duration
617
- if total_seconds < 2: # Less than 2 seconds - can't split further
618
- print(f"{indent} ⚠️ Cannot split further (range < 2 seconds). Some results may be missing.")
619
- break
620
-
621
- elif total_seconds < 120: # Less than 2 minutes - split by seconds
622
- # Split into 2-4 parts depending on range
623
- num_splits = min(4, max(2, int(total_seconds / 30)))
624
- split_duration = time_diff / num_splits
625
- split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
626
-
627
- total_from_splits = 0
628
- for i in range(num_splits):
629
- split_start = split_dates[i]
630
- split_end = split_dates[i + 1]
631
- # Avoid overlapping ranges (add 1 second to start)
632
- if i > 0:
633
- split_start = split_start + timedelta(seconds=1)
634
-
635
- count = fetch_issues_with_time_partition(
636
- base_query, split_start, split_end, token_pool, issues_by_id, debug_limit, depth + 1
637
- )
638
- total_from_splits += count
639
-
640
- return total_from_splits
641
-
642
- elif total_seconds < 7200: # Less than 2 hours - split by minutes
643
- # Split into 2-4 parts
644
- num_splits = min(4, max(2, int(total_seconds / 1800)))
645
- split_duration = time_diff / num_splits
646
- split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
647
-
648
- total_from_splits = 0
649
- for i in range(num_splits):
650
- split_start = split_dates[i]
651
- split_end = split_dates[i + 1]
652
- # Avoid overlapping ranges (add 1 minute to start)
653
- if i > 0:
654
- split_start = split_start + timedelta(minutes=1)
655
-
656
- count = fetch_issues_with_time_partition(
657
- base_query, split_start, split_end, token_pool, issues_by_id, debug_limit, depth + 1
658
- )
659
- total_from_splits += count
660
-
661
- return total_from_splits
662
-
663
- elif total_seconds < 172800: # Less than 2 days - split by hours
664
- # Split into 2-4 parts
665
- num_splits = min(4, max(2, int(total_seconds / 43200)))
666
- split_duration = time_diff / num_splits
667
- split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
668
-
669
- total_from_splits = 0
670
- for i in range(num_splits):
671
- split_start = split_dates[i]
672
- split_end = split_dates[i + 1]
673
- # Avoid overlapping ranges (add 1 hour to start)
674
- if i > 0:
675
- split_start = split_start + timedelta(hours=1)
676
-
677
- count = fetch_issues_with_time_partition(
678
- base_query, split_start, split_end, token_pool, issues_by_id, debug_limit, depth + 1
679
- )
680
- total_from_splits += count
681
-
682
- return total_from_splits
683
-
684
- else: # 2+ days - split by days
685
- days_diff = time_diff.days
686
-
687
- # Use aggressive splitting for large ranges or deep recursion
688
- # Split into 4 parts if range is > 30 days, otherwise split in half
689
- if days_diff > 30 or depth > 5:
690
- # Split into 4 parts for more aggressive partitioning
691
- quarter_diff = time_diff / 4
692
- split_dates = [
693
- start_date,
694
- start_date + quarter_diff,
695
- start_date + quarter_diff * 2,
696
- start_date + quarter_diff * 3,
697
- end_date
698
- ]
699
-
700
- total_from_splits = 0
701
- for i in range(4):
702
- split_start = split_dates[i]
703
- split_end = split_dates[i + 1]
704
- # Avoid overlapping ranges
705
- if i > 0:
706
- split_start = split_start + timedelta(days=1)
707
-
708
- count = fetch_issues_with_time_partition(
709
- base_query, split_start, split_end, token_pool, issues_by_id, debug_limit, depth + 1
710
- )
711
- total_from_splits += count
712
-
713
- return total_from_splits
714
- else:
715
- # Binary split for smaller ranges
716
- mid_date = start_date + time_diff / 2
717
-
718
- # Recursively fetch both halves
719
- count1 = fetch_issues_with_time_partition(
720
- base_query, start_date, mid_date, token_pool, issues_by_id, debug_limit, depth + 1
721
- )
722
- count2 = fetch_issues_with_time_partition(
723
- base_query, mid_date + timedelta(days=1), end_date, token_pool, issues_by_id, debug_limit, depth + 1
724
- )
725
-
726
- return count1 + count2
727
-
728
- # Normal pagination: check if there are more pages
729
- if len(items) < per_page or page >= 10:
730
- break
731
-
732
- page += 1
733
- time.sleep(0.5) # Courtesy delay between pages
734
-
735
- except Exception as e:
736
- print(f"{indent} Error fetching range {start_str} to {end_str}: {str(e)}")
737
- return total_in_partition
738
-
739
- if total_in_partition > 0:
740
- print(f"{indent} ✓ Found {total_in_partition} issues in range {start_str} to {end_str}")
741
-
742
- return total_in_partition
743
 
744
 
745
  def extract_issue_metadata(issue):
746
  """
747
  Extract minimal issue metadata for efficient storage.
748
- Only keeps essential fields: html_url, created_at, closed_at, state_reason.
749
  Note: agent_name is not stored as it's inferred from the folder structure.
750
 
751
  Issue states:
@@ -759,7 +406,7 @@ def extract_issue_metadata(issue):
759
  state_reason = issue.get('state_reason')
760
 
761
  return {
762
- 'html_url': issue.get('html_url'),
763
  'created_at': created_at,
764
  'closed_at': closed_at,
765
  'state': state,
@@ -772,37 +419,48 @@ def extract_issue_metadata(issue):
772
  def calculate_issue_stats_from_metadata(metadata_list):
773
  """
774
  Calculate statistics from a list of issue metadata (lightweight objects).
775
- Works with minimal metadata: html_url, created_at, closed_at, state, state_reason.
776
 
777
  Returns a dictionary with comprehensive issue metrics.
778
 
779
  Resolved Rate is calculated as:
780
- resolved issues / total issues * 100
781
 
782
- Resolved Issues = issues closed as completed (state_reason="completed")
783
- We do NOT count issues closed as not planned (state_reason="not_planned")
 
 
784
  """
785
  total_issues = len(metadata_list)
786
 
787
- # Count resolved issues - those with state_reason="completed"
788
- resolved = sum(1 for issue_meta in metadata_list
789
- if issue_meta.get('state_reason') == 'completed')
790
 
791
- # Calculate resolved rate
792
- resolved_rate = (resolved / total_issues * 100) if total_issues > 0 else 0
 
 
 
 
793
 
794
  return {
795
  'total_issues': total_issues,
796
- 'resolved_issues': resolved,
 
797
  'resolved_rate': round(resolved_rate, 2),
798
  }
799
 
800
 
801
- def calculate_monthly_metrics_by_agent():
802
  """
803
- Calculate monthly metrics for all agents for visualization.
804
  Loads data directly from SWE-Arena/issue_metadata dataset.
805
 
 
 
 
 
806
  Returns:
807
  dict: {
808
  'agents': list of agent names,
@@ -865,18 +523,21 @@ def calculate_monthly_metrics_by_agent():
865
  for month in months:
866
  issues_in_month = month_dict.get(month, [])
867
 
868
- # Count resolved issues (those with state_reason="completed")
869
- resolved_count = sum(1 for issue in issues_in_month if issue.get('state_reason') == 'completed')
 
 
 
870
 
871
  # Total issues created in this month
872
  total_count = len(issues_in_month)
873
 
874
- # Calculate resolved rate
875
- resolved_rate = (resolved_count / total_count * 100) if total_count > 0 else None
876
 
877
  resolved_rates.append(resolved_rate)
878
  total_issues_list.append(total_count)
879
- resolved_issues_list.append(resolved_count)
880
 
881
  result_data[agent_name] = {
882
  'resolved_rates': resolved_rates,
@@ -884,8 +545,25 @@ def calculate_monthly_metrics_by_agent():
884
  'resolved_issues': resolved_issues_list
885
  }
886
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
887
  return {
888
- 'agents': sorted(list(agent_month_data.keys())),
889
  'months': months,
890
  'data': result_data
891
  }
@@ -921,26 +599,14 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
921
  """
922
  Save issue metadata to HuggingFace dataset, organized by [agent_identifier]/YYYY.MM.DD.jsonl.
923
  Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.
924
- In debug mode, saves to in-memory cache only.
925
 
926
- This function APPENDS new metadata and DEDUPLICATES by html_url.
927
- Uses batch folder upload to minimize commits (1 commit per agent instead of 1 per file).
928
 
929
  Args:
930
  metadata_list: List of issue metadata dictionaries
931
  agent_identifier: GitHub identifier of the agent (used as folder name)
932
  """
933
- # Skip saving to HF in debug mode - use in-memory cache instead
934
- if DEBUG_MODE:
935
- global DEBUG_ISSUE_METADATA_CACHE
936
- # Merge with existing cache, deduplicating by html_url
937
- existing = {issue['html_url']: issue for issue in DEBUG_ISSUE_METADATA_CACHE[agent_identifier] if issue.get('html_url')}
938
- new = {issue['html_url']: issue for issue in metadata_list if issue.get('html_url')}
939
- existing.update(new)
940
- DEBUG_ISSUE_METADATA_CACHE[agent_identifier] = list(existing.values())
941
- print(f"🐛 DEBUG MODE: Saved to in-memory cache only ({len(metadata_list)} issues) - NOT saved to HuggingFace")
942
- return True
943
-
944
  import tempfile
945
  import shutil
946
 
@@ -950,63 +616,43 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
950
  if not token:
951
  raise Exception("No HuggingFace token found")
952
 
953
- api = HfApi()
 
 
 
 
 
 
 
954
 
955
  # Create temporary directory for batch upload
956
  temp_dir = tempfile.mkdtemp()
957
  agent_folder = os.path.join(temp_dir, agent_identifier)
958
  os.makedirs(agent_folder, exist_ok=True)
959
 
960
- # Group by exact date (year, month, day)
961
- grouped = group_metadata_by_date(metadata_list)
962
-
963
- print(f"📤 Preparing batch upload for {agent_identifier} ({len(grouped)} daily files)...")
964
 
 
965
  for (issue_year, month, day), day_metadata in grouped.items():
966
- # New structure: [agent_identifier]/YYYY.MM.DD.jsonl
967
  filename = f"{agent_identifier}/{issue_year}.{month:02d}.{day:02d}.jsonl"
968
- local_filename = f"{issue_year}.{month:02d}.{day:02d}.jsonl"
969
- local_path = os.path.join(agent_folder, local_filename)
970
 
971
- print(f" Preparing {len(day_metadata)} issues for {filename}...")
 
972
 
973
- # Download existing file if it exists
974
- existing_metadata = []
975
- try:
976
- file_path = hf_hub_download(
977
- repo_id=ISSUE_METADATA_REPO,
978
- filename=filename,
979
- repo_type="dataset",
980
- token=token
981
- )
982
- existing_metadata = load_jsonl(file_path)
983
- print(f" Found {len(existing_metadata)} existing issues in {filename}")
984
- except Exception:
985
- print(f" No existing file found for {filename}, creating new")
986
 
987
- # Merge and deduplicate by html_url
988
- existing_by_url = {meta['html_url']: meta for meta in existing_metadata if meta.get('html_url')}
989
- new_by_url = {meta['html_url']: meta for meta in day_metadata if meta.get('html_url')}
990
-
991
- # Update with new data (new data overwrites old)
992
- existing_by_url.update(new_by_url)
993
- merged_metadata = list(existing_by_url.values())
994
-
995
- # Save to temporary folder
996
- save_jsonl(local_path, merged_metadata)
997
- print(f" ✓ Prepared {len(merged_metadata)} total issues for {local_filename}")
998
-
999
- # Upload entire folder in a single commit
1000
- print(f"📤 Uploading folder {agent_identifier} to HuggingFace (1 commit)...")
1001
- api.upload_folder(
1002
- folder_path=agent_folder,
1003
- path_in_repo=agent_identifier,
1004
  repo_id=ISSUE_METADATA_REPO,
1005
- repo_type="dataset",
1006
- token=token,
1007
- commit_message=f"Update metadata for {agent_identifier}"
1008
  )
1009
- print(f" ✓ Successfully uploaded {len(grouped)} files in 1 commit")
1010
 
1011
  return True
1012
 
@@ -1022,8 +668,7 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
1022
  def load_issue_metadata():
1023
  """
1024
  Load issue metadata from the last LEADERBOARD_TIME_FRAME_DAYS only.
1025
- In debug mode, loads from in-memory cache if available.
1026
-
1027
  Structure: [agent_identifier]/YYYY.MM.DD.jsonl
1028
 
1029
  Returns:
@@ -1034,28 +679,6 @@ def load_issue_metadata():
1034
  current_time = datetime.now(timezone.utc)
1035
  cutoff_date = current_time - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
1036
 
1037
- # In debug mode, check in-memory cache first
1038
- if DEBUG_MODE and DEBUG_ISSUE_METADATA_CACHE:
1039
- all_metadata = []
1040
- for agent_identifier, metadata_list in DEBUG_ISSUE_METADATA_CACHE.items():
1041
- for issue_meta in metadata_list:
1042
- # Filter by time frame in debug mode too
1043
- created_at = issue_meta.get('created_at')
1044
- if created_at:
1045
- try:
1046
- dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
1047
- if dt < cutoff_date:
1048
- continue # Skip issues outside time frame
1049
- except Exception:
1050
- pass # Keep issues with unparseable dates
1051
-
1052
- issue_with_agent = issue_meta.copy()
1053
- issue_with_agent['agent_identifier'] = agent_identifier
1054
- all_metadata.append(issue_with_agent)
1055
- if all_metadata:
1056
- print(f"🐛 DEBUG MODE: Loading issue metadata from in-memory cache from last {LEADERBOARD_TIME_FRAME_DAYS} days ({len(all_metadata)} issues)")
1057
- return all_metadata
1058
-
1059
  try:
1060
  api = HfApi()
1061
  token = get_hf_token()
@@ -1086,7 +709,7 @@ def load_issue_metadata():
1086
  # Skip files with unparseable dates
1087
  continue
1088
 
1089
- print(f"📥 Loading issue metadata from last {LEADERBOARD_TIME_FRAME_DAYS} days ({len(time_frame_files)} daily files across all agents)...")
1090
 
1091
  all_metadata = []
1092
  for filename in time_frame_files:
@@ -1193,13 +816,12 @@ def get_latest_issue_date_for_agent(agent_identifier):
1193
  return None
1194
 
1195
 
1196
- def get_daily_files_last_n_months(agent_identifier, n_months=6):
1197
  """
1198
- Get list of daily file paths for an agent from the last N months.
1199
 
1200
  Args:
1201
  agent_identifier: GitHub identifier of the agent
1202
- n_months: Number of months to look back (default: 6)
1203
 
1204
  Returns:
1205
  List of file paths in format: [agent_identifier]/YYYY.MM.DD.jsonl
@@ -1208,9 +830,9 @@ def get_daily_files_last_n_months(agent_identifier, n_months=6):
1208
  api = HfApi()
1209
  token = get_hf_token()
1210
 
1211
- # Calculate date range
1212
  today = datetime.now(timezone.utc)
1213
- n_months_ago = today - timedelta(days=30 * n_months)
1214
 
1215
  # List all files in the repository
1216
  files = api.list_repo_files(repo_id=ISSUE_METADATA_REPO, repo_type="dataset")
@@ -1236,8 +858,8 @@ def get_daily_files_last_n_months(agent_identifier, n_months=6):
1236
  file_year, file_month, file_day = map(int, date_components)
1237
  file_date = datetime(file_year, file_month, file_day, tzinfo=timezone.utc)
1238
 
1239
- # Include if within last n_months
1240
- if n_months_ago <= file_date <= today:
1241
  recent_files.append(filename)
1242
  except Exception:
1243
  continue
@@ -1249,169 +871,6 @@ def get_daily_files_last_n_months(agent_identifier, n_months=6):
1249
  return []
1250
 
1251
 
1252
-
1253
-
1254
- def fetch_issue_current_status(issue_url, token):
1255
- """
1256
- Fetch the current status of a single issue from GitHub API.
1257
-
1258
- Args:
1259
- issue_url: Issue HTML URL (e.g., https://github.com/owner/repo/issues/123)
1260
- token: GitHub API token
1261
-
1262
- Returns:
1263
- Dictionary with updated state, state_reason, and closed_at, or None if failed
1264
- """
1265
- try:
1266
- # Convert HTML URL to API URL
1267
- # https://github.com/owner/repo/issues/123 -> https://api.github.com/repos/owner/repo/issues/123
1268
- parts = issue_url.replace('https://github.com/', '').split('/')
1269
- if len(parts) < 4:
1270
- return None
1271
-
1272
- owner, repo, issue_word, issue_number = parts[0], parts[1], parts[2], parts[3]
1273
- api_url = f'https://api.github.com/repos/{owner}/{repo}/issues/{issue_number}'
1274
-
1275
- headers = {'Authorization': f'token {token}'} if token else {}
1276
- response = request_with_backoff('GET', api_url, headers=headers, max_retries=3)
1277
-
1278
- if response is None or response.status_code != 200:
1279
- return None
1280
-
1281
- issue_data = response.json()
1282
- state = issue_data.get('state')
1283
- state_reason = issue_data.get('state_reason')
1284
- closed_at = issue_data.get('closed_at')
1285
-
1286
- return {
1287
- 'state': state,
1288
- 'state_reason': state_reason,
1289
- 'closed_at': closed_at
1290
- }
1291
-
1292
- except Exception as e:
1293
- print(f" Error fetching issue status for {issue_url}: {str(e)}")
1294
- return None
1295
-
1296
-
1297
- def refresh_open_issues_for_agent(agent_identifier, token):
1298
- """
1299
- Refresh status for all open issues from the last 6 months for an agent.
1300
- Only updates issues that are still open (state="open" or no state_reason).
1301
-
1302
- This implements the smart update strategy:
1303
- - Skip issues that are already closed/resolved
1304
- - Fetch current status for open issues
1305
- - Update and save back to daily files
1306
-
1307
- Args:
1308
- agent_identifier: GitHub identifier of the agent
1309
- token: GitHub API token
1310
-
1311
- Returns:
1312
- Tuple: (total_checked, updated_count)
1313
- """
1314
- print(f"\n🔄 Refreshing open issues for {agent_identifier} (last 6 months)...")
1315
-
1316
- try:
1317
- # Get daily files from last 6 months
1318
- recent_files = get_daily_files_last_n_months(agent_identifier, n_months=6)
1319
-
1320
- if not recent_files:
1321
- print(f" No recent files found for {agent_identifier}")
1322
- return (0, 0)
1323
-
1324
- print(f" Found {len(recent_files)} daily files to check")
1325
-
1326
- total_checked = 0
1327
- updated_count = 0
1328
-
1329
- # Process each file
1330
- for filename in recent_files:
1331
- try:
1332
- # Download file
1333
- file_path = hf_hub_download(
1334
- repo_id=ISSUE_METADATA_REPO,
1335
- filename=filename,
1336
- repo_type="dataset",
1337
- token=get_hf_token()
1338
- )
1339
- issues = load_jsonl(file_path)
1340
-
1341
- if not issues:
1342
- continue
1343
-
1344
- updated_issues = []
1345
- file_had_updates = False
1346
-
1347
- # Check each issue
1348
- for issue in issues:
1349
- # Skip if already closed (has a state_reason)
1350
- if issue.get('state') == 'closed' and issue.get('state_reason'):
1351
- updated_issues.append(issue)
1352
- continue
1353
-
1354
- # Issue is open, fetch current status
1355
- total_checked += 1
1356
- issue_url = issue.get('html_url')
1357
-
1358
- if not issue_url:
1359
- updated_issues.append(issue)
1360
- continue
1361
-
1362
- current_status = fetch_issue_current_status(issue_url, token)
1363
-
1364
- if current_status:
1365
- # Check if status changed (now closed)
1366
- if current_status['state'] == 'closed':
1367
- print(f" ✓ Issue status changed: {issue_url}")
1368
- issue['state'] = current_status['state']
1369
- issue['state_reason'] = current_status['state_reason']
1370
- issue['closed_at'] = current_status['closed_at']
1371
- updated_count += 1
1372
- file_had_updates = True
1373
-
1374
- updated_issues.append(issue)
1375
- time.sleep(0.1) # Rate limiting courtesy delay
1376
-
1377
- # Save file if there were updates
1378
- if file_had_updates:
1379
- # Extract filename components for local save
1380
- parts = filename.split('/')
1381
- local_filename = parts[-1] # Just YYYY.MM.DD.jsonl
1382
-
1383
- # Save locally
1384
- save_jsonl(local_filename, updated_issues)
1385
-
1386
- try:
1387
- # Upload back to HuggingFace
1388
- api = HfApi()
1389
- upload_with_retry(
1390
- api=api,
1391
- path_or_fileobj=local_filename,
1392
- path_in_repo=filename,
1393
- repo_id=ISSUE_METADATA_REPO,
1394
- repo_type="dataset",
1395
- token=get_hf_token()
1396
- )
1397
- print(f" 💾 Updated {filename}")
1398
- finally:
1399
- # Always clean up local file, even if upload fails
1400
- if os.path.exists(local_filename):
1401
- os.remove(local_filename)
1402
-
1403
- except Exception as e:
1404
- print(f" Warning: Could not process {filename}: {str(e)}")
1405
- continue
1406
-
1407
- print(f" ✅ Refresh complete: {total_checked} open issues checked, {updated_count} updated")
1408
- return (total_checked, updated_count)
1409
-
1410
- except Exception as e:
1411
- print(f" ✗ Error refreshing issues for {agent_identifier}: {str(e)}")
1412
- return (0, 0)
1413
-
1414
-
1415
  # =============================================================================
1416
  # HUGGINGFACE DATASET OPERATIONS
1417
  # =============================================================================
@@ -1428,8 +887,6 @@ def load_agents_from_hf():
1428
  # Filter for JSON files only
1429
  json_files = [f for f in files if f.endswith('.json')]
1430
 
1431
- print(f"Found {len(json_files)} agent files in {AGENTS_REPO}")
1432
-
1433
  # Download and parse each JSON file
1434
  for json_file in json_files:
1435
  try:
@@ -1441,6 +898,19 @@ def load_agents_from_hf():
1441
 
1442
  with open(file_path, 'r') as f:
1443
  agent_data = json.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
1444
  agents.append(agent_data)
1445
 
1446
  except Exception as e:
@@ -1552,188 +1022,102 @@ def save_agent_to_hf(data):
1552
  # DATA MANAGEMENT
1553
  # =============================================================================
1554
 
1555
- def fetch_new_issues_for_agent(agent_identifier, token_pool, query_patterns=None, use_parallel=True):
1556
  """
1557
- Fetch and save new issues for an agent from yesterday 12am UTC to today 12am UTC.
 
1558
 
1559
- Args:
1560
- agent_identifier: GitHub identifier of the agent
1561
- token_pool: TokenPool instance for rotating tokens
1562
- query_patterns: List of query patterns to search (if None, uses default)
1563
- use_parallel: Whether to use parallel execution (default: True)
1564
-
1565
- Returns:
1566
- Number of new issues found and saved
1567
  """
1568
- if not query_patterns:
1569
- query_patterns = [
1570
- 'label:good-first-issue',
1571
- 'label:bug',
1572
- 'label:enhancement',
1573
- 'label:documentation',
1574
- ]
1575
-
1576
- # Calculate time range: yesterday 12am UTC to today 12am UTC
1577
- now_utc = datetime.now(timezone.utc)
1578
- today_midnight = now_utc.replace(hour=0, minute=0, second=0, microsecond=0)
1579
- yesterday_midnight = today_midnight - timedelta(days=1)
1580
-
1581
- print(f"\n 📥 Fetching new issues for {agent_identifier}...")
1582
- print(f" Time range: {yesterday_midnight.isoformat()} to {today_midnight.isoformat()}")
1583
-
1584
- total_new_issues = 0
1585
- issues_by_id = {}
1586
-
1587
- # Add agent identifier to query patterns
1588
- full_query_patterns = [f'author:{agent_identifier} {pattern}' for pattern in query_patterns]
1589
-
1590
- # Use parallel execution if enabled and multiple patterns exist and not in debug mode
1591
- if use_parallel and len(full_query_patterns) > 1 and not DEBUG_MODE:
1592
- try:
1593
- total_new_issues = fetch_issues_parallel(
1594
- full_query_patterns,
1595
- yesterday_midnight,
1596
- today_midnight,
1597
- token_pool,
1598
- issues_by_id,
1599
- debug_limit=10 if DEBUG_MODE else None
1600
- )
1601
- except Exception as e:
1602
- print(f" ⚠️ Parallel execution failed, falling back to sequential: {str(e)}")
1603
- use_parallel = False
1604
-
1605
- # Fall back to sequential if parallel is disabled or failed
1606
- if not use_parallel or len(full_query_patterns) == 1 or DEBUG_MODE:
1607
- for base_query in full_query_patterns:
1608
- try:
1609
- count = fetch_issues_with_time_partition(
1610
- base_query,
1611
- yesterday_midnight,
1612
- today_midnight,
1613
- token_pool,
1614
- issues_by_id,
1615
- debug_limit=10 if DEBUG_MODE else None,
1616
- depth=0
1617
- )
1618
- total_new_issues += count
1619
-
1620
- except Exception as e:
1621
- print(f" ⚠️ Error fetching pattern '{base_query}': {str(e)}")
1622
- continue
1623
-
1624
- # Extract metadata from fetched issues
1625
- if issues_by_id:
1626
- metadata_list = [extract_issue_metadata(issue) for issue in issues_by_id.values()]
1627
-
1628
- # Save to HuggingFace
1629
- success = save_issue_metadata_to_hf(metadata_list, agent_identifier)
1630
-
1631
- if success:
1632
- print(f" ✓ Saved {len(metadata_list)} new issues for {agent_identifier}")
1633
- else:
1634
- print(f" ✗ Failed to save issues for {agent_identifier}")
1635
-
1636
- return total_new_issues
1637
-
1638
 
1639
- def update_all_agents_incremental():
1640
- """
1641
- Daily incremental update that:
1642
- 1. Refreshes all open issues from the last (LEADERBOARD_TIME_FRAME_DAYS - 1) days
1643
- to check if they've been closed
1644
- 2. Fetches and adds new issues from yesterday 12am UTC to today 12am UTC
1645
 
1646
- Runs daily at 12:00 AM UTC as a scheduled task.
1647
- """
1648
  print(f"\n{'='*80}")
1649
- print(f"🕛 Daily incremental mining started at {datetime.now(timezone.utc).isoformat()}")
1650
- print(f"{'='*80}")
 
 
 
1651
 
 
1652
  try:
1653
- # Load all GitHub tokens and create token pool
1654
- tokens = get_github_tokens()
1655
- token_pool = TokenPool(tokens)
1656
-
1657
- # Get first token for functions that still need single token
1658
- token = tokens[0] if tokens else None
1659
-
1660
- # Load agent metadata from HuggingFace
1661
- agents = load_agents_from_hf()
1662
- if not agents:
1663
- print("No agents found in HuggingFace dataset")
1664
- return
1665
-
1666
- print(f"\n🔄 Phase 1: Refreshing open issues from last {LEADERBOARD_TIME_FRAME_DAYS - 1} days")
1667
- print(f" (checking if previously open issues have been closed)")
1668
-
1669
- total_checked = 0
1670
- total_updated = 0
1671
-
1672
- # Step 1: Refresh all open issues from the last (LEADERBOARD_TIME_FRAME_DAYS - 1) days
1673
- for agent in agents:
1674
- identifier = agent.get('github_identifier')
1675
- if not identifier:
1676
- continue
1677
-
1678
- try:
1679
- checked, updated = refresh_open_issues_for_agent(identifier, token)
1680
- total_checked += checked
1681
- total_updated += updated
1682
- except Exception as e:
1683
- print(f" ⚠️ Error refreshing {identifier}: {str(e)}")
1684
- continue
1685
-
1686
- print(f"\n ✅ Phase 1 complete: {total_checked} open issues checked, {total_updated} updated")
1687
-
1688
- print(f"\n📥 Phase 2: Fetching new issues from yesterday 12am UTC to today 12am UTC")
1689
-
1690
- total_new_issues = 0
1691
-
1692
- # Step 2: Fetch new issues for each agent
1693
- for agent in agents:
1694
- identifier = agent.get('github_identifier')
1695
- if not identifier:
1696
- continue
1697
 
1698
- try:
1699
- new_count = fetch_new_issues_for_agent(identifier, token_pool)
1700
- total_new_issues += new_count
1701
- except Exception as e:
1702
- print(f" ⚠️ Error fetching new issues for {identifier}: {str(e)}")
1703
- continue
1704
 
1705
- print(f"\n ✅ Phase 2 complete: {total_new_issues} new issues fetched")
 
 
 
 
 
 
 
 
1706
 
1707
- # Load updated metadata and calculate stats
1708
- print(f"\n📊 Calculating updated statistics...")
1709
- all_metadata = load_issue_metadata()
 
1710
 
1711
- for agent in agents:
1712
- identifier = agent.get('github_identifier')
1713
- agent_name = agent.get('agent_name', 'Unknown')
1714
 
1715
- if not identifier:
1716
- continue
 
1717
 
1718
- try:
1719
- # Filter metadata for this agent
1720
- agent_metadata = [issue for issue in all_metadata if issue.get('agent_identifier') == identifier]
 
1721
 
1722
- # Calculate stats from metadata
1723
- stats = calculate_issue_stats_from_metadata(agent_metadata)
1724
 
1725
- print(f"{identifier}: {stats['total_issues']} issues, {stats['resolved_rate']}% resolved")
1726
 
1727
- except Exception as e:
1728
- print(f" ✗ Error processing {identifier}: {str(e)}")
1729
- continue
 
 
 
 
 
 
 
1730
 
1731
- print(f"\n✅ Daily incremental mining completed at {datetime.now(timezone.utc).isoformat()}")
 
 
 
 
 
1732
 
1733
- except Exception as e:
1734
- print(f" Daily incremental mining failed: {str(e)}")
1735
- import traceback
1736
- traceback.print_exc()
 
 
 
 
1737
 
1738
 
1739
  def construct_leaderboard_from_metadata():
@@ -1779,6 +1163,14 @@ def construct_leaderboard_from_metadata():
1779
  # UI FUNCTIONS
1780
  # =============================================================================
1781
 
 
 
 
 
 
 
 
 
1782
  def create_monthly_metrics_plot():
1783
  """
1784
  Create a Plotly figure with dual y-axes showing:
@@ -1786,8 +1178,9 @@ def create_monthly_metrics_plot():
1786
  - Right y-axis: Total Issues created as bar charts
1787
 
1788
  Each agent gets a unique color for both their line and bars.
 
1789
  """
1790
- metrics = calculate_monthly_metrics_by_agent()
1791
 
1792
  if not metrics['agents'] or not metrics['months']:
1793
  # Return an empty figure with a message
@@ -1808,19 +1201,16 @@ def create_monthly_metrics_plot():
1808
  # Create figure with secondary y-axis
1809
  fig = make_subplots(specs=[[{"secondary_y": True}]])
1810
 
1811
- # Define colors for agents (using a color palette)
1812
- colors = [
1813
- '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
1814
- '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'
1815
- ]
1816
-
1817
  agents = metrics['agents']
1818
  months = metrics['months']
1819
  data = metrics['data']
1820
 
 
 
 
1821
  # Add traces for each agent
1822
- for idx, agent_name in enumerate(agents):
1823
- color = colors[idx % len(colors)]
1824
  agent_data = data[agent_name]
1825
 
1826
  # Add line trace for resolved rate (left y-axis)
@@ -1941,7 +1331,7 @@ def get_leaderboard_dataframe():
1941
  return df
1942
 
1943
 
1944
- def submit_agent(identifier, agent_name, organization, description, website):
1945
  """
1946
  Submit a new agent to the leaderboard.
1947
  Validates input and saves submission. Issue data will be populated by daily incremental updates.
@@ -1951,16 +1341,15 @@ def submit_agent(identifier, agent_name, organization, description, website):
1951
  return "❌ GitHub identifier is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1952
  if not agent_name or not agent_name.strip():
1953
  return "❌ Agent name is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1954
- if not organization or not organization.strip():
1955
- return "❌ Organization name is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1956
  if not website or not website.strip():
1957
  return "❌ Website URL is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1958
 
1959
  # Clean inputs
1960
  identifier = identifier.strip()
1961
  agent_name = agent_name.strip()
1962
- organization = organization.strip()
1963
- description = description.strip()
1964
  website = website.strip()
1965
 
1966
  # Validate GitHub identifier
@@ -1978,9 +1367,8 @@ def submit_agent(identifier, agent_name, organization, description, website):
1978
  # Create submission
1979
  submission = {
1980
  'agent_name': agent_name,
1981
- 'organization': organization,
1982
  'github_identifier': identifier,
1983
- 'description': description,
1984
  'website': website,
1985
  }
1986
 
@@ -2000,51 +1388,35 @@ def submit_agent(identifier, agent_name, organization, description, website):
2000
  # GRADIO APPLICATION
2001
  # =============================================================================
2002
 
2003
- # Initialize data before creating UI
2004
- if DEBUG_MODE:
2005
- print("\n" + "="*80)
2006
- print("🐛 DEBUG MODE ENABLED 🐛")
2007
- print("="*80)
2008
- print("Issue retrieval is limited to 10 issues per query pattern per agent")
2009
-
2010
- # Show how debug mode was enabled
2011
- if args.debug:
2012
- print("Enabled via: command-line flag '--debug'")
2013
- print("To disable: run without '--debug' flag")
2014
- else:
2015
- print("Enabled via: DEBUG_MODE environment variable")
2016
- print("To disable: run with '--no-debug' flag or unset DEBUG_MODE")
2017
-
2018
- print("="*80 + "\n")
2019
- else:
2020
- print("\n🚀 Starting in PRODUCTION MODE - full issue retrieval enabled")
2021
- if args.no_debug:
2022
- print(" (Explicitly set via '--no-debug' flag)")
2023
- print()
2024
-
2025
- # Start APScheduler for daily regular issue mining at 12:00 AM UTC
2026
  scheduler = BackgroundScheduler(timezone="UTC")
2027
  scheduler.add_job(
2028
- update_all_agents_incremental,
2029
- trigger=CronTrigger(hour=0, minute=0), # 12:00 AM UTC daily
2030
- id='daily_regular_mining',
2031
- name='Daily Regular Issue Mining',
2032
  replace_existing=True
2033
  )
2034
  scheduler.start()
2035
- print("✓ Scheduler started: Daily regular issue mining at 12:00 AM UTC")
 
 
 
 
2036
 
2037
  # Create Gradio interface
2038
  with gr.Blocks(title="SWE Agent Issue Leaderboard", theme=gr.themes.Soft()) as app:
2039
 
2040
  gr.Markdown("# 🏆 SWE Agent Issue Leaderboard")
2041
- gr.Markdown("Track and compare GitHub issue resolution statistics for SWE agents (last 6 months)")
2042
-
2043
  with gr.Tabs():
2044
-
2045
  # Leaderboard Tab
2046
  with gr.Tab("📊 Leaderboard"):
2047
- gr.Markdown("*All statistics are based on issues from the last 6 months*")
2048
  leaderboard_table = Leaderboard(
2049
  value=get_leaderboard_dataframe(),
2050
  datatype=LEADERBOARD_COLUMNS,
@@ -2078,14 +1450,9 @@ with gr.Blocks(title="SWE Agent Issue Leaderboard", theme=gr.themes.Soft()) as a
2078
  )
2079
 
2080
  with gr.Column():
2081
- organization_input = gr.Textbox(
2082
- label="Organization*",
2083
- placeholder="Your organization or team name"
2084
- )
2085
- description_input = gr.Textbox(
2086
- label="Description",
2087
- placeholder="Brief description of your agent",
2088
- lines=3
2089
  )
2090
  website_input = gr.Textbox(
2091
  label="Website",
@@ -2104,7 +1471,7 @@ with gr.Blocks(title="SWE Agent Issue Leaderboard", theme=gr.themes.Soft()) as a
2104
  # Event handler
2105
  submit_button.click(
2106
  fn=submit_agent,
2107
- inputs=[github_input, name_input, organization_input, description_input, website_input],
2108
  outputs=[submission_status, leaderboard_table, monthly_plot]
2109
  )
2110
 
 
3
  import json
4
  import os
5
  import time
6
+ import tempfile
7
  import requests
8
  from datetime import datetime, timezone, timedelta
9
  from collections import defaultdict
10
  from huggingface_hub import HfApi, hf_hub_download
 
 
11
  from dotenv import load_dotenv
12
  import pandas as pd
13
  import random
 
14
  import plotly.graph_objects as go
15
  from plotly.subplots import make_subplots
16
  from apscheduler.schedulers.background import BackgroundScheduler
17
  from apscheduler.triggers.cron import CronTrigger
18
+ from google.cloud import bigquery
19
 
20
  # Load environment variables
21
  load_dotenv()
22
 
 
 
 
 
 
 
 
 
23
  # =============================================================================
24
  # CONFIGURATION
25
  # =============================================================================
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  AGENTS_REPO = "SWE-Arena/swe_agents" # HuggingFace dataset for agent metadata
28
  ISSUE_METADATA_REPO = "SWE-Arena/issue_metadata" # HuggingFace dataset for issue metadata
29
+ LEADERBOARD_TIME_FRAME_DAYS = 180 # Time frame for leaderboard
30
+ UPDATE_TIME_FRAME_DAYS = 30 # How often to re-mine data via BigQuery
31
 
32
  LEADERBOARD_COLUMNS = [
33
  ("Agent Name", "string"),
 
83
  """
84
  if not date_string or date_string == 'N/A':
85
  return 'N/A'
86
+
87
  try:
88
  # Parse the date string (handles both with and without microseconds)
89
  if '.' in date_string:
 
92
  else:
93
  # Already in correct format or GitHub format
94
  return date_string
95
+
96
  # Convert to standardized format
97
  return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
98
  except Exception as e:
 
101
 
102
 
103
  # =============================================================================
104
+ # BIGQUERY OPERATIONS
105
  # =============================================================================
106
 
107
+ def get_bigquery_client():
108
  """
109
+ Initialize BigQuery client using credentials from environment variable.
 
 
 
 
 
110
 
111
+ Expects GOOGLE_APPLICATION_CREDENTIALS_JSON environment variable containing
112
+ the service account JSON credentials as a string.
113
  """
114
+ # Get the JSON content from environment variable
115
+ creds_json = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS_JSON')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
+ if creds_json:
118
+ # Create a temporary file to store credentials
119
+ with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
120
+ temp_file.write(creds_json)
121
+ temp_path = temp_file.name
 
 
 
 
 
122
 
123
+ # Set environment variable to point to temp file
124
+ os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = temp_path
 
125
 
126
+ # Initialize BigQuery client
127
+ client = bigquery.Client()
 
128
 
129
+ # Clean up temp file
130
+ os.unlink(temp_path)
 
 
 
 
131
 
132
+ return client
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  else:
134
+ raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
 
 
 
135
 
 
 
 
 
 
 
136
 
137
+ def generate_table_union_statements(start_date, end_date):
 
138
  """
139
+ Generate UNION ALL statements for githubarchive.day tables in date range.
140
 
141
+ Args:
142
+ start_date: Start datetime
143
+ end_date: End datetime
144
 
145
+ Returns:
146
+ String with UNION ALL SELECT statements for all tables in range
 
 
 
147
  """
148
+ table_names = []
149
+ current_date = start_date
150
+
151
+ while current_date < end_date:
152
+ table_name = f"`githubarchive.day.{current_date.strftime('%Y%m%d')}`"
153
+ table_names.append(table_name)
154
+ current_date += timedelta(days=1)
155
 
156
+ # Create UNION ALL chain
157
+ union_parts = [f"SELECT * FROM {table}" for table in table_names]
158
+ return " UNION ALL ".join(union_parts)
159
 
 
 
 
160
 
161
+ def fetch_all_issue_metadata_single_query(client, identifiers, start_date, end_date):
162
+ """
163
+ Fetch issue metadata for ALL agents using ONE comprehensive BigQuery query.
164
 
165
+ This query fetches IssuesEvent and IssueCommentEvent from GitHub Archive and
166
+ deduplicates to get the latest state of each issue. Filters by issue author,
167
+ commenter, or assignee.
168
 
169
+ Args:
170
+ client: BigQuery client instance
171
+ identifiers: List of GitHub usernames/bot identifiers
172
+ start_date: Start datetime (timezone-aware)
173
+ end_date: End datetime (timezone-aware)
174
 
175
+ Returns:
176
+ Dictionary mapping agent identifier to list of issue metadata:
177
+ {
178
+ 'agent-identifier': [
179
+ {
180
+ 'url': Issue URL,
181
+ 'created_at': Issue creation timestamp,
182
+ 'closed_at': Close timestamp (if closed, else None),
183
+ 'state_reason': Reason for closure (completed/not_planned/etc.)
184
+ },
185
+ ...
186
+ ],
187
+ ...
188
  }
189
+ """
190
+ print(f"\n🔍 Querying BigQuery for ALL {len(identifiers)} agents in ONE QUERY")
191
+ print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
192
+
193
+ # Generate table UNION statements for issue events
194
+ issue_tables = generate_table_union_statements(start_date, end_date)
195
+
196
+ # Build identifier list for IN clause (handle both bot and non-bot versions)
197
+ identifier_set = set()
198
+ for id in identifiers:
199
+ identifier_set.add(id)
200
+ # Also add stripped version without [bot] suffix
201
+ stripped = id.replace('[bot]', '')
202
+ if stripped != id:
203
+ identifier_set.add(stripped)
204
+
205
+ identifier_list = ', '.join([f"'{id}'" for id in identifier_set])
206
+
207
+ # Build comprehensive query with CTEs
208
+ query = f"""
209
+ WITH issue_events AS (
210
+ -- Get all issue events and comment events for ALL agents
211
+ SELECT
212
+ JSON_EXTRACT_SCALAR(payload, '$.issue.html_url') as url,
213
+ JSON_EXTRACT_SCALAR(payload, '$.issue.created_at') as created_at,
214
+ JSON_EXTRACT_SCALAR(payload, '$.issue.closed_at') as closed_at,
215
+ JSON_EXTRACT_SCALAR(payload, '$.issue.state_reason') as state_reason,
216
+ JSON_EXTRACT_SCALAR(payload, '$.issue.user.login') as author,
217
+ JSON_EXTRACT_SCALAR(payload, '$.issue.assignee.login') as assignee,
218
+ JSON_EXTRACT_SCALAR(payload, '$.comment.user.login') as commenter,
219
+ JSON_EXTRACT_SCALAR(payload, '$.issue.number') as issue_number,
220
+ repo.name as repo_name,
221
+ created_at as event_time
222
+ FROM (
223
+ {issue_tables}
224
+ )
225
+ WHERE
226
+ type IN ('IssuesEvent', 'IssueCommentEvent')
227
+ -- Exclude pull requests (they have pull_request field)
228
+ AND JSON_EXTRACT(payload, '$.issue.pull_request') IS NULL
229
+ AND JSON_EXTRACT_SCALAR(payload, '$.issue.html_url') IS NOT NULL
230
+ -- Filter by author OR commenter OR assignee
231
+ AND (
232
+ JSON_EXTRACT_SCALAR(payload, '$.issue.user.login') IN ({identifier_list})
233
+ OR JSON_EXTRACT_SCALAR(payload, '$.comment.user.login') IN ({identifier_list})
234
+ OR JSON_EXTRACT_SCALAR(payload, '$.issue.assignee.login') IN ({identifier_list})
235
+ )
236
+ ),
237
+
238
+ latest_states AS (
239
+ -- Deduplicate to get latest state for each issue
240
+ SELECT
241
+ url,
242
+ created_at,
243
+ closed_at,
244
+ state_reason,
245
+ author,
246
+ assignee,
247
+ commenter
248
+ FROM issue_events
249
+ QUALIFY ROW_NUMBER() OVER (
250
+ PARTITION BY repo_name, issue_number
251
+ ORDER BY event_time DESC
252
+ ) = 1
253
+ ),
254
+
255
+ agent_issues AS (
256
+ -- Map each issue to its relevant agent(s)
257
+ SELECT DISTINCT
258
+ CASE
259
+ WHEN author IN ({identifier_list}) THEN author
260
+ WHEN commenter IN ({identifier_list}) THEN commenter
261
+ WHEN assignee IN ({identifier_list}) THEN assignee
262
+ ELSE NULL
263
+ END as agent_identifier,
264
+ url,
265
+ created_at,
266
+ closed_at,
267
+ state_reason
268
+ FROM latest_states
269
+ WHERE
270
+ author IN ({identifier_list})
271
+ OR commenter IN ({identifier_list})
272
+ OR assignee IN ({identifier_list})
273
+ )
274
 
275
+ SELECT
276
+ agent_identifier,
277
+ url,
278
+ created_at,
279
+ closed_at,
280
+ state_reason
281
+ FROM agent_issues
282
+ WHERE agent_identifier IS NOT NULL
283
+ ORDER BY agent_identifier, created_at DESC
284
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
+ # Calculate number of days for reporting
287
+ query_days = (end_date - start_date).days
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
+ print(f" Querying {query_days} days for issue and comment events...")
290
+ print(f" Agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}")
 
 
291
 
292
+ try:
293
+ query_job = client.query(query)
294
+ results = list(query_job.result())
295
+
296
+ print(f" ✓ Found {len(results)} total issue records across all agents")
297
+
298
+ # Group results by agent
299
+ metadata_by_agent = defaultdict(list)
300
+
301
+ for row in results:
302
+ agent_id = row.agent_identifier
303
+
304
+ # Convert datetime objects to ISO strings
305
+ created_at = row.created_at
306
+ if hasattr(created_at, 'isoformat'):
307
+ created_at = created_at.isoformat()
308
+
309
+ closed_at = row.closed_at
310
+ if hasattr(closed_at, 'isoformat'):
311
+ closed_at = closed_at.isoformat()
312
+
313
+ metadata_by_agent[agent_id].append({
314
+ 'url': row.url,
315
+ 'created_at': created_at,
316
+ 'closed_at': closed_at,
317
+ 'state_reason': row.state_reason,
318
+ })
319
+
320
+ # Print breakdown by agent
321
+ print(f"\n 📊 Results breakdown by agent:")
322
+ for identifier in identifiers:
323
+ # Check both original and stripped versions
324
+ count = len(metadata_by_agent.get(identifier, []))
325
+ stripped = identifier.replace('[bot]', '')
326
+ if stripped != identifier:
327
+ count += len(metadata_by_agent.get(stripped, []))
328
+
329
+ if count > 0:
330
+ # Merge both versions if needed
331
+ all_metadata = metadata_by_agent.get(identifier, []) + metadata_by_agent.get(stripped, [])
332
+ completed_count = sum(1 for m in all_metadata if m['state_reason'] == 'completed')
333
+ closed_count = sum(1 for m in all_metadata if m['closed_at'] is not None)
334
+ open_count = count - closed_count
335
+ print(f" {identifier}: {count} issues ({completed_count} completed, {closed_count} closed, {open_count} open)")
336
+
337
+ # Convert defaultdict to regular dict and merge bot/non-bot versions
338
+ final_metadata = {}
339
+ for identifier in identifiers:
340
+ combined = metadata_by_agent.get(identifier, [])
341
+ stripped = identifier.replace('[bot]', '')
342
+ if stripped != identifier and stripped in metadata_by_agent:
343
+ combined.extend(metadata_by_agent[stripped])
344
+
345
+ if combined:
346
+ final_metadata[identifier] = combined
347
+
348
+ return final_metadata
349
 
350
+ except Exception as e:
351
+ print(f" ✗ BigQuery error: {str(e)}")
352
+ import traceback
353
+ traceback.print_exc()
354
+ return {}
355
+
356
+
357
+ # =============================================================================
358
+ # GITHUB API OPERATIONS (Minimal - for validation only)
359
+ # =============================================================================
360
 
361
+ def get_github_token():
362
+ """Get GitHub token from environment variables for validation purposes."""
363
+ token = os.getenv('GITHUB_TOKEN')
364
+ if not token:
365
+ print("Warning: GITHUB_TOKEN not found for validation")
366
+ return token
367
 
368
 
369
  def validate_github_username(identifier):
370
+ """Verify that a GitHub identifier exists (simple validation for submission)."""
371
  try:
372
  token = get_github_token()
373
  headers = {'Authorization': f'token {token}'} if token else {}
374
  url = f'https://api.github.com/users/{identifier}'
375
+ response = requests.get(url, headers=headers, timeout=10)
376
+
 
377
  if response.status_code == 200:
378
  return True, "Username is valid"
379
  elif response.status_code == 404:
 
384
  return False, f"Validation error: {str(e)}"
385
 
386
 
387
+ # =============================================================================
388
+ # ISSUE METADATA OPERATIONS
389
+ # =============================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
 
391
 
392
  def extract_issue_metadata(issue):
393
  """
394
  Extract minimal issue metadata for efficient storage.
395
+ Only keeps essential fields: url, created_at, closed_at, state_reason.
396
  Note: agent_name is not stored as it's inferred from the folder structure.
397
 
398
  Issue states:
 
406
  state_reason = issue.get('state_reason')
407
 
408
  return {
409
+ 'url': issue.get('url'),
410
  'created_at': created_at,
411
  'closed_at': closed_at,
412
  'state': state,
 
419
  def calculate_issue_stats_from_metadata(metadata_list):
420
  """
421
  Calculate statistics from a list of issue metadata (lightweight objects).
422
+ Works with minimal metadata: url, created_at, closed_at, state, state_reason.
423
 
424
  Returns a dictionary with comprehensive issue metrics.
425
 
426
  Resolved Rate is calculated as:
427
+ completed issues / closed issues * 100
428
 
429
+ Completed Issues = issues closed as completed (state_reason="completed")
430
+ Closed Issues = all issues that have been closed (closed_at is not None)
431
+ We do NOT count issues closed as not planned (state_reason="not_planned") as resolved,
432
+ but they ARE counted in the denominator as closed issues.
433
  """
434
  total_issues = len(metadata_list)
435
 
436
+ # Count closed issues (those with closed_at timestamp)
437
+ closed_issues = sum(1 for issue_meta in metadata_list
438
+ if issue_meta.get('closed_at') is not None)
439
 
440
+ # Count completed issues (subset of closed issues with state_reason="completed")
441
+ completed = sum(1 for issue_meta in metadata_list
442
+ if issue_meta.get('state_reason') == 'completed')
443
+
444
+ # Calculate resolved rate as: completed / closed (not completed / total)
445
+ resolved_rate = (completed / closed_issues * 100) if closed_issues > 0 else 0
446
 
447
  return {
448
  'total_issues': total_issues,
449
+ 'closed_issues': closed_issues,
450
+ 'resolved_issues': completed,
451
  'resolved_rate': round(resolved_rate, 2),
452
  }
453
 
454
 
455
+ def calculate_monthly_metrics_by_agent(top_n=None):
456
  """
457
+ Calculate monthly metrics for all agents (or top N agents) for visualization.
458
  Loads data directly from SWE-Arena/issue_metadata dataset.
459
 
460
+ Args:
461
+ top_n: If specified, only return metrics for the top N agents by total issues.
462
+ Agents are ranked by their total issue count across all months.
463
+
464
  Returns:
465
  dict: {
466
  'agents': list of agent names,
 
523
  for month in months:
524
  issues_in_month = month_dict.get(month, [])
525
 
526
+ # Count completed issues (those with state_reason="completed")
527
+ completed_count = sum(1 for issue in issues_in_month if issue.get('state_reason') == 'completed')
528
+
529
+ # Count closed issues (those with closed_at timestamp)
530
+ closed_count = sum(1 for issue in issues_in_month if issue.get('closed_at') is not None)
531
 
532
  # Total issues created in this month
533
  total_count = len(issues_in_month)
534
 
535
+ # Calculate resolved rate as: completed / closed (not completed / total)
536
+ resolved_rate = (completed_count / closed_count * 100) if closed_count > 0 else None
537
 
538
  resolved_rates.append(resolved_rate)
539
  total_issues_list.append(total_count)
540
+ resolved_issues_list.append(completed_count)
541
 
542
  result_data[agent_name] = {
543
  'resolved_rates': resolved_rates,
 
545
  'resolved_issues': resolved_issues_list
546
  }
547
 
548
+ # Filter to top N agents if specified
549
+ agents_list = sorted(list(agent_month_data.keys()))
550
+ if top_n is not None and top_n > 0:
551
+ # Calculate total issues for each agent across all months
552
+ agent_totals = []
553
+ for agent_name in agents_list:
554
+ total_issues = sum(result_data[agent_name]['total_issues'])
555
+ agent_totals.append((agent_name, total_issues))
556
+
557
+ # Sort by total issues (descending) and take top N
558
+ agent_totals.sort(key=lambda x: x[1], reverse=True)
559
+ top_agents = [agent_name for agent_name, _ in agent_totals[:top_n]]
560
+
561
+ # Filter result_data to only include top agents
562
+ result_data = {agent: result_data[agent] for agent in top_agents if agent in result_data}
563
+ agents_list = top_agents
564
+
565
  return {
566
+ 'agents': agents_list,
567
  'months': months,
568
  'data': result_data
569
  }
 
599
  """
600
  Save issue metadata to HuggingFace dataset, organized by [agent_identifier]/YYYY.MM.DD.jsonl.
601
  Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.
 
602
 
603
+ This function uses COMPLETE OVERWRITE strategy (not append/deduplicate).
604
+ Uses upload_large_folder for optimized batch uploads.
605
 
606
  Args:
607
  metadata_list: List of issue metadata dictionaries
608
  agent_identifier: GitHub identifier of the agent (used as folder name)
609
  """
 
 
 
 
 
 
 
 
 
 
 
610
  import tempfile
611
  import shutil
612
 
 
616
  if not token:
617
  raise Exception("No HuggingFace token found")
618
 
619
+ api = HfApi(token=token)
620
+
621
+ # Group by exact date (year, month, day)
622
+ grouped = group_metadata_by_date(metadata_list)
623
+
624
+ if not grouped:
625
+ print(f" No valid metadata to save for {agent_identifier}")
626
+ return False
627
 
628
  # Create temporary directory for batch upload
629
  temp_dir = tempfile.mkdtemp()
630
  agent_folder = os.path.join(temp_dir, agent_identifier)
631
  os.makedirs(agent_folder, exist_ok=True)
632
 
633
+ print(f"📦 Preparing batch upload for {agent_identifier} ({len(grouped)} daily files)...")
 
 
 
634
 
635
+ # Process each daily file
636
  for (issue_year, month, day), day_metadata in grouped.items():
 
637
  filename = f"{agent_identifier}/{issue_year}.{month:02d}.{day:02d}.jsonl"
638
+ local_filename = os.path.join(agent_folder, f"{issue_year}.{month:02d}.{day:02d}.jsonl")
 
639
 
640
+ # Sort by created_at for better organization
641
+ day_metadata.sort(key=lambda x: x.get('created_at', ''), reverse=True)
642
 
643
+ # Save to temp directory (complete overwrite, no merging)
644
+ save_jsonl(local_filename, day_metadata)
645
+ print(f" Prepared {len(day_metadata)} issues for {filename}")
 
 
 
 
 
 
 
 
 
 
646
 
647
+ # Upload entire folder using upload_large_folder (optimized for large files)
648
+ # Note: upload_large_folder creates multiple commits automatically and doesn't support custom commit_message
649
+ print(f"🤗 Uploading {len(grouped)} files ({len(metadata_list)} total issues)...")
650
+ api.upload_large_folder(
651
+ folder_path=temp_dir,
 
 
 
 
 
 
 
 
 
 
 
 
652
  repo_id=ISSUE_METADATA_REPO,
653
+ repo_type="dataset"
 
 
654
  )
655
+ print(f" ✓ Batch upload complete for {agent_identifier}")
656
 
657
  return True
658
 
 
668
  def load_issue_metadata():
669
  """
670
  Load issue metadata from the last LEADERBOARD_TIME_FRAME_DAYS only.
671
+
 
672
  Structure: [agent_identifier]/YYYY.MM.DD.jsonl
673
 
674
  Returns:
 
679
  current_time = datetime.now(timezone.utc)
680
  cutoff_date = current_time - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
681
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
682
  try:
683
  api = HfApi()
684
  token = get_hf_token()
 
709
  # Skip files with unparseable dates
710
  continue
711
 
712
+ print(f"📥 [LOAD] Reading cached issue metadata from HuggingFace ({len(time_frame_files)} files, last {LEADERBOARD_TIME_FRAME_DAYS} days)...")
713
 
714
  all_metadata = []
715
  for filename in time_frame_files:
 
816
  return None
817
 
818
 
819
+ def get_daily_files_last_time_frame(agent_identifier):
820
  """
821
+ Get list of daily file paths for an agent from the configured time frame.
822
 
823
  Args:
824
  agent_identifier: GitHub identifier of the agent
 
825
 
826
  Returns:
827
  List of file paths in format: [agent_identifier]/YYYY.MM.DD.jsonl
 
830
  api = HfApi()
831
  token = get_hf_token()
832
 
833
+ # Calculate date range using configured time frame
834
  today = datetime.now(timezone.utc)
835
+ cutoff_date = today - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
836
 
837
  # List all files in the repository
838
  files = api.list_repo_files(repo_id=ISSUE_METADATA_REPO, repo_type="dataset")
 
858
  file_year, file_month, file_day = map(int, date_components)
859
  file_date = datetime(file_year, file_month, file_day, tzinfo=timezone.utc)
860
 
861
+ # Include if within configured time frame
862
+ if cutoff_date <= file_date <= today:
863
  recent_files.append(filename)
864
  except Exception:
865
  continue
 
871
  return []
872
 
873
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
874
  # =============================================================================
875
  # HUGGINGFACE DATASET OPERATIONS
876
  # =============================================================================
 
887
  # Filter for JSON files only
888
  json_files = [f for f in files if f.endswith('.json')]
889
 
 
 
890
  # Download and parse each JSON file
891
  for json_file in json_files:
892
  try:
 
898
 
899
  with open(file_path, 'r') as f:
900
  agent_data = json.load(f)
901
+
902
+ # Extract github_identifier from filename (e.g., "agent[bot].json" -> "agent[bot]")
903
+ filename_identifier = json_file.replace('.json', '')
904
+
905
+ # Add or override github_identifier to match filename
906
+ agent_data['github_identifier'] = filename_identifier
907
+
908
+ # Normalize name field: use 'name' if exists, otherwise use identifier
909
+ if 'name' in agent_data:
910
+ agent_data['agent_name'] = agent_data['name']
911
+ elif 'agent_name' not in agent_data:
912
+ agent_data['agent_name'] = filename_identifier
913
+
914
  agents.append(agent_data)
915
 
916
  except Exception as e:
 
1022
  # DATA MANAGEMENT
1023
  # =============================================================================
1024
 
1025
+ def mine_all_agents():
1026
  """
1027
+ Mine issue metadata for all agents within UPDATE_TIME_FRAME_DAYS and save to HuggingFace.
1028
+ Uses ONE BigQuery query for ALL agents (most efficient approach).
1029
 
1030
+ Runs periodically based on UPDATE_TIME_FRAME_DAYS (e.g., weekly).
 
 
 
 
 
 
 
1031
  """
1032
+ # Load agent metadata from HuggingFace
1033
+ agents = load_agents_from_hf()
1034
+ if not agents:
1035
+ print("No agents found in HuggingFace dataset")
1036
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1037
 
1038
+ # Extract all identifiers
1039
+ identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
1040
+ if not identifiers:
1041
+ print("No valid agent identifiers found")
1042
+ return
 
1043
 
 
 
1044
  print(f"\n{'='*80}")
1045
+ print(f"⛏️ [MINE] Starting BigQuery data mining for {len(identifiers)} agents")
1046
+ print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
1047
+ print(f"Data source: BigQuery + GitHub Archive (ONE QUERY FOR ALL AGENTS)")
1048
+ print(f"⚠️ This will query BigQuery and may take several minutes")
1049
+ print(f"{'='*80}\n")
1050
 
1051
+ # Initialize BigQuery client
1052
  try:
1053
+ client = get_bigquery_client()
1054
+ except Exception as e:
1055
+ print(f"✗ Failed to initialize BigQuery client: {str(e)}")
1056
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1057
 
1058
+ # Define time range: past LEADERBOARD_TIME_FRAME_DAYS (excluding today)
1059
+ current_time = datetime.now(timezone.utc)
1060
+ end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)
1061
+ start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
 
 
1062
 
1063
+ try:
1064
+ all_metadata = fetch_all_issue_metadata_single_query(
1065
+ client, identifiers, start_date, end_date
1066
+ )
1067
+ except Exception as e:
1068
+ print(f"✗ Error during BigQuery fetch: {str(e)}")
1069
+ import traceback
1070
+ traceback.print_exc()
1071
+ return
1072
 
1073
+ # Save results for each agent
1074
+ print(f"\n{'='*80}")
1075
+ print(f"💾 Saving results to HuggingFace for each agent...")
1076
+ print(f"{'='*80}\n")
1077
 
1078
+ success_count = 0
1079
+ error_count = 0
1080
+ no_data_count = 0
1081
 
1082
+ for i, agent in enumerate(agents, 1):
1083
+ identifier = agent.get('github_identifier')
1084
+ agent_name = agent.get('agent_name', 'Unknown')
1085
 
1086
+ if not identifier:
1087
+ print(f"[{i}/{len(agents)}] Skipping agent without identifier")
1088
+ error_count += 1
1089
+ continue
1090
 
1091
+ metadata = all_metadata.get(identifier, [])
 
1092
 
1093
+ print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
1094
 
1095
+ try:
1096
+ if metadata:
1097
+ print(f" 💾 Saving {len(metadata)} issue records...")
1098
+ if save_issue_metadata_to_hf(metadata, identifier):
1099
+ success_count += 1
1100
+ else:
1101
+ error_count += 1
1102
+ else:
1103
+ print(f" No issues found")
1104
+ no_data_count += 1
1105
 
1106
+ except Exception as e:
1107
+ print(f" ✗ Error saving {identifier}: {str(e)}")
1108
+ import traceback
1109
+ traceback.print_exc()
1110
+ error_count += 1
1111
+ continue
1112
 
1113
+ print(f"\n{'='*80}")
1114
+ print(f" Mining complete!")
1115
+ print(f" Total agents: {len(agents)}")
1116
+ print(f" Successfully saved: {success_count}")
1117
+ print(f" No data (skipped): {no_data_count}")
1118
+ print(f" Errors: {error_count}")
1119
+ print(f" BigQuery queries executed: 1")
1120
+ print(f"{'='*80}\n")
1121
 
1122
 
1123
  def construct_leaderboard_from_metadata():
 
1163
  # UI FUNCTIONS
1164
  # =============================================================================
1165
 
1166
+ def generate_color(index, total):
1167
+ """Generate distinct colors using HSL color space for better distribution"""
1168
+ hue = (index * 360 / total) % 360
1169
+ saturation = 70 + (index % 3) * 10 # Vary saturation slightly
1170
+ lightness = 45 + (index % 2) * 10 # Vary lightness slightly
1171
+ return f'hsl({hue}, {saturation}%, {lightness}%)'
1172
+
1173
+
1174
  def create_monthly_metrics_plot():
1175
  """
1176
  Create a Plotly figure with dual y-axes showing:
 
1178
  - Right y-axis: Total Issues created as bar charts
1179
 
1180
  Each agent gets a unique color for both their line and bars.
1181
+ Shows only top 5 agents by total issue count.
1182
  """
1183
+ metrics = calculate_monthly_metrics_by_agent(top_n=5)
1184
 
1185
  if not metrics['agents'] or not metrics['months']:
1186
  # Return an empty figure with a message
 
1201
  # Create figure with secondary y-axis
1202
  fig = make_subplots(specs=[[{"secondary_y": True}]])
1203
 
 
 
 
 
 
 
1204
  agents = metrics['agents']
1205
  months = metrics['months']
1206
  data = metrics['data']
1207
 
1208
+ # Generate unique colors for many agents using HSL color space
1209
+ agent_colors = {agent: generate_color(idx, len(agents)) for idx, agent in enumerate(agents)}
1210
+
1211
  # Add traces for each agent
1212
+ for agent_name in agents:
1213
+ color = agent_colors[agent_name]
1214
  agent_data = data[agent_name]
1215
 
1216
  # Add line trace for resolved rate (left y-axis)
 
1331
  return df
1332
 
1333
 
1334
+ def submit_agent(identifier, agent_name, developer, website):
1335
  """
1336
  Submit a new agent to the leaderboard.
1337
  Validates input and saves submission. Issue data will be populated by daily incremental updates.
 
1341
  return "❌ GitHub identifier is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1342
  if not agent_name or not agent_name.strip():
1343
  return "❌ Agent name is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1344
+ if not developer or not developer.strip():
1345
+ return "❌ Developer name is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1346
  if not website or not website.strip():
1347
  return "❌ Website URL is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1348
 
1349
  # Clean inputs
1350
  identifier = identifier.strip()
1351
  agent_name = agent_name.strip()
1352
+ developer = developer.strip()
 
1353
  website = website.strip()
1354
 
1355
  # Validate GitHub identifier
 
1367
  # Create submission
1368
  submission = {
1369
  'agent_name': agent_name,
1370
+ 'developer': developer,
1371
  'github_identifier': identifier,
 
1372
  'website': website,
1373
  }
1374
 
 
1388
  # GRADIO APPLICATION
1389
  # =============================================================================
1390
 
1391
+ # Start APScheduler for periodic issue mining via BigQuery
1392
+ # NOTE: On app startup, we only LOAD existing cached data from HuggingFace
1393
+ # Mining (BigQuery queries) ONLY happens on schedule (weekly on Mondays)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1394
  scheduler = BackgroundScheduler(timezone="UTC")
1395
  scheduler.add_job(
1396
+ mine_all_agents,
1397
+ trigger=CronTrigger(day_of_week='mon', hour=0, minute=0), # Every Monday at 12:00 AM UTC
1398
+ id='periodic_bigquery_mining',
1399
+ name='Periodic BigQuery Issue Mining',
1400
  replace_existing=True
1401
  )
1402
  scheduler.start()
1403
+ print(f"\n{'='*80}")
1404
+ print(f"✓ Scheduler initialized successfully")
1405
+ print(f"⛏️ Mining schedule: Every Monday at 12:00 AM UTC")
1406
+ print(f"📥 On startup: Only loads cached data from HuggingFace (no mining)")
1407
+ print(f"{'='*80}\n")
1408
 
1409
  # Create Gradio interface
1410
  with gr.Blocks(title="SWE Agent Issue Leaderboard", theme=gr.themes.Soft()) as app:
1411
 
1412
  gr.Markdown("# 🏆 SWE Agent Issue Leaderboard")
1413
+ gr.Markdown(f"Track and compare GitHub issue resolution statistics for SWE agents")
1414
+
1415
  with gr.Tabs():
1416
+
1417
  # Leaderboard Tab
1418
  with gr.Tab("📊 Leaderboard"):
1419
+ gr.Markdown(f"*All statistics are based on issues from the last {LEADERBOARD_TIME_FRAME_DAYS // 30} months*")
1420
  leaderboard_table = Leaderboard(
1421
  value=get_leaderboard_dataframe(),
1422
  datatype=LEADERBOARD_COLUMNS,
 
1450
  )
1451
 
1452
  with gr.Column():
1453
+ developer_input = gr.Textbox(
1454
+ label="Developer*",
1455
+ placeholder="Your developer or team name"
 
 
 
 
 
1456
  )
1457
  website_input = gr.Textbox(
1458
  label="Website",
 
1471
  # Event handler
1472
  submit_button.click(
1473
  fn=submit_agent,
1474
+ inputs=[github_input, name_input, developer_input, website_input],
1475
  outputs=[submission_status, leaderboard_table, monthly_plot]
1476
  )
1477
 
msr.py CHANGED
@@ -1,17 +1,16 @@
1
  """
2
  Minimalist Issue Metadata Mining Script
3
- Mines issue metadata from GitHub and saves to HuggingFace dataset.
4
  """
5
 
6
  import json
7
  import os
8
- import time
9
- import requests
10
  from datetime import datetime, timezone, timedelta
11
  from collections import defaultdict
12
  from huggingface_hub import HfApi, hf_hub_download
13
  from dotenv import load_dotenv
14
- import random
15
 
16
  # Load environment variables
17
  load_dotenv()
@@ -22,7 +21,7 @@ load_dotenv()
22
 
23
  AGENTS_REPO = "SWE-Arena/swe_agents"
24
  ISSUE_METADATA_REPO = "SWE-Arena/issue_metadata"
25
- LEADERBOARD_TIME_FRAME_DAYS = 180 # 6 months
26
 
27
  # =============================================================================
28
  # UTILITY FUNCTIONS
@@ -52,213 +51,6 @@ def save_jsonl(filename, data):
52
  f.write(json.dumps(item) + '\n')
53
 
54
 
55
- def get_github_tokens():
56
- """Get all GitHub tokens from environment variables (all keys starting with GITHUB_TOKEN)."""
57
- tokens = []
58
- for key, value in os.environ.items():
59
- if key.startswith('GITHUB_TOKEN') and value:
60
- tokens.append(value)
61
-
62
- if not tokens:
63
- print("Warning: No GITHUB_TOKEN found. API rate limits: 60/hour (authenticated: 5000/hour)")
64
- else:
65
- print(f"✓ Loaded {len(tokens)} GitHub token(s) for rotation")
66
-
67
- return tokens
68
-
69
-
70
- def get_github_token():
71
- """Get primary GitHub token from environment variables (backward compatibility)."""
72
- token = os.getenv('GITHUB_TOKEN')
73
- if not token:
74
- print("Warning: GITHUB_TOKEN not found. API rate limits: 60/hour (authenticated: 5000/hour)")
75
- return token
76
-
77
-
78
- class TokenPool:
79
- """
80
- Hybrid token pool with parallel execution and round-robin fallback.
81
-
82
- Splits tokens into two pools:
83
- - 50% for parallel execution (maximize throughput)
84
- - 50% for round-robin backup (handle rate limits)
85
-
86
- Features:
87
- - Automatic rate limit detection and tracking
88
- - Token recovery when rate limits expire
89
- - Statistics monitoring
90
- - Thread-safe operations
91
- """
92
- def __init__(self, tokens):
93
- import threading
94
-
95
- # Store all tokens
96
- self.all_tokens = tokens if tokens else [None]
97
- total_tokens = len(self.all_tokens)
98
-
99
- # Split tokens into parallel and round-robin pools (50/50)
100
- # For odd numbers, round-robin gets the extra token
101
- split_point = max(1, total_tokens // 2)
102
-
103
- self.parallel_tokens = self.all_tokens[:split_point]
104
- self.roundrobin_tokens = self.all_tokens[split_point:] if split_point < total_tokens else self.all_tokens
105
-
106
- # Round-robin index for fallback pool
107
- self.roundrobin_index = 0
108
-
109
- # Track rate-limited tokens with reset timestamps
110
- self.rate_limited_tokens = {} # {token: reset_timestamp}
111
-
112
- # Statistics
113
- self.stats = {
114
- 'parallel_calls': 0,
115
- 'roundrobin_calls': 0,
116
- 'fallback_triggers': 0
117
- }
118
-
119
- # Thread lock for thread-safety
120
- self.lock = threading.Lock()
121
-
122
- print(f"🔀 Token Pool Initialized:")
123
- print(f" Total tokens: {total_tokens}")
124
- print(f" Parallel pool: {len(self.parallel_tokens)} tokens")
125
- print(f" Round-robin pool: {len(self.roundrobin_tokens)} tokens")
126
-
127
- def _clean_expired_rate_limits(self):
128
- """Remove tokens from rate-limited set if their reset time has passed."""
129
- import time
130
- current_time = time.time()
131
- expired = [token for token, reset_time in self.rate_limited_tokens.items()
132
- if reset_time and current_time >= reset_time]
133
- for token in expired:
134
- del self.rate_limited_tokens[token]
135
- print(f" ✓ Token recovered from rate limit")
136
-
137
- def get_parallel_token(self):
138
- """Get an available token from the parallel pool."""
139
- with self.lock:
140
- self._clean_expired_rate_limits()
141
-
142
- # Find first available parallel token (not rate-limited)
143
- for token in self.parallel_tokens:
144
- if token not in self.rate_limited_tokens:
145
- self.stats['parallel_calls'] += 1
146
- return token
147
-
148
- # All parallel tokens are rate-limited
149
- return None
150
-
151
- def get_roundrobin_token(self):
152
- """Get next token from round-robin pool."""
153
- with self.lock:
154
- self._clean_expired_rate_limits()
155
-
156
- if not self.roundrobin_tokens:
157
- return None
158
-
159
- # Try up to pool size to find non-rate-limited token
160
- attempts = 0
161
- max_attempts = len(self.roundrobin_tokens)
162
-
163
- while attempts < max_attempts:
164
- token = self.roundrobin_tokens[self.roundrobin_index]
165
- self.roundrobin_index = (self.roundrobin_index + 1) % len(self.roundrobin_tokens)
166
- attempts += 1
167
-
168
- if token not in self.rate_limited_tokens:
169
- self.stats['roundrobin_calls'] += 1
170
- return token
171
-
172
- # All round-robin tokens are rate-limited, return one anyway
173
- # (request_with_backoff will handle the rate limit)
174
- token = self.roundrobin_tokens[self.roundrobin_index]
175
- self.roundrobin_index = (self.roundrobin_index + 1) % len(self.roundrobin_tokens)
176
- self.stats['roundrobin_calls'] += 1
177
- return token
178
-
179
- def get_next_token(self):
180
- """
181
- Get next available token using hybrid strategy:
182
- 1. Try parallel pool first
183
- 2. Fall back to round-robin if parallel is exhausted
184
- """
185
- # Try parallel pool first
186
- token = self.get_parallel_token()
187
-
188
- if token is not None:
189
- return token
190
-
191
- # Parallel pool exhausted, fall back to round-robin
192
- with self.lock:
193
- self.stats['fallback_triggers'] += 1
194
-
195
- return self.get_roundrobin_token()
196
-
197
- def get_headers(self):
198
- """Get headers with the next token in rotation."""
199
- token = self.get_next_token()
200
- return {'Authorization': f'token {token}'} if token else {}
201
-
202
- def mark_rate_limited(self, token, reset_timestamp=None):
203
- """
204
- Mark a token as rate-limited with optional reset timestamp.
205
-
206
- Args:
207
- token: The token to mark
208
- reset_timestamp: Unix timestamp when rate limit resets (optional)
209
- """
210
- with self.lock:
211
- from datetime import datetime, timezone
212
- self.rate_limited_tokens[token] = reset_timestamp
213
- pool_type = "parallel" if token in self.parallel_tokens else "round-robin"
214
- if reset_timestamp:
215
- reset_time = datetime.fromtimestamp(reset_timestamp, timezone.utc).strftime('%H:%M:%S UTC')
216
- print(f" ⚠️ Token marked as rate-limited ({pool_type} pool, resets at {reset_time})")
217
- else:
218
- print(f" ⚠️ Token marked as rate-limited ({pool_type} pool)")
219
-
220
- def get_available_parallel_tokens(self):
221
- """Get list of all available (non-rate-limited) parallel tokens."""
222
- with self.lock:
223
- self._clean_expired_rate_limits()
224
- return [token for token in self.parallel_tokens
225
- if token not in self.rate_limited_tokens]
226
-
227
- def get_stats(self):
228
- """Get current statistics."""
229
- with self.lock:
230
- self._clean_expired_rate_limits()
231
- parallel_rate_limited = sum(1 for t in self.parallel_tokens
232
- if t in self.rate_limited_tokens)
233
- roundrobin_rate_limited = sum(1 for t in self.roundrobin_tokens
234
- if t in self.rate_limited_tokens)
235
-
236
- return {
237
- **self.stats,
238
- 'parallel_rate_limited': parallel_rate_limited,
239
- 'roundrobin_rate_limited': roundrobin_rate_limited
240
- }
241
-
242
- def print_stats(self):
243
- """Print statistics about token pool usage."""
244
- stats = self.get_stats()
245
- total_calls = stats['parallel_calls'] + stats['roundrobin_calls']
246
-
247
- if total_calls == 0:
248
- print("📊 No API calls made yet")
249
- return
250
-
251
- parallel_pct = (stats['parallel_calls'] / total_calls * 100) if total_calls > 0 else 0
252
- roundrobin_pct = (stats['roundrobin_calls'] / total_calls * 100) if total_calls > 0 else 0
253
-
254
- print(f"📊 Token Pool Statistics:")
255
- print(f" Total API calls: {total_calls}")
256
- print(f" Parallel calls: {stats['parallel_calls']} ({parallel_pct:.1f}%)")
257
- print(f" Round-robin calls: {stats['roundrobin_calls']} ({roundrobin_pct:.1f}%)")
258
- print(f" Fallback triggers: {stats['fallback_triggers']}")
259
- print(f" Currently rate-limited: {stats['parallel_rate_limited']} parallel, {stats['roundrobin_rate_limited']} round-robin")
260
-
261
-
262
  def get_hf_token():
263
  """Get HuggingFace token from environment variables."""
264
  token = os.getenv('HF_TOKEN')
@@ -267,523 +59,258 @@ def get_hf_token():
267
  return token
268
 
269
 
270
- # =============================================================================
271
- # GITHUB API FUNCTIONS
272
- # =============================================================================
273
-
274
- def request_with_backoff(method, url, *, headers=None, params=None, json_body=None, data=None, max_retries=10, timeout=30, token_pool=None, token=None):
275
  """
276
- Perform an HTTP request with exponential backoff and jitter for GitHub API.
277
- Retries on 403/429 (rate limits), 5xx server errors, and transient network exceptions.
278
-
279
- Args:
280
- token_pool: Optional TokenPool instance for automatic rate limit tracking
281
- token: Optional token being used (for marking as rate-limited)
282
 
283
- Returns the final requests.Response on success or non-retryable status, or None after exhausting retries.
 
284
  """
285
- delay = 1.0
286
- for attempt in range(max_retries):
287
- try:
288
- resp = requests.request(
289
- method,
290
- url,
291
- headers=headers or {},
292
- params=params,
293
- json=json_body,
294
- data=data,
295
- timeout=timeout
296
- )
297
 
298
- status = resp.status_code
299
-
300
- # Success
301
- if 200 <= status < 300:
302
- return resp
303
-
304
- # Rate limits or server errors -> retry with backoff
305
- if status in (403, 429) or 500 <= status < 600:
306
- wait = None
307
- reset_timestamp = None
308
-
309
- # Prefer Retry-After when present
310
- retry_after = resp.headers.get('Retry-After') or resp.headers.get('retry-after')
311
- if retry_after:
312
- try:
313
- wait = float(retry_after)
314
- except Exception:
315
- wait = None
316
-
317
- # Fallback to X-RateLimit-Reset when 403/429
318
- if wait is None and status in (403, 429):
319
- reset_hdr = resp.headers.get('X-RateLimit-Reset') or resp.headers.get('x-ratelimit-reset')
320
- if reset_hdr:
321
- try:
322
- reset_ts = int(float(reset_hdr))
323
- reset_timestamp = reset_ts
324
- wait = max(reset_ts - time.time() + 2, 1)
325
- except Exception:
326
- wait = None
327
-
328
- # Mark token as rate-limited if we have token_pool and token
329
- if status in (403, 429) and token_pool and token:
330
- token_pool.mark_rate_limited(token, reset_timestamp)
331
-
332
- # Final fallback: exponential backoff with jitter
333
- if wait is None:
334
- wait = delay + random.uniform(0, 0.5)
335
-
336
- # Cap individual wait to avoid extreme sleeps
337
- wait = max(1.0, min(wait, 120.0))
338
- print(f"GitHub API {status}. Backing off {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
339
- time.sleep(wait)
340
- delay = min(delay * 2, 60.0)
341
- continue
342
 
343
- # Non-retryable error; return response for caller to handle
344
- return resp
345
 
346
- except requests.RequestException as e:
347
- # Network error -> retry with backoff
348
- wait = delay + random.uniform(0, 0.5)
349
- wait = max(1.0, min(wait, 60.0))
350
- print(f"Request error: {e}. Retrying in {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
351
- time.sleep(wait)
352
- delay = min(delay * 2, 60.0)
353
 
354
- print(f"Exceeded max retries for {url}")
355
- return None
356
 
 
 
 
357
 
358
- def fetch_issues_parallel(query_patterns, start_date, end_date, token_pool, issues_by_id):
 
359
  """
360
- Fetch issues for multiple query patterns in parallel using available parallel tokens.
361
 
362
  Args:
363
- query_patterns: List of query patterns to search
364
- start_date: Start date for time range
365
- end_date: End date for time range
366
- token_pool: TokenPool instance for token management
367
- issues_by_id: Shared dictionary to store issues (thread-safe operations)
368
 
369
  Returns:
370
- Total number of issues found across all patterns
371
  """
372
- import concurrent.futures
373
- import threading
374
-
375
- # Get available parallel tokens
376
- available_tokens = token_pool.get_available_parallel_tokens()
377
-
378
- if not available_tokens:
379
- print(" ⚠️ No parallel tokens available, using sequential fallback")
380
- total_found = 0
381
- for pattern in query_patterns:
382
- count = fetch_issues_with_time_partition(
383
- pattern, start_date, end_date, token_pool, issues_by_id, depth=0
384
- )
385
- total_found += count
386
- return total_found
387
 
388
- # Determine max workers based on available tokens
389
- max_workers = min(len(query_patterns), len(available_tokens))
 
 
390
 
391
- print(f" 🚀 Using parallel execution with {max_workers} workers")
 
 
392
 
393
- # Thread-safe lock for issues_by_id updates
394
- lock = threading.Lock()
395
-
396
- def fetch_pattern(pattern, token):
397
- """Worker function to fetch issues for a single pattern."""
398
- # Create temporary dict for this pattern
399
- pattern_issues = {}
400
-
401
- try:
402
- # Fetch issues for this pattern
403
- count = fetch_issues_with_time_partition(
404
- pattern,
405
- start_date,
406
- end_date,
407
- token_pool,
408
- pattern_issues,
409
- depth=0
410
- )
411
-
412
- # Merge into shared dict with lock
413
- with lock:
414
- for issue_id, issue in pattern_issues.items():
415
- if issue_id not in issues_by_id:
416
- issues_by_id[issue_id] = issue
417
-
418
- return count
419
-
420
- except Exception as e:
421
- print(f" ✗ Error in parallel fetch for pattern '{pattern}': {str(e)}")
422
- return 0
423
-
424
- # Execute patterns in parallel
425
- total_found = 0
426
- with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
427
- # Map patterns to tokens
428
- futures = []
429
- for i, pattern in enumerate(query_patterns):
430
- token = available_tokens[i % len(available_tokens)]
431
- future = executor.submit(fetch_pattern, pattern, token)
432
- futures.append(future)
433
-
434
- # Collect results
435
- for future in concurrent.futures.as_completed(futures):
436
- try:
437
- count = future.result()
438
- total_found += count
439
- except Exception as e:
440
- print(f" ✗ Parallel execution error: {str(e)}")
441
-
442
- return total_found
443
-
444
-
445
- def fetch_issues_with_time_partition(base_query, start_date, end_date, token_pool, issues_by_id, depth=0):
446
- """
447
- Fetch issues within a specific time range using time-based partitioning.
448
- Recursively splits the time range if hitting the 1000-result limit.
449
- Supports splitting by day, hour, minute, and second as needed.
450
-
451
- Args:
452
- base_query: Base GitHub search query
453
- start_date: Start date for time range
454
- end_date: End date for time range
455
- token_pool: TokenPool instance for rotating tokens
456
- issues_by_id: Dictionary to store issues (deduplicated by ID)
457
- depth: Current recursion depth
458
-
459
- Returns the number of issues found in this time partition.
460
- """
461
- # Calculate time difference
462
- time_diff = end_date - start_date
463
- total_seconds = time_diff.total_seconds()
464
-
465
- # Determine granularity and format dates accordingly
466
- if total_seconds >= 86400: # >= 1 day
467
- # Use day granularity (YYYY-MM-DD)
468
- start_str = start_date.strftime('%Y-%m-%d')
469
- end_str = end_date.strftime('%Y-%m-%d')
470
- elif total_seconds >= 3600: # >= 1 hour but < 1 day
471
- # Use hour granularity (YYYY-MM-DDTHH:MM:SSZ)
472
- start_str = start_date.strftime('%Y-%m-%dT%H:00:00Z')
473
- end_str = end_date.strftime('%Y-%m-%dT%H:59:59Z')
474
- elif total_seconds >= 60: # >= 1 minute but < 1 hour
475
- # Use minute granularity (YYYY-MM-DDTHH:MM:SSZ)
476
- start_str = start_date.strftime('%Y-%m-%dT%H:%M:00Z')
477
- end_str = end_date.strftime('%Y-%m-%dT%H:%M:59Z')
478
- else: # < 1 minute
479
- # Use second granularity (YYYY-MM-DDTHH:MM:SSZ)
480
- start_str = start_date.strftime('%Y-%m-%dT%H:%M:%SZ')
481
- end_str = end_date.strftime('%Y-%m-%dT%H:%M:%SZ')
482
-
483
- # Add date range to query
484
- query = f'{base_query} created:{start_str}..{end_str}'
485
-
486
- indent = " " + " " * depth
487
- print(f"{indent}Searching range {start_str} to {end_str}...")
488
-
489
- page = 1
490
- per_page = 100
491
- total_in_partition = 0
492
-
493
- while True:
494
- url = 'https://api.github.com/search/issues'
495
- params = {
496
- 'q': query,
497
- 'per_page': per_page,
498
- 'page': page,
499
- 'sort': 'created',
500
- 'order': 'asc'
501
- }
502
-
503
- try:
504
- headers = token_pool.get_headers()
505
- response = request_with_backoff('GET', url, headers=headers, params=params)
506
- if response is None:
507
- print(f"{indent} Error: retries exhausted for range {start_str} to {end_str}")
508
- return total_in_partition
509
-
510
- if response.status_code != 200:
511
- print(f"{indent} Error: HTTP {response.status_code} for range {start_str} to {end_str}")
512
- return total_in_partition
513
-
514
- data = response.json()
515
- total_count = data.get('total_count', 0)
516
- items = data.get('items', [])
517
-
518
- if not items:
519
- break
520
-
521
- # Add issues to global dict
522
- for issue in items:
523
- issue_id = issue.get('id')
524
- if issue_id and issue_id not in issues_by_id:
525
- issues_by_id[issue_id] = issue
526
- total_in_partition += 1
527
-
528
- # Check if we hit the 1000-result limit
529
- if total_count > 1000 and page == 10:
530
- print(f"{indent} ⚠️ Hit 1000-result limit ({total_count} total). Splitting time range...")
531
-
532
- # Determine how to split based on time range duration
533
- if total_seconds < 2: # Less than 2 seconds - can't split further
534
- print(f"{indent} ⚠️ Cannot split further (range < 2 seconds). Some results may be missing.")
535
- break
536
-
537
- elif total_seconds < 120: # Less than 2 minutes - split by seconds
538
- # Split into 2-4 parts depending on range
539
- num_splits = min(4, max(2, int(total_seconds / 30)))
540
- split_duration = time_diff / num_splits
541
- split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
542
-
543
- total_from_splits = 0
544
- for i in range(num_splits):
545
- split_start = split_dates[i]
546
- split_end = split_dates[i + 1]
547
- # Avoid overlapping ranges (add 1 second to start)
548
- if i > 0:
549
- split_start = split_start + timedelta(seconds=1)
550
-
551
- count = fetch_issues_with_time_partition(
552
- base_query, split_start, split_end, token_pool, issues_by_id, depth + 1
553
- )
554
- total_from_splits += count
555
-
556
- return total_from_splits
557
-
558
- elif total_seconds < 7200: # Less than 2 hours - split by minutes
559
- # Split into 2-4 parts
560
- num_splits = min(4, max(2, int(total_seconds / 1800)))
561
- split_duration = time_diff / num_splits
562
- split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
563
-
564
- total_from_splits = 0
565
- for i in range(num_splits):
566
- split_start = split_dates[i]
567
- split_end = split_dates[i + 1]
568
- # Avoid overlapping ranges (add 1 minute to start)
569
- if i > 0:
570
- split_start = split_start + timedelta(minutes=1)
571
-
572
- count = fetch_issues_with_time_partition(
573
- base_query, split_start, split_end, token_pool, issues_by_id, depth + 1
574
- )
575
- total_from_splits += count
576
-
577
- return total_from_splits
578
-
579
- elif total_seconds < 172800: # Less than 2 days - split by hours
580
- # Split into 2-4 parts
581
- num_splits = min(4, max(2, int(total_seconds / 43200)))
582
- split_duration = time_diff / num_splits
583
- split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
584
-
585
- total_from_splits = 0
586
- for i in range(num_splits):
587
- split_start = split_dates[i]
588
- split_end = split_dates[i + 1]
589
- # Avoid overlapping ranges (add 1 hour to start)
590
- if i > 0:
591
- split_start = split_start + timedelta(hours=1)
592
-
593
- count = fetch_issues_with_time_partition(
594
- base_query, split_start, split_end, token_pool, issues_by_id, depth + 1
595
- )
596
- total_from_splits += count
597
-
598
- return total_from_splits
599
-
600
- else: # 2+ days - split by days
601
- days_diff = time_diff.days
602
-
603
- # Use aggressive splitting for large ranges or deep recursion
604
- # Split into 4 parts if range is > 30 days, otherwise split in half
605
- if days_diff > 30 or depth > 5:
606
- # Split into 4 parts for more aggressive partitioning
607
- quarter_diff = time_diff / 4
608
- split_dates = [
609
- start_date,
610
- start_date + quarter_diff,
611
- start_date + quarter_diff * 2,
612
- start_date + quarter_diff * 3,
613
- end_date
614
- ]
615
-
616
- total_from_splits = 0
617
- for i in range(4):
618
- split_start = split_dates[i]
619
- split_end = split_dates[i + 1]
620
- # Avoid overlapping ranges
621
- if i > 0:
622
- split_start = split_start + timedelta(days=1)
623
-
624
- count = fetch_issues_with_time_partition(
625
- base_query, split_start, split_end, token_pool, issues_by_id, depth + 1
626
- )
627
- total_from_splits += count
628
-
629
- return total_from_splits
630
- else:
631
- # Binary split for smaller ranges
632
- mid_date = start_date + time_diff / 2
633
-
634
- # Recursively fetch both halves
635
- count1 = fetch_issues_with_time_partition(
636
- base_query, start_date, mid_date, token_pool, issues_by_id, depth + 1
637
- )
638
- count2 = fetch_issues_with_time_partition(
639
- base_query, mid_date + timedelta(days=1), end_date, token_pool, issues_by_id, depth + 1
640
- )
641
-
642
- return count1 + count2
643
-
644
- # Normal pagination: check if there are more pages
645
- if len(items) < per_page or page >= 10:
646
- break
647
-
648
- page += 1
649
- time.sleep(0.5) # Courtesy delay between pages
650
-
651
- except Exception as e:
652
- print(f"{indent} Error fetching range {start_str} to {end_str}: {str(e)}")
653
- return total_in_partition
654
-
655
- if total_in_partition > 0:
656
- print(f"{indent} ✓ Found {total_in_partition} issues in range {start_str} to {end_str}")
657
-
658
- return total_in_partition
659
-
660
-
661
- def extract_issue_metadata(issue):
662
- """
663
- Extract minimal issue metadata for efficient storage.
664
- Only keeps essential fields: html_url, created_at, closed_at, state_reason.
665
-
666
- Issue states:
667
- - state: "open" or "closed"
668
- - state_reason: "completed" (resolved), "not_planned" (closed as not planned), or None (still open)
669
- """
670
- created_at = issue.get('created_at')
671
- closed_at = issue.get('closed_at')
672
- state = issue.get('state')
673
- state_reason = issue.get('state_reason')
674
-
675
- return {
676
- 'html_url': issue.get('html_url'),
677
- 'created_at': created_at,
678
- 'closed_at': closed_at,
679
- 'state': state,
680
- 'state_reason': state_reason
681
- }
682
 
 
 
 
683
 
684
- def fetch_all_issues_metadata(identifier, agent_name, token_pool, use_parallel=True):
685
  """
686
- Fetch issues associated with a GitHub user or bot for the past LEADERBOARD_TIME_FRAME_DAYS.
687
- Returns lightweight metadata instead of full issue objects.
688
 
689
- This function employs time-based partitioning to navigate GitHub's 1000-result limit per query.
690
- It searches using multiple query patterns:
691
- - is:issue author:{identifier} (issues authored by the bot)
692
- - is:issue assignee:{identifier} (issues assigned to the bot)
693
 
694
  Args:
695
- identifier: GitHub username or bot identifier
696
- agent_name: Human-readable name of the agent for metadata purposes
697
- token_pool: TokenPool instance for rotating tokens
698
- use_parallel: Whether to use parallel execution (default: True)
699
 
700
  Returns:
701
- List of dictionaries containing minimal issue metadata
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
702
  """
703
 
704
- # Define query patterns for issues:
705
- # 1) author pattern: issues authored by the identifier
706
- # 2) assignee pattern: issues assigned to the identifier
707
- stripped_id = identifier.replace('[bot]', '')
708
- query_patterns = []
709
-
710
- # Always add author and assignee pattern
711
- query_patterns.append(f'is:issue author:{identifier}')
712
- query_patterns.append(f'is:issue assignee:{identifier}')
713
- query_patterns.append(f'is:issue assignee:{stripped_id}')
714
-
715
- # Use a dict to deduplicate issues by ID
716
- issues_by_id = {}
717
-
718
- # Define time range: past LEADERBOARD_TIME_FRAME_DAYS (excluding today)
719
- current_time = datetime.now(timezone.utc)
720
- end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0) # 12:00 AM UTC today
721
- start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
722
-
723
- print(f"\n🔍 Fetching issues for {identifier}")
724
- print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')} (today excluded)")
725
- print(f" Query patterns: {len(query_patterns)}")
726
-
727
- total_start_time = time.time()
728
-
729
- # Use parallel execution if enabled and multiple patterns exist
730
- if use_parallel and len(query_patterns) > 1:
731
- try:
732
- print(f"\n 🚀 Using parallel execution for {len(query_patterns)} query patterns")
733
- total_found = fetch_issues_parallel(
734
- query_patterns,
735
- start_date,
736
- end_date,
737
- token_pool,
738
- issues_by_id
739
- )
740
- except Exception as e:
741
- print(f" ⚠️ Parallel execution failed, falling back to sequential: {str(e)}")
742
- use_parallel = False
743
-
744
- # Fall back to sequential if parallel is disabled or failed
745
- if not use_parallel or len(query_patterns) == 1:
746
- for query_pattern in query_patterns:
747
- print(f"\n🔍 Searching with query: {query_pattern}")
748
-
749
- pattern_start_time = time.time()
750
- initial_count = len(issues_by_id)
751
-
752
- # Fetch with time partitioning
753
- issues_found = fetch_issues_with_time_partition(
754
- query_pattern,
755
- start_date,
756
- end_date,
757
- token_pool,
758
- issues_by_id
759
- )
760
-
761
- pattern_duration = time.time() - pattern_start_time
762
- new_issues = len(issues_by_id) - initial_count
763
-
764
- print(f" ✓ Pattern complete: {new_issues} new issues found ({issues_found} total fetched, {len(issues_by_id) - initial_count - (issues_found - new_issues)} duplicates)")
765
- print(f" ⏱️ Time taken: {pattern_duration:.1f} seconds")
766
-
767
- time.sleep(1.0)
768
-
769
- total_duration = time.time() - total_start_time
770
- all_issues = list(issues_by_id.values())
771
-
772
- print(f"\n✅ COMPLETE: Found {len(all_issues)} unique issues for {identifier}")
773
- print(f" ⏱️ Total time: {total_duration:.1f} seconds")
774
- print(f"📦 Extracting minimal metadata...")
775
-
776
- metadata_list = [extract_issue_metadata(issue) for issue in all_issues]
777
 
778
- # Calculate memory savings
779
- import sys
780
- original_size = sys.getsizeof(str(all_issues))
781
- metadata_size = sys.getsizeof(str(metadata_list))
782
- savings_pct = ((original_size - metadata_size) / original_size * 100) if original_size > 0 else 0
783
 
784
- print(f"💾 Memory efficiency: {original_size // 1024}KB → {metadata_size // 1024}KB (saved {savings_pct:.1f}%)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
785
 
786
- return metadata_list
 
 
 
 
787
 
788
 
789
  # =============================================================================
@@ -812,128 +339,84 @@ def group_metadata_by_date(metadata_list):
812
  return dict(grouped)
813
 
814
 
815
- def upload_with_retry(api, path_or_fileobj, path_in_repo, repo_id, repo_type, token, max_retries=5):
816
- """
817
- Upload file to HuggingFace with exponential backoff retry logic.
818
- """
819
- delay = 2.0
820
-
821
- for attempt in range(max_retries):
822
- try:
823
- api.upload_file(
824
- path_or_fileobj=path_or_fileobj,
825
- path_in_repo=path_in_repo,
826
- repo_id=repo_id,
827
- repo_type=repo_type,
828
- token=token
829
- )
830
- if attempt > 0:
831
- print(f" ✓ Upload succeeded on attempt {attempt + 1}/{max_retries}")
832
- return True
833
-
834
- except Exception as e:
835
- if attempt < max_retries - 1:
836
- wait_time = delay + random.uniform(0, 1.0)
837
- print(f" ⚠️ Upload failed (attempt {attempt + 1}/{max_retries}): {str(e)}")
838
- print(f" ⏳ Retrying in {wait_time:.1f} seconds...")
839
- time.sleep(wait_time)
840
- delay = min(delay * 2, 60.0)
841
- else:
842
- print(f" ✗ Upload failed after {max_retries} attempts: {str(e)}")
843
- raise
844
-
845
-
846
  def save_issue_metadata_to_hf(metadata_list, agent_identifier):
847
  """
848
  Save issue metadata to HuggingFace dataset, organized by [agent_identifier]/YYYY.MM.DD.jsonl.
849
  Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.
850
 
851
- This function APPENDS new metadata and DEDUPLICATES by html_url.
852
- Uses batch folder upload to minimize commits (1 commit per agent instead of 1 per file).
853
 
854
  Args:
855
  metadata_list: List of issue metadata dictionaries
856
  agent_identifier: GitHub identifier of the agent (used as folder name)
857
  """
858
- import tempfile
859
  import shutil
860
 
861
- temp_dir = None
862
  try:
863
  token = get_hf_token()
864
  if not token:
865
  raise Exception("No HuggingFace token found")
866
 
867
- api = HfApi()
868
 
869
- # Create temporary directory for batch upload
870
- temp_dir = tempfile.mkdtemp()
871
- agent_folder = os.path.join(temp_dir, agent_identifier)
872
- os.makedirs(agent_folder, exist_ok=True)
873
-
874
- # Group by exact date (year, month, day)
875
  grouped = group_metadata_by_date(metadata_list)
876
 
877
- print(f"📤 Preparing batch upload for {agent_identifier} ({len(grouped)} daily files)...")
 
 
878
 
879
- for (issue_year, month, day), day_metadata in grouped.items():
880
- filename = f"{agent_identifier}/{issue_year}.{month:02d}.{day:02d}.jsonl"
881
- local_filename = f"{issue_year}.{month:02d}.{day:02d}.jsonl"
882
- local_path = os.path.join(agent_folder, local_filename)
883
 
884
- print(f" Preparing {len(day_metadata)} issues for {filename}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
885
 
886
- # Download existing file if it exists
887
- existing_metadata = []
888
- try:
889
- file_path = hf_hub_download(
890
- repo_id=ISSUE_METADATA_REPO,
891
- filename=filename,
892
- repo_type="dataset",
893
- token=token
894
- )
895
- existing_metadata = load_jsonl(file_path)
896
- print(f" Found {len(existing_metadata)} existing issues in {filename}")
897
- except Exception:
898
- print(f" No existing file found for {filename}, creating new")
899
-
900
- # Merge and deduplicate by html_url
901
- existing_by_url = {meta['html_url']: meta for meta in existing_metadata if meta.get('html_url')}
902
- new_by_url = {meta['html_url']: meta for meta in day_metadata if meta.get('html_url')}
903
-
904
- # Update with new data (new data overwrites old)
905
- existing_by_url.update(new_by_url)
906
- merged_metadata = list(existing_by_url.values())
907
-
908
- # Save to temporary folder
909
- save_jsonl(local_path, merged_metadata)
910
- print(f" ✓ Prepared {len(merged_metadata)} total issues for {local_filename}")
911
-
912
- # Upload entire folder in a single commit
913
- print(f"📤 Uploading folder {agent_identifier} to HuggingFace (1 commit)...")
914
- api.upload_folder(
915
- folder_path=agent_folder,
916
- path_in_repo=agent_identifier,
917
- repo_id=ISSUE_METADATA_REPO,
918
- repo_type="dataset",
919
- token=token,
920
- commit_message=f"Update metadata for {agent_identifier}"
921
- )
922
- print(f" ✓ Successfully uploaded {len(grouped)} files in 1 commit")
923
 
924
- return True
 
 
 
925
 
926
  except Exception as e:
927
- print(f"✗ Error saving issue metadata: {str(e)}")
 
 
928
  return False
929
- finally:
930
- # Always clean up temporary directory
931
- if temp_dir and os.path.exists(temp_dir):
932
- shutil.rmtree(temp_dir)
933
 
934
 
935
  def load_agents_from_hf():
936
- """Load all agent metadata JSON files from HuggingFace dataset."""
 
 
 
 
937
  try:
938
  api = HfApi()
939
  agents = []
@@ -957,6 +440,16 @@ def load_agents_from_hf():
957
 
958
  with open(file_path, 'r') as f:
959
  agent_data = json.load(f)
 
 
 
 
 
 
 
 
 
 
960
  agents.append(agent_data)
961
 
962
  except Exception as e:
@@ -978,54 +471,95 @@ def load_agents_from_hf():
978
  def mine_all_agents():
979
  """
980
  Mine issue metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
 
981
  """
982
- # Load all GitHub tokens and create token pool
983
- tokens = get_github_tokens()
984
- token_pool = TokenPool(tokens)
985
-
986
  # Load agent metadata from HuggingFace
987
  agents = load_agents_from_hf()
988
  if not agents:
989
  print("No agents found in HuggingFace dataset")
990
  return
991
 
 
 
 
 
 
 
992
  print(f"\n{'='*80}")
993
- print(f"Starting issue metadata mining for {len(agents)} agents")
994
  print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
995
  print(f"{'='*80}\n")
996
 
997
- # Mine each agent
998
- for agent in agents:
 
 
 
999
  identifier = agent.get('github_identifier')
1000
- agent_name = agent.get('agent_name', 'Unknown')
1001
 
1002
  if not identifier:
1003
- print(f"Warning: Skipping agent without identifier: {agent}")
 
1004
  continue
1005
 
1006
- try:
1007
- print(f"\n{'='*80}")
1008
- print(f"Processing: {agent_name} ({identifier})")
1009
- print(f"{'='*80}")
1010
 
1011
- # Fetch issue metadata
1012
- metadata = fetch_all_issues_metadata(identifier, agent_name, token_pool)
1013
 
 
1014
  if metadata:
1015
- print(f"💾 Saving {len(metadata)} issue records...")
1016
- save_issue_metadata_to_hf(metadata, identifier)
1017
- print(f"✓ Successfully processed {agent_name}")
 
 
1018
  else:
1019
- print(f" No issues found for {agent_name}")
 
1020
 
1021
  except Exception as e:
1022
- print(f"✗ Error processing {identifier}: {str(e)}")
1023
  import traceback
1024
  traceback.print_exc()
 
1025
  continue
1026
 
1027
  print(f"\n{'='*80}")
1028
- print(f"✅ Mining complete for all agents")
 
 
 
 
 
1029
  print(f"{'='*80}\n")
1030
 
1031
 
 
1
  """
2
  Minimalist Issue Metadata Mining Script
3
+ Mines issue metadata from GitHub Archive via BigQuery and saves to HuggingFace dataset.
4
  """
5
 
6
  import json
7
  import os
8
+ import tempfile
 
9
  from datetime import datetime, timezone, timedelta
10
  from collections import defaultdict
11
  from huggingface_hub import HfApi, hf_hub_download
12
  from dotenv import load_dotenv
13
+ from google.cloud import bigquery
14
 
15
  # Load environment variables
16
  load_dotenv()
 
21
 
22
  AGENTS_REPO = "SWE-Arena/swe_agents"
23
  ISSUE_METADATA_REPO = "SWE-Arena/issue_metadata"
24
+ LEADERBOARD_TIME_FRAME_DAYS = 3 # Time frame for leaderboard
25
 
26
  # =============================================================================
27
  # UTILITY FUNCTIONS
 
51
  f.write(json.dumps(item) + '\n')
52
 
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  def get_hf_token():
55
  """Get HuggingFace token from environment variables."""
56
  token = os.getenv('HF_TOKEN')
 
59
  return token
60
 
61
 
62
+ def get_bigquery_client():
 
 
 
 
63
  """
64
+ Initialize BigQuery client using credentials from environment variable.
 
 
 
 
 
65
 
66
+ Expects GOOGLE_APPLICATION_CREDENTIALS_JSON environment variable containing
67
+ the service account JSON credentials as a string.
68
  """
69
+ # Get the JSON content from environment variable
70
+ creds_json = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS_JSON')
 
 
 
 
 
 
 
 
 
 
71
 
72
+ if creds_json:
73
+ # Create a temporary file to store credentials
74
+ with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
75
+ temp_file.write(creds_json)
76
+ temp_path = temp_file.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ # Set environment variable to point to temp file
79
+ os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = temp_path
80
 
81
+ # Initialize BigQuery client
82
+ client = bigquery.Client()
 
 
 
 
 
83
 
84
+ # Clean up temp file
85
+ os.unlink(temp_path)
86
 
87
+ return client
88
+ else:
89
+ raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
90
 
91
+
92
+ def generate_table_union_statements(start_date, end_date):
93
  """
94
+ Generate UNION ALL statements for githubarchive.day tables in date range.
95
 
96
  Args:
97
+ start_date: Start datetime
98
+ end_date: End datetime
 
 
 
99
 
100
  Returns:
101
+ String with UNION ALL SELECT statements for all tables in range
102
  """
103
+ table_names = []
104
+ current_date = start_date
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ while current_date < end_date:
107
+ table_name = f"`githubarchive.day.{current_date.strftime('%Y%m%d')}`"
108
+ table_names.append(table_name)
109
+ current_date += timedelta(days=1)
110
 
111
+ # Create UNION ALL chain
112
+ union_parts = [f"SELECT * FROM {table}" for table in table_names]
113
+ return " UNION ALL ".join(union_parts)
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
+ # =============================================================================
117
+ # BIGQUERY FUNCTIONS
118
+ # =============================================================================
119
 
120
+ def fetch_all_issue_metadata_single_query(client, identifiers, start_date, end_date):
121
  """
122
+ Fetch issue metadata for ALL agents using ONE comprehensive BigQuery query.
 
123
 
124
+ This query fetches IssuesEvent and IssueCommentEvent from GitHub Archive and
125
+ deduplicates to get the latest state of each issue. Filters by issue author,
126
+ commenter, or assignee.
 
127
 
128
  Args:
129
+ client: BigQuery client instance
130
+ identifiers: List of GitHub usernames/bot identifiers
131
+ start_date: Start datetime (timezone-aware)
132
+ end_date: End datetime (timezone-aware)
133
 
134
  Returns:
135
+ Dictionary mapping agent identifier to list of issue metadata:
136
+ {
137
+ 'agent-identifier': [
138
+ {
139
+ 'url': Issue URL,
140
+ 'created_at': Issue creation timestamp,
141
+ 'closed_at': Close timestamp (if closed, else None),
142
+ 'state_reason': Reason for closure (completed/not_planned/etc.)
143
+ },
144
+ ...
145
+ ],
146
+ ...
147
+ }
148
+ """
149
+ print(f"\n🔍 Querying BigQuery for ALL {len(identifiers)} agents in ONE QUERY")
150
+ print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
151
+
152
+ # Generate table UNION statements for issue events
153
+ issue_tables = generate_table_union_statements(start_date, end_date)
154
+
155
+ # Build identifier list for IN clause (handle both bot and non-bot versions)
156
+ identifier_set = set()
157
+ for id in identifiers:
158
+ identifier_set.add(id)
159
+ # Also add stripped version without [bot] suffix
160
+ stripped = id.replace('[bot]', '')
161
+ if stripped != id:
162
+ identifier_set.add(stripped)
163
+
164
+ identifier_list = ', '.join([f"'{id}'" for id in identifier_set])
165
+
166
+ # Build comprehensive query with CTEs
167
+ query = f"""
168
+ WITH issue_events AS (
169
+ -- Get all issue events and comment events for ALL agents
170
+ SELECT
171
+ JSON_EXTRACT_SCALAR(payload, '$.issue.html_url') as url,
172
+ JSON_EXTRACT_SCALAR(payload, '$.issue.created_at') as created_at,
173
+ JSON_EXTRACT_SCALAR(payload, '$.issue.closed_at') as closed_at,
174
+ JSON_EXTRACT_SCALAR(payload, '$.issue.state_reason') as state_reason,
175
+ JSON_EXTRACT_SCALAR(payload, '$.issue.user.login') as author,
176
+ JSON_EXTRACT_SCALAR(payload, '$.issue.assignee.login') as assignee,
177
+ JSON_EXTRACT_SCALAR(payload, '$.comment.user.login') as commenter,
178
+ JSON_EXTRACT_SCALAR(payload, '$.issue.number') as issue_number,
179
+ repo.name as repo_name,
180
+ created_at as event_time
181
+ FROM (
182
+ {issue_tables}
183
+ )
184
+ WHERE
185
+ type IN ('IssuesEvent', 'IssueCommentEvent')
186
+ -- Exclude pull requests (they have pull_request field)
187
+ AND JSON_EXTRACT(payload, '$.issue.pull_request') IS NULL
188
+ AND JSON_EXTRACT_SCALAR(payload, '$.issue.html_url') IS NOT NULL
189
+ -- Filter by author OR commenter OR assignee
190
+ AND (
191
+ JSON_EXTRACT_SCALAR(payload, '$.issue.user.login') IN ({identifier_list})
192
+ OR JSON_EXTRACT_SCALAR(payload, '$.comment.user.login') IN ({identifier_list})
193
+ OR JSON_EXTRACT_SCALAR(payload, '$.issue.assignee.login') IN ({identifier_list})
194
+ )
195
+ ),
196
+
197
+ latest_states AS (
198
+ -- Deduplicate to get latest state for each issue
199
+ SELECT
200
+ url,
201
+ created_at,
202
+ closed_at,
203
+ state_reason,
204
+ author,
205
+ assignee,
206
+ commenter
207
+ FROM issue_events
208
+ QUALIFY ROW_NUMBER() OVER (
209
+ PARTITION BY repo_name, issue_number
210
+ ORDER BY event_time DESC
211
+ ) = 1
212
+ ),
213
+
214
+ agent_issues AS (
215
+ -- Map each issue to its relevant agent(s)
216
+ SELECT DISTINCT
217
+ CASE
218
+ WHEN author IN ({identifier_list}) THEN author
219
+ WHEN commenter IN ({identifier_list}) THEN commenter
220
+ WHEN assignee IN ({identifier_list}) THEN assignee
221
+ ELSE NULL
222
+ END as agent_identifier,
223
+ url,
224
+ created_at,
225
+ closed_at,
226
+ state_reason
227
+ FROM latest_states
228
+ WHERE
229
+ author IN ({identifier_list})
230
+ OR commenter IN ({identifier_list})
231
+ OR assignee IN ({identifier_list})
232
+ )
233
+
234
+ SELECT
235
+ agent_identifier,
236
+ url,
237
+ created_at,
238
+ closed_at,
239
+ state_reason
240
+ FROM agent_issues
241
+ WHERE agent_identifier IS NOT NULL
242
+ ORDER BY agent_identifier, created_at DESC
243
  """
244
 
245
+ # Calculate number of days for reporting
246
+ query_days = (end_date - start_date).days
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
+ print(f" Querying {query_days} days for issue and comment events...")
249
+ print(f" Agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}")
 
 
 
250
 
251
+ try:
252
+ query_job = client.query(query)
253
+ results = list(query_job.result())
254
+
255
+ print(f" ✓ Found {len(results)} total issue records across all agents")
256
+
257
+ # Group results by agent
258
+ metadata_by_agent = defaultdict(list)
259
+
260
+ for row in results:
261
+ agent_id = row.agent_identifier
262
+
263
+ # Convert datetime objects to ISO strings
264
+ created_at = row.created_at
265
+ if hasattr(created_at, 'isoformat'):
266
+ created_at = created_at.isoformat()
267
+
268
+ closed_at = row.closed_at
269
+ if hasattr(closed_at, 'isoformat'):
270
+ closed_at = closed_at.isoformat()
271
+
272
+ metadata_by_agent[agent_id].append({
273
+ 'url': row.url,
274
+ 'created_at': created_at,
275
+ 'closed_at': closed_at,
276
+ 'state_reason': row.state_reason,
277
+ })
278
+
279
+ # Print breakdown by agent
280
+ print(f"\n 📊 Results breakdown by agent:")
281
+ for identifier in identifiers:
282
+ # Check both original and stripped versions
283
+ count = len(metadata_by_agent.get(identifier, []))
284
+ stripped = identifier.replace('[bot]', '')
285
+ if stripped != identifier:
286
+ count += len(metadata_by_agent.get(stripped, []))
287
+
288
+ if count > 0:
289
+ # Merge both versions if needed
290
+ all_metadata = metadata_by_agent.get(identifier, []) + metadata_by_agent.get(stripped, [])
291
+ completed_count = sum(1 for m in all_metadata if m['state_reason'] == 'completed')
292
+ closed_count = sum(1 for m in all_metadata if m['closed_at'] is not None)
293
+ open_count = count - closed_count
294
+ print(f" {identifier}: {count} issues ({completed_count} completed, {closed_count} closed, {open_count} open)")
295
+
296
+ # Convert defaultdict to regular dict and merge bot/non-bot versions
297
+ final_metadata = {}
298
+ for identifier in identifiers:
299
+ combined = metadata_by_agent.get(identifier, [])
300
+ stripped = identifier.replace('[bot]', '')
301
+ if stripped != identifier and stripped in metadata_by_agent:
302
+ combined.extend(metadata_by_agent[stripped])
303
+
304
+ if combined:
305
+ final_metadata[identifier] = combined
306
+
307
+ return final_metadata
308
 
309
+ except Exception as e:
310
+ print(f" ✗ BigQuery error: {str(e)}")
311
+ import traceback
312
+ traceback.print_exc()
313
+ return {}
314
 
315
 
316
  # =============================================================================
 
339
  return dict(grouped)
340
 
341
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
  def save_issue_metadata_to_hf(metadata_list, agent_identifier):
343
  """
344
  Save issue metadata to HuggingFace dataset, organized by [agent_identifier]/YYYY.MM.DD.jsonl.
345
  Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.
346
 
347
+ This function OVERWRITES existing files completely with fresh data from BigQuery.
348
+ Uses batch upload to avoid rate limit (uploads entire folder in single commit).
349
 
350
  Args:
351
  metadata_list: List of issue metadata dictionaries
352
  agent_identifier: GitHub identifier of the agent (used as folder name)
353
  """
 
354
  import shutil
355
 
 
356
  try:
357
  token = get_hf_token()
358
  if not token:
359
  raise Exception("No HuggingFace token found")
360
 
361
+ api = HfApi(token=token)
362
 
363
+ # Group by date (year, month, day)
 
 
 
 
 
364
  grouped = group_metadata_by_date(metadata_list)
365
 
366
+ if not grouped:
367
+ print(f" No valid metadata to save for {agent_identifier}")
368
+ return False
369
 
370
+ # Create a temporary directory for batch upload
371
+ temp_dir = tempfile.mkdtemp()
372
+ agent_folder = os.path.join(temp_dir, agent_identifier)
373
+ os.makedirs(agent_folder, exist_ok=True)
374
 
375
+ try:
376
+ print(f" 📦 Preparing batch upload for {len(grouped)} daily files...")
377
+
378
+ # Process each daily file
379
+ for (issue_year, month, day), day_metadata in grouped.items():
380
+ filename = f"{agent_identifier}/{issue_year}.{month:02d}.{day:02d}.jsonl"
381
+ local_filename = os.path.join(agent_folder, f"{issue_year}.{month:02d}.{day:02d}.jsonl")
382
+
383
+ # Sort by created_at for better organization
384
+ day_metadata.sort(key=lambda x: x.get('created_at', ''), reverse=True)
385
+
386
+ # Save to temp directory (complete overwrite, no merging)
387
+ save_jsonl(local_filename, day_metadata)
388
+ print(f" Prepared {len(day_metadata)} issues for {filename}")
389
+
390
+ # Upload entire folder using upload_large_folder (optimized for large files)
391
+ # Note: upload_large_folder creates multiple commits automatically and doesn't support custom commit_message
392
+ print(f" 🤗 Uploading {len(grouped)} files ({len(metadata_list)} total issues)...")
393
+ api.upload_large_folder(
394
+ folder_path=temp_dir,
395
+ repo_id=ISSUE_METADATA_REPO,
396
+ repo_type="dataset"
397
+ )
398
+ print(f" ✓ Batch upload complete for {agent_identifier}")
399
 
400
+ return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
 
402
+ finally:
403
+ # Always clean up temp directory
404
+ if os.path.exists(temp_dir):
405
+ shutil.rmtree(temp_dir)
406
 
407
  except Exception as e:
408
+ print(f" ✗ Error saving issue metadata: {str(e)}")
409
+ import traceback
410
+ traceback.print_exc()
411
  return False
 
 
 
 
412
 
413
 
414
  def load_agents_from_hf():
415
+ """
416
+ Load all agent metadata JSON files from HuggingFace dataset.
417
+
418
+ The github_identifier is extracted from the filename (e.g., 'agent-name[bot].json' -> 'agent-name[bot]')
419
+ """
420
  try:
421
  api = HfApi()
422
  agents = []
 
440
 
441
  with open(file_path, 'r') as f:
442
  agent_data = json.load(f)
443
+
444
+ # Only process agents with status == "public"
445
+ if agent_data.get('status') != 'public':
446
+ print(f"Skipping {json_file}: status is not 'public'")
447
+ continue
448
+
449
+ # Extract github_identifier from filename (remove .json extension)
450
+ github_identifier = json_file.replace('.json', '')
451
+ agent_data['github_identifier'] = github_identifier
452
+
453
  agents.append(agent_data)
454
 
455
  except Exception as e:
 
471
  def mine_all_agents():
472
  """
473
  Mine issue metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
474
+ Uses ONE BigQuery query for ALL agents (most efficient approach).
475
  """
 
 
 
 
476
  # Load agent metadata from HuggingFace
477
  agents = load_agents_from_hf()
478
  if not agents:
479
  print("No agents found in HuggingFace dataset")
480
  return
481
 
482
+ # Extract all identifiers
483
+ identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
484
+ if not identifiers:
485
+ print("No valid agent identifiers found")
486
+ return
487
+
488
  print(f"\n{'='*80}")
489
+ print(f"Starting issue metadata mining for {len(identifiers)} agents")
490
  print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
491
+ print(f"Data source: BigQuery + GitHub Archive (ONE QUERY FOR ALL AGENTS)")
492
+ print(f"{'='*80}\n")
493
+
494
+ # Initialize BigQuery client
495
+ try:
496
+ client = get_bigquery_client()
497
+ except Exception as e:
498
+ print(f"✗ Failed to initialize BigQuery client: {str(e)}")
499
+ return
500
+
501
+ # Define time range: past LEADERBOARD_TIME_FRAME_DAYS (excluding today)
502
+ current_time = datetime.now(timezone.utc)
503
+ end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)
504
+ start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
505
+
506
+ try:
507
+ all_metadata = fetch_all_issue_metadata_single_query(
508
+ client, identifiers, start_date, end_date
509
+ )
510
+ except Exception as e:
511
+ print(f"✗ Error during BigQuery fetch: {str(e)}")
512
+ import traceback
513
+ traceback.print_exc()
514
+ return
515
+
516
+ # Save results for each agent
517
+ print(f"\n{'='*80}")
518
+ print(f"💾 Saving results to HuggingFace for each agent...")
519
  print(f"{'='*80}\n")
520
 
521
+ success_count = 0
522
+ error_count = 0
523
+ no_data_count = 0
524
+
525
+ for i, agent in enumerate(agents, 1):
526
  identifier = agent.get('github_identifier')
527
+ agent_name = agent.get('name', agent.get('agent_name', 'Unknown'))
528
 
529
  if not identifier:
530
+ print(f"[{i}/{len(agents)}] Skipping agent without identifier")
531
+ error_count += 1
532
  continue
533
 
534
+ metadata = all_metadata.get(identifier, [])
 
 
 
535
 
536
+ print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
 
537
 
538
+ try:
539
  if metadata:
540
+ print(f" 💾 Saving {len(metadata)} issue records...")
541
+ if save_issue_metadata_to_hf(metadata, identifier):
542
+ success_count += 1
543
+ else:
544
+ error_count += 1
545
  else:
546
+ print(f" No issues found")
547
+ no_data_count += 1
548
 
549
  except Exception as e:
550
+ print(f" ✗ Error saving {identifier}: {str(e)}")
551
  import traceback
552
  traceback.print_exc()
553
+ error_count += 1
554
  continue
555
 
556
  print(f"\n{'='*80}")
557
+ print(f"✅ Mining complete!")
558
+ print(f" Total agents: {len(agents)}")
559
+ print(f" Successfully saved: {success_count}")
560
+ print(f" No data (skipped): {no_data_count}")
561
+ print(f" Errors: {error_count}")
562
+ print(f" BigQuery queries executed: 1")
563
  print(f"{'='*80}\n")
564
 
565
 
requirements.txt CHANGED
@@ -1,5 +1,7 @@
1
  APScheduler
2
  datasets
 
 
3
  gradio
4
  gradio_leaderboard
5
  huggingface_hub
 
1
  APScheduler
2
  datasets
3
+ db-dtypes
4
+ google-cloud-bigquery
5
  gradio
6
  gradio_leaderboard
7
  huggingface_hub