zhiminy commited on
Commit
ee9e618
·
1 Parent(s): 37c1612
Files changed (2) hide show
  1. app.py +24 -26
  2. msr.py +3 -3
app.py CHANGED
@@ -220,7 +220,7 @@ def generate_table_union_statements(start_date, end_date):
220
  return " UNION ALL ".join(union_parts)
221
 
222
 
223
- def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100):
224
  """
225
  Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
226
 
@@ -232,7 +232,7 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
232
  identifiers: List of GitHub usernames/bot identifiers
233
  start_date: Start datetime (timezone-aware)
234
  end_date: End datetime (timezone-aware)
235
- batch_size: Number of agents per batch (default: 100)
236
 
237
  Returns:
238
  Dictionary mapping agent identifier to list of issue metadata
@@ -1283,9 +1283,7 @@ def save_leaderboard_and_metrics_to_hf():
1283
  def mine_all_agents():
1284
  """
1285
  Mine issue metadata for all agents within UPDATE_TIME_FRAME_DAYS and save to HuggingFace.
1286
- Uses ONE BigQuery query for ALL agents (most efficient approach).
1287
-
1288
- Runs periodically based on UPDATE_TIME_FRAME_DAYS (e.g., weekly).
1289
  """
1290
  # Load agent metadata from HuggingFace
1291
  agents = load_agents_from_hf()
@@ -1321,7 +1319,7 @@ def mine_all_agents():
1321
  try:
1322
  # Use batched approach for better performance
1323
  all_metadata = fetch_issue_metadata_batched(
1324
- client, identifiers, start_date, end_date, batch_size=100
1325
  )
1326
  except Exception as e:
1327
  print(f"✗ Error during BigQuery fetch: {str(e)}")
@@ -1370,7 +1368,7 @@ def mine_all_agents():
1370
  continue
1371
 
1372
  # Calculate number of batches executed
1373
- batch_size = 100
1374
  num_batches = (len(identifiers) + batch_size - 1) // batch_size
1375
 
1376
  print(f"\n{'='*80}")
@@ -1441,14 +1439,16 @@ def generate_color(index, total):
1441
  return f'hsl({hue}, {saturation}%, {lightness}%)'
1442
 
1443
 
1444
- def create_monthly_metrics_plot():
1445
  """
1446
  Create a Plotly figure with dual y-axes showing:
1447
  - Left y-axis: Resolved Rate (%) as line curves
1448
  - Right y-axis: Total Issues created as bar charts
1449
 
1450
  Each agent gets a unique color for both their line and bars.
1451
- Shows only top 5 agents by total issue count.
 
 
1452
  """
1453
  # Try to load from cache first
1454
  cached_data = load_cached_leaderboard_and_metrics()
@@ -1457,7 +1457,7 @@ def create_monthly_metrics_plot():
1457
  # Use cached monthly metrics
1458
  all_metrics = cached_data['monthly_metrics']
1459
 
1460
- # Filter to top 5 agents by total issue count
1461
  if all_metrics.get('agents') and all_metrics.get('data'):
1462
  # Calculate total issues for each agent
1463
  agent_totals = []
@@ -1465,9 +1465,9 @@ def create_monthly_metrics_plot():
1465
  total_issues = sum(all_metrics['data'][agent_name]['total_issues'])
1466
  agent_totals.append((agent_name, total_issues))
1467
 
1468
- # Sort and take top 5
1469
  agent_totals.sort(key=lambda x: x[1], reverse=True)
1470
- top_agents = [agent_name for agent_name, _ in agent_totals[:5]]
1471
 
1472
  # Filter metrics to only include top agents
1473
  metrics = {
@@ -1480,7 +1480,7 @@ def create_monthly_metrics_plot():
1480
  else:
1481
  # Fallback: Calculate from issue metadata
1482
  print(" Calculating monthly metrics from issue metadata...")
1483
- metrics = calculate_monthly_metrics_by_agent(top_n=5)
1484
 
1485
  if not metrics['agents'] or not metrics['months']:
1486
  # Return an empty figure with a message
@@ -1642,7 +1642,8 @@ def get_leaderboard_dataframe():
1642
  def submit_agent(identifier, agent_name, developer, website):
1643
  """
1644
  Submit a new agent to the leaderboard.
1645
- Validates input and saves submission. Issue data will be populated by daily incremental updates.
 
1646
  """
1647
  # Validate required fields
1648
  if not identifier or not identifier.strip():
@@ -1687,30 +1688,27 @@ def submit_agent(identifier, agent_name, developer, website):
1687
  return f"✅ Successfully submitted {agent_name}! Issue data will be populated by daily incremental updates.", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1688
 
1689
 
1690
- # =============================================================================
1691
- # BACKGROUND TASKS
1692
- # =============================================================================
1693
-
1694
-
1695
  # =============================================================================
1696
  # GRADIO APPLICATION
1697
  # =============================================================================
1698
 
1699
- # Start APScheduler for periodic issue mining via BigQuery
1700
- # NOTE: On app startup, we only LOAD existing cached data from HuggingFace
1701
- # Mining (BigQuery queries) ONLY happens on schedule (weekly on Mondays)
 
 
1702
  scheduler = BackgroundScheduler(timezone="UTC")
1703
  scheduler.add_job(
1704
  mine_all_agents,
1705
- trigger=CronTrigger(day_of_week='mon', hour=0, minute=0), # Every Monday at 12:00 AM UTC
1706
- id='periodic_bigquery_mining',
1707
- name='Periodic BigQuery Issue Mining',
1708
  replace_existing=True
1709
  )
1710
  scheduler.start()
1711
  print(f"\n{'='*80}")
1712
  print(f"✓ Scheduler initialized successfully")
1713
- print(f"⛏️ Mining schedule: Every Monday at 12:00 AM UTC")
1714
  print(f"📥 On startup: Only loads cached data from HuggingFace (no mining)")
1715
  print(f"{'='*80}\n")
1716
 
 
220
  return " UNION ALL ".join(union_parts)
221
 
222
 
223
+ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50):
224
  """
225
  Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
226
 
 
232
  identifiers: List of GitHub usernames/bot identifiers
233
  start_date: Start datetime (timezone-aware)
234
  end_date: End datetime (timezone-aware)
235
+ batch_size: Number of agents per batch (default: 50)
236
 
237
  Returns:
238
  Dictionary mapping agent identifier to list of issue metadata
 
1283
  def mine_all_agents():
1284
  """
1285
  Mine issue metadata for all agents within UPDATE_TIME_FRAME_DAYS and save to HuggingFace.
1286
+ Uses BATCHED BigQuery queries for all agents (efficient approach).
 
 
1287
  """
1288
  # Load agent metadata from HuggingFace
1289
  agents = load_agents_from_hf()
 
1319
  try:
1320
  # Use batched approach for better performance
1321
  all_metadata = fetch_issue_metadata_batched(
1322
+ client, identifiers, start_date, end_date, batch_size=50
1323
  )
1324
  except Exception as e:
1325
  print(f"✗ Error during BigQuery fetch: {str(e)}")
 
1368
  continue
1369
 
1370
  # Calculate number of batches executed
1371
+ batch_size = 50
1372
  num_batches = (len(identifiers) + batch_size - 1) // batch_size
1373
 
1374
  print(f"\n{'='*80}")
 
1439
  return f'hsl({hue}, {saturation}%, {lightness}%)'
1440
 
1441
 
1442
+ def create_monthly_metrics_plot(top_n=5):
1443
  """
1444
  Create a Plotly figure with dual y-axes showing:
1445
  - Left y-axis: Resolved Rate (%) as line curves
1446
  - Right y-axis: Total Issues created as bar charts
1447
 
1448
  Each agent gets a unique color for both their line and bars.
1449
+
1450
+ Args:
1451
+ top_n: Number of top agents to show (default: 5)
1452
  """
1453
  # Try to load from cache first
1454
  cached_data = load_cached_leaderboard_and_metrics()
 
1457
  # Use cached monthly metrics
1458
  all_metrics = cached_data['monthly_metrics']
1459
 
1460
+ # Filter to top_n agents by total issue count
1461
  if all_metrics.get('agents') and all_metrics.get('data'):
1462
  # Calculate total issues for each agent
1463
  agent_totals = []
 
1465
  total_issues = sum(all_metrics['data'][agent_name]['total_issues'])
1466
  agent_totals.append((agent_name, total_issues))
1467
 
1468
+ # Sort and take top_n agents
1469
  agent_totals.sort(key=lambda x: x[1], reverse=True)
1470
+ top_agents = [agent_name for agent_name, _ in agent_totals[:top_n]]
1471
 
1472
  # Filter metrics to only include top agents
1473
  metrics = {
 
1480
  else:
1481
  # Fallback: Calculate from issue metadata
1482
  print(" Calculating monthly metrics from issue metadata...")
1483
+ metrics = calculate_monthly_metrics_by_agent(top_n=top_n)
1484
 
1485
  if not metrics['agents'] or not metrics['months']:
1486
  # Return an empty figure with a message
 
1642
  def submit_agent(identifier, agent_name, developer, website):
1643
  """
1644
  Submit a new agent to the leaderboard.
1645
+ Validates input and saves submission.
1646
+ Issue data will be populated by the monthly mining task.
1647
  """
1648
  # Validate required fields
1649
  if not identifier or not identifier.strip():
 
1688
  return f"✅ Successfully submitted {agent_name}! Issue data will be populated by daily incremental updates.", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1689
 
1690
 
 
 
 
 
 
1691
  # =============================================================================
1692
  # GRADIO APPLICATION
1693
  # =============================================================================
1694
 
1695
+ print(f"\n🚀 Starting SWE Agent PR Leaderboard")
1696
+ print(f" Leaderboard time frame: {LEADERBOARD_TIME_FRAME_DAYS} days ({LEADERBOARD_TIME_FRAME_DAYS // 30} months)")
1697
+ print(f" Mining update frequency: Every {UPDATE_TIME_FRAME_DAYS} days\n")
1698
+
1699
+ # Start APScheduler for monthly PR mining at 12:00 AM UTC every 1st of the month
1700
  scheduler = BackgroundScheduler(timezone="UTC")
1701
  scheduler.add_job(
1702
  mine_all_agents,
1703
+ trigger=CronTrigger(day=1, hour=0, minute=0), # 12:00 AM UTC every 1st of the month
1704
+ id='monthly_issue_mining',
1705
+ name='Monthly Issue Mining',
1706
  replace_existing=True
1707
  )
1708
  scheduler.start()
1709
  print(f"\n{'='*80}")
1710
  print(f"✓ Scheduler initialized successfully")
1711
+ print(f"⛏️ Mining schedule: Every 1st of the month at 12:00 AM UTC")
1712
  print(f"📥 On startup: Only loads cached data from HuggingFace (no mining)")
1713
  print(f"{'='*80}\n")
1714
 
msr.py CHANGED
@@ -176,7 +176,7 @@ def generate_table_union_statements(start_date, end_date):
176
  # BIGQUERY FUNCTIONS
177
  # =============================================================================
178
 
179
- def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100):
180
  """
181
  Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
182
 
@@ -859,7 +859,7 @@ def mine_all_agents():
859
  try:
860
  # Use batched approach for better performance
861
  all_metadata = fetch_issue_metadata_batched(
862
- client, identifiers, start_date, end_date, batch_size=100
863
  )
864
  except Exception as e:
865
  print(f"✗ Error during BigQuery fetch: {str(e)}")
@@ -908,7 +908,7 @@ def mine_all_agents():
908
  continue
909
 
910
  # Calculate number of batches executed
911
- batch_size = 100
912
  num_batches = (len(identifiers) + batch_size - 1) // batch_size
913
 
914
  print(f"\n{'='*80}")
 
176
  # BIGQUERY FUNCTIONS
177
  # =============================================================================
178
 
179
+ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50):
180
  """
181
  Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
182
 
 
859
  try:
860
  # Use batched approach for better performance
861
  all_metadata = fetch_issue_metadata_batched(
862
+ client, identifiers, start_date, end_date, batch_size=50
863
  )
864
  except Exception as e:
865
  print(f"✗ Error during BigQuery fetch: {str(e)}")
 
908
  continue
909
 
910
  # Calculate number of batches executed
911
+ batch_size = 50
912
  num_batches = (len(identifiers) + batch_size - 1) // batch_size
913
 
914
  print(f"\n{'='*80}")