zhiminy commited on
Commit
725070c
·
1 Parent(s): ee9e618

refine workflow

Browse files
Files changed (2) hide show
  1. app.py +32 -55
  2. msr.py +33 -56
app.py CHANGED
@@ -220,7 +220,7 @@ def generate_table_union_statements(start_date, end_date):
220
  return " UNION ALL ".join(union_parts)
221
 
222
 
223
- def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50):
224
  """
225
  Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
226
 
@@ -233,12 +233,14 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
233
  start_date: Start datetime (timezone-aware)
234
  end_date: End datetime (timezone-aware)
235
  batch_size: Number of agents per batch (default: 50)
 
236
 
237
  Returns:
238
  Dictionary mapping agent identifier to list of issue metadata
239
  """
240
  print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach")
241
  print(f" Batch size: {batch_size} agents per query")
 
242
 
243
  # Split identifiers into batches
244
  batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
@@ -266,6 +268,21 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
266
 
267
  print(f" ✓ Batch {batch_num} completed: {len(batch_results)} agents with data")
268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  except Exception as e:
270
  print(f" ✗ Batch {batch_num} failed: {str(e)}")
271
  print(f" Continuing with remaining batches...")
@@ -1318,68 +1335,28 @@ def mine_all_agents():
1318
 
1319
  try:
1320
  # Use batched approach for better performance
 
1321
  all_metadata = fetch_issue_metadata_batched(
1322
- client, identifiers, start_date, end_date, batch_size=50
1323
  )
 
 
 
 
 
 
 
 
 
 
 
 
1324
  except Exception as e:
1325
  print(f"✗ Error during BigQuery fetch: {str(e)}")
1326
  import traceback
1327
  traceback.print_exc()
1328
  return
1329
 
1330
- # Save results for each agent
1331
- print(f"\n{'='*80}")
1332
- print(f"💾 Saving results to HuggingFace for each agent...")
1333
- print(f"{'='*80}\n")
1334
-
1335
- success_count = 0
1336
- error_count = 0
1337
- no_data_count = 0
1338
-
1339
- for i, agent in enumerate(agents, 1):
1340
- identifier = agent.get('github_identifier')
1341
- agent_name = agent.get('name', 'Unknown')
1342
-
1343
- if not identifier:
1344
- print(f"[{i}/{len(agents)}] Skipping agent without identifier")
1345
- error_count += 1
1346
- continue
1347
-
1348
- metadata = all_metadata.get(identifier, [])
1349
-
1350
- print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
1351
-
1352
- try:
1353
- if metadata:
1354
- print(f" 💾 Saving {len(metadata)} issue records...")
1355
- if save_issue_metadata_to_hf(metadata, identifier):
1356
- success_count += 1
1357
- else:
1358
- error_count += 1
1359
- else:
1360
- print(f" No issues found")
1361
- no_data_count += 1
1362
-
1363
- except Exception as e:
1364
- print(f" ✗ Error saving {identifier}: {str(e)}")
1365
- import traceback
1366
- traceback.print_exc()
1367
- error_count += 1
1368
- continue
1369
-
1370
- # Calculate number of batches executed
1371
- batch_size = 50
1372
- num_batches = (len(identifiers) + batch_size - 1) // batch_size
1373
-
1374
- print(f"\n{'='*80}")
1375
- print(f"✅ Mining complete!")
1376
- print(f" Total agents: {len(agents)}")
1377
- print(f" Successfully saved: {success_count}")
1378
- print(f" No data (skipped): {no_data_count}")
1379
- print(f" Errors: {error_count}")
1380
- print(f" BigQuery batches executed: {num_batches} (batch size: {batch_size})")
1381
- print(f"{'='*80}\n")
1382
-
1383
  # After mining is complete, save leaderboard and metrics to HuggingFace
1384
  print(f"📤 Uploading leaderboard and metrics data...")
1385
  if save_leaderboard_and_metrics_to_hf():
 
220
  return " UNION ALL ".join(union_parts)
221
 
222
 
223
+ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True):
224
  """
225
  Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
226
 
 
233
  start_date: Start datetime (timezone-aware)
234
  end_date: End datetime (timezone-aware)
235
  batch_size: Number of agents per batch (default: 50)
236
+ upload_immediately: Upload results to HuggingFace immediately after each batch (default: True)
237
 
238
  Returns:
239
  Dictionary mapping agent identifier to list of issue metadata
240
  """
241
  print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach")
242
  print(f" Batch size: {batch_size} agents per query")
243
+ print(f" Upload mode: {'Immediate (after each batch)' if upload_immediately else 'Deferred (after all batches)'}")
244
 
245
  # Split identifiers into batches
246
  batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
 
268
 
269
  print(f" ✓ Batch {batch_num} completed: {len(batch_results)} agents with data")
270
 
271
+ # Upload immediately after this batch if enabled
272
+ if upload_immediately and batch_results:
273
+ print(f"\n 🤗 Uploading batch {batch_num}/{len(batches)} results to HuggingFace...")
274
+ upload_success = 0
275
+ upload_errors = 0
276
+
277
+ for identifier, metadata_list in batch_results.items():
278
+ if metadata_list:
279
+ if save_issue_metadata_to_hf(metadata_list, identifier):
280
+ upload_success += 1
281
+ else:
282
+ upload_errors += 1
283
+
284
+ print(f" ✓ Batch {batch_num}/{len(batches)} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
285
+
286
  except Exception as e:
287
  print(f" ✗ Batch {batch_num} failed: {str(e)}")
288
  print(f" Continuing with remaining batches...")
 
1335
 
1336
  try:
1337
  # Use batched approach for better performance
1338
+ # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
1339
  all_metadata = fetch_issue_metadata_batched(
1340
+ client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True
1341
  )
1342
+
1343
+ # Calculate summary statistics
1344
+ total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
1345
+ agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
1346
+
1347
+ print(f"\n{'='*80}")
1348
+ print(f"✅ BigQuery mining and upload complete!")
1349
+ print(f" Total agents: {len(agents)}")
1350
+ print(f" Agents with data: {agents_with_data}")
1351
+ print(f" Total PRs found: {total_prs}")
1352
+ print(f"{'='*80}\n")
1353
+
1354
  except Exception as e:
1355
  print(f"✗ Error during BigQuery fetch: {str(e)}")
1356
  import traceback
1357
  traceback.print_exc()
1358
  return
1359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1360
  # After mining is complete, save leaderboard and metrics to HuggingFace
1361
  print(f"📤 Uploading leaderboard and metrics data...")
1362
  if save_leaderboard_and_metrics_to_hf():
msr.py CHANGED
@@ -176,7 +176,7 @@ def generate_table_union_statements(start_date, end_date):
176
  # BIGQUERY FUNCTIONS
177
  # =============================================================================
178
 
179
- def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50):
180
  """
181
  Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
182
 
@@ -188,13 +188,15 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
188
  identifiers: List of GitHub usernames/bot identifiers
189
  start_date: Start datetime (timezone-aware)
190
  end_date: End datetime (timezone-aware)
191
- batch_size: Number of agents per batch (default: 100)
 
192
 
193
  Returns:
194
  Dictionary mapping agent identifier to list of issue metadata
195
  """
196
  print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach")
197
  print(f" Batch size: {batch_size} agents per query")
 
198
 
199
  # Split identifiers into batches
200
  batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
@@ -222,6 +224,21 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
222
 
223
  print(f" ✓ Batch {batch_num} completed: {len(batch_results)} agents with data")
224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  except Exception as e:
226
  print(f" ✗ Batch {batch_num} failed: {str(e)}")
227
  print(f" Continuing with remaining batches...")
@@ -858,68 +875,28 @@ def mine_all_agents():
858
 
859
  try:
860
  # Use batched approach for better performance
 
861
  all_metadata = fetch_issue_metadata_batched(
862
- client, identifiers, start_date, end_date, batch_size=50
863
  )
 
 
 
 
 
 
 
 
 
 
 
 
864
  except Exception as e:
865
  print(f"✗ Error during BigQuery fetch: {str(e)}")
866
  import traceback
867
  traceback.print_exc()
868
  return
869
 
870
- # Save results for each agent
871
- print(f"\n{'='*80}")
872
- print(f"💾 Saving results to HuggingFace for each agent...")
873
- print(f"{'='*80}\n")
874
-
875
- success_count = 0
876
- error_count = 0
877
- no_data_count = 0
878
-
879
- for i, agent in enumerate(agents, 1):
880
- identifier = agent.get('github_identifier')
881
- agent_name = agent.get('name', 'Unknown')
882
-
883
- if not identifier:
884
- print(f"[{i}/{len(agents)}] Skipping agent without identifier")
885
- error_count += 1
886
- continue
887
-
888
- metadata = all_metadata.get(identifier, [])
889
-
890
- print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
891
-
892
- try:
893
- if metadata:
894
- print(f" 💾 Saving {len(metadata)} issue records...")
895
- if save_issue_metadata_to_hf(metadata, identifier):
896
- success_count += 1
897
- else:
898
- error_count += 1
899
- else:
900
- print(f" No issues found")
901
- no_data_count += 1
902
-
903
- except Exception as e:
904
- print(f" ✗ Error saving {identifier}: {str(e)}")
905
- import traceback
906
- traceback.print_exc()
907
- error_count += 1
908
- continue
909
-
910
- # Calculate number of batches executed
911
- batch_size = 50
912
- num_batches = (len(identifiers) + batch_size - 1) // batch_size
913
-
914
- print(f"\n{'='*80}")
915
- print(f"✅ Mining complete!")
916
- print(f" Total agents: {len(agents)}")
917
- print(f" Successfully saved: {success_count}")
918
- print(f" No data (skipped): {no_data_count}")
919
- print(f" Errors: {error_count}")
920
- print(f" BigQuery batches executed: {num_batches} (batch size: {batch_size})")
921
- print(f"{'='*80}\n")
922
-
923
  # After mining is complete, save leaderboard and metrics to HuggingFace
924
  print(f"📤 Uploading leaderboard and metrics data...")
925
  if save_leaderboard_and_metrics_to_hf(all_metadata, agents):
 
176
  # BIGQUERY FUNCTIONS
177
  # =============================================================================
178
 
179
+ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True):
180
  """
181
  Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
182
 
 
188
  identifiers: List of GitHub usernames/bot identifiers
189
  start_date: Start datetime (timezone-aware)
190
  end_date: End datetime (timezone-aware)
191
+ batch_size: Number of agents per batch (default: 50)
192
+ upload_immediately: Upload results to HuggingFace immediately after each batch (default: True)
193
 
194
  Returns:
195
  Dictionary mapping agent identifier to list of issue metadata
196
  """
197
  print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach")
198
  print(f" Batch size: {batch_size} agents per query")
199
+ print(f" Upload mode: {'Immediate (after each batch)' if upload_immediately else 'Deferred (after all batches)'}")
200
 
201
  # Split identifiers into batches
202
  batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
 
224
 
225
  print(f" ✓ Batch {batch_num} completed: {len(batch_results)} agents with data")
226
 
227
+ # Upload immediately after this batch if enabled
228
+ if upload_immediately and batch_results:
229
+ print(f"\n 🤗 Uploading batch {batch_num}/{len(batches)} results to HuggingFace...")
230
+ upload_success = 0
231
+ upload_errors = 0
232
+
233
+ for identifier, metadata_list in batch_results.items():
234
+ if metadata_list:
235
+ if save_issue_metadata_to_hf(metadata_list, identifier):
236
+ upload_success += 1
237
+ else:
238
+ upload_errors += 1
239
+
240
+ print(f" ✓ Batch {batch_num}/{len(batches)} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
241
+
242
  except Exception as e:
243
  print(f" ✗ Batch {batch_num} failed: {str(e)}")
244
  print(f" Continuing with remaining batches...")
 
875
 
876
  try:
877
  # Use batched approach for better performance
878
+ # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
879
  all_metadata = fetch_issue_metadata_batched(
880
+ client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True
881
  )
882
+
883
+ # Calculate summary statistics
884
+ total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
885
+ agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
886
+
887
+ print(f"\n{'='*80}")
888
+ print(f"✅ BigQuery mining and upload complete!")
889
+ print(f" Total agents: {len(agents)}")
890
+ print(f" Agents with data: {agents_with_data}")
891
+ print(f" Total PRs found: {total_prs}")
892
+ print(f"{'='*80}\n")
893
+
894
  except Exception as e:
895
  print(f"✗ Error during BigQuery fetch: {str(e)}")
896
  import traceback
897
  traceback.print_exc()
898
  return
899
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
900
  # After mining is complete, save leaderboard and metrics to HuggingFace
901
  print(f"📤 Uploading leaderboard and metrics data...")
902
  if save_leaderboard_and_metrics_to_hf(all_metadata, agents):