refine workflow
Browse files
app.py
CHANGED
|
@@ -220,7 +220,7 @@ def generate_table_union_statements(start_date, end_date):
|
|
| 220 |
return " UNION ALL ".join(union_parts)
|
| 221 |
|
| 222 |
|
| 223 |
-
def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50):
|
| 224 |
"""
|
| 225 |
Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
|
| 226 |
|
|
@@ -233,12 +233,14 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
|
|
| 233 |
start_date: Start datetime (timezone-aware)
|
| 234 |
end_date: End datetime (timezone-aware)
|
| 235 |
batch_size: Number of agents per batch (default: 50)
|
|
|
|
| 236 |
|
| 237 |
Returns:
|
| 238 |
Dictionary mapping agent identifier to list of issue metadata
|
| 239 |
"""
|
| 240 |
print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach")
|
| 241 |
print(f" Batch size: {batch_size} agents per query")
|
|
|
|
| 242 |
|
| 243 |
# Split identifiers into batches
|
| 244 |
batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
|
|
@@ -266,6 +268,21 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
|
|
| 266 |
|
| 267 |
print(f" ✓ Batch {batch_num} completed: {len(batch_results)} agents with data")
|
| 268 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
except Exception as e:
|
| 270 |
print(f" ✗ Batch {batch_num} failed: {str(e)}")
|
| 271 |
print(f" Continuing with remaining batches...")
|
|
@@ -1318,68 +1335,28 @@ def mine_all_agents():
|
|
| 1318 |
|
| 1319 |
try:
|
| 1320 |
# Use batched approach for better performance
|
|
|
|
| 1321 |
all_metadata = fetch_issue_metadata_batched(
|
| 1322 |
-
client, identifiers, start_date, end_date, batch_size=50
|
| 1323 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1324 |
except Exception as e:
|
| 1325 |
print(f"✗ Error during BigQuery fetch: {str(e)}")
|
| 1326 |
import traceback
|
| 1327 |
traceback.print_exc()
|
| 1328 |
return
|
| 1329 |
|
| 1330 |
-
# Save results for each agent
|
| 1331 |
-
print(f"\n{'='*80}")
|
| 1332 |
-
print(f"💾 Saving results to HuggingFace for each agent...")
|
| 1333 |
-
print(f"{'='*80}\n")
|
| 1334 |
-
|
| 1335 |
-
success_count = 0
|
| 1336 |
-
error_count = 0
|
| 1337 |
-
no_data_count = 0
|
| 1338 |
-
|
| 1339 |
-
for i, agent in enumerate(agents, 1):
|
| 1340 |
-
identifier = agent.get('github_identifier')
|
| 1341 |
-
agent_name = agent.get('name', 'Unknown')
|
| 1342 |
-
|
| 1343 |
-
if not identifier:
|
| 1344 |
-
print(f"[{i}/{len(agents)}] Skipping agent without identifier")
|
| 1345 |
-
error_count += 1
|
| 1346 |
-
continue
|
| 1347 |
-
|
| 1348 |
-
metadata = all_metadata.get(identifier, [])
|
| 1349 |
-
|
| 1350 |
-
print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
|
| 1351 |
-
|
| 1352 |
-
try:
|
| 1353 |
-
if metadata:
|
| 1354 |
-
print(f" 💾 Saving {len(metadata)} issue records...")
|
| 1355 |
-
if save_issue_metadata_to_hf(metadata, identifier):
|
| 1356 |
-
success_count += 1
|
| 1357 |
-
else:
|
| 1358 |
-
error_count += 1
|
| 1359 |
-
else:
|
| 1360 |
-
print(f" No issues found")
|
| 1361 |
-
no_data_count += 1
|
| 1362 |
-
|
| 1363 |
-
except Exception as e:
|
| 1364 |
-
print(f" ✗ Error saving {identifier}: {str(e)}")
|
| 1365 |
-
import traceback
|
| 1366 |
-
traceback.print_exc()
|
| 1367 |
-
error_count += 1
|
| 1368 |
-
continue
|
| 1369 |
-
|
| 1370 |
-
# Calculate number of batches executed
|
| 1371 |
-
batch_size = 50
|
| 1372 |
-
num_batches = (len(identifiers) + batch_size - 1) // batch_size
|
| 1373 |
-
|
| 1374 |
-
print(f"\n{'='*80}")
|
| 1375 |
-
print(f"✅ Mining complete!")
|
| 1376 |
-
print(f" Total agents: {len(agents)}")
|
| 1377 |
-
print(f" Successfully saved: {success_count}")
|
| 1378 |
-
print(f" No data (skipped): {no_data_count}")
|
| 1379 |
-
print(f" Errors: {error_count}")
|
| 1380 |
-
print(f" BigQuery batches executed: {num_batches} (batch size: {batch_size})")
|
| 1381 |
-
print(f"{'='*80}\n")
|
| 1382 |
-
|
| 1383 |
# After mining is complete, save leaderboard and metrics to HuggingFace
|
| 1384 |
print(f"📤 Uploading leaderboard and metrics data...")
|
| 1385 |
if save_leaderboard_and_metrics_to_hf():
|
|
|
|
| 220 |
return " UNION ALL ".join(union_parts)
|
| 221 |
|
| 222 |
|
| 223 |
+
def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True):
|
| 224 |
"""
|
| 225 |
Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
|
| 226 |
|
|
|
|
| 233 |
start_date: Start datetime (timezone-aware)
|
| 234 |
end_date: End datetime (timezone-aware)
|
| 235 |
batch_size: Number of agents per batch (default: 50)
|
| 236 |
+
upload_immediately: Upload results to HuggingFace immediately after each batch (default: True)
|
| 237 |
|
| 238 |
Returns:
|
| 239 |
Dictionary mapping agent identifier to list of issue metadata
|
| 240 |
"""
|
| 241 |
print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach")
|
| 242 |
print(f" Batch size: {batch_size} agents per query")
|
| 243 |
+
print(f" Upload mode: {'Immediate (after each batch)' if upload_immediately else 'Deferred (after all batches)'}")
|
| 244 |
|
| 245 |
# Split identifiers into batches
|
| 246 |
batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
|
|
|
|
| 268 |
|
| 269 |
print(f" ✓ Batch {batch_num} completed: {len(batch_results)} agents with data")
|
| 270 |
|
| 271 |
+
# Upload immediately after this batch if enabled
|
| 272 |
+
if upload_immediately and batch_results:
|
| 273 |
+
print(f"\n 🤗 Uploading batch {batch_num}/{len(batches)} results to HuggingFace...")
|
| 274 |
+
upload_success = 0
|
| 275 |
+
upload_errors = 0
|
| 276 |
+
|
| 277 |
+
for identifier, metadata_list in batch_results.items():
|
| 278 |
+
if metadata_list:
|
| 279 |
+
if save_issue_metadata_to_hf(metadata_list, identifier):
|
| 280 |
+
upload_success += 1
|
| 281 |
+
else:
|
| 282 |
+
upload_errors += 1
|
| 283 |
+
|
| 284 |
+
print(f" ✓ Batch {batch_num}/{len(batches)} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
|
| 285 |
+
|
| 286 |
except Exception as e:
|
| 287 |
print(f" ✗ Batch {batch_num} failed: {str(e)}")
|
| 288 |
print(f" Continuing with remaining batches...")
|
|
|
|
| 1335 |
|
| 1336 |
try:
|
| 1337 |
# Use batched approach for better performance
|
| 1338 |
+
# upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
|
| 1339 |
all_metadata = fetch_issue_metadata_batched(
|
| 1340 |
+
client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True
|
| 1341 |
)
|
| 1342 |
+
|
| 1343 |
+
# Calculate summary statistics
|
| 1344 |
+
total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
|
| 1345 |
+
agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
|
| 1346 |
+
|
| 1347 |
+
print(f"\n{'='*80}")
|
| 1348 |
+
print(f"✅ BigQuery mining and upload complete!")
|
| 1349 |
+
print(f" Total agents: {len(agents)}")
|
| 1350 |
+
print(f" Agents with data: {agents_with_data}")
|
| 1351 |
+
print(f" Total PRs found: {total_prs}")
|
| 1352 |
+
print(f"{'='*80}\n")
|
| 1353 |
+
|
| 1354 |
except Exception as e:
|
| 1355 |
print(f"✗ Error during BigQuery fetch: {str(e)}")
|
| 1356 |
import traceback
|
| 1357 |
traceback.print_exc()
|
| 1358 |
return
|
| 1359 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1360 |
# After mining is complete, save leaderboard and metrics to HuggingFace
|
| 1361 |
print(f"📤 Uploading leaderboard and metrics data...")
|
| 1362 |
if save_leaderboard_and_metrics_to_hf():
|
msr.py
CHANGED
|
@@ -176,7 +176,7 @@ def generate_table_union_statements(start_date, end_date):
|
|
| 176 |
# BIGQUERY FUNCTIONS
|
| 177 |
# =============================================================================
|
| 178 |
|
| 179 |
-
def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50):
|
| 180 |
"""
|
| 181 |
Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
|
| 182 |
|
|
@@ -188,13 +188,15 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
|
|
| 188 |
identifiers: List of GitHub usernames/bot identifiers
|
| 189 |
start_date: Start datetime (timezone-aware)
|
| 190 |
end_date: End datetime (timezone-aware)
|
| 191 |
-
batch_size: Number of agents per batch (default:
|
|
|
|
| 192 |
|
| 193 |
Returns:
|
| 194 |
Dictionary mapping agent identifier to list of issue metadata
|
| 195 |
"""
|
| 196 |
print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach")
|
| 197 |
print(f" Batch size: {batch_size} agents per query")
|
|
|
|
| 198 |
|
| 199 |
# Split identifiers into batches
|
| 200 |
batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
|
|
@@ -222,6 +224,21 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
|
|
| 222 |
|
| 223 |
print(f" ✓ Batch {batch_num} completed: {len(batch_results)} agents with data")
|
| 224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
except Exception as e:
|
| 226 |
print(f" ✗ Batch {batch_num} failed: {str(e)}")
|
| 227 |
print(f" Continuing with remaining batches...")
|
|
@@ -858,68 +875,28 @@ def mine_all_agents():
|
|
| 858 |
|
| 859 |
try:
|
| 860 |
# Use batched approach for better performance
|
|
|
|
| 861 |
all_metadata = fetch_issue_metadata_batched(
|
| 862 |
-
client, identifiers, start_date, end_date, batch_size=50
|
| 863 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 864 |
except Exception as e:
|
| 865 |
print(f"✗ Error during BigQuery fetch: {str(e)}")
|
| 866 |
import traceback
|
| 867 |
traceback.print_exc()
|
| 868 |
return
|
| 869 |
|
| 870 |
-
# Save results for each agent
|
| 871 |
-
print(f"\n{'='*80}")
|
| 872 |
-
print(f"💾 Saving results to HuggingFace for each agent...")
|
| 873 |
-
print(f"{'='*80}\n")
|
| 874 |
-
|
| 875 |
-
success_count = 0
|
| 876 |
-
error_count = 0
|
| 877 |
-
no_data_count = 0
|
| 878 |
-
|
| 879 |
-
for i, agent in enumerate(agents, 1):
|
| 880 |
-
identifier = agent.get('github_identifier')
|
| 881 |
-
agent_name = agent.get('name', 'Unknown')
|
| 882 |
-
|
| 883 |
-
if not identifier:
|
| 884 |
-
print(f"[{i}/{len(agents)}] Skipping agent without identifier")
|
| 885 |
-
error_count += 1
|
| 886 |
-
continue
|
| 887 |
-
|
| 888 |
-
metadata = all_metadata.get(identifier, [])
|
| 889 |
-
|
| 890 |
-
print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
|
| 891 |
-
|
| 892 |
-
try:
|
| 893 |
-
if metadata:
|
| 894 |
-
print(f" 💾 Saving {len(metadata)} issue records...")
|
| 895 |
-
if save_issue_metadata_to_hf(metadata, identifier):
|
| 896 |
-
success_count += 1
|
| 897 |
-
else:
|
| 898 |
-
error_count += 1
|
| 899 |
-
else:
|
| 900 |
-
print(f" No issues found")
|
| 901 |
-
no_data_count += 1
|
| 902 |
-
|
| 903 |
-
except Exception as e:
|
| 904 |
-
print(f" ✗ Error saving {identifier}: {str(e)}")
|
| 905 |
-
import traceback
|
| 906 |
-
traceback.print_exc()
|
| 907 |
-
error_count += 1
|
| 908 |
-
continue
|
| 909 |
-
|
| 910 |
-
# Calculate number of batches executed
|
| 911 |
-
batch_size = 50
|
| 912 |
-
num_batches = (len(identifiers) + batch_size - 1) // batch_size
|
| 913 |
-
|
| 914 |
-
print(f"\n{'='*80}")
|
| 915 |
-
print(f"✅ Mining complete!")
|
| 916 |
-
print(f" Total agents: {len(agents)}")
|
| 917 |
-
print(f" Successfully saved: {success_count}")
|
| 918 |
-
print(f" No data (skipped): {no_data_count}")
|
| 919 |
-
print(f" Errors: {error_count}")
|
| 920 |
-
print(f" BigQuery batches executed: {num_batches} (batch size: {batch_size})")
|
| 921 |
-
print(f"{'='*80}\n")
|
| 922 |
-
|
| 923 |
# After mining is complete, save leaderboard and metrics to HuggingFace
|
| 924 |
print(f"📤 Uploading leaderboard and metrics data...")
|
| 925 |
if save_leaderboard_and_metrics_to_hf(all_metadata, agents):
|
|
|
|
| 176 |
# BIGQUERY FUNCTIONS
|
| 177 |
# =============================================================================
|
| 178 |
|
| 179 |
+
def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True):
|
| 180 |
"""
|
| 181 |
Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
|
| 182 |
|
|
|
|
| 188 |
identifiers: List of GitHub usernames/bot identifiers
|
| 189 |
start_date: Start datetime (timezone-aware)
|
| 190 |
end_date: End datetime (timezone-aware)
|
| 191 |
+
batch_size: Number of agents per batch (default: 50)
|
| 192 |
+
upload_immediately: Upload results to HuggingFace immediately after each batch (default: True)
|
| 193 |
|
| 194 |
Returns:
|
| 195 |
Dictionary mapping agent identifier to list of issue metadata
|
| 196 |
"""
|
| 197 |
print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach")
|
| 198 |
print(f" Batch size: {batch_size} agents per query")
|
| 199 |
+
print(f" Upload mode: {'Immediate (after each batch)' if upload_immediately else 'Deferred (after all batches)'}")
|
| 200 |
|
| 201 |
# Split identifiers into batches
|
| 202 |
batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
|
|
|
|
| 224 |
|
| 225 |
print(f" ✓ Batch {batch_num} completed: {len(batch_results)} agents with data")
|
| 226 |
|
| 227 |
+
# Upload immediately after this batch if enabled
|
| 228 |
+
if upload_immediately and batch_results:
|
| 229 |
+
print(f"\n 🤗 Uploading batch {batch_num}/{len(batches)} results to HuggingFace...")
|
| 230 |
+
upload_success = 0
|
| 231 |
+
upload_errors = 0
|
| 232 |
+
|
| 233 |
+
for identifier, metadata_list in batch_results.items():
|
| 234 |
+
if metadata_list:
|
| 235 |
+
if save_issue_metadata_to_hf(metadata_list, identifier):
|
| 236 |
+
upload_success += 1
|
| 237 |
+
else:
|
| 238 |
+
upload_errors += 1
|
| 239 |
+
|
| 240 |
+
print(f" ✓ Batch {batch_num}/{len(batches)} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
|
| 241 |
+
|
| 242 |
except Exception as e:
|
| 243 |
print(f" ✗ Batch {batch_num} failed: {str(e)}")
|
| 244 |
print(f" Continuing with remaining batches...")
|
|
|
|
| 875 |
|
| 876 |
try:
|
| 877 |
# Use batched approach for better performance
|
| 878 |
+
# upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
|
| 879 |
all_metadata = fetch_issue_metadata_batched(
|
| 880 |
+
client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True
|
| 881 |
)
|
| 882 |
+
|
| 883 |
+
# Calculate summary statistics
|
| 884 |
+
total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
|
| 885 |
+
agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
|
| 886 |
+
|
| 887 |
+
print(f"\n{'='*80}")
|
| 888 |
+
print(f"✅ BigQuery mining and upload complete!")
|
| 889 |
+
print(f" Total agents: {len(agents)}")
|
| 890 |
+
print(f" Agents with data: {agents_with_data}")
|
| 891 |
+
print(f" Total PRs found: {total_prs}")
|
| 892 |
+
print(f"{'='*80}\n")
|
| 893 |
+
|
| 894 |
except Exception as e:
|
| 895 |
print(f"✗ Error during BigQuery fetch: {str(e)}")
|
| 896 |
import traceback
|
| 897 |
traceback.print_exc()
|
| 898 |
return
|
| 899 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 900 |
# After mining is complete, save leaderboard and metrics to HuggingFace
|
| 901 |
print(f"📤 Uploading leaderboard and metrics data...")
|
| 902 |
if save_leaderboard_and_metrics_to_hf(all_metadata, agents):
|