Spaces:

SWE-Arena
/

SWE-Issue

Sleeping

App Files Files Community

zhiminy commited on Nov 9, 2025

Commit

725070c

1 Parent(s): ee9e618

refine workflow

Browse files

Files changed (2) hide show

app.py +32 -55
msr.py +33 -56

app.py CHANGED Viewed

@@ -220,7 +220,7 @@ def generate_table_union_statements(start_date, end_date):
     return " UNION ALL ".join(union_parts)
-def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50):
     """
     Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
@@ -233,12 +233,14 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
         start_date: Start datetime (timezone-aware)
         end_date: End datetime (timezone-aware)
         batch_size: Number of agents per batch (default: 50)
     Returns:
         Dictionary mapping agent identifier to list of issue metadata
     """
     print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach")
     print(f"   Batch size: {batch_size} agents per query")
     # Split identifiers into batches
     batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
@@ -266,6 +268,21 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
             print(f"   ✓ Batch {batch_num} completed: {len(batch_results)} agents with data")
         except Exception as e:
             print(f"   ✗ Batch {batch_num} failed: {str(e)}")
             print(f"   Continuing with remaining batches...")
@@ -1318,68 +1335,28 @@ def mine_all_agents():
     try:
         # Use batched approach for better performance
         all_metadata = fetch_issue_metadata_batched(
-            client, identifiers, start_date, end_date, batch_size=50
         )
     except Exception as e:
         print(f"✗ Error during BigQuery fetch: {str(e)}")
         import traceback
         traceback.print_exc()
         return
-    # Save results for each agent
-    print(f"\n{'='*80}")
-    print(f"💾 Saving results to HuggingFace for each agent...")
-    print(f"{'='*80}\n")
-    success_count = 0
-    error_count = 0
-    no_data_count = 0
-    for i, agent in enumerate(agents, 1):
-        identifier = agent.get('github_identifier')
-        agent_name = agent.get('name', 'Unknown')
-        if not identifier:
-            print(f"[{i}/{len(agents)}] Skipping agent without identifier")
-            error_count += 1
-            continue
-        metadata = all_metadata.get(identifier, [])
-        print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
-        try:
-            if metadata:
-                print(f"   💾 Saving {len(metadata)} issue records...")
-                if save_issue_metadata_to_hf(metadata, identifier):
-                    success_count += 1
-                else:
-                    error_count += 1
-            else:
-                print(f"   No issues found")
-                no_data_count += 1
-        except Exception as e:
-            print(f"   ✗ Error saving {identifier}: {str(e)}")
-            import traceback
-            traceback.print_exc()
-            error_count += 1
-            continue
-    # Calculate number of batches executed
-    batch_size = 50
-    num_batches = (len(identifiers) + batch_size - 1) // batch_size
-    print(f"\n{'='*80}")
-    print(f"✅ Mining complete!")
-    print(f"   Total agents: {len(agents)}")
-    print(f"   Successfully saved: {success_count}")
-    print(f"   No data (skipped): {no_data_count}")
-    print(f"   Errors: {error_count}")
-    print(f"   BigQuery batches executed: {num_batches} (batch size: {batch_size})")
-    print(f"{'='*80}\n")
     # After mining is complete, save leaderboard and metrics to HuggingFace
     print(f"📤 Uploading leaderboard and metrics data...")
     if save_leaderboard_and_metrics_to_hf():

     return " UNION ALL ".join(union_parts)
+def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True):
     """
     Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
         start_date: Start datetime (timezone-aware)
         end_date: End datetime (timezone-aware)
         batch_size: Number of agents per batch (default: 50)
+        upload_immediately: Upload results to HuggingFace immediately after each batch (default: True)
     Returns:
         Dictionary mapping agent identifier to list of issue metadata
     """
     print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach")
     print(f"   Batch size: {batch_size} agents per query")
+    print(f"   Upload mode: {'Immediate (after each batch)' if upload_immediately else 'Deferred (after all batches)'}")
     # Split identifiers into batches
     batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
             print(f"   ✓ Batch {batch_num} completed: {len(batch_results)} agents with data")
+            # Upload immediately after this batch if enabled
+            if upload_immediately and batch_results:
+                print(f"\n   🤗 Uploading batch {batch_num}/{len(batches)} results to HuggingFace...")
+                upload_success = 0
+                upload_errors = 0
+                for identifier, metadata_list in batch_results.items():
+                    if metadata_list:
+                        if save_issue_metadata_to_hf(metadata_list, identifier):
+                            upload_success += 1
+                        else:
+                            upload_errors += 1
+                print(f"   ✓ Batch {batch_num}/{len(batches)} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
         except Exception as e:
             print(f"   ✗ Batch {batch_num} failed: {str(e)}")
             print(f"   Continuing with remaining batches...")
     try:
         # Use batched approach for better performance
+        # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
         all_metadata = fetch_issue_metadata_batched(
+            client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True
         )
+        # Calculate summary statistics
+        total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
+        agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
+        print(f"\n{'='*80}")
+        print(f"✅ BigQuery mining and upload complete!")
+        print(f"   Total agents: {len(agents)}")
+        print(f"   Agents with data: {agents_with_data}")
+        print(f"   Total PRs found: {total_prs}")
+        print(f"{'='*80}\n")
     except Exception as e:
         print(f"✗ Error during BigQuery fetch: {str(e)}")
         import traceback
         traceback.print_exc()
         return
     # After mining is complete, save leaderboard and metrics to HuggingFace
     print(f"📤 Uploading leaderboard and metrics data...")
     if save_leaderboard_and_metrics_to_hf():

msr.py CHANGED Viewed

@@ -176,7 +176,7 @@ def generate_table_union_statements(start_date, end_date):
 # BIGQUERY FUNCTIONS
 # =============================================================================
-def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50):
     """
     Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
@@ -188,13 +188,15 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
         identifiers: List of GitHub usernames/bot identifiers
         start_date: Start datetime (timezone-aware)
         end_date: End datetime (timezone-aware)
-        batch_size: Number of agents per batch (default: 100)
     Returns:
         Dictionary mapping agent identifier to list of issue metadata
     """
     print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach")
     print(f"   Batch size: {batch_size} agents per query")
     # Split identifiers into batches
     batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
@@ -222,6 +224,21 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
             print(f"   ✓ Batch {batch_num} completed: {len(batch_results)} agents with data")
         except Exception as e:
             print(f"   ✗ Batch {batch_num} failed: {str(e)}")
             print(f"   Continuing with remaining batches...")
@@ -858,68 +875,28 @@ def mine_all_agents():
     try:
         # Use batched approach for better performance
         all_metadata = fetch_issue_metadata_batched(
-            client, identifiers, start_date, end_date, batch_size=50
         )
     except Exception as e:
         print(f"✗ Error during BigQuery fetch: {str(e)}")
         import traceback
         traceback.print_exc()
         return
-    # Save results for each agent
-    print(f"\n{'='*80}")
-    print(f"💾 Saving results to HuggingFace for each agent...")
-    print(f"{'='*80}\n")
-    success_count = 0
-    error_count = 0
-    no_data_count = 0
-    for i, agent in enumerate(agents, 1):
-        identifier = agent.get('github_identifier')
-        agent_name = agent.get('name', 'Unknown')
-        if not identifier:
-            print(f"[{i}/{len(agents)}] Skipping agent without identifier")
-            error_count += 1
-            continue
-        metadata = all_metadata.get(identifier, [])
-        print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
-        try:
-            if metadata:
-                print(f"   💾 Saving {len(metadata)} issue records...")
-                if save_issue_metadata_to_hf(metadata, identifier):
-                    success_count += 1
-                else:
-                    error_count += 1
-            else:
-                print(f"   No issues found")
-                no_data_count += 1
-        except Exception as e:
-            print(f"   ✗ Error saving {identifier}: {str(e)}")
-            import traceback
-            traceback.print_exc()
-            error_count += 1
-            continue
-    # Calculate number of batches executed
-    batch_size = 50
-    num_batches = (len(identifiers) + batch_size - 1) // batch_size
-    print(f"\n{'='*80}")
-    print(f"✅ Mining complete!")
-    print(f"   Total agents: {len(agents)}")
-    print(f"   Successfully saved: {success_count}")
-    print(f"   No data (skipped): {no_data_count}")
-    print(f"   Errors: {error_count}")
-    print(f"   BigQuery batches executed: {num_batches} (batch size: {batch_size})")
-    print(f"{'='*80}\n")
     # After mining is complete, save leaderboard and metrics to HuggingFace
     print(f"📤 Uploading leaderboard and metrics data...")
     if save_leaderboard_and_metrics_to_hf(all_metadata, agents):

 # BIGQUERY FUNCTIONS
 # =============================================================================
+def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True):
     """
     Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
         identifiers: List of GitHub usernames/bot identifiers
         start_date: Start datetime (timezone-aware)
         end_date: End datetime (timezone-aware)
+        batch_size: Number of agents per batch (default: 50)
+        upload_immediately: Upload results to HuggingFace immediately after each batch (default: True)
     Returns:
         Dictionary mapping agent identifier to list of issue metadata
     """
     print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach")
     print(f"   Batch size: {batch_size} agents per query")
+    print(f"   Upload mode: {'Immediate (after each batch)' if upload_immediately else 'Deferred (after all batches)'}")
     # Split identifiers into batches
     batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
             print(f"   ✓ Batch {batch_num} completed: {len(batch_results)} agents with data")
+            # Upload immediately after this batch if enabled
+            if upload_immediately and batch_results:
+                print(f"\n   🤗 Uploading batch {batch_num}/{len(batches)} results to HuggingFace...")
+                upload_success = 0
+                upload_errors = 0
+                for identifier, metadata_list in batch_results.items():
+                    if metadata_list:
+                        if save_issue_metadata_to_hf(metadata_list, identifier):
+                            upload_success += 1
+                        else:
+                            upload_errors += 1
+                print(f"   ✓ Batch {batch_num}/{len(batches)} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
         except Exception as e:
             print(f"   ✗ Batch {batch_num} failed: {str(e)}")
             print(f"   Continuing with remaining batches...")
     try:
         # Use batched approach for better performance
+        # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
         all_metadata = fetch_issue_metadata_batched(
+            client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True
         )
+        # Calculate summary statistics
+        total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
+        agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
+        print(f"\n{'='*80}")
+        print(f"✅ BigQuery mining and upload complete!")
+        print(f"   Total agents: {len(agents)}")
+        print(f"   Agents with data: {agents_with_data}")
+        print(f"   Total PRs found: {total_prs}")
+        print(f"{'='*80}\n")
     except Exception as e:
         print(f"✗ Error during BigQuery fetch: {str(e)}")
         import traceback
         traceback.print_exc()
         return
     # After mining is complete, save leaderboard and metrics to HuggingFace
     print(f"📤 Uploading leaderboard and metrics data...")
     if save_leaderboard_and_metrics_to_hf(all_metadata, agents):