zhiminy commited on
Commit
cabb3ce
·
1 Parent(s): 728ac40
Files changed (2) hide show
  1. app.py +3 -4
  2. msr.py +3 -3
app.py CHANGED
@@ -228,7 +228,7 @@ def generate_table_union_statements(start_date, end_date):
228
  return " UNION ALL ".join(union_parts)
229
 
230
 
231
- def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True):
232
  """
233
  Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
234
 
@@ -240,7 +240,7 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
240
  identifiers: List of GitHub usernames/bot identifiers
241
  start_date: Start datetime (timezone-aware)
242
  end_date: End datetime (timezone-aware)
243
- batch_size: Number of agents per batch (default: 50)
244
  upload_immediately: Upload results to HuggingFace immediately after each batch (default: True)
245
 
246
  Returns:
@@ -1065,7 +1065,6 @@ def load_agents_from_hf():
1065
 
1066
  # Only process agents with status == "public"
1067
  if agent_data.get('status') != 'public':
1068
- print(f"Skipping {json_file}: status is not 'public'")
1069
  continue
1070
 
1071
  # Extract github_identifier from filename (e.g., "agent[bot].json" -> "agent[bot]")
@@ -1345,7 +1344,7 @@ def mine_all_agents():
1345
  # Use batched approach for better performance
1346
  # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
1347
  all_metadata = fetch_issue_metadata_batched(
1348
- client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True
1349
  )
1350
 
1351
  # Calculate summary statistics
 
228
  return " UNION ALL ".join(union_parts)
229
 
230
 
231
+ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True):
232
  """
233
  Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
234
 
 
240
  identifiers: List of GitHub usernames/bot identifiers
241
  start_date: Start datetime (timezone-aware)
242
  end_date: End datetime (timezone-aware)
243
+ batch_size: Number of agents per batch (default: 100)
244
  upload_immediately: Upload results to HuggingFace immediately after each batch (default: True)
245
 
246
  Returns:
 
1065
 
1066
  # Only process agents with status == "public"
1067
  if agent_data.get('status') != 'public':
 
1068
  continue
1069
 
1070
  # Extract github_identifier from filename (e.g., "agent[bot].json" -> "agent[bot]")
 
1344
  # Use batched approach for better performance
1345
  # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
1346
  all_metadata = fetch_issue_metadata_batched(
1347
+ client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True
1348
  )
1349
 
1350
  # Calculate summary statistics
msr.py CHANGED
@@ -184,7 +184,7 @@ def generate_table_union_statements(start_date, end_date):
184
  # BIGQUERY FUNCTIONS
185
  # =============================================================================
186
 
187
- def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True):
188
  """
189
  Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
190
 
@@ -196,7 +196,7 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
196
  identifiers: List of GitHub usernames/bot identifiers
197
  start_date: Start datetime (timezone-aware)
198
  end_date: End datetime (timezone-aware)
199
- batch_size: Number of agents per batch (default: 50)
200
  upload_immediately: Upload results to HuggingFace immediately after each batch (default: True)
201
 
202
  Returns:
@@ -885,7 +885,7 @@ def mine_all_agents():
885
  # Use batched approach for better performance
886
  # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
887
  all_metadata = fetch_issue_metadata_batched(
888
- client, identifiers, start_date, end_date, batch_size=50, upload_immediately=True
889
  )
890
 
891
  # Calculate summary statistics
 
184
  # BIGQUERY FUNCTIONS
185
  # =============================================================================
186
 
187
+ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True):
188
  """
189
  Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
190
 
 
196
  identifiers: List of GitHub usernames/bot identifiers
197
  start_date: Start datetime (timezone-aware)
198
  end_date: End datetime (timezone-aware)
199
+ batch_size: Number of agents per batch (default: 100)
200
  upload_immediately: Upload results to HuggingFace immediately after each batch (default: True)
201
 
202
  Returns:
 
885
  # Use batched approach for better performance
886
  # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
887
  all_metadata = fetch_issue_metadata_batched(
888
+ client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True
889
  )
890
 
891
  # Calculate summary statistics