refine
Browse files
app.py
CHANGED
|
@@ -228,7 +228,7 @@ def generate_table_union_statements(start_date, end_date):
|
|
| 228 |
return " UNION ALL ".join(union_parts)
|
| 229 |
|
| 230 |
|
| 231 |
-
def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=
|
| 232 |
"""
|
| 233 |
Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
|
| 234 |
|
|
@@ -240,7 +240,7 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
|
|
| 240 |
identifiers: List of GitHub usernames/bot identifiers
|
| 241 |
start_date: Start datetime (timezone-aware)
|
| 242 |
end_date: End datetime (timezone-aware)
|
| 243 |
-
batch_size: Number of agents per batch (default:
|
| 244 |
upload_immediately: Upload results to HuggingFace immediately after each batch (default: True)
|
| 245 |
|
| 246 |
Returns:
|
|
@@ -1065,7 +1065,6 @@ def load_agents_from_hf():
|
|
| 1065 |
|
| 1066 |
# Only process agents with status == "public"
|
| 1067 |
if agent_data.get('status') != 'public':
|
| 1068 |
-
print(f"Skipping {json_file}: status is not 'public'")
|
| 1069 |
continue
|
| 1070 |
|
| 1071 |
# Extract github_identifier from filename (e.g., "agent[bot].json" -> "agent[bot]")
|
|
@@ -1345,7 +1344,7 @@ def mine_all_agents():
|
|
| 1345 |
# Use batched approach for better performance
|
| 1346 |
# upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
|
| 1347 |
all_metadata = fetch_issue_metadata_batched(
|
| 1348 |
-
client, identifiers, start_date, end_date, batch_size=
|
| 1349 |
)
|
| 1350 |
|
| 1351 |
# Calculate summary statistics
|
|
|
|
| 228 |
return " UNION ALL ".join(union_parts)
|
| 229 |
|
| 230 |
|
| 231 |
+
def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True):
|
| 232 |
"""
|
| 233 |
Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
|
| 234 |
|
|
|
|
| 240 |
identifiers: List of GitHub usernames/bot identifiers
|
| 241 |
start_date: Start datetime (timezone-aware)
|
| 242 |
end_date: End datetime (timezone-aware)
|
| 243 |
+
batch_size: Number of agents per batch (default: 100)
|
| 244 |
upload_immediately: Upload results to HuggingFace immediately after each batch (default: True)
|
| 245 |
|
| 246 |
Returns:
|
|
|
|
| 1065 |
|
| 1066 |
# Only process agents with status == "public"
|
| 1067 |
if agent_data.get('status') != 'public':
|
|
|
|
| 1068 |
continue
|
| 1069 |
|
| 1070 |
# Extract github_identifier from filename (e.g., "agent[bot].json" -> "agent[bot]")
|
|
|
|
| 1344 |
# Use batched approach for better performance
|
| 1345 |
# upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
|
| 1346 |
all_metadata = fetch_issue_metadata_batched(
|
| 1347 |
+
client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True
|
| 1348 |
)
|
| 1349 |
|
| 1350 |
# Calculate summary statistics
|
msr.py
CHANGED
|
@@ -184,7 +184,7 @@ def generate_table_union_statements(start_date, end_date):
|
|
| 184 |
# BIGQUERY FUNCTIONS
|
| 185 |
# =============================================================================
|
| 186 |
|
| 187 |
-
def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=
|
| 188 |
"""
|
| 189 |
Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
|
| 190 |
|
|
@@ -196,7 +196,7 @@ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batc
|
|
| 196 |
identifiers: List of GitHub usernames/bot identifiers
|
| 197 |
start_date: Start datetime (timezone-aware)
|
| 198 |
end_date: End datetime (timezone-aware)
|
| 199 |
-
batch_size: Number of agents per batch (default:
|
| 200 |
upload_immediately: Upload results to HuggingFace immediately after each batch (default: True)
|
| 201 |
|
| 202 |
Returns:
|
|
@@ -885,7 +885,7 @@ def mine_all_agents():
|
|
| 885 |
# Use batched approach for better performance
|
| 886 |
# upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
|
| 887 |
all_metadata = fetch_issue_metadata_batched(
|
| 888 |
-
client, identifiers, start_date, end_date, batch_size=
|
| 889 |
)
|
| 890 |
|
| 891 |
# Calculate summary statistics
|
|
|
|
| 184 |
# BIGQUERY FUNCTIONS
|
| 185 |
# =============================================================================
|
| 186 |
|
| 187 |
+
def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True):
|
| 188 |
"""
|
| 189 |
Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
|
| 190 |
|
|
|
|
| 196 |
identifiers: List of GitHub usernames/bot identifiers
|
| 197 |
start_date: Start datetime (timezone-aware)
|
| 198 |
end_date: End datetime (timezone-aware)
|
| 199 |
+
batch_size: Number of agents per batch (default: 100)
|
| 200 |
upload_immediately: Upload results to HuggingFace immediately after each batch (default: True)
|
| 201 |
|
| 202 |
Returns:
|
|
|
|
| 885 |
# Use batched approach for better performance
|
| 886 |
# upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
|
| 887 |
all_metadata = fetch_issue_metadata_batched(
|
| 888 |
+
client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True
|
| 889 |
)
|
| 890 |
|
| 891 |
# Calculate summary statistics
|