zhimin-z
commited on
Commit
·
f348397
1
Parent(s):
91984f1
fixing
Browse files
msr.py
CHANGED
|
@@ -316,26 +316,21 @@ def get_duckdb_connection():
|
|
| 316 |
raise
|
| 317 |
|
| 318 |
# CORE MEMORY & THREADING SETTINGS
|
| 319 |
-
conn.execute(f"SET threads TO
|
| 320 |
conn.execute(f"SET max_memory = '48GB';") # Hard cap
|
| 321 |
conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
|
| 322 |
|
| 323 |
-
# JSON STREAMING OPTIMIZATIONS (critical for performance)
|
| 324 |
-
conn.execute("SET json.read_objects = true;") # Enable streaming JSON objects
|
| 325 |
-
conn.execute("SET json.read_buffer_size = '64MB';") # Increase from 256KB default for large fields
|
| 326 |
-
conn.execute("SET json.format = 'newline_delimited';") # Skip array parsing, double throughput
|
| 327 |
-
|
| 328 |
# GZIP PARALLEL DECOMPRESSION (only needed for .json.gz files)
|
|
|
|
|
|
|
| 329 |
try:
|
| 330 |
conn.execute("SET extension_directory = '/tmp/duckdb_ext';")
|
| 331 |
-
|
| 332 |
-
conn.execute("LOAD 'gzip';")
|
| 333 |
except Exception as e:
|
| 334 |
-
print(f" ⚠ Warning: Could not
|
| 335 |
|
| 336 |
# PERFORMANCE OPTIMIZATIONS
|
| 337 |
conn.execute("SET preserve_insertion_order = false;") # Disable expensive ordering
|
| 338 |
-
conn.execute("SET default_order = 'ORDER BY NONE';") # Skip unnecessary sorting
|
| 339 |
conn.execute("SET enable_object_cache = true;") # Cache repeatedly read files
|
| 340 |
|
| 341 |
return conn
|
|
@@ -374,18 +369,11 @@ def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_LO
|
|
| 374 |
|
| 375 |
def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
|
| 376 |
"""
|
| 377 |
-
|
| 378 |
- IssuesEvent, IssueCommentEvent (for assistant-assigned issues AND wanted issues)
|
| 379 |
- PullRequestEvent (for wanted issue tracking)
|
| 380 |
- DiscussionEvent (for discussion tracking)
|
| 381 |
|
| 382 |
-
Then post-processes in Python to separate into:
|
| 383 |
-
1. Assistant-assigned issues: Issues where assistants are assigned to or commented on
|
| 384 |
-
2. Wanted issues: Long-standing issues from tracked orgs linked to merged PRs by assistants
|
| 385 |
-
3. Discussions: GitHub discussions created by assistants
|
| 386 |
-
|
| 387 |
-
This approach is more efficient than running separate queries for each category.
|
| 388 |
-
|
| 389 |
Args:
|
| 390 |
conn: DuckDB connection instance
|
| 391 |
identifiers: List of GitHub usernames/bot identifiers
|
|
@@ -425,7 +413,7 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
|
|
| 425 |
current_date = start_date
|
| 426 |
batch_num = 0
|
| 427 |
|
| 428 |
-
print(f" Streaming {total_batches} batches
|
| 429 |
|
| 430 |
while current_date <= end_date:
|
| 431 |
batch_num += 1
|
|
|
|
| 316 |
raise
|
| 317 |
|
| 318 |
# CORE MEMORY & THREADING SETTINGS
|
| 319 |
+
conn.execute(f"SET threads TO 4;")
|
| 320 |
conn.execute(f"SET max_memory = '48GB';") # Hard cap
|
| 321 |
conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
|
| 322 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
# GZIP PARALLEL DECOMPRESSION (only needed for .json.gz files)
|
| 324 |
+
# Note: Modern DuckDB versions have built-in gzip support via compression='gzip' parameter
|
| 325 |
+
# If extension is needed, it will be auto-installed when accessing .gz files
|
| 326 |
try:
|
| 327 |
conn.execute("SET extension_directory = '/tmp/duckdb_ext';")
|
| 328 |
+
# Auto-install will happen when reading gzip files - no need to pre-install
|
|
|
|
| 329 |
except Exception as e:
|
| 330 |
+
print(f" ⚠ Warning: Could not set extension directory: {e}")
|
| 331 |
|
| 332 |
# PERFORMANCE OPTIMIZATIONS
|
| 333 |
conn.execute("SET preserve_insertion_order = false;") # Disable expensive ordering
|
|
|
|
| 334 |
conn.execute("SET enable_object_cache = true;") # Cache repeatedly read files
|
| 335 |
|
| 336 |
return conn
|
|
|
|
| 369 |
|
| 370 |
def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
|
| 371 |
"""
|
| 372 |
+
QUERY: Fetch both issue and discussion metadata using streaming batch processing:
|
| 373 |
- IssuesEvent, IssueCommentEvent (for assistant-assigned issues AND wanted issues)
|
| 374 |
- PullRequestEvent (for wanted issue tracking)
|
| 375 |
- DiscussionEvent (for discussion tracking)
|
| 376 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
Args:
|
| 378 |
conn: DuckDB connection instance
|
| 379 |
identifiers: List of GitHub usernames/bot identifiers
|
|
|
|
| 413 |
current_date = start_date
|
| 414 |
batch_num = 0
|
| 415 |
|
| 416 |
+
print(f" Streaming {total_batches} batches of {BATCH_SIZE_DAYS}-day intervals...")
|
| 417 |
|
| 418 |
while current_date <= end_date:
|
| 419 |
batch_num += 1
|