zhimin-z
commited on
Commit
·
4ad0589
1
Parent(s):
0522808
fix
Browse files
msr.py
CHANGED
|
@@ -320,15 +320,6 @@ def get_duckdb_connection():
|
|
| 320 |
conn.execute(f"SET max_memory = '50GB';")
|
| 321 |
conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
|
| 322 |
|
| 323 |
-
# GZIP PARALLEL DECOMPRESSION (only needed for .json.gz files)
|
| 324 |
-
# Note: Modern DuckDB versions have built-in gzip support via compression='gzip' parameter
|
| 325 |
-
# If extension is needed, it will be auto-installed when accessing .gz files
|
| 326 |
-
try:
|
| 327 |
-
conn.execute("SET extension_directory = '/tmp/duckdb_ext';")
|
| 328 |
-
# Auto-install will happen when reading gzip files - no need to pre-install
|
| 329 |
-
except Exception as e:
|
| 330 |
-
print(f" ⚠ Warning: Could not set extension directory: {e}")
|
| 331 |
-
|
| 332 |
# PERFORMANCE OPTIMIZATIONS
|
| 333 |
conn.execute("SET preserve_insertion_order = false;") # Disable expensive ordering
|
| 334 |
conn.execute("SET enable_object_cache = true;") # Cache repeatedly read files
|
|
@@ -433,8 +424,7 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
|
|
| 433 |
file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
|
| 434 |
|
| 435 |
try:
|
| 436 |
-
# UNIFIED QUERY:
|
| 437 |
-
# Post-process in Python to separate into assistant-assigned issues, wanted issues, PRs, and discussions
|
| 438 |
unified_query = f"""
|
| 439 |
SELECT
|
| 440 |
type,
|
|
@@ -454,7 +444,7 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
|
|
| 454 |
json_extract_string(payload, '$.issue.assignee.login') as issue_assignee,
|
| 455 |
json_extract(payload, '$.issue.assignees') as issue_assignees,
|
| 456 |
json_extract_string(payload, '$.comment.user.login') as commenter,
|
| 457 |
-
-- PR fields
|
| 458 |
COALESCE(
|
| 459 |
json_extract_string(payload, '$.issue.html_url'),
|
| 460 |
json_extract_string(payload, '$.pull_request.html_url')
|
|
@@ -478,7 +468,15 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
|
|
| 478 |
json_extract_string(payload, '$.discussion.answer_chosen_at') as discussion_closed_at,
|
| 479 |
json_extract_string(payload, '$.discussion.state_reason') as discussion_state_reason,
|
| 480 |
json_extract_string(payload, '$.action') as action
|
| 481 |
-
FROM read_json(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 482 |
WHERE
|
| 483 |
type IN ('IssuesEvent', 'IssueCommentEvent', 'PullRequestEvent', 'DiscussionEvent')
|
| 484 |
AND (
|
|
|
|
| 320 |
conn.execute(f"SET max_memory = '50GB';")
|
| 321 |
conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
|
| 322 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
# PERFORMANCE OPTIMIZATIONS
|
| 324 |
conn.execute("SET preserve_insertion_order = false;") # Disable expensive ordering
|
| 325 |
conn.execute("SET enable_object_cache = true;") # Cache repeatedly read files
|
|
|
|
| 424 |
file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
|
| 425 |
|
| 426 |
try:
|
| 427 |
+
# UNIFIED QUERY: Optimized for gzip decompression
|
|
|
|
| 428 |
unified_query = f"""
|
| 429 |
SELECT
|
| 430 |
type,
|
|
|
|
| 444 |
json_extract_string(payload, '$.issue.assignee.login') as issue_assignee,
|
| 445 |
json_extract(payload, '$.issue.assignees') as issue_assignees,
|
| 446 |
json_extract_string(payload, '$.comment.user.login') as commenter,
|
| 447 |
+
-- PR fields - simplified with COALESCE
|
| 448 |
COALESCE(
|
| 449 |
json_extract_string(payload, '$.issue.html_url'),
|
| 450 |
json_extract_string(payload, '$.pull_request.html_url')
|
|
|
|
| 468 |
json_extract_string(payload, '$.discussion.answer_chosen_at') as discussion_closed_at,
|
| 469 |
json_extract_string(payload, '$.discussion.state_reason') as discussion_state_reason,
|
| 470 |
json_extract_string(payload, '$.action') as action
|
| 471 |
+
FROM read_json(
|
| 472 |
+
{file_patterns_sql},
|
| 473 |
+
union_by_name=true,
|
| 474 |
+
filename=true,
|
| 475 |
+
compression='gzip',
|
| 476 |
+
format='newline_delimited',
|
| 477 |
+
ignore_errors=true,
|
| 478 |
+
strptime_format='%Y-%m-%dT%H:%M:%SZ'
|
| 479 |
+
)
|
| 480 |
WHERE
|
| 481 |
type IN ('IssuesEvent', 'IssueCommentEvent', 'PullRequestEvent', 'DiscussionEvent')
|
| 482 |
AND (
|