zhimin-z commited on
Commit
4ad0589
·
1 Parent(s): 0522808
Files changed (1) hide show
  1. msr.py +11 -13
msr.py CHANGED
@@ -320,15 +320,6 @@ def get_duckdb_connection():
320
  conn.execute(f"SET max_memory = '50GB';")
321
  conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
322
 
323
- # GZIP PARALLEL DECOMPRESSION (only needed for .json.gz files)
324
- # Note: Modern DuckDB versions have built-in gzip support via compression='gzip' parameter
325
- # If extension is needed, it will be auto-installed when accessing .gz files
326
- try:
327
- conn.execute("SET extension_directory = '/tmp/duckdb_ext';")
328
- # Auto-install will happen when reading gzip files - no need to pre-install
329
- except Exception as e:
330
- print(f" ⚠ Warning: Could not set extension directory: {e}")
331
-
332
  # PERFORMANCE OPTIMIZATIONS
333
  conn.execute("SET preserve_insertion_order = false;") # Disable expensive ordering
334
  conn.execute("SET enable_object_cache = true;") # Cache repeatedly read files
@@ -433,8 +424,7 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
433
  file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
434
 
435
  try:
436
- # UNIFIED QUERY: Fetch ALL event types in ONE query
437
- # Post-process in Python to separate into assistant-assigned issues, wanted issues, PRs, and discussions
438
  unified_query = f"""
439
  SELECT
440
  type,
@@ -454,7 +444,7 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
454
  json_extract_string(payload, '$.issue.assignee.login') as issue_assignee,
455
  json_extract(payload, '$.issue.assignees') as issue_assignees,
456
  json_extract_string(payload, '$.comment.user.login') as commenter,
457
- -- PR fields
458
  COALESCE(
459
  json_extract_string(payload, '$.issue.html_url'),
460
  json_extract_string(payload, '$.pull_request.html_url')
@@ -478,7 +468,15 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
478
  json_extract_string(payload, '$.discussion.answer_chosen_at') as discussion_closed_at,
479
  json_extract_string(payload, '$.discussion.state_reason') as discussion_state_reason,
480
  json_extract_string(payload, '$.action') as action
481
- FROM read_json({file_patterns_sql}, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
 
 
 
 
 
 
 
 
482
  WHERE
483
  type IN ('IssuesEvent', 'IssueCommentEvent', 'PullRequestEvent', 'DiscussionEvent')
484
  AND (
 
320
  conn.execute(f"SET max_memory = '50GB';")
321
  conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
322
 
 
 
 
 
 
 
 
 
 
323
  # PERFORMANCE OPTIMIZATIONS
324
  conn.execute("SET preserve_insertion_order = false;") # Disable expensive ordering
325
  conn.execute("SET enable_object_cache = true;") # Cache repeatedly read files
 
424
  file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
425
 
426
  try:
427
+ # UNIFIED QUERY: Optimized for gzip decompression
 
428
  unified_query = f"""
429
  SELECT
430
  type,
 
444
  json_extract_string(payload, '$.issue.assignee.login') as issue_assignee,
445
  json_extract(payload, '$.issue.assignees') as issue_assignees,
446
  json_extract_string(payload, '$.comment.user.login') as commenter,
447
+ -- PR fields - simplified with COALESCE
448
  COALESCE(
449
  json_extract_string(payload, '$.issue.html_url'),
450
  json_extract_string(payload, '$.pull_request.html_url')
 
468
  json_extract_string(payload, '$.discussion.answer_chosen_at') as discussion_closed_at,
469
  json_extract_string(payload, '$.discussion.state_reason') as discussion_state_reason,
470
  json_extract_string(payload, '$.action') as action
471
+ FROM read_json(
472
+ {file_patterns_sql},
473
+ union_by_name=true,
474
+ filename=true,
475
+ compression='gzip',
476
+ format='newline_delimited',
477
+ ignore_errors=true,
478
+ strptime_format='%Y-%m-%dT%H:%M:%SZ'
479
+ )
480
  WHERE
481
  type IN ('IssuesEvent', 'IssueCommentEvent', 'PullRequestEvent', 'DiscussionEvent')
482
  AND (