zhimin-z commited on
Commit
f348397
·
1 Parent(s): 91984f1
Files changed (1) hide show
  1. msr.py +7 -19
msr.py CHANGED
@@ -316,26 +316,21 @@ def get_duckdb_connection():
316
  raise
317
 
318
  # CORE MEMORY & THREADING SETTINGS
319
- conn.execute(f"SET threads TO 8;")
320
  conn.execute(f"SET max_memory = '48GB';") # Hard cap
321
  conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
322
 
323
- # JSON STREAMING OPTIMIZATIONS (critical for performance)
324
- conn.execute("SET json.read_objects = true;") # Enable streaming JSON objects
325
- conn.execute("SET json.read_buffer_size = '64MB';") # Increase from 256KB default for large fields
326
- conn.execute("SET json.format = 'newline_delimited';") # Skip array parsing, double throughput
327
-
328
  # GZIP PARALLEL DECOMPRESSION (only needed for .json.gz files)
 
 
329
  try:
330
  conn.execute("SET extension_directory = '/tmp/duckdb_ext';")
331
- conn.execute("INSTALL 'gzip';")
332
- conn.execute("LOAD 'gzip';")
333
  except Exception as e:
334
- print(f" ⚠ Warning: Could not load gzip extension: {e}")
335
 
336
  # PERFORMANCE OPTIMIZATIONS
337
  conn.execute("SET preserve_insertion_order = false;") # Disable expensive ordering
338
- conn.execute("SET default_order = 'ORDER BY NONE';") # Skip unnecessary sorting
339
  conn.execute("SET enable_object_cache = true;") # Cache repeatedly read files
340
 
341
  return conn
@@ -374,18 +369,11 @@ def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_LO
374
 
375
  def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
376
  """
377
- UNIFIED QUERY: Fetches ALL metadata types in ONE query per batch:
378
  - IssuesEvent, IssueCommentEvent (for assistant-assigned issues AND wanted issues)
379
  - PullRequestEvent (for wanted issue tracking)
380
  - DiscussionEvent (for discussion tracking)
381
 
382
- Then post-processes in Python to separate into:
383
- 1. Assistant-assigned issues: Issues where assistants are assigned to or commented on
384
- 2. Wanted issues: Long-standing issues from tracked orgs linked to merged PRs by assistants
385
- 3. Discussions: GitHub discussions created by assistants
386
-
387
- This approach is more efficient than running separate queries for each category.
388
-
389
  Args:
390
  conn: DuckDB connection instance
391
  identifiers: List of GitHub usernames/bot identifiers
@@ -425,7 +413,7 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
425
  current_date = start_date
426
  batch_num = 0
427
 
428
- print(f" Streaming {total_batches} batches with unified query...")
429
 
430
  while current_date <= end_date:
431
  batch_num += 1
 
316
  raise
317
 
318
  # CORE MEMORY & THREADING SETTINGS
319
+ conn.execute(f"SET threads TO 4;")
320
  conn.execute(f"SET max_memory = '48GB';") # Hard cap
321
  conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
322
 
 
 
 
 
 
323
  # GZIP PARALLEL DECOMPRESSION (only needed for .json.gz files)
324
+ # Note: Modern DuckDB versions have built-in gzip support via compression='gzip' parameter
325
+ # If extension is needed, it will be auto-installed when accessing .gz files
326
  try:
327
  conn.execute("SET extension_directory = '/tmp/duckdb_ext';")
328
+ # Auto-install will happen when reading gzip files - no need to pre-install
 
329
  except Exception as e:
330
+ print(f" ⚠ Warning: Could not set extension directory: {e}")
331
 
332
  # PERFORMANCE OPTIMIZATIONS
333
  conn.execute("SET preserve_insertion_order = false;") # Disable expensive ordering
 
334
  conn.execute("SET enable_object_cache = true;") # Cache repeatedly read files
335
 
336
  return conn
 
369
 
370
  def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
371
  """
372
+ QUERY: Fetch both issue and discussion metadata using streaming batch processing:
373
  - IssuesEvent, IssueCommentEvent (for assistant-assigned issues AND wanted issues)
374
  - PullRequestEvent (for wanted issue tracking)
375
  - DiscussionEvent (for discussion tracking)
376
 
 
 
 
 
 
 
 
377
  Args:
378
  conn: DuckDB connection instance
379
  identifiers: List of GitHub usernames/bot identifiers
 
413
  current_date = start_date
414
  batch_num = 0
415
 
416
+ print(f" Streaming {total_batches} batches of {BATCH_SIZE_DAYS}-day intervals...")
417
 
418
  while current_date <= end_date:
419
  batch_num += 1