zhimin-z commited on
Commit
76e8b98
·
1 Parent(s): 3da62f9
Files changed (1) hide show
  1. msr.py +57 -10
msr.py CHANGED
@@ -283,8 +283,23 @@ def get_duckdb_connection():
283
  """
284
  Initialize DuckDB connection with OPTIMIZED memory settings.
285
  Uses persistent database and reduced memory footprint.
 
286
  """
287
- conn = duckdb.connect(DUCKDB_CACHE_FILE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
  # OPTIMIZED SETTINGS
290
  conn.execute(f"SET threads TO {DUCKDB_THREADS};")
@@ -383,16 +398,14 @@ def fetch_all_issue_metadata_streaming(conn, identifiers, start_date, end_date):
383
  # Build file patterns SQL for THIS BATCH
384
  file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
385
 
386
- # Query for this batch - IssuesEvent filtered by assignee only
387
  query = f"""
388
- WITH issue_events AS (
389
  SELECT
390
- CONCAT(
391
- REPLACE(repo.url, 'api.github.com/repos/', 'github.com/'),
392
- '/issues/',
393
- CAST(payload.issue.number AS VARCHAR)
394
- ) as url,
395
- payload.issue.assignee.login as assignee,
396
  created_at as event_time,
397
  payload.issue.created_at as issue_created_at,
398
  payload.issue.closed_at as issue_closed_at,
@@ -402,7 +415,41 @@ def fetch_all_issue_metadata_streaming(conn, identifiers, start_date, end_date):
402
  type = 'IssuesEvent'
403
  AND payload.issue.number IS NOT NULL
404
  AND payload.issue.pull_request IS NULL
405
- AND payload.issue.assignee.login IN ({identifier_list})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  ),
407
  issue_timeline AS (
408
  SELECT
 
283
  """
284
  Initialize DuckDB connection with OPTIMIZED memory settings.
285
  Uses persistent database and reduced memory footprint.
286
+ Automatically removes cache file if lock conflict is detected.
287
  """
288
+ try:
289
+ conn = duckdb.connect(DUCKDB_CACHE_FILE)
290
+ except Exception as e:
291
+ # Check if it's a locking error
292
+ error_msg = str(e)
293
+ if "lock" in error_msg.lower() or "conflicting" in error_msg.lower():
294
+ print(f" ⚠ Lock conflict detected, removing {DUCKDB_CACHE_FILE}...")
295
+ if os.path.exists(DUCKDB_CACHE_FILE):
296
+ os.remove(DUCKDB_CACHE_FILE)
297
+ print(f" ✓ Cache file removed, retrying connection...")
298
+ # Retry connection after removing cache
299
+ conn = duckdb.connect(DUCKDB_CACHE_FILE)
300
+ else:
301
+ # Re-raise if it's not a locking error
302
+ raise
303
 
304
  # OPTIMIZED SETTINGS
305
  conn.execute(f"SET threads TO {DUCKDB_THREADS};")
 
398
  # Build file patterns SQL for THIS BATCH
399
  file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
400
 
401
+ # Query for this batch - IssuesEvent filtered by assignee (singular) and assignees (array)
402
  query = f"""
403
+ WITH raw_events AS (
404
  SELECT
405
+ repo.url,
406
+ payload.issue.number,
407
+ payload.issue.assignee.login as single_assignee,
408
+ payload.issue.assignees as assignees_array,
 
 
409
  created_at as event_time,
410
  payload.issue.created_at as issue_created_at,
411
  payload.issue.closed_at as issue_closed_at,
 
415
  type = 'IssuesEvent'
416
  AND payload.issue.number IS NOT NULL
417
  AND payload.issue.pull_request IS NULL
418
+ ),
419
+ -- Extract assignees from both single assignee field and assignees array
420
+ issue_events AS (
421
+ -- Get from single assignee field
422
+ SELECT
423
+ CONCAT(
424
+ REPLACE(url, 'api.github.com/repos/', 'github.com/'),
425
+ '/issues/',
426
+ CAST(number AS VARCHAR)
427
+ ) as url,
428
+ single_assignee as assignee,
429
+ event_time,
430
+ issue_created_at,
431
+ issue_closed_at,
432
+ state_reason
433
+ FROM raw_events
434
+ WHERE single_assignee IN ({identifier_list})
435
+
436
+ UNION ALL
437
+
438
+ -- Get from assignees array
439
+ SELECT
440
+ CONCAT(
441
+ REPLACE(url, 'api.github.com/repos/', 'github.com/'),
442
+ '/issues/',
443
+ CAST(number AS VARCHAR)
444
+ ) as url,
445
+ assignee_data.login as assignee,
446
+ event_time,
447
+ issue_created_at,
448
+ issue_closed_at,
449
+ state_reason
450
+ FROM raw_events,
451
+ UNNEST(assignees_array) as assignee_data
452
+ WHERE assignee_data.login IN ({identifier_list})
453
  ),
454
  issue_timeline AS (
455
  SELECT