zhimin-z commited on
Commit
6994ebb
·
1 Parent(s): ae37963
Files changed (1) hide show
  1. msr.py +29 -15
msr.py CHANGED
@@ -398,8 +398,9 @@ def fetch_all_issue_metadata_streaming(conn, identifiers, start_date, end_date):
398
  # Build file patterns SQL for THIS BATCH
399
  file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
400
 
401
- # Query for this batch - IssuesEvent filtered by assignee
402
- # Note: We check BOTH single assignee field AND assignees array
 
403
  query = f"""
404
  WITH issue_events AS (
405
  SELECT
@@ -408,37 +409,50 @@ def fetch_all_issue_metadata_streaming(conn, identifiers, start_date, end_date):
408
  '/issues/',
409
  CAST(payload.issue.number AS VARCHAR)
410
  ) as url,
411
- COALESCE(payload.issue.assignee.login,
412
- (SELECT a.login
413
- FROM (SELECT UNNEST(payload.issue.assignees) as a)
414
- WHERE a.login IN ({identifier_list})
415
- LIMIT 1)) as assignee,
 
 
 
 
 
 
 
 
 
416
  created_at as event_time,
417
  payload.issue.created_at as issue_created_at,
418
  payload.issue.closed_at as issue_closed_at,
419
  payload.issue.state_reason as state_reason
420
  FROM read_json({file_patterns_sql}, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
421
  WHERE
422
- type = 'IssuesEvent'
423
  AND payload.issue.number IS NOT NULL
424
  AND payload.issue.pull_request IS NULL
425
  AND (
426
- payload.issue.assignee.login IN ({identifier_list})
427
- OR EXISTS (
428
- SELECT 1 FROM (SELECT UNNEST(payload.issue.assignees) as a)
429
- WHERE a.login IN ({identifier_list})
430
- )
 
 
 
 
431
  )
432
  ),
433
  issue_timeline AS (
434
  SELECT
435
  url,
436
- assignee as agent_identifier,
437
  MIN(issue_created_at) as created_at,
438
  MAX(issue_closed_at) as closed_at,
439
  MAX(state_reason) as state_reason
440
  FROM issue_events
441
- GROUP BY url, assignee
442
  )
443
  SELECT url, agent_identifier, created_at, closed_at, state_reason
444
  FROM issue_timeline
 
398
  # Build file patterns SQL for THIS BATCH
399
  file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
400
 
401
+ # Query for this batch - IssuesEvent (by author OR assignee) and IssueCommentEvent (by comment author)
402
+ # Note: For IssuesEvent, we check issue author, single assignee field, AND assignees array
403
+ # For IssueCommentEvent, we use the comment author
404
  query = f"""
405
  WITH issue_events AS (
406
  SELECT
 
409
  '/issues/',
410
  CAST(payload.issue.number AS VARCHAR)
411
  ) as url,
412
+ CASE
413
+ WHEN type = 'IssuesEvent' THEN
414
+ COALESCE(
415
+ CASE WHEN payload.issue.user.login IN ({identifier_list}) THEN payload.issue.user.login END,
416
+ payload.issue.assignee.login,
417
+ (SELECT a.login
418
+ FROM (SELECT UNNEST(payload.issue.assignees) as a)
419
+ WHERE a.login IN ({identifier_list})
420
+ LIMIT 1)
421
+ )
422
+ WHEN type = 'IssueCommentEvent' THEN
423
+ payload.comment.user.login
424
+ ELSE NULL
425
+ END as agent_identifier,
426
  created_at as event_time,
427
  payload.issue.created_at as issue_created_at,
428
  payload.issue.closed_at as issue_closed_at,
429
  payload.issue.state_reason as state_reason
430
  FROM read_json({file_patterns_sql}, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
431
  WHERE
432
+ type IN ('IssuesEvent', 'IssueCommentEvent')
433
  AND payload.issue.number IS NOT NULL
434
  AND payload.issue.pull_request IS NULL
435
  AND (
436
+ (type = 'IssuesEvent' AND (
437
+ payload.issue.user.login IN ({identifier_list})
438
+ OR payload.issue.assignee.login IN ({identifier_list})
439
+ OR EXISTS (
440
+ SELECT 1 FROM (SELECT UNNEST(payload.issue.assignees) as a)
441
+ WHERE a.login IN ({identifier_list})
442
+ )
443
+ ))
444
+ OR (type = 'IssueCommentEvent' AND payload.comment.user.login IN ({identifier_list}))
445
  )
446
  ),
447
  issue_timeline AS (
448
  SELECT
449
  url,
450
+ agent_identifier,
451
  MIN(issue_created_at) as created_at,
452
  MAX(issue_closed_at) as closed_at,
453
  MAX(state_reason) as state_reason
454
  FROM issue_events
455
+ GROUP BY url, agent_identifier
456
  )
457
  SELECT url, agent_identifier, created_at, closed_at, state_reason
458
  FROM issue_timeline