zhimin-z
commited on
Commit
·
6994ebb
1
Parent(s):
ae37963
refine
Browse files
msr.py
CHANGED
|
@@ -398,8 +398,9 @@ def fetch_all_issue_metadata_streaming(conn, identifiers, start_date, end_date):
|
|
| 398 |
# Build file patterns SQL for THIS BATCH
|
| 399 |
file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
|
| 400 |
|
| 401 |
-
# Query for this batch - IssuesEvent
|
| 402 |
-
# Note:
|
|
|
|
| 403 |
query = f"""
|
| 404 |
WITH issue_events AS (
|
| 405 |
SELECT
|
|
@@ -408,37 +409,50 @@ def fetch_all_issue_metadata_streaming(conn, identifiers, start_date, end_date):
|
|
| 408 |
'/issues/',
|
| 409 |
CAST(payload.issue.number AS VARCHAR)
|
| 410 |
) as url,
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
created_at as event_time,
|
| 417 |
payload.issue.created_at as issue_created_at,
|
| 418 |
payload.issue.closed_at as issue_closed_at,
|
| 419 |
payload.issue.state_reason as state_reason
|
| 420 |
FROM read_json({file_patterns_sql}, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
|
| 421 |
WHERE
|
| 422 |
-
type
|
| 423 |
AND payload.issue.number IS NOT NULL
|
| 424 |
AND payload.issue.pull_request IS NULL
|
| 425 |
AND (
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
)
|
| 432 |
),
|
| 433 |
issue_timeline AS (
|
| 434 |
SELECT
|
| 435 |
url,
|
| 436 |
-
|
| 437 |
MIN(issue_created_at) as created_at,
|
| 438 |
MAX(issue_closed_at) as closed_at,
|
| 439 |
MAX(state_reason) as state_reason
|
| 440 |
FROM issue_events
|
| 441 |
-
GROUP BY url,
|
| 442 |
)
|
| 443 |
SELECT url, agent_identifier, created_at, closed_at, state_reason
|
| 444 |
FROM issue_timeline
|
|
|
|
| 398 |
# Build file patterns SQL for THIS BATCH
|
| 399 |
file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
|
| 400 |
|
| 401 |
+
# Query for this batch - IssuesEvent (by author OR assignee) and IssueCommentEvent (by comment author)
|
| 402 |
+
# Note: For IssuesEvent, we check issue author, single assignee field, AND assignees array
|
| 403 |
+
# For IssueCommentEvent, we use the comment author
|
| 404 |
query = f"""
|
| 405 |
WITH issue_events AS (
|
| 406 |
SELECT
|
|
|
|
| 409 |
'/issues/',
|
| 410 |
CAST(payload.issue.number AS VARCHAR)
|
| 411 |
) as url,
|
| 412 |
+
CASE
|
| 413 |
+
WHEN type = 'IssuesEvent' THEN
|
| 414 |
+
COALESCE(
|
| 415 |
+
CASE WHEN payload.issue.user.login IN ({identifier_list}) THEN payload.issue.user.login END,
|
| 416 |
+
payload.issue.assignee.login,
|
| 417 |
+
(SELECT a.login
|
| 418 |
+
FROM (SELECT UNNEST(payload.issue.assignees) as a)
|
| 419 |
+
WHERE a.login IN ({identifier_list})
|
| 420 |
+
LIMIT 1)
|
| 421 |
+
)
|
| 422 |
+
WHEN type = 'IssueCommentEvent' THEN
|
| 423 |
+
payload.comment.user.login
|
| 424 |
+
ELSE NULL
|
| 425 |
+
END as agent_identifier,
|
| 426 |
created_at as event_time,
|
| 427 |
payload.issue.created_at as issue_created_at,
|
| 428 |
payload.issue.closed_at as issue_closed_at,
|
| 429 |
payload.issue.state_reason as state_reason
|
| 430 |
FROM read_json({file_patterns_sql}, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
|
| 431 |
WHERE
|
| 432 |
+
type IN ('IssuesEvent', 'IssueCommentEvent')
|
| 433 |
AND payload.issue.number IS NOT NULL
|
| 434 |
AND payload.issue.pull_request IS NULL
|
| 435 |
AND (
|
| 436 |
+
(type = 'IssuesEvent' AND (
|
| 437 |
+
payload.issue.user.login IN ({identifier_list})
|
| 438 |
+
OR payload.issue.assignee.login IN ({identifier_list})
|
| 439 |
+
OR EXISTS (
|
| 440 |
+
SELECT 1 FROM (SELECT UNNEST(payload.issue.assignees) as a)
|
| 441 |
+
WHERE a.login IN ({identifier_list})
|
| 442 |
+
)
|
| 443 |
+
))
|
| 444 |
+
OR (type = 'IssueCommentEvent' AND payload.comment.user.login IN ({identifier_list}))
|
| 445 |
)
|
| 446 |
),
|
| 447 |
issue_timeline AS (
|
| 448 |
SELECT
|
| 449 |
url,
|
| 450 |
+
agent_identifier,
|
| 451 |
MIN(issue_created_at) as created_at,
|
| 452 |
MAX(issue_closed_at) as closed_at,
|
| 453 |
MAX(state_reason) as state_reason
|
| 454 |
FROM issue_events
|
| 455 |
+
GROUP BY url, agent_identifier
|
| 456 |
)
|
| 457 |
SELECT url, agent_identifier, created_at, closed_at, state_reason
|
| 458 |
FROM issue_timeline
|