Spaces:

SWE-Arena
/

SWE-Issue

Running

App Files Files Community

zhimin-z commited on Dec 1, 2025

Commit

4ad0589

1 Parent(s): 0522808

fix

Browse files

Files changed (1) hide show

msr.py +11 -13

msr.py CHANGED Viewed

@@ -320,15 +320,6 @@ def get_duckdb_connection():
     conn.execute(f"SET max_memory = '50GB';")
     conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
-    # GZIP PARALLEL DECOMPRESSION (only needed for .json.gz files)
-    # Note: Modern DuckDB versions have built-in gzip support via compression='gzip' parameter
-    # If extension is needed, it will be auto-installed when accessing .gz files
-    try:
-        conn.execute("SET extension_directory = '/tmp/duckdb_ext';")
-        # Auto-install will happen when reading gzip files - no need to pre-install
-    except Exception as e:
-        print(f"   ⚠ Warning: Could not set extension directory: {e}")
     # PERFORMANCE OPTIMIZATIONS
     conn.execute("SET preserve_insertion_order = false;")  # Disable expensive ordering
     conn.execute("SET enable_object_cache = true;")  # Cache repeatedly read files
@@ -433,8 +424,7 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
         file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
         try:
-            # UNIFIED QUERY: Fetch ALL event types in ONE query
-            # Post-process in Python to separate into assistant-assigned issues, wanted issues, PRs, and discussions
             unified_query = f"""
             SELECT
                 type,
@@ -454,7 +444,7 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
                 json_extract_string(payload, '$.issue.assignee.login') as issue_assignee,
                 json_extract(payload, '$.issue.assignees') as issue_assignees,
                 json_extract_string(payload, '$.comment.user.login') as commenter,
-                -- PR fields
                 COALESCE(
                     json_extract_string(payload, '$.issue.html_url'),
                     json_extract_string(payload, '$.pull_request.html_url')
@@ -478,7 +468,15 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
                 json_extract_string(payload, '$.discussion.answer_chosen_at') as discussion_closed_at,
                 json_extract_string(payload, '$.discussion.state_reason') as discussion_state_reason,
                 json_extract_string(payload, '$.action') as action
-            FROM read_json({file_patterns_sql}, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
             WHERE
                 type IN ('IssuesEvent', 'IssueCommentEvent', 'PullRequestEvent', 'DiscussionEvent')
                 AND (

     conn.execute(f"SET max_memory = '50GB';")
     conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
     # PERFORMANCE OPTIMIZATIONS
     conn.execute("SET preserve_insertion_order = false;")  # Disable expensive ordering
     conn.execute("SET enable_object_cache = true;")  # Cache repeatedly read files
         file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
         try:
+            # UNIFIED QUERY: Optimized for gzip decompression
             unified_query = f"""
             SELECT
                 type,
                 json_extract_string(payload, '$.issue.assignee.login') as issue_assignee,
                 json_extract(payload, '$.issue.assignees') as issue_assignees,
                 json_extract_string(payload, '$.comment.user.login') as commenter,
+                -- PR fields - simplified with COALESCE
                 COALESCE(
                     json_extract_string(payload, '$.issue.html_url'),
                     json_extract_string(payload, '$.pull_request.html_url')
                 json_extract_string(payload, '$.discussion.answer_chosen_at') as discussion_closed_at,
                 json_extract_string(payload, '$.discussion.state_reason') as discussion_state_reason,
                 json_extract_string(payload, '$.action') as action
+            FROM read_json(
+                {file_patterns_sql},
+                union_by_name=true,
+                filename=true,
+                compression='gzip',
+                format='newline_delimited',
+                ignore_errors=true,
+                strptime_format='%Y-%m-%dT%H:%M:%SZ'
+            )
             WHERE
                 type IN ('IssuesEvent', 'IssueCommentEvent', 'PullRequestEvent', 'DiscussionEvent')
                 AND (