Spaces:

SWE-Arena
/

SWE-Issue

Running

App Files Files Community

zhimin-z commited on Nov 26, 2025

Commit

f348397

1 Parent(s): 91984f1

fixing

Browse files

Files changed (1) hide show

msr.py +7 -19

msr.py CHANGED Viewed

@@ -316,26 +316,21 @@ def get_duckdb_connection():
             raise
     # CORE MEMORY & THREADING SETTINGS
-    conn.execute(f"SET threads TO 8;")
     conn.execute(f"SET max_memory = '48GB';")  # Hard cap
     conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
-    # JSON STREAMING OPTIMIZATIONS (critical for performance)
-    conn.execute("SET json.read_objects = true;")  # Enable streaming JSON objects
-    conn.execute("SET json.read_buffer_size = '64MB';")  # Increase from 256KB default for large fields
-    conn.execute("SET json.format = 'newline_delimited';")  # Skip array parsing, double throughput
     # GZIP PARALLEL DECOMPRESSION (only needed for .json.gz files)
     try:
         conn.execute("SET extension_directory = '/tmp/duckdb_ext';")
-        conn.execute("INSTALL 'gzip';")
-        conn.execute("LOAD 'gzip';")
     except Exception as e:
-        print(f"   ⚠ Warning: Could not load gzip extension: {e}")
     # PERFORMANCE OPTIMIZATIONS
     conn.execute("SET preserve_insertion_order = false;")  # Disable expensive ordering
-    conn.execute("SET default_order = 'ORDER BY NONE';")  # Skip unnecessary sorting
     conn.execute("SET enable_object_cache = true;")  # Cache repeatedly read files
     return conn
@@ -374,18 +369,11 @@ def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_LO
 def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
     """
-    UNIFIED QUERY: Fetches ALL metadata types in ONE query per batch:
     - IssuesEvent, IssueCommentEvent (for assistant-assigned issues AND wanted issues)
     - PullRequestEvent (for wanted issue tracking)
     - DiscussionEvent (for discussion tracking)
-    Then post-processes in Python to separate into:
-    1. Assistant-assigned issues: Issues where assistants are assigned to or commented on
-    2. Wanted issues: Long-standing issues from tracked orgs linked to merged PRs by assistants
-    3. Discussions: GitHub discussions created by assistants
-    This approach is more efficient than running separate queries for each category.
     Args:
         conn: DuckDB connection instance
         identifiers: List of GitHub usernames/bot identifiers
@@ -425,7 +413,7 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
     current_date = start_date
     batch_num = 0
-    print(f"   Streaming {total_batches} batches with unified query...")
     while current_date <= end_date:
         batch_num += 1

             raise
     # CORE MEMORY & THREADING SETTINGS
+    conn.execute(f"SET threads TO 4;")
     conn.execute(f"SET max_memory = '48GB';")  # Hard cap
     conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
     # GZIP PARALLEL DECOMPRESSION (only needed for .json.gz files)
+    # Note: Modern DuckDB versions have built-in gzip support via compression='gzip' parameter
+    # If extension is needed, it will be auto-installed when accessing .gz files
     try:
         conn.execute("SET extension_directory = '/tmp/duckdb_ext';")
+        # Auto-install will happen when reading gzip files - no need to pre-install
     except Exception as e:
+        print(f"   ⚠ Warning: Could not set extension directory: {e}")
     # PERFORMANCE OPTIMIZATIONS
     conn.execute("SET preserve_insertion_order = false;")  # Disable expensive ordering
     conn.execute("SET enable_object_cache = true;")  # Cache repeatedly read files
     return conn
 def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
     """
+    QUERY: Fetch both issue and discussion metadata using streaming batch processing:
     - IssuesEvent, IssueCommentEvent (for assistant-assigned issues AND wanted issues)
     - PullRequestEvent (for wanted issue tracking)
     - DiscussionEvent (for discussion tracking)
     Args:
         conn: DuckDB connection instance
         identifiers: List of GitHub usernames/bot identifiers
     current_date = start_date
     batch_num = 0
+    print(f"   Streaming {total_batches} batches of {BATCH_SIZE_DAYS}-day intervals...")
     while current_date <= end_date:
         batch_num += 1