Spaces:

jonathanagustin
/

lawforge-data-api

Running

App Files Files Community

jonathanagustin commited on Dec 23, 2025

Commit

15bf590

verified ·

1 Parent(s): d8953ff

fix: query ALL parquet shards, not just shard 0

Browse files

Files changed (1) hide show

app.py +128 -212

app.py CHANGED Viewed

@@ -1,23 +1,23 @@
 """LawForge Data API - HuggingFace Space
 FastAPI service to query CourtListener parquet data directly.
-Bypasses datasets-server limitations for private datasets.
 """
 import os
-from functools import lru_cache
-from typing import Optional
 import duckdb
 from fastapi import FastAPI, HTTPException, Query
 from fastapi.middleware.cors import CORSMiddleware
 from huggingface_hub import hf_hub_download
-import pandas as pd
 app = FastAPI(
     title="LawForge Data API",
     description="Query CourtListener legal data",
-    version="1.0.0"
 )
 app.add_middleware(
@@ -31,50 +31,87 @@ app.add_middleware(
 # Configuration
 DATASET_ID = "jonathanagustin/courtlistener-1"
 HF_TOKEN = os.environ.get("HF_TOKEN")
-# Cache for DuckDB connections
-_db_cache = {}
-def get_parquet_path(config: str, shard: int = 0) -> str:
-    """Download and cache parquet file, return local path."""
-    cache_key = f"{config}_{shard}"
-    if cache_key not in _db_cache:
-        filename = f"data/{config}/{config}-{shard:05d}.parquet"
-        print(f"Downloading: {filename}")
         try:
             local_path = hf_hub_download(
                 repo_id=DATASET_ID,
                 filename=filename,
                 repo_type="dataset",
-                token=HF_TOKEN
             )
-            print(f"Downloaded to: {local_path}")
-            _db_cache[cache_key] = local_path
         except Exception as e:
             print(f"Error downloading {filename}: {e}")
-            raise HTTPException(status_code=404, detail=f"Parquet file not found: {filename}. Error: {str(e)}")
-    return _db_cache[cache_key]
-def query_parquet(config: str, sql: str, params: dict = None) -> list:
-    """Execute SQL query on parquet file."""
-    path = get_parquet_path(config)
     try:
         conn = duckdb.connect(":memory:")
-        conn.execute(f"CREATE VIEW data AS SELECT * FROM read_parquet('{path}')")
-        if params:
-            result = conn.execute(sql, params).fetchdf()
         else:
-            result = conn.execute(sql).fetchdf()
         conn.close()
-        # Convert to JSON-safe format
-        import json
-        import numpy as np
         def clean_value(v):
             if v is None:
                 return None
@@ -85,224 +122,103 @@ def query_parquet(config: str, sql: str, params: dict = None) -> list:
             if isinstance(v, (np.floating, np.float64)):
                 return float(v)
             return v
-        records = []
-        for _, row in result.iterrows():
-            record = {k: clean_value(v) for k, v in row.items()}
-            records.append(record)
-        return records
     except Exception as e:
-        import traceback
-        raise HTTPException(status_code=500, detail=f"Query error: {str(e)}. Traceback: {traceback.format_exc()}")
 @app.get("/")
 def root():
     return {
         "name": "LawForge Data API",
-        "version": "1.0.0",
         "endpoints": {
             "/health": "Health check",
-            "/rows/{config}": "Get rows from a config",
             "/search/{config}": "Full-text search",
             "/filter/{config}": "SQL WHERE filter",
         }
     }
 @app.get("/health")
 def health():
-    token_status = "set" if HF_TOKEN else "not set"
-    token_len = len(HF_TOKEN) if HF_TOKEN else 0
-    return {"status": "ok", "hf_token": token_status, "token_len": token_len}
-@app.get("/test-download")
-def test_download():
-    """Test downloading a parquet file."""
-    from huggingface_hub import hf_hub_download
-    try:
-        local_path = hf_hub_download(
-            repo_id=DATASET_ID,
-            filename="data/courts/courts-00000.parquet",
-            repo_type="dataset",
-            token=HF_TOKEN
-        )
-        import os
-        size = os.path.getsize(local_path)
-        return {"status": "ok", "path": local_path, "size_bytes": size}
-    except Exception as e:
-        return {"status": "error", "error": str(e), "type": type(e).__name__}
-@app.get("/test-query")
-def test_query():
-    """Test querying a parquet file."""
-    try:
-        path = get_parquet_path("courts")
-        conn = duckdb.connect(":memory:")
-        conn.execute(f"CREATE VIEW data AS SELECT * FROM read_parquet('{path}')")
-        result = conn.execute("SELECT COUNT(*) as cnt FROM data").fetchdf()
-        count = int(result['cnt'].iloc[0])
-        # Get one row
-        row = conn.execute("SELECT * FROM data LIMIT 1").fetchdf()
-        conn.close()
-        # Convert to dict
-        row_dict = row.to_dict(orient="records")[0] if len(row) > 0 else {}
-        return {"status": "ok", "count": count, "sample_row_keys": list(row_dict.keys())}
-    except Exception as e:
-        import traceback
-        return {"status": "error", "error": str(e), "type": type(e).__name__, "traceback": traceback.format_exc()}
 @app.get("/rows/{config}")
-def get_rows(
-    config: str,
-    offset: int = Query(0, ge=0),
-    limit: int = Query(20, ge=1, le=100)
-):
-    """Get paginated rows from a config."""
-    import traceback
-    try:
-        sql = f"SELECT * FROM data LIMIT {limit} OFFSET {offset}"
-        rows = query_parquet(config, sql)
-        # Get total count
-        count_sql = "SELECT COUNT(*) as cnt FROM data"
-        count_result = query_parquet(config, count_sql)
-        total = count_result[0]["cnt"] if count_result else 0
-        return {
-            "rows": rows,
-            "total": total,
-            "offset": offset,
-            "limit": limit
-        }
-    except HTTPException:
-        raise
-    except Exception as e:
-        return {"error": str(e), "traceback": traceback.format_exc()}
 @app.get("/search/{config}")
-def search(
-    config: str,
-    q: str = Query(..., min_length=1),
-    offset: int = Query(0, ge=0),
-    limit: int = Query(20, ge=1, le=100)
-):
-    """Full-text search on a config."""
-    try:
-        # Build search query based on config
-        if config == "opinions":
-            search_cols = ["plain_text", "html"]
-        elif config == "opinion-clusters":
-            search_cols = ["case_name", "case_name_full", "syllabus"]
-        elif config == "dockets":
-            search_cols = ["case_name", "case_name_full", "docket_number"]
-        else:
-            search_cols = ["*"]
-        # Create WHERE clause for text search
-        where_clauses = []
-        for col in search_cols:
-            if col == "*":
-                where_clauses.append(f"CAST(data AS VARCHAR) ILIKE '%{q}%'")
-            else:
-                where_clauses.append(f"COALESCE({col}, '') ILIKE '%{q}%'")
-        where = " OR ".join(where_clauses)
-        sql = f"SELECT * FROM data WHERE {where} LIMIT {limit} OFFSET {offset}"
-        rows = query_parquet(config, sql)
-        return {
-            "rows": rows,
-            "query": q,
-            "offset": offset,
-            "limit": limit
-        }
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
 @app.get("/filter/{config}")
-def filter_rows(
-    config: str,
-    where: str = Query(..., min_length=1),
-    offset: int = Query(0, ge=0),
-    limit: int = Query(20, ge=1, le=100)
-):
-    """Filter rows using SQL WHERE clause."""
-    try:
-        # Sanitize WHERE clause (basic protection)
-        forbidden = ["DROP", "DELETE", "INSERT", "UPDATE", "ALTER", "CREATE", ";"]
-        where_upper = where.upper()
-        for word in forbidden:
-            if word in where_upper:
-                raise HTTPException(status_code=400, detail=f"Forbidden SQL keyword: {word}")
-        sql = f"SELECT * FROM data WHERE {where} LIMIT {limit} OFFSET {offset}"
-        rows = query_parquet(config, sql)
-        return {
-            "rows": rows,
-            "where": where,
-            "offset": offset,
-            "limit": limit
-        }
-    except HTTPException:
-        raise
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
 @app.get("/opinion/{opinion_id}")
 def get_opinion(opinion_id: int):
-    """Get a specific opinion by ID."""
-    try:
-        sql = f"SELECT * FROM data WHERE id = {opinion_id}"
-        rows = query_parquet("opinions", sql)
-        if not rows:
-            raise HTTPException(status_code=404, detail="Opinion not found")
-        return rows[0]
-    except HTTPException:
-        raise
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
 @app.get("/cluster/{cluster_id}")
 def get_cluster(cluster_id: int):
-    """Get a specific opinion cluster by ID."""
-    try:
-        sql = f"SELECT * FROM data WHERE id = {cluster_id}"
-        rows = query_parquet("opinion-clusters", sql)
-        if not rows:
-            raise HTTPException(status_code=404, detail="Cluster not found")
-        return rows[0]
-    except HTTPException:
-        raise
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
 @app.get("/docket/{docket_id}")
 def get_docket(docket_id: int):
-    """Get a specific docket by ID."""
-    try:
-        sql = f"SELECT * FROM data WHERE id = {docket_id}"
-        rows = query_parquet("dockets", sql)
-        if not rows:
-            raise HTTPException(status_code=404, detail="Docket not found")
-        return rows[0]
-    except HTTPException:
-        raise
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":

 """LawForge Data API - HuggingFace Space
 FastAPI service to query CourtListener parquet data directly.
+Uses DuckDB to query ALL parquet shards.
 """
 import os
+import json
+from pathlib import Path
 import duckdb
+import numpy as np
 from fastapi import FastAPI, HTTPException, Query
 from fastapi.middleware.cors import CORSMiddleware
 from huggingface_hub import hf_hub_download
 app = FastAPI(
     title="LawForge Data API",
     description="Query CourtListener legal data",
+    version="2.0.0"
 )
 app.add_middleware(
 # Configuration
 DATASET_ID = "jonathanagustin/courtlistener-1"
 HF_TOKEN = os.environ.get("HF_TOKEN")
+CACHE_DIR = Path("/tmp/hf_cache")
+CACHE_DIR.mkdir(parents=True, exist_ok=True)
+# Cache
+_shard_cache: dict[str, list[str]] = {}
+_manifest_cache: dict = {}
+def get_manifest() -> dict:
+    """Download and cache the manifest."""
+    global _manifest_cache
+    if not _manifest_cache:
+        try:
+            path = hf_hub_download(
+                repo_id=DATASET_ID,
+                filename="manifest.json",
+                repo_type="dataset",
+                token=HF_TOKEN,
+                cache_dir=str(CACHE_DIR)
+            )
+            with open(path) as f:
+                _manifest_cache = json.load(f)
+        except Exception as e:
+            print(f"Error loading manifest: {e}")
+            _manifest_cache = {"tables": {}}
+    return _manifest_cache
+def get_shard_count(config: str) -> int:
+    """Get number of shards for a config from manifest."""
+    manifest = get_manifest()
+    table_info = manifest.get("tables", {}).get(config, {})
+    return table_info.get("shard_count", 1)
+def download_all_shards(config: str) -> list[str]:
+    """Download all parquet shards for a config."""
+    if config in _shard_cache:
+        return _shard_cache[config]
+    shard_count = get_shard_count(config)
+    print(f"Downloading {shard_count} shards for {config}...")
+    paths = []
+    for i in range(shard_count):
+        filename = f"data/{config}/{config}-{i:05d}.parquet"
         try:
             local_path = hf_hub_download(
                 repo_id=DATASET_ID,
                 filename=filename,
                 repo_type="dataset",
+                token=HF_TOKEN,
+                cache_dir=str(CACHE_DIR)
             )
+            paths.append(local_path)
         except Exception as e:
             print(f"Error downloading {filename}: {e}")
+    print(f"Downloaded {len(paths)}/{shard_count} shards for {config}")
+    _shard_cache[config] = paths
+    return paths
+def query_config(config: str, sql_template: str) -> list[dict]:
+    """Execute SQL query across all shards of a config."""
+    paths = download_all_shards(config)
+    if not paths:
+        raise HTTPException(status_code=404, detail=f"No data found for config: {config}")
     try:
         conn = duckdb.connect(":memory:")
+        if len(paths) == 1:
+            conn.execute(f"CREATE VIEW data AS SELECT * FROM read_parquet('{paths[0]}')")
         else:
+            paths_str = ", ".join(f"'{p}'" for p in paths)
+            conn.execute(f"CREATE VIEW data AS SELECT * FROM read_parquet([{paths_str}])")
+        result = conn.execute(sql_template).fetchdf()
         conn.close()
         def clean_value(v):
             if v is None:
                 return None
             if isinstance(v, (np.floating, np.float64)):
                 return float(v)
             return v
+        return [{k: clean_value(v) for k, v in row.items()} for _, row in result.iterrows()]
+    except HTTPException:
+        raise
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Query error: {str(e)}")
 @app.get("/")
 def root():
+    manifest = get_manifest()
+    tables = list(manifest.get("tables", {}).keys())
     return {
         "name": "LawForge Data API",
+        "version": "2.0.0",
+        "tables": tables,
         "endpoints": {
             "/health": "Health check",
+            "/rows/{config}": "Get rows (all shards)",
             "/search/{config}": "Full-text search",
             "/filter/{config}": "SQL WHERE filter",
+            "/stats": "Dataset statistics",
         }
     }
 @app.get("/health")
 def health():
+    return {"status": "ok", "hf_token": "set" if HF_TOKEN else "not set", "token_len": len(HF_TOKEN) if HF_TOKEN else 0}
+@app.get("/stats")
+def stats():
+    manifest = get_manifest()
+    tables = {name: {"total_rows": info.get("total_rows", 0), "shard_count": info.get("shard_count", 0)}
+              for name, info in manifest.get("tables", {}).items()}
+    return {"updated_at": manifest.get("updated_at"), "tables": tables}
 @app.get("/rows/{config}")
+def get_rows(config: str, offset: int = Query(0, ge=0), limit: int = Query(20, ge=1, le=1000)):
+    manifest = get_manifest()
+    total = manifest.get("tables", {}).get(config, {}).get("total_rows", 0)
+    rows = query_config(config, f"SELECT * FROM data LIMIT {limit} OFFSET {offset}")
+    return {"rows": rows, "total": total, "offset": offset, "limit": limit}
 @app.get("/search/{config}")
+def search(config: str, q: str = Query(..., min_length=1), offset: int = Query(0, ge=0), limit: int = Query(20, ge=1, le=100)):
+    if config == "opinions":
+        cols = ["plain_text", "html", "author_str"]
+    elif config == "opinion-clusters":
+        cols = ["case_name", "case_name_full", "syllabus", "judges"]
+    elif config == "dockets":
+        cols = ["case_name", "case_name_full", "docket_number"]
+    else:
+        cols = ["id"]
+    where = " OR ".join(f"COALESCE(CAST({c} AS VARCHAR), '') ILIKE '%{q}%'" for c in cols)
+    rows = query_config(config, f"SELECT * FROM data WHERE {where} LIMIT {limit} OFFSET {offset}")
+    return {"rows": rows, "query": q, "offset": offset, "limit": limit}
 @app.get("/filter/{config}")
+def filter_rows(config: str, where: str = Query(..., min_length=1), offset: int = Query(0, ge=0), limit: int = Query(20, ge=1, le=1000)):
+    forbidden = ["DROP", "DELETE", "INSERT", "UPDATE", "ALTER", "CREATE", ";", "--"]
+    for word in forbidden:
+        if word in where.upper():
+            raise HTTPException(status_code=400, detail=f"Forbidden: {word}")
+    rows = query_config(config, f"SELECT * FROM data WHERE {where} LIMIT {limit} OFFSET {offset}")
+    return {"rows": rows, "where": where, "offset": offset, "limit": limit}
 @app.get("/opinion/{opinion_id}")
 def get_opinion(opinion_id: int):
+    rows = query_config("opinions", f"SELECT * FROM data WHERE id = '{opinion_id}'")
+    if not rows:
+        raise HTTPException(status_code=404, detail="Opinion not found")
+    return rows[0]
 @app.get("/cluster/{cluster_id}")
 def get_cluster(cluster_id: int):
+    rows = query_config("opinion-clusters", f"SELECT * FROM data WHERE id = '{cluster_id}'")
+    if not rows:
+        raise HTTPException(status_code=404, detail="Cluster not found")
+    return rows[0]
 @app.get("/docket/{docket_id}")
 def get_docket(docket_id: int):
+    rows = query_config("dockets", f"SELECT * FROM data WHERE id = '{docket_id}'")
+    if not rows:
+        raise HTTPException(status_code=404, detail="Docket not found")
+    return rows[0]
 if __name__ == "__main__":