Spaces:

jonathanagustin
/

lawforge-data-api

Sleeping

App Files Files Community

jonathanagustin commited on Dec 21, 2025

Commit

fca3f00

verified ·

1 Parent(s): 0660570

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

Dockerfile +25 -0
README.md +32 -5
app.py +235 -0
requirements.txt +6 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,25 @@

+FROM python:3.11-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for caching
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy app code
+COPY app.py .
+# Create cache directory for parquet files
+RUN mkdir -p /app/cache
+ENV HF_HOME=/app/cache
+# Expose port
+EXPOSE 7860
+# Run the app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,10 +1,37 @@
 ---
-title: Lawforge Data Api
-emoji: 📈
-colorFrom: indigo
-colorTo: red
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: LawForge Data API
+emoji: ⚖️
+colorFrom: blue
+colorTo: indigo
 sdk: docker
 pinned: false
+license: mit
+datasets:
+  - jonathanagustin/courtlistener-1
 ---
+# LawForge Data API
+FastAPI service for querying CourtListener legal data via DuckDB.
+## Endpoints
+- `GET /` - API info
+- `GET /health` - Health check
+- `GET /rows/{config}` - Get paginated rows
+- `GET /search/{config}?q=query` - Full-text search
+- `GET /filter/{config}?where=clause` - SQL WHERE filter
+- `GET /opinion/{id}` - Get opinion by ID
+- `GET /cluster/{id}` - Get cluster by ID
+- `GET /docket/{id}` - Get docket by ID
+## Available Configs
+- `opinions` - Court opinions
+- `opinion-clusters` - Opinion metadata
+- `dockets` - Case dockets
+- `courts` - Court information
+- `citations` - Citation data
+- `people-db-people` - Judges and people
+- `people-db-positions` - Positions held
+- `people-db-schools` - Law schools

app.py ADDED Viewed

	@@ -0,0 +1,235 @@

+"""LawForge Data API - HuggingFace Space
+FastAPI service to query CourtListener parquet data directly.
+Bypasses datasets-server limitations for private datasets.
+"""
+import os
+from functools import lru_cache
+from typing import Optional
+import duckdb
+from fastapi import FastAPI, HTTPException, Query
+from fastapi.middleware.cors import CORSMiddleware
+from huggingface_hub import hf_hub_download
+import pandas as pd
+app = FastAPI(
+    title="LawForge Data API",
+    description="Query CourtListener legal data",
+    version="1.0.0"
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Configuration
+DATASET_ID = "jonathanagustin/courtlistener-1"
+HF_TOKEN = os.environ.get("HF_TOKEN")
+# Cache for DuckDB connections
+_db_cache = {}
+def get_parquet_path(config: str, shard: int = 0) -> str:
+    """Download and cache parquet file, return local path."""
+    cache_key = f"{config}_{shard}"
+    if cache_key not in _db_cache:
+        filename = f"data/{config}/{config}-{shard:05d}.parquet"
+        try:
+            local_path = hf_hub_download(
+                repo_id=DATASET_ID,
+                filename=filename,
+                repo_type="dataset",
+                token=HF_TOKEN
+            )
+            _db_cache[cache_key] = local_path
+        except Exception as e:
+            raise HTTPException(status_code=404, detail=f"Parquet file not found: {filename}")
+    return _db_cache[cache_key]
+def query_parquet(config: str, sql: str, params: dict = None) -> list:
+    """Execute SQL query on parquet file."""
+    path = get_parquet_path(config)
+    conn = duckdb.connect(":memory:")
+    conn.execute(f"CREATE VIEW data AS SELECT * FROM read_parquet('{path}')")
+    if params:
+        result = conn.execute(sql, params).fetchdf()
+    else:
+        result = conn.execute(sql).fetchdf()
+    conn.close()
+    return result.to_dict(orient="records")
+@app.get("/")
+def root():
+    return {
+        "name": "LawForge Data API",
+        "version": "1.0.0",
+        "endpoints": {
+            "/health": "Health check",
+            "/rows/{config}": "Get rows from a config",
+            "/search/{config}": "Full-text search",
+            "/filter/{config}": "SQL WHERE filter",
+        }
+    }
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+@app.get("/rows/{config}")
+def get_rows(
+    config: str,
+    offset: int = Query(0, ge=0),
+    limit: int = Query(20, ge=1, le=100)
+):
+    """Get paginated rows from a config."""
+    try:
+        sql = f"SELECT * FROM data LIMIT {limit} OFFSET {offset}"
+        rows = query_parquet(config, sql)
+        # Get total count
+        count_sql = "SELECT COUNT(*) as cnt FROM data"
+        total = query_parquet(config, count_sql)[0]["cnt"]
+        return {
+            "rows": rows,
+            "total": total,
+            "offset": offset,
+            "limit": limit
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/search/{config}")
+def search(
+    config: str,
+    q: str = Query(..., min_length=1),
+    offset: int = Query(0, ge=0),
+    limit: int = Query(20, ge=1, le=100)
+):
+    """Full-text search on a config."""
+    try:
+        # Build search query based on config
+        if config == "opinions":
+            search_cols = ["plain_text", "html"]
+        elif config == "opinion-clusters":
+            search_cols = ["case_name", "case_name_full", "syllabus"]
+        elif config == "dockets":
+            search_cols = ["case_name", "case_name_full", "docket_number"]
+        else:
+            search_cols = ["*"]
+        # Create WHERE clause for text search
+        where_clauses = []
+        for col in search_cols:
+            if col == "*":
+                where_clauses.append(f"CAST(data AS VARCHAR) ILIKE '%{q}%'")
+            else:
+                where_clauses.append(f"COALESCE({col}, '') ILIKE '%{q}%'")
+        where = " OR ".join(where_clauses)
+        sql = f"SELECT * FROM data WHERE {where} LIMIT {limit} OFFSET {offset}"
+        rows = query_parquet(config, sql)
+        return {
+            "rows": rows,
+            "query": q,
+            "offset": offset,
+            "limit": limit
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/filter/{config}")
+def filter_rows(
+    config: str,
+    where: str = Query(..., min_length=1),
+    offset: int = Query(0, ge=0),
+    limit: int = Query(20, ge=1, le=100)
+):
+    """Filter rows using SQL WHERE clause."""
+    try:
+        # Sanitize WHERE clause (basic protection)
+        forbidden = ["DROP", "DELETE", "INSERT", "UPDATE", "ALTER", "CREATE", ";"]
+        where_upper = where.upper()
+        for word in forbidden:
+            if word in where_upper:
+                raise HTTPException(status_code=400, detail=f"Forbidden SQL keyword: {word}")
+        sql = f"SELECT * FROM data WHERE {where} LIMIT {limit} OFFSET {offset}"
+        rows = query_parquet(config, sql)
+        return {
+            "rows": rows,
+            "where": where,
+            "offset": offset,
+            "limit": limit
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/opinion/{opinion_id}")
+def get_opinion(opinion_id: int):
+    """Get a specific opinion by ID."""
+    try:
+        sql = f"SELECT * FROM data WHERE id = {opinion_id}"
+        rows = query_parquet("opinions", sql)
+        if not rows:
+            raise HTTPException(status_code=404, detail="Opinion not found")
+        return rows[0]
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/cluster/{cluster_id}")
+def get_cluster(cluster_id: int):
+    """Get a specific opinion cluster by ID."""
+    try:
+        sql = f"SELECT * FROM data WHERE id = {cluster_id}"
+        rows = query_parquet("opinion-clusters", sql)
+        if not rows:
+            raise HTTPException(status_code=404, detail="Cluster not found")
+        return rows[0]
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/docket/{docket_id}")
+def get_docket(docket_id: int):
+    """Get a specific docket by ID."""
+    try:
+        sql = f"SELECT * FROM data WHERE id = {docket_id}"
+        rows = query_parquet("dockets", sql)
+        if not rows:
+            raise HTTPException(status_code=404, detail="Docket not found")
+        return rows[0]
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+fastapi>=0.104.0
+uvicorn[standard]>=0.24.0
+duckdb>=0.9.0
+pandas>=2.0.0
+pyarrow>=14.0.0
+huggingface_hub>=0.19.0