jonathanagustin commited on
Commit
fca3f00
·
verified ·
1 Parent(s): 0660570

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. Dockerfile +25 -0
  2. README.md +32 -5
  3. app.py +235 -0
  4. requirements.txt +6 -0
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ build-essential \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Copy requirements first for caching
11
+ COPY requirements.txt .
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ # Copy app code
15
+ COPY app.py .
16
+
17
+ # Create cache directory for parquet files
18
+ RUN mkdir -p /app/cache
19
+ ENV HF_HOME=/app/cache
20
+
21
+ # Expose port
22
+ EXPOSE 7860
23
+
24
+ # Run the app
25
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,10 +1,37 @@
1
  ---
2
- title: Lawforge Data Api
3
- emoji: 📈
4
- colorFrom: indigo
5
- colorTo: red
6
  sdk: docker
7
  pinned: false
 
 
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: LawForge Data API
3
+ emoji: ⚖️
4
+ colorFrom: blue
5
+ colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
+ license: mit
9
+ datasets:
10
+ - jonathanagustin/courtlistener-1
11
  ---
12
 
13
+ # LawForge Data API
14
+
15
+ FastAPI service for querying CourtListener legal data via DuckDB.
16
+
17
+ ## Endpoints
18
+
19
+ - `GET /` - API info
20
+ - `GET /health` - Health check
21
+ - `GET /rows/{config}` - Get paginated rows
22
+ - `GET /search/{config}?q=query` - Full-text search
23
+ - `GET /filter/{config}?where=clause` - SQL WHERE filter
24
+ - `GET /opinion/{id}` - Get opinion by ID
25
+ - `GET /cluster/{id}` - Get cluster by ID
26
+ - `GET /docket/{id}` - Get docket by ID
27
+
28
+ ## Available Configs
29
+
30
+ - `opinions` - Court opinions
31
+ - `opinion-clusters` - Opinion metadata
32
+ - `dockets` - Case dockets
33
+ - `courts` - Court information
34
+ - `citations` - Citation data
35
+ - `people-db-people` - Judges and people
36
+ - `people-db-positions` - Positions held
37
+ - `people-db-schools` - Law schools
app.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LawForge Data API - HuggingFace Space
2
+
3
+ FastAPI service to query CourtListener parquet data directly.
4
+ Bypasses datasets-server limitations for private datasets.
5
+ """
6
+
7
+ import os
8
+ from functools import lru_cache
9
+ from typing import Optional
10
+
11
+ import duckdb
12
+ from fastapi import FastAPI, HTTPException, Query
13
+ from fastapi.middleware.cors import CORSMiddleware
14
+ from huggingface_hub import hf_hub_download
15
+ import pandas as pd
16
+
17
+ app = FastAPI(
18
+ title="LawForge Data API",
19
+ description="Query CourtListener legal data",
20
+ version="1.0.0"
21
+ )
22
+
23
+ app.add_middleware(
24
+ CORSMiddleware,
25
+ allow_origins=["*"],
26
+ allow_credentials=True,
27
+ allow_methods=["*"],
28
+ allow_headers=["*"],
29
+ )
30
+
31
+ # Configuration
32
+ DATASET_ID = "jonathanagustin/courtlistener-1"
33
+ HF_TOKEN = os.environ.get("HF_TOKEN")
34
+
35
+ # Cache for DuckDB connections
36
+ _db_cache = {}
37
+
38
+
39
+ def get_parquet_path(config: str, shard: int = 0) -> str:
40
+ """Download and cache parquet file, return local path."""
41
+ cache_key = f"{config}_{shard}"
42
+ if cache_key not in _db_cache:
43
+ filename = f"data/{config}/{config}-{shard:05d}.parquet"
44
+ try:
45
+ local_path = hf_hub_download(
46
+ repo_id=DATASET_ID,
47
+ filename=filename,
48
+ repo_type="dataset",
49
+ token=HF_TOKEN
50
+ )
51
+ _db_cache[cache_key] = local_path
52
+ except Exception as e:
53
+ raise HTTPException(status_code=404, detail=f"Parquet file not found: {filename}")
54
+ return _db_cache[cache_key]
55
+
56
+
57
+ def query_parquet(config: str, sql: str, params: dict = None) -> list:
58
+ """Execute SQL query on parquet file."""
59
+ path = get_parquet_path(config)
60
+ conn = duckdb.connect(":memory:")
61
+ conn.execute(f"CREATE VIEW data AS SELECT * FROM read_parquet('{path}')")
62
+
63
+ if params:
64
+ result = conn.execute(sql, params).fetchdf()
65
+ else:
66
+ result = conn.execute(sql).fetchdf()
67
+
68
+ conn.close()
69
+ return result.to_dict(orient="records")
70
+
71
+
72
+ @app.get("/")
73
+ def root():
74
+ return {
75
+ "name": "LawForge Data API",
76
+ "version": "1.0.0",
77
+ "endpoints": {
78
+ "/health": "Health check",
79
+ "/rows/{config}": "Get rows from a config",
80
+ "/search/{config}": "Full-text search",
81
+ "/filter/{config}": "SQL WHERE filter",
82
+ }
83
+ }
84
+
85
+
86
+ @app.get("/health")
87
+ def health():
88
+ return {"status": "ok"}
89
+
90
+
91
+ @app.get("/rows/{config}")
92
+ def get_rows(
93
+ config: str,
94
+ offset: int = Query(0, ge=0),
95
+ limit: int = Query(20, ge=1, le=100)
96
+ ):
97
+ """Get paginated rows from a config."""
98
+ try:
99
+ sql = f"SELECT * FROM data LIMIT {limit} OFFSET {offset}"
100
+ rows = query_parquet(config, sql)
101
+
102
+ # Get total count
103
+ count_sql = "SELECT COUNT(*) as cnt FROM data"
104
+ total = query_parquet(config, count_sql)[0]["cnt"]
105
+
106
+ return {
107
+ "rows": rows,
108
+ "total": total,
109
+ "offset": offset,
110
+ "limit": limit
111
+ }
112
+ except Exception as e:
113
+ raise HTTPException(status_code=500, detail=str(e))
114
+
115
+
116
+ @app.get("/search/{config}")
117
+ def search(
118
+ config: str,
119
+ q: str = Query(..., min_length=1),
120
+ offset: int = Query(0, ge=0),
121
+ limit: int = Query(20, ge=1, le=100)
122
+ ):
123
+ """Full-text search on a config."""
124
+ try:
125
+ # Build search query based on config
126
+ if config == "opinions":
127
+ search_cols = ["plain_text", "html"]
128
+ elif config == "opinion-clusters":
129
+ search_cols = ["case_name", "case_name_full", "syllabus"]
130
+ elif config == "dockets":
131
+ search_cols = ["case_name", "case_name_full", "docket_number"]
132
+ else:
133
+ search_cols = ["*"]
134
+
135
+ # Create WHERE clause for text search
136
+ where_clauses = []
137
+ for col in search_cols:
138
+ if col == "*":
139
+ where_clauses.append(f"CAST(data AS VARCHAR) ILIKE '%{q}%'")
140
+ else:
141
+ where_clauses.append(f"COALESCE({col}, '') ILIKE '%{q}%'")
142
+
143
+ where = " OR ".join(where_clauses)
144
+ sql = f"SELECT * FROM data WHERE {where} LIMIT {limit} OFFSET {offset}"
145
+ rows = query_parquet(config, sql)
146
+
147
+ return {
148
+ "rows": rows,
149
+ "query": q,
150
+ "offset": offset,
151
+ "limit": limit
152
+ }
153
+ except Exception as e:
154
+ raise HTTPException(status_code=500, detail=str(e))
155
+
156
+
157
+ @app.get("/filter/{config}")
158
+ def filter_rows(
159
+ config: str,
160
+ where: str = Query(..., min_length=1),
161
+ offset: int = Query(0, ge=0),
162
+ limit: int = Query(20, ge=1, le=100)
163
+ ):
164
+ """Filter rows using SQL WHERE clause."""
165
+ try:
166
+ # Sanitize WHERE clause (basic protection)
167
+ forbidden = ["DROP", "DELETE", "INSERT", "UPDATE", "ALTER", "CREATE", ";"]
168
+ where_upper = where.upper()
169
+ for word in forbidden:
170
+ if word in where_upper:
171
+ raise HTTPException(status_code=400, detail=f"Forbidden SQL keyword: {word}")
172
+
173
+ sql = f"SELECT * FROM data WHERE {where} LIMIT {limit} OFFSET {offset}"
174
+ rows = query_parquet(config, sql)
175
+
176
+ return {
177
+ "rows": rows,
178
+ "where": where,
179
+ "offset": offset,
180
+ "limit": limit
181
+ }
182
+ except HTTPException:
183
+ raise
184
+ except Exception as e:
185
+ raise HTTPException(status_code=500, detail=str(e))
186
+
187
+
188
+ @app.get("/opinion/{opinion_id}")
189
+ def get_opinion(opinion_id: int):
190
+ """Get a specific opinion by ID."""
191
+ try:
192
+ sql = f"SELECT * FROM data WHERE id = {opinion_id}"
193
+ rows = query_parquet("opinions", sql)
194
+ if not rows:
195
+ raise HTTPException(status_code=404, detail="Opinion not found")
196
+ return rows[0]
197
+ except HTTPException:
198
+ raise
199
+ except Exception as e:
200
+ raise HTTPException(status_code=500, detail=str(e))
201
+
202
+
203
+ @app.get("/cluster/{cluster_id}")
204
+ def get_cluster(cluster_id: int):
205
+ """Get a specific opinion cluster by ID."""
206
+ try:
207
+ sql = f"SELECT * FROM data WHERE id = {cluster_id}"
208
+ rows = query_parquet("opinion-clusters", sql)
209
+ if not rows:
210
+ raise HTTPException(status_code=404, detail="Cluster not found")
211
+ return rows[0]
212
+ except HTTPException:
213
+ raise
214
+ except Exception as e:
215
+ raise HTTPException(status_code=500, detail=str(e))
216
+
217
+
218
+ @app.get("/docket/{docket_id}")
219
+ def get_docket(docket_id: int):
220
+ """Get a specific docket by ID."""
221
+ try:
222
+ sql = f"SELECT * FROM data WHERE id = {docket_id}"
223
+ rows = query_parquet("dockets", sql)
224
+ if not rows:
225
+ raise HTTPException(status_code=404, detail="Docket not found")
226
+ return rows[0]
227
+ except HTTPException:
228
+ raise
229
+ except Exception as e:
230
+ raise HTTPException(status_code=500, detail=str(e))
231
+
232
+
233
+ if __name__ == "__main__":
234
+ import uvicorn
235
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi>=0.104.0
2
+ uvicorn[standard]>=0.24.0
3
+ duckdb>=0.9.0
4
+ pandas>=2.0.0
5
+ pyarrow>=14.0.0
6
+ huggingface_hub>=0.19.0