ItsMaxNorm commited on
Commit
dd1bdfb
·
1 Parent(s): 6f62e7a

Initial deploy

Browse files
Files changed (4) hide show
  1. Dockerfile +12 -0
  2. README.md +22 -5
  3. app.py +444 -0
  4. requirements.txt +5 -0
Dockerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ COPY . .
9
+
10
+ EXPOSE 7860
11
+
12
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,12 +1,29 @@
1
  ---
2
- title: Papercircle Papers Api
3
- emoji: 💻
4
- colorFrom: red
5
- colorTo: blue
6
  sdk: docker
 
7
  pinned: false
8
  license: mit
9
  short_description: Paper circle offline database
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: PaperCircle Papers API
3
+ emoji: 📄
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: docker
7
+ app_port: 7860
8
  pinned: false
9
  license: mit
10
  short_description: Paper circle offline database
11
  ---
12
 
13
+ # PaperCircle Papers API
14
+
15
+ FastAPI service serving conference papers from a Parquet dataset via DuckDB.
16
+ Provides full-text search and filtered browsing for 230K+ academic papers.
17
+
18
+ ## Environment Variables
19
+
20
+ - `HF_DATASET_REPO`: HuggingFace dataset repo ID (default: `ItsMaxNorm/pc-database`)
21
+ - `PARQUET_PATH`: Local path to papers.parquet (alternative to HF download)
22
+
23
+ ## Endpoints
24
+
25
+ - `GET /health` — Health check
26
+ - `GET /api/community/papers` — Paginated papers with filters
27
+ - `GET /api/community/papers/{paper_id}` — Single paper
28
+ - `GET /api/community/filters` — Filter options
29
+ - `GET /api/search?query=...` — Full-text search
app.py ADDED
@@ -0,0 +1,444 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PaperCircle Papers API — HuggingFace Spaces
3
+ =============================================
4
+ Lightweight FastAPI serving conference papers from a Parquet dataset via DuckDB.
5
+ Deployed on HuggingFace Spaces (free tier).
6
+ """
7
+
8
+ import os
9
+ import json
10
+ import time
11
+ from contextlib import asynccontextmanager
12
+ from typing import Optional, List
13
+
14
+ import duckdb
15
+ from fastapi import FastAPI, Query, HTTPException
16
+ from fastapi.middleware.cors import CORSMiddleware
17
+ from huggingface_hub import hf_hub_download
18
+
19
+ # =============================================================================
20
+ # Configuration
21
+ # =============================================================================
22
+
23
+ HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "ItsMaxNorm/pc-database")
24
+ PARQUET_PATH = os.getenv("PARQUET_PATH", "")
25
+
26
+ # =============================================================================
27
+ # Database
28
+ # =============================================================================
29
+
30
+ db: Optional[duckdb.DuckDBPyConnection] = None
31
+ ready = False
32
+
33
+
34
+ def init_database():
35
+ """Load Parquet into DuckDB and create FTS index."""
36
+ global db, ready
37
+
38
+ start = time.time()
39
+ db = duckdb.connect(":memory:")
40
+
41
+ # Find the parquet file
42
+ parquet_file = None
43
+
44
+ # Option 1: Local parquet file
45
+ if PARQUET_PATH and os.path.exists(PARQUET_PATH):
46
+ parquet_file = PARQUET_PATH
47
+ print(f"[DB] Using local Parquet: {parquet_file}")
48
+
49
+ # Option 2: Download from HF Hub
50
+ elif HF_DATASET_REPO:
51
+ print(f"[DB] Downloading dataset from HF Hub: {HF_DATASET_REPO}")
52
+ parquet_file = hf_hub_download(
53
+ repo_id=HF_DATASET_REPO,
54
+ filename="data/papers.parquet",
55
+ repo_type="dataset",
56
+ )
57
+ print(f"[DB] Downloaded to: {parquet_file}")
58
+
59
+ # Option 3: Look in local data/ directory
60
+ else:
61
+ local_path = os.path.join(os.path.dirname(__file__), "data", "papers.parquet")
62
+ if os.path.exists(local_path):
63
+ parquet_file = local_path
64
+ print(f"[DB] Using bundled Parquet: {parquet_file}")
65
+
66
+ if not parquet_file:
67
+ raise RuntimeError(
68
+ "No Parquet file found. Set HF_DATASET_REPO or PARQUET_PATH env var, "
69
+ "or place data/papers.parquet in the app directory."
70
+ )
71
+
72
+ # Load into DuckDB
73
+ db.execute(f"""
74
+ CREATE TABLE papers AS
75
+ SELECT * FROM read_parquet('{parquet_file}')
76
+ """)
77
+
78
+ row_count = db.execute("SELECT COUNT(*) FROM papers").fetchone()[0]
79
+ print(f"[DB] Loaded {row_count} papers in {time.time() - start:.1f}s")
80
+
81
+ # Install and load FTS extension
82
+ db.execute("INSTALL fts")
83
+ db.execute("LOAD fts")
84
+
85
+ # Create FTS index on title, abstract, tldr
86
+ db.execute("""
87
+ PRAGMA create_fts_index(
88
+ 'papers', 'paper_id',
89
+ 'title', 'abstract', 'tldr',
90
+ overwrite=1
91
+ )
92
+ """)
93
+ print(f"[DB] FTS index created in {time.time() - start:.1f}s total")
94
+
95
+ ready = True
96
+
97
+
98
+ # =============================================================================
99
+ # App
100
+ # =============================================================================
101
+
102
+ @asynccontextmanager
103
+ async def lifespan(app: FastAPI):
104
+ init_database()
105
+ yield
106
+ if db:
107
+ db.close()
108
+
109
+
110
+ app = FastAPI(
111
+ title="PaperCircle Papers API",
112
+ version="1.0.0",
113
+ lifespan=lifespan,
114
+ )
115
+
116
+ app.add_middleware(
117
+ CORSMiddleware,
118
+ allow_origins=["*"],
119
+ allow_credentials=True,
120
+ allow_methods=["*"],
121
+ allow_headers=["*"],
122
+ )
123
+
124
+
125
+ # =============================================================================
126
+ # Endpoints
127
+ # =============================================================================
128
+
129
+ @app.get("/health")
130
+ async def health():
131
+ return {"status": "healthy" if ready else "loading", "ready": ready}
132
+
133
+
134
+ @app.get("/api/community/papers")
135
+ async def get_community_papers(
136
+ page: int = Query(1, ge=1),
137
+ limit: int = Query(20, ge=1, le=100),
138
+ year: Optional[int] = None,
139
+ conference: Optional[str] = None,
140
+ source: Optional[str] = None,
141
+ track: Optional[str] = None,
142
+ status: Optional[str] = None,
143
+ primary_area: Optional[str] = None,
144
+ min_rating: Optional[float] = None,
145
+ keywords: Optional[str] = None,
146
+ sort_by: str = Query("year", regex="^(year|rating|combined_score|recency|title)$"),
147
+ ):
148
+ """Get paginated community papers with filters."""
149
+ if not ready:
150
+ raise HTTPException(status_code=503, detail="Database loading, please retry")
151
+
152
+ offset = (page - 1) * limit
153
+
154
+ where_clauses = []
155
+ params = []
156
+
157
+ if year is not None:
158
+ where_clauses.append("year = ?")
159
+ params.append(year)
160
+ if conference:
161
+ where_clauses.append("conference = ?")
162
+ params.append(conference)
163
+ if source:
164
+ where_clauses.append("source = ?")
165
+ params.append(source)
166
+ if track:
167
+ where_clauses.append("track = ?")
168
+ params.append(track)
169
+ if status:
170
+ where_clauses.append("paper_status = ?")
171
+ params.append(status)
172
+ if primary_area:
173
+ where_clauses.append("primary_area = ?")
174
+ params.append(primary_area)
175
+ if min_rating is not None:
176
+ where_clauses.append("rating_avg >= ?")
177
+ params.append(min_rating)
178
+ if keywords:
179
+ # Simple ILIKE search for keyword filtering
180
+ where_clauses.append("(title ILIKE ? OR abstract ILIKE ? OR keywords ILIKE ?)")
181
+ pattern = f"%{keywords}%"
182
+ params.extend([pattern, pattern, pattern])
183
+
184
+ where_sql = " AND ".join(where_clauses) if where_clauses else "1=1"
185
+
186
+ # Sort mapping
187
+ sort_map = {
188
+ "year": "year DESC NULLS LAST",
189
+ "rating": "rating_avg DESC NULLS LAST",
190
+ "recency": "year DESC NULLS LAST",
191
+ "title": "title ASC",
192
+ "combined_score": "rating_avg DESC NULLS LAST",
193
+ }
194
+ order_sql = sort_map.get(sort_by, "year DESC NULLS LAST")
195
+
196
+ # Get total count
197
+ count_result = db.execute(
198
+ f"SELECT COUNT(*) FROM papers WHERE {where_sql}", params
199
+ ).fetchone()
200
+ total = count_result[0]
201
+
202
+ # Get papers
203
+ rows = db.execute(
204
+ f"""
205
+ SELECT paper_id, title, authors, abstract, year, venue, conference,
206
+ source, track, paper_status, primary_area, keywords, tldr,
207
+ pdf_url, arxiv_id, rating_avg, github_url
208
+ FROM papers
209
+ WHERE {where_sql}
210
+ ORDER BY {order_sql}
211
+ LIMIT ? OFFSET ?
212
+ """,
213
+ params + [limit, offset],
214
+ ).fetchall()
215
+
216
+ columns = [
217
+ "paper_id", "title", "authors", "abstract", "year", "venue", "conference",
218
+ "source", "track", "paper_status", "primary_area", "keywords", "tldr",
219
+ "pdf_url", "arxiv_id", "rating_avg", "github_url",
220
+ ]
221
+
222
+ papers = []
223
+ for row in rows:
224
+ paper = dict(zip(columns, row))
225
+ # Parse JSON strings back to lists
226
+ paper["authors"] = json.loads(paper["authors"]) if paper["authors"] else []
227
+ paper["keywords"] = json.loads(paper["keywords"]) if paper["keywords"] else []
228
+ papers.append(paper)
229
+
230
+ total_pages = (total + limit - 1) // limit if total > 0 else 1
231
+
232
+ return {
233
+ "papers": papers,
234
+ "total": total,
235
+ "page": page,
236
+ "limit": limit,
237
+ "total_pages": total_pages,
238
+ }
239
+
240
+
241
+ @app.get("/api/community/papers/{paper_id}")
242
+ async def get_community_paper(paper_id: str):
243
+ """Get a single paper by paper_id."""
244
+ if not ready:
245
+ raise HTTPException(status_code=503, detail="Database loading")
246
+
247
+ row = db.execute(
248
+ """
249
+ SELECT paper_id, title, authors, abstract, year, venue, conference,
250
+ source, track, paper_status, primary_area, keywords, tldr,
251
+ pdf_url, arxiv_id, rating_avg, github_url, bibtex
252
+ FROM papers WHERE paper_id = ?
253
+ """,
254
+ [paper_id],
255
+ ).fetchone()
256
+
257
+ if not row:
258
+ raise HTTPException(status_code=404, detail="Paper not found")
259
+
260
+ columns = [
261
+ "paper_id", "title", "authors", "abstract", "year", "venue", "conference",
262
+ "source", "track", "paper_status", "primary_area", "keywords", "tldr",
263
+ "pdf_url", "arxiv_id", "rating_avg", "github_url", "bibtex",
264
+ ]
265
+ paper = dict(zip(columns, row))
266
+ paper["authors"] = json.loads(paper["authors"]) if paper["authors"] else []
267
+ paper["keywords"] = json.loads(paper["keywords"]) if paper["keywords"] else []
268
+ return paper
269
+
270
+
271
+ @app.get("/api/community/filters")
272
+ async def get_filter_options():
273
+ """Get available filter options."""
274
+ if not ready:
275
+ raise HTTPException(status_code=503, detail="Database loading")
276
+
277
+ years = [r[0] for r in db.execute(
278
+ "SELECT DISTINCT year FROM papers WHERE year IS NOT NULL ORDER BY year DESC"
279
+ ).fetchall()]
280
+
281
+ conferences = [r[0] for r in db.execute(
282
+ "SELECT DISTINCT conference FROM papers WHERE conference IS NOT NULL AND conference != '' ORDER BY conference"
283
+ ).fetchall()]
284
+
285
+ sources = [r[0] for r in db.execute(
286
+ "SELECT DISTINCT source FROM papers WHERE source IS NOT NULL AND source != '' ORDER BY source"
287
+ ).fetchall()]
288
+
289
+ tracks = [r[0] for r in db.execute(
290
+ "SELECT DISTINCT track FROM papers WHERE track IS NOT NULL AND track != '' ORDER BY track"
291
+ ).fetchall()]
292
+
293
+ statuses = [r[0] for r in db.execute(
294
+ "SELECT DISTINCT paper_status FROM papers WHERE paper_status IS NOT NULL AND paper_status != '' ORDER BY paper_status"
295
+ ).fetchall()]
296
+
297
+ primary_areas = [r[0] for r in db.execute(
298
+ "SELECT DISTINCT primary_area FROM papers WHERE primary_area IS NOT NULL AND primary_area != '' ORDER BY primary_area"
299
+ ).fetchall()]
300
+
301
+ return {
302
+ "years": years,
303
+ "conferences": conferences,
304
+ "sources": sources,
305
+ "tracks": tracks,
306
+ "statuses": statuses,
307
+ "primary_areas": primary_areas,
308
+ }
309
+
310
+
311
+ @app.get("/api/search")
312
+ async def search_papers(
313
+ query: str = Query(..., min_length=1),
314
+ conferences: Optional[str] = None,
315
+ start_year: Optional[int] = None,
316
+ end_year: Optional[int] = None,
317
+ limit: int = Query(50, ge=1, le=200),
318
+ offset: int = Query(0, ge=0),
319
+ ):
320
+ """Full-text search with optional filters. conferences is comma-separated."""
321
+ if not ready:
322
+ raise HTTPException(status_code=503, detail="Database loading")
323
+
324
+ conf_list = [c.strip() for c in conferences.split(",")] if conferences else None
325
+
326
+ # Try FTS first
327
+ try:
328
+ papers = _search_fts(query, conf_list, start_year, end_year, limit, offset)
329
+ if papers:
330
+ return {"papers": papers, "search_type": "fts", "count": len(papers)}
331
+ except Exception as e:
332
+ print(f"[Search] FTS failed: {e}, falling back to simple search")
333
+
334
+ # Fallback to simple ILIKE search
335
+ papers = _search_simple(query, conf_list, start_year, end_year, limit, offset)
336
+ return {"papers": papers, "search_type": "simple", "count": len(papers)}
337
+
338
+
339
+ def _search_fts(query, conferences, start_year, end_year, limit, offset):
340
+ """Full-text search using DuckDB FTS extension."""
341
+ where_clauses = []
342
+ params = []
343
+
344
+ if conferences:
345
+ placeholders = ",".join(["?" for _ in conferences])
346
+ where_clauses.append(f"p.conference IN ({placeholders})")
347
+ params.extend(conferences)
348
+ if start_year is not None:
349
+ where_clauses.append("p.year >= ?")
350
+ params.append(start_year)
351
+ if end_year is not None:
352
+ where_clauses.append("p.year <= ?")
353
+ params.append(end_year)
354
+
355
+ extra_where = (" AND " + " AND ".join(where_clauses)) if where_clauses else ""
356
+
357
+ rows = db.execute(
358
+ f"""
359
+ SELECT p.paper_id, p.title, p.authors, p.abstract, p.year, p.venue,
360
+ p.conference, p.arxiv_id, p.pdf_url, p.rating_avg, p.keywords,
361
+ p.tldr, p.primary_area,
362
+ fts_main_papers.match_bm25(paper_id, ?) AS score
363
+ FROM papers p
364
+ WHERE score IS NOT NULL {extra_where}
365
+ ORDER BY score DESC
366
+ LIMIT ? OFFSET ?
367
+ """,
368
+ [query] + params + [limit, offset],
369
+ ).fetchall()
370
+
371
+ columns = [
372
+ "paper_id", "title", "authors", "abstract", "year", "venue",
373
+ "conference", "arxiv_id", "pdf_url", "rating_avg", "keywords",
374
+ "tldr", "primary_area", "score",
375
+ ]
376
+
377
+ papers = []
378
+ for row in rows:
379
+ paper = dict(zip(columns, row))
380
+ paper["authors"] = json.loads(paper["authors"]) if paper["authors"] else []
381
+ paper["keywords"] = json.loads(paper["keywords"]) if paper["keywords"] else []
382
+ papers.append(paper)
383
+
384
+ return papers
385
+
386
+
387
+ def _search_simple(query, conferences, start_year, end_year, limit, offset):
388
+ """Fallback ILIKE-based search."""
389
+ where_clauses = ["(p.title ILIKE ? OR p.abstract ILIKE ? OR p.tldr ILIKE ?)"]
390
+ pattern = f"%{query}%"
391
+ params = [pattern, pattern, pattern]
392
+
393
+ if conferences:
394
+ placeholders = ",".join(["?" for _ in conferences])
395
+ where_clauses.append(f"p.conference IN ({placeholders})")
396
+ params.extend(conferences)
397
+ if start_year is not None:
398
+ where_clauses.append("p.year >= ?")
399
+ params.append(start_year)
400
+ if end_year is not None:
401
+ where_clauses.append("p.year <= ?")
402
+ params.append(end_year)
403
+
404
+ where_sql = " AND ".join(where_clauses)
405
+
406
+ rows = db.execute(
407
+ f"""
408
+ SELECT p.paper_id, p.title, p.authors, p.abstract, p.year, p.venue,
409
+ p.conference, p.arxiv_id, p.pdf_url, p.rating_avg, p.keywords,
410
+ p.tldr, p.primary_area
411
+ FROM papers p
412
+ WHERE {where_sql}
413
+ ORDER BY
414
+ CASE WHEN p.title ILIKE ? THEN 0 ELSE 1 END,
415
+ p.rating_avg DESC NULLS LAST,
416
+ p.year DESC NULLS LAST
417
+ LIMIT ? OFFSET ?
418
+ """,
419
+ params + [pattern, limit, offset],
420
+ ).fetchall()
421
+
422
+ columns = [
423
+ "paper_id", "title", "authors", "abstract", "year", "venue",
424
+ "conference", "arxiv_id", "pdf_url", "rating_avg", "keywords",
425
+ "tldr", "primary_area",
426
+ ]
427
+
428
+ papers = []
429
+ for row in rows:
430
+ paper = dict(zip(columns, row))
431
+ paper["authors"] = json.loads(paper["authors"]) if paper["authors"] else []
432
+ paper["keywords"] = json.loads(paper["keywords"]) if paper["keywords"] else []
433
+ papers.append(paper)
434
+
435
+ return papers
436
+
437
+
438
+ # =============================================================================
439
+ # Main
440
+ # =============================================================================
441
+
442
+ if __name__ == "__main__":
443
+ import uvicorn
444
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi==0.115.0
2
+ uvicorn[standard]==0.30.0
3
+ duckdb==1.1.0
4
+ huggingface_hub>=0.23.0
5
+ pyarrow>=15.0.0