siddhm11 commited on
Commit
12d7e78
·
1 Parent(s): b4d17db

Phase 3.5: Wire Turso DB for metadata (2.9x faster, includes citations)

Browse files
app/config.py CHANGED
@@ -20,6 +20,10 @@ ARXIV_API_URL = "https://export.arxiv.org/api/query"
20
  ARXIV_MAX_RESULTS = 10 # results per search page
21
  METADATA_CACHE_TTL_DAYS = 30 # re-fetch metadata after this many days
22
 
 
 
 
 
23
  # ── Recommendation settings ───────────────────────────────────────────────────
24
  REC_LIMIT = 10 # how many recommendations to show
25
  REC_POSITIVE_LIMIT = 20 # max positive examples sent to Qdrant
 
20
  ARXIV_MAX_RESULTS = 10 # results per search page
21
  METADATA_CACHE_TTL_DAYS = 30 # re-fetch metadata after this many days
22
 
23
+ # ── Turso (libSQL) — arXiv metadata DB — Phase 3.5 ───────────────────────────
24
+ TURSO_URL = os.getenv("TURSO_URL", "")
25
+ TURSO_DB_TOKEN = os.getenv("TURSO_DB_TOKEN", "")
26
+
27
  # ── Recommendation settings ───────────────────────────────────────────────────
28
  REC_LIMIT = 10 # how many recommendations to show
29
  REC_POSITIVE_LIMIT = 20 # max positive examples sent to Qdrant
app/routers/search.py CHANGED
@@ -7,11 +7,14 @@ GET /search?q=<query>
7
 
8
  Phase 3 replaces the arXiv keyword API with:
9
  LLM rewrite → BGE-M3 encode → Qdrant dense + Zilliz sparse → RRF → rerank
 
 
 
10
  """
11
  import uuid
12
  from fastapi import APIRouter, Request, Cookie
13
  from fastapi.responses import HTMLResponse
14
- from app import arxiv_svc, user_state as us, hybrid_search_svc
15
  from app.config import COOKIE_NAME, ARXIV_MAX_RESULTS
16
  from app.templates_env import templates
17
 
@@ -34,18 +37,27 @@ async def search(
34
  arxiv_ids = []
35
 
36
  if arxiv_ids:
37
- # Fetch metadata for the ranked results
38
  try:
39
- meta = await arxiv_svc.fetch_metadata_batch(arxiv_ids)
40
- # Preserve ranking order from hybrid search
41
- papers = [meta[aid] for aid in arxiv_ids if aid in meta]
42
  except Exception as e:
43
- # arXiv API timeout fall back to keyword search
44
- print(f"[search] Metadata fetch failed ({e}), falling back to arXiv API")
45
- papers = []
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  if not papers and q.strip():
48
- # Fallback: arXiv keyword API if hybrid returns nothing or metadata failed
49
  try:
50
  papers = await arxiv_svc.search(q.strip())
51
  except Exception as e:
 
7
 
8
  Phase 3 replaces the arXiv keyword API with:
9
  LLM rewrite → BGE-M3 encode → Qdrant dense + Zilliz sparse → RRF → rerank
10
+
11
+ Phase 3.5: Metadata now fetched from Turso cloud DB (fast, includes citations)
12
+ with arXiv API as fallback for papers not in Turso.
13
  """
14
  import uuid
15
  from fastapi import APIRouter, Request, Cookie
16
  from fastapi.responses import HTMLResponse
17
+ from app import arxiv_svc, turso_svc, user_state as us, hybrid_search_svc
18
  from app.config import COOKIE_NAME, ARXIV_MAX_RESULTS
19
  from app.templates_env import templates
20
 
 
37
  arxiv_ids = []
38
 
39
  if arxiv_ids:
40
+ # Phase 3.5: Fetch metadata from Turso DB first (fast, ~50ms)
41
  try:
42
+ meta = await turso_svc.fetch_metadata_batch(arxiv_ids)
 
 
43
  except Exception as e:
44
+ print(f"[search] Turso metadata fetch failed: {e}")
45
+ meta = {}
46
+
47
+ # Fallback: fetch any missing IDs from arXiv API
48
+ missing = [aid for aid in arxiv_ids if aid not in meta]
49
+ if missing:
50
+ try:
51
+ arxiv_meta = await arxiv_svc.fetch_metadata_batch(missing)
52
+ meta.update(arxiv_meta)
53
+ except Exception as e:
54
+ print(f"[search] arXiv fallback for {len(missing)} IDs failed: {e}")
55
+
56
+ # Preserve ranking order from hybrid search
57
+ papers = [meta[aid] for aid in arxiv_ids if aid in meta]
58
 
59
  if not papers and q.strip():
60
+ # Fallback: arXiv keyword API if hybrid returns nothing
61
  try:
62
  papers = await arxiv_svc.search(q.strip())
63
  except Exception as e:
app/turso_svc.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Turso (libSQL) metadata service — Phase 3.5.
3
+
4
+ Replaces arxiv_svc.fetch_metadata_batch() with direct Turso DB lookups.
5
+ Uses Turso's HTTP pipeline API — no additional Python dependencies needed
6
+ (just httpx, already installed).
7
+
8
+ The DB contains ~1.6M arXiv papers with metadata + citation counts from
9
+ Semantic Scholar, bulk-loaded from Kaggle.
10
+
11
+ Connection: TURSO_URL + TURSO_DB_TOKEN (env vars)
12
+ Table: papers (arxiv_id UNIQUE INDEX)
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import time
18
+
19
+ import httpx
20
+
21
+ from app import config
22
+
23
+
24
+ # ── Public API ───────────────────────────────────────────────────────────────
25
+
26
+ async def fetch_metadata(arxiv_id: str) -> dict | None:
27
+ """Fetch metadata for a single paper from Turso."""
28
+ result = await fetch_metadata_batch([arxiv_id])
29
+ return result.get(arxiv_id)
30
+
31
+
32
+ async def fetch_metadata_batch(arxiv_ids: list[str]) -> dict[str, dict]:
33
+ """
34
+ Fetch metadata for multiple papers from Turso DB.
35
+
36
+ Returns {arxiv_id: paper_dict} for all IDs found.
37
+ Paper dict has keys: arxiv_id, title, abstract, authors, category,
38
+ published, year, citation_count, influential_citations.
39
+
40
+ Uses Turso HTTP pipeline API — single HTTP request for all IDs.
41
+ """
42
+ if not arxiv_ids:
43
+ return {}
44
+
45
+ url = config.TURSO_URL
46
+ token = config.TURSO_DB_TOKEN
47
+
48
+ if not url or not token:
49
+ print("[turso] TURSO_URL or TURSO_DB_TOKEN not configured, skipping")
50
+ return {}
51
+
52
+ # Build parameterised query with placeholders
53
+ placeholders = ", ".join(["?" for _ in arxiv_ids])
54
+ sql = f"SELECT arxiv_id, title, authors, categories, primary_topic, update_date, abstract_preview, citation_count, influential_citations FROM papers WHERE arxiv_id IN ({placeholders})"
55
+
56
+ args = [{"type": "text", "value": aid} for aid in arxiv_ids]
57
+
58
+ # Turso HTTP pipeline API
59
+ pipeline_url = url.rstrip("/")
60
+ # Convert to HTTP API URL format
61
+ if pipeline_url.startswith("libsql://"):
62
+ pipeline_url = pipeline_url.replace("libsql://", "https://")
63
+ if not pipeline_url.startswith("https://"):
64
+ pipeline_url = "https://" + pipeline_url.lstrip("https://").lstrip("http://")
65
+
66
+ payload = {
67
+ "requests": [
68
+ {
69
+ "type": "execute",
70
+ "stmt": {"sql": sql, "args": args},
71
+ },
72
+ {"type": "close"},
73
+ ]
74
+ }
75
+
76
+ headers = {
77
+ "Authorization": f"Bearer {token}",
78
+ "Content-Type": "application/json",
79
+ }
80
+
81
+ t0 = time.perf_counter()
82
+
83
+ try:
84
+ async with httpx.AsyncClient(timeout=10) as client:
85
+ resp = await client.post(
86
+ f"{pipeline_url}/v2/pipeline",
87
+ json=payload,
88
+ headers=headers,
89
+ )
90
+ resp.raise_for_status()
91
+ except Exception as e:
92
+ print(f"[turso] HTTP request failed: {e}")
93
+ return {}
94
+
95
+ elapsed_ms = (time.perf_counter() - t0) * 1000
96
+ print(f"[turso] Fetched metadata for {len(arxiv_ids)} IDs in {elapsed_ms:.0f}ms")
97
+
98
+ try:
99
+ data = resp.json()
100
+ results = data.get("results", [])
101
+ if not results:
102
+ return {}
103
+
104
+ # First result is our execute response
105
+ execute_result = results[0]
106
+ if execute_result.get("type") == "error":
107
+ print(f"[turso] Query error: {execute_result.get('error')}")
108
+ return {}
109
+
110
+ response = execute_result.get("response", {})
111
+ result_data = response.get("result", {})
112
+ cols = [c["name"] for c in result_data.get("cols", [])]
113
+ rows = result_data.get("rows", [])
114
+
115
+ except (KeyError, IndexError, TypeError) as e:
116
+ print(f"[turso] Response parsing error: {e}")
117
+ return {}
118
+
119
+ # Convert rows to paper dicts matching the expected format
120
+ output: dict[str, dict] = {}
121
+ for row in rows:
122
+ # Each row is a list of {"type": "text"|"integer"|"null", "value": ...}
123
+ values = {}
124
+ for i, col in enumerate(cols):
125
+ cell = row[i]
126
+ if cell.get("type") == "null":
127
+ values[col] = None
128
+ else:
129
+ values[col] = cell.get("value", "")
130
+
131
+ paper = _to_paper_dict(values)
132
+ if paper:
133
+ output[paper["arxiv_id"]] = paper
134
+
135
+ return output
136
+
137
+
138
+ def _to_paper_dict(row: dict) -> dict | None:
139
+ """
140
+ Convert a Turso row into the paper dict format expected by templates.
141
+
142
+ Template expects:
143
+ arxiv_id, title, abstract, authors (JSON string), category, published, year
144
+ Turso provides:
145
+ arxiv_id, title, authors (comma-sep), categories, primary_topic,
146
+ update_date, abstract_preview, citation_count, influential_citations
147
+ """
148
+ arxiv_id = row.get("arxiv_id")
149
+ if not arxiv_id:
150
+ return None
151
+
152
+ # Convert authors from comma-separated to JSON array string
153
+ authors_raw = row.get("authors") or ""
154
+ if authors_raw.startswith("["):
155
+ # Already JSON — leave as is
156
+ authors_json = authors_raw
157
+ else:
158
+ # Comma-separated → JSON array (take first 5)
159
+ author_list = [a.strip() for a in authors_raw.split(",") if a.strip()][:5]
160
+ authors_json = json.dumps(author_list)
161
+
162
+ # Use primary_topic as category, fall back to first in categories list
163
+ category = row.get("primary_topic") or ""
164
+ if not category:
165
+ cats = row.get("categories") or ""
166
+ category = cats.split()[0] if cats else ""
167
+
168
+ # Extract year from update_date (YYYY-MM-DD format)
169
+ update_date = row.get("update_date") or ""
170
+ year = 0
171
+ if len(update_date) >= 4:
172
+ try:
173
+ year = int(update_date[:4])
174
+ except ValueError:
175
+ pass
176
+
177
+ # Citation count (bonus data from Semantic Scholar)
178
+ citation_count = 0
179
+ try:
180
+ citation_count = int(row.get("citation_count") or 0)
181
+ except (ValueError, TypeError):
182
+ pass
183
+
184
+ influential = 0
185
+ try:
186
+ influential = int(row.get("influential_citations") or 0)
187
+ except (ValueError, TypeError):
188
+ pass
189
+
190
+ return {
191
+ "arxiv_id": arxiv_id,
192
+ "title": (row.get("title") or "").replace("\n", " "),
193
+ "abstract": (row.get("abstract_preview") or "").replace("\n", " "),
194
+ "authors": authors_json,
195
+ "category": category,
196
+ "published": update_date,
197
+ "year": year,
198
+ "citation_count": citation_count,
199
+ "influential_citations": influential,
200
+ }
tests/test_turso_timing.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test script: Compare Turso DB vs arXiv API metadata fetch times.
3
+ Run: python -m tests.test_turso_timing
4
+ """
5
+ import asyncio
6
+ import time
7
+ import sys
8
+ import os
9
+
10
+ # Ensure app module is importable
11
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
12
+
13
+ from app import turso_svc, arxiv_svc
14
+
15
+ # Sample arxiv IDs (known papers from our vector DBs)
16
+ TEST_IDS = [
17
+ "1706.03762", # Attention Is All You Need
18
+ "2206.03003", # Transformer attention medical
19
+ "2209.15001", # Dilated Neighborhood Attention Transformer
20
+ "1809.04281", # Music Transformer
21
+ "2010.11929", # ViT - Vision Transformer
22
+ "1810.04805", # BERT
23
+ "2005.14165", # GPT-3
24
+ "2302.13971", # LLaMA
25
+ "1512.03385", # ResNet
26
+ "2103.00020", # CLIP
27
+ ]
28
+
29
+
30
+ async def test_turso():
31
+ print("=" * 60)
32
+ print("TURSO DB METADATA FETCH TEST")
33
+ print("=" * 60)
34
+
35
+ # Single paper
36
+ t0 = time.perf_counter()
37
+ result = await turso_svc.fetch_metadata(TEST_IDS[0])
38
+ t1 = time.perf_counter()
39
+ print(f"\n[Single] {TEST_IDS[0]} -> {(t1-t0)*1000:.0f}ms")
40
+ if result:
41
+ print(f" Title: {result['title'][:80]}")
42
+ print(f" Authors: {result['authors'][:80]}")
43
+ print(f" Category: {result['category']}")
44
+ print(f" Published: {result['published']}")
45
+ print(f" Year: {result['year']}")
46
+ print(f" Citations: {result.get('citation_count', 'N/A')}")
47
+ print(f" Influential: {result.get('influential_citations', 'N/A')}")
48
+ else:
49
+ print(" NOT FOUND in Turso DB")
50
+
51
+ # Batch of 10
52
+ t0 = time.perf_counter()
53
+ batch = await turso_svc.fetch_metadata_batch(TEST_IDS)
54
+ t1 = time.perf_counter()
55
+ turso_time = (t1 - t0) * 1000
56
+ print(f"\n[Batch of {len(TEST_IDS)}] -> {turso_time:.0f}ms")
57
+ print(f" Found: {len(batch)}/{len(TEST_IDS)}")
58
+ for aid, paper in batch.items():
59
+ cites = paper.get("citation_count", 0)
60
+ print(f" {aid}: {paper['title'][:60]}... [{paper['category']}] (cites: {cites})")
61
+
62
+ return turso_time, batch
63
+
64
+
65
+ async def test_arxiv():
66
+ print("\n" + "=" * 60)
67
+ print("ARXIV API METADATA FETCH TEST (for comparison)")
68
+ print("=" * 60)
69
+
70
+ t0 = time.perf_counter()
71
+ batch = await arxiv_svc.fetch_metadata_batch(TEST_IDS)
72
+ t1 = time.perf_counter()
73
+ arxiv_time = (t1 - t0) * 1000
74
+ print(f"\n[Batch of {len(TEST_IDS)}] -> {arxiv_time:.0f}ms")
75
+ print(f" Found: {len(batch)}/{len(TEST_IDS)}")
76
+ for aid, paper in batch.items():
77
+ print(f" {aid}: {paper['title'][:60]}... [{paper['category']}]")
78
+
79
+ return arxiv_time, batch
80
+
81
+
82
+ async def main():
83
+ turso_time, turso_batch = await test_turso()
84
+ arxiv_time, arxiv_batch = await test_arxiv()
85
+
86
+ print("\n" + "=" * 60)
87
+ print("TIMING COMPARISON")
88
+ print("=" * 60)
89
+ print(f" Turso DB: {turso_time:>8.0f}ms ({len(turso_batch)} papers)")
90
+ print(f" arXiv API: {arxiv_time:>8.0f}ms ({len(arxiv_batch)} papers)")
91
+ speedup = arxiv_time / turso_time if turso_time > 0 else float("inf")
92
+ print(f" Speedup: {speedup:.1f}x faster with Turso")
93
+ print()
94
+
95
+ # Verify data quality: compare titles
96
+ print("DATA QUALITY CHECK (title match):")
97
+ for aid in TEST_IDS:
98
+ t_title = turso_batch.get(aid, {}).get("title", "N/A")[:50]
99
+ a_title = arxiv_batch.get(aid, {}).get("title", "N/A")[:50]
100
+ match = "OK" if t_title.lower()[:30] == a_title.lower()[:30] else "DIFF"
101
+ print(f" [{match}] {aid}")
102
+
103
+
104
+ if __name__ == "__main__":
105
+ asyncio.run(main())