Spaces:

siddhm11
/

ResearchIT

Sleeping

App Files Files Community

siddhm11 commited on Apr 20

Commit

12d7e78

1 Parent(s): b4d17db

Phase 3.5: Wire Turso DB for metadata (2.9x faster, includes citations)

Browse files

Files changed (4) hide show

app/config.py +4 -0
app/routers/search.py +21 -9
app/turso_svc.py +200 -0
tests/test_turso_timing.py +105 -0

app/config.py CHANGED Viewed

@@ -20,6 +20,10 @@ ARXIV_API_URL = "https://export.arxiv.org/api/query"
 ARXIV_MAX_RESULTS = 10          # results per search page
 METADATA_CACHE_TTL_DAYS = 30    # re-fetch metadata after this many days
 # ── Recommendation settings ───────────────────────────────────────────────────
 REC_LIMIT = 10                  # how many recommendations to show
 REC_POSITIVE_LIMIT = 20         # max positive examples sent to Qdrant

 ARXIV_MAX_RESULTS = 10          # results per search page
 METADATA_CACHE_TTL_DAYS = 30    # re-fetch metadata after this many days
+# ── Turso (libSQL) — arXiv metadata DB — Phase 3.5 ───────────────────────────
+TURSO_URL = os.getenv("TURSO_URL", "")
+TURSO_DB_TOKEN = os.getenv("TURSO_DB_TOKEN", "")
 # ── Recommendation settings ───────────────────────────────────────────────────
 REC_LIMIT = 10                  # how many recommendations to show
 REC_POSITIVE_LIMIT = 20         # max positive examples sent to Qdrant

app/routers/search.py CHANGED Viewed

@@ -7,11 +7,14 @@ GET /search?q=<query>
 Phase 3 replaces the arXiv keyword API with:
   LLM rewrite → BGE-M3 encode → Qdrant dense + Zilliz sparse → RRF → rerank
 """
 import uuid
 from fastapi import APIRouter, Request, Cookie
 from fastapi.responses import HTMLResponse
-from app import arxiv_svc, user_state as us, hybrid_search_svc
 from app.config import COOKIE_NAME, ARXIV_MAX_RESULTS
 from app.templates_env import templates
@@ -34,18 +37,27 @@ async def search(
             arxiv_ids = []
         if arxiv_ids:
-            # Fetch metadata for the ranked results
             try:
-                meta = await arxiv_svc.fetch_metadata_batch(arxiv_ids)
-                # Preserve ranking order from hybrid search
-                papers = [meta[aid] for aid in arxiv_ids if aid in meta]
             except Exception as e:
-                # arXiv API timeout — fall back to keyword search
-                print(f"[search] Metadata fetch failed ({e}), falling back to arXiv API")
-                papers = []
         if not papers and q.strip():
-            # Fallback: arXiv keyword API if hybrid returns nothing or metadata failed
             try:
                 papers = await arxiv_svc.search(q.strip())
             except Exception as e:

 Phase 3 replaces the arXiv keyword API with:
   LLM rewrite → BGE-M3 encode → Qdrant dense + Zilliz sparse → RRF → rerank
+Phase 3.5: Metadata now fetched from Turso cloud DB (fast, includes citations)
+  with arXiv API as fallback for papers not in Turso.
 """
 import uuid
 from fastapi import APIRouter, Request, Cookie
 from fastapi.responses import HTMLResponse
+from app import arxiv_svc, turso_svc, user_state as us, hybrid_search_svc
 from app.config import COOKIE_NAME, ARXIV_MAX_RESULTS
 from app.templates_env import templates
             arxiv_ids = []
         if arxiv_ids:
+            # Phase 3.5: Fetch metadata from Turso DB first (fast, ~50ms)
             try:
+                meta = await turso_svc.fetch_metadata_batch(arxiv_ids)
             except Exception as e:
+                print(f"[search] Turso metadata fetch failed: {e}")
+                meta = {}
+            # Fallback: fetch any missing IDs from arXiv API
+            missing = [aid for aid in arxiv_ids if aid not in meta]
+            if missing:
+                try:
+                    arxiv_meta = await arxiv_svc.fetch_metadata_batch(missing)
+                    meta.update(arxiv_meta)
+                except Exception as e:
+                    print(f"[search] arXiv fallback for {len(missing)} IDs failed: {e}")
+            # Preserve ranking order from hybrid search
+            papers = [meta[aid] for aid in arxiv_ids if aid in meta]
         if not papers and q.strip():
+            # Fallback: arXiv keyword API if hybrid returns nothing
             try:
                 papers = await arxiv_svc.search(q.strip())
             except Exception as e:

app/turso_svc.py ADDED Viewed

	@@ -0,0 +1,200 @@

+"""
+Turso (libSQL) metadata service — Phase 3.5.
+Replaces arxiv_svc.fetch_metadata_batch() with direct Turso DB lookups.
+Uses Turso's HTTP pipeline API — no additional Python dependencies needed
+(just httpx, already installed).
+The DB contains ~1.6M arXiv papers with metadata + citation counts from
+Semantic Scholar, bulk-loaded from Kaggle.
+Connection: TURSO_URL + TURSO_DB_TOKEN (env vars)
+Table:      papers (arxiv_id UNIQUE INDEX)
+"""
+from __future__ import annotations
+import json
+import time
+import httpx
+from app import config
+# ── Public API ───────────────────────────────────────────────────────────────
+async def fetch_metadata(arxiv_id: str) -> dict | None:
+    """Fetch metadata for a single paper from Turso."""
+    result = await fetch_metadata_batch([arxiv_id])
+    return result.get(arxiv_id)
+async def fetch_metadata_batch(arxiv_ids: list[str]) -> dict[str, dict]:
+    """
+    Fetch metadata for multiple papers from Turso DB.
+    Returns {arxiv_id: paper_dict} for all IDs found.
+    Paper dict has keys: arxiv_id, title, abstract, authors, category,
+    published, year, citation_count, influential_citations.
+    Uses Turso HTTP pipeline API — single HTTP request for all IDs.
+    """
+    if not arxiv_ids:
+        return {}
+    url = config.TURSO_URL
+    token = config.TURSO_DB_TOKEN
+    if not url or not token:
+        print("[turso] TURSO_URL or TURSO_DB_TOKEN not configured, skipping")
+        return {}
+    # Build parameterised query with placeholders
+    placeholders = ", ".join(["?" for _ in arxiv_ids])
+    sql = f"SELECT arxiv_id, title, authors, categories, primary_topic, update_date, abstract_preview, citation_count, influential_citations FROM papers WHERE arxiv_id IN ({placeholders})"
+    args = [{"type": "text", "value": aid} for aid in arxiv_ids]
+    # Turso HTTP pipeline API
+    pipeline_url = url.rstrip("/")
+    # Convert to HTTP API URL format
+    if pipeline_url.startswith("libsql://"):
+        pipeline_url = pipeline_url.replace("libsql://", "https://")
+    if not pipeline_url.startswith("https://"):
+        pipeline_url = "https://" + pipeline_url.lstrip("https://").lstrip("http://")
+    payload = {
+        "requests": [
+            {
+                "type": "execute",
+                "stmt": {"sql": sql, "args": args},
+            },
+            {"type": "close"},
+        ]
+    }
+    headers = {
+        "Authorization": f"Bearer {token}",
+        "Content-Type": "application/json",
+    }
+    t0 = time.perf_counter()
+    try:
+        async with httpx.AsyncClient(timeout=10) as client:
+            resp = await client.post(
+                f"{pipeline_url}/v2/pipeline",
+                json=payload,
+                headers=headers,
+            )
+            resp.raise_for_status()
+    except Exception as e:
+        print(f"[turso] HTTP request failed: {e}")
+        return {}
+    elapsed_ms = (time.perf_counter() - t0) * 1000
+    print(f"[turso] Fetched metadata for {len(arxiv_ids)} IDs in {elapsed_ms:.0f}ms")
+    try:
+        data = resp.json()
+        results = data.get("results", [])
+        if not results:
+            return {}
+        # First result is our execute response
+        execute_result = results[0]
+        if execute_result.get("type") == "error":
+            print(f"[turso] Query error: {execute_result.get('error')}")
+            return {}
+        response = execute_result.get("response", {})
+        result_data = response.get("result", {})
+        cols = [c["name"] for c in result_data.get("cols", [])]
+        rows = result_data.get("rows", [])
+    except (KeyError, IndexError, TypeError) as e:
+        print(f"[turso] Response parsing error: {e}")
+        return {}
+    # Convert rows to paper dicts matching the expected format
+    output: dict[str, dict] = {}
+    for row in rows:
+        # Each row is a list of {"type": "text"|"integer"|"null", "value": ...}
+        values = {}
+        for i, col in enumerate(cols):
+            cell = row[i]
+            if cell.get("type") == "null":
+                values[col] = None
+            else:
+                values[col] = cell.get("value", "")
+        paper = _to_paper_dict(values)
+        if paper:
+            output[paper["arxiv_id"]] = paper
+    return output
+def _to_paper_dict(row: dict) -> dict | None:
+    """
+    Convert a Turso row into the paper dict format expected by templates.
+    Template expects:
+      arxiv_id, title, abstract, authors (JSON string), category, published, year
+    Turso provides:
+      arxiv_id, title, authors (comma-sep), categories, primary_topic,
+      update_date, abstract_preview, citation_count, influential_citations
+    """
+    arxiv_id = row.get("arxiv_id")
+    if not arxiv_id:
+        return None
+    # Convert authors from comma-separated to JSON array string
+    authors_raw = row.get("authors") or ""
+    if authors_raw.startswith("["):
+        # Already JSON — leave as is
+        authors_json = authors_raw
+    else:
+        # Comma-separated → JSON array (take first 5)
+        author_list = [a.strip() for a in authors_raw.split(",") if a.strip()][:5]
+        authors_json = json.dumps(author_list)
+    # Use primary_topic as category, fall back to first in categories list
+    category = row.get("primary_topic") or ""
+    if not category:
+        cats = row.get("categories") or ""
+        category = cats.split()[0] if cats else ""
+    # Extract year from update_date (YYYY-MM-DD format)
+    update_date = row.get("update_date") or ""
+    year = 0
+    if len(update_date) >= 4:
+        try:
+            year = int(update_date[:4])
+        except ValueError:
+            pass
+    # Citation count (bonus data from Semantic Scholar)
+    citation_count = 0
+    try:
+        citation_count = int(row.get("citation_count") or 0)
+    except (ValueError, TypeError):
+        pass
+    influential = 0
+    try:
+        influential = int(row.get("influential_citations") or 0)
+    except (ValueError, TypeError):
+        pass
+    return {
+        "arxiv_id": arxiv_id,
+        "title": (row.get("title") or "").replace("\n", " "),
+        "abstract": (row.get("abstract_preview") or "").replace("\n", " "),
+        "authors": authors_json,
+        "category": category,
+        "published": update_date,
+        "year": year,
+        "citation_count": citation_count,
+        "influential_citations": influential,
+    }

tests/test_turso_timing.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""
+Test script: Compare Turso DB vs arXiv API metadata fetch times.
+Run: python -m tests.test_turso_timing
+"""
+import asyncio
+import time
+import sys
+import os
+# Ensure app module is importable
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from app import turso_svc, arxiv_svc
+# Sample arxiv IDs (known papers from our vector DBs)
+TEST_IDS = [
+    "1706.03762",   # Attention Is All You Need
+    "2206.03003",   # Transformer attention medical
+    "2209.15001",   # Dilated Neighborhood Attention Transformer
+    "1809.04281",   # Music Transformer
+    "2010.11929",   # ViT - Vision Transformer
+    "1810.04805",   # BERT
+    "2005.14165",   # GPT-3
+    "2302.13971",   # LLaMA
+    "1512.03385",   # ResNet
+    "2103.00020",   # CLIP
+]
+async def test_turso():
+    print("=" * 60)
+    print("TURSO DB METADATA FETCH TEST")
+    print("=" * 60)
+    # Single paper
+    t0 = time.perf_counter()
+    result = await turso_svc.fetch_metadata(TEST_IDS[0])
+    t1 = time.perf_counter()
+    print(f"\n[Single] {TEST_IDS[0]} -> {(t1-t0)*1000:.0f}ms")
+    if result:
+        print(f"  Title:     {result['title'][:80]}")
+        print(f"  Authors:   {result['authors'][:80]}")
+        print(f"  Category:  {result['category']}")
+        print(f"  Published: {result['published']}")
+        print(f"  Year:      {result['year']}")
+        print(f"  Citations: {result.get('citation_count', 'N/A')}")
+        print(f"  Influential: {result.get('influential_citations', 'N/A')}")
+    else:
+        print("  NOT FOUND in Turso DB")
+    # Batch of 10
+    t0 = time.perf_counter()
+    batch = await turso_svc.fetch_metadata_batch(TEST_IDS)
+    t1 = time.perf_counter()
+    turso_time = (t1 - t0) * 1000
+    print(f"\n[Batch of {len(TEST_IDS)}] -> {turso_time:.0f}ms")
+    print(f"  Found: {len(batch)}/{len(TEST_IDS)}")
+    for aid, paper in batch.items():
+        cites = paper.get("citation_count", 0)
+        print(f"  {aid}: {paper['title'][:60]}... [{paper['category']}] (cites: {cites})")
+    return turso_time, batch
+async def test_arxiv():
+    print("\n" + "=" * 60)
+    print("ARXIV API METADATA FETCH TEST (for comparison)")
+    print("=" * 60)
+    t0 = time.perf_counter()
+    batch = await arxiv_svc.fetch_metadata_batch(TEST_IDS)
+    t1 = time.perf_counter()
+    arxiv_time = (t1 - t0) * 1000
+    print(f"\n[Batch of {len(TEST_IDS)}] -> {arxiv_time:.0f}ms")
+    print(f"  Found: {len(batch)}/{len(TEST_IDS)}")
+    for aid, paper in batch.items():
+        print(f"  {aid}: {paper['title'][:60]}... [{paper['category']}]")
+    return arxiv_time, batch
+async def main():
+    turso_time, turso_batch = await test_turso()
+    arxiv_time, arxiv_batch = await test_arxiv()
+    print("\n" + "=" * 60)
+    print("TIMING COMPARISON")
+    print("=" * 60)
+    print(f"  Turso DB:   {turso_time:>8.0f}ms ({len(turso_batch)} papers)")
+    print(f"  arXiv API:  {arxiv_time:>8.0f}ms ({len(arxiv_batch)} papers)")
+    speedup = arxiv_time / turso_time if turso_time > 0 else float("inf")
+    print(f"  Speedup:    {speedup:.1f}x faster with Turso")
+    print()
+    # Verify data quality: compare titles
+    print("DATA QUALITY CHECK (title match):")
+    for aid in TEST_IDS:
+        t_title = turso_batch.get(aid, {}).get("title", "N/A")[:50]
+        a_title = arxiv_batch.get(aid, {}).get("title", "N/A")[:50]
+        match = "OK" if t_title.lower()[:30] == a_title.lower()[:30] else "DIFF"
+        print(f"  [{match}] {aid}")
+if __name__ == "__main__":
+    asyncio.run(main())