Spaces:

thearkforyou
/

researchradar

Running

unknown Claude Opus 4.6 commited on Feb 24

Commit

854be79

1 Parent(s): 0c39e68

Add entity list API, author fetching, and search source decoupling

Backend changes for the UI update:

- sqlite_db: add get_authors_for_papers() batch method, get_entity_list()
for browsing all entities by type, fix get_enrichment_stats() to use
COUNT(DISTINCT) for accurate unique entity counts
- models: add EntityListItem, authors + used_in_answer to SourcePaper,
authors to PaperSummary
- routes_analytics: add GET /analytics/{type}/list endpoint (methods,
datasets, tasks, topics) with limit param
- routes_papers: batch-fetch authors for browse results
- routes_search: batch-fetch authors for sources, use source_top_k=20
to return more relevant papers while keeping LLM context at top_k=5
- rag_engine: add source_top_k parameter to decouple generation context
from source list, mark sources with used_in_answer flag

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (7) hide show

frontend/tsconfig.tsbuildinfo +0 -1
src/api/models.py +10 -0
src/api/routes_analytics.py +20 -0
src/api/routes_papers.py +7 -0
src/api/routes_search.py +12 -2
src/generation/rag_engine.py +15 -7
src/storage/sqlite_db.py +70 -5

frontend/tsconfig.tsbuildinfo DELETED Viewed

@@ -1 +0,0 @@

- {"root":["./src/app.tsx","./src/main.tsx","./src/vite-env.d.ts","./src/components/cooccurrencetable.tsx","./src/components/emptystate.tsx","./src/components/erroralert.tsx","./src/components/growthchart.tsx","./src/components/layout.tsx","./src/components/spinner.tsx","./src/components/statcard.tsx","./src/components/topentitieschart.tsx","./src/components/trendexplorer.tsx","./src/components/venuechart.tsx","./src/lib/api.ts","./src/lib/hooks.ts","./src/lib/types.ts","./src/lib/utils.ts","./src/pages/browsepage.tsx","./src/pages/dashboardpage.tsx","./src/pages/paperpage.tsx","./src/pages/searchpage.tsx"],"version":"5.7.3"}

src/api/models.py CHANGED Viewed

@@ -46,6 +46,8 @@ class SourcePaper(BaseModel):
     year: int
     venue: str | None
     chunk_type: str
 class SearchResponse(BaseModel):
@@ -80,6 +82,7 @@ class PaperSummary(BaseModel):
     year: int | None
     venue: str | None
     url: str | None
 class PaperListResponse(BaseModel):
@@ -142,6 +145,13 @@ class GrowthPoint(BaseModel):
     growth_pct: float | None
 class HealthResponse(BaseModel):
     """System health status."""

     year: int
     venue: str | None
     chunk_type: str
+    authors: list[str] = Field(default_factory=list)
+    used_in_answer: bool = False
 class SearchResponse(BaseModel):
     year: int | None
     venue: str | None
     url: str | None
+    authors: list[str] = Field(default_factory=list)
 class PaperListResponse(BaseModel):
     growth_pct: float | None
+class EntityListItem(BaseModel):
+    """A unique entity name with its paper count."""
+    name: str
+    count: int
 class HealthResponse(BaseModel):
     """System health status."""

src/api/routes_analytics.py CHANGED Viewed

@@ -8,6 +8,7 @@ from src.api.deps import get_db
 from src.api.models import (
     CooccurrenceRow,
     EnrichmentStatsResponse,
     GrowthPoint,
     RankedEntity,
     TrendPoint,
@@ -119,6 +120,25 @@ def top_topics(
     ]
 # ── Co-occurrence endpoints ──────────────────────────────────────────

 from src.api.models import (
     CooccurrenceRow,
     EnrichmentStatsResponse,
+    EntityListItem,
     GrowthPoint,
     RankedEntity,
     TrendPoint,
     ]
+# ── Entity list endpoints ────────────────────────────────────────────
+VALID_ENTITY_TYPES = {"methods", "datasets", "tasks", "topics"}
+@router.get("/{entity_type}/list", response_model=list[EntityListItem])
+def entity_list(
+    entity_type: str,
+    limit: int = Query(default=500, ge=1, le=1000),
+    db: SQLiteDB = Depends(get_db),
+):
+    """List all unique entities of a type with their paper counts."""
+    if entity_type not in VALID_ENTITY_TYPES:
+        from fastapi import HTTPException
+        raise HTTPException(status_code=404, detail=f"Unknown entity type: {entity_type}")
+    rows = db.get_entity_list(entity_type, limit=limit)
+    return [EntityListItem(**r) for r in rows]
 # ── Co-occurrence endpoints ──────────────────────────────────────────

src/api/routes_papers.py CHANGED Viewed

@@ -41,6 +41,13 @@ def browse_papers(
     )
     total = db.count_papers(**filter_kwargs)
     rows = db.browse_papers(**filter_kwargs, limit=req.limit, offset=req.offset)
     papers = [PaperSummary(**r) for r in rows]
     return PaperListResponse(
         papers=papers, count=total, limit=req.limit, offset=req.offset,

     )
     total = db.count_papers(**filter_kwargs)
     rows = db.browse_papers(**filter_kwargs, limit=req.limit, offset=req.offset)
+    # Batch-fetch authors for all papers in this page
+    paper_ids = [r["id"] for r in rows]
+    authors_map = db.get_authors_for_papers(paper_ids)
+    for r in rows:
+        r["authors"] = authors_map.get(r["id"], [])
     papers = [PaperSummary(**r) for r in rows]
     return PaperListResponse(
         papers=papers, count=total, limit=req.limit, offset=req.offset,

src/api/routes_search.py CHANGED Viewed

@@ -4,10 +4,11 @@ import logging
 from fastapi import APIRouter, Depends, Request
-from src.api.deps import get_rag_engine
 from src.api.models import SearchRequest, SearchResponse, SourcePaper
 from src.api.rate_limit import search_limiter
 from src.generation.rag_engine import RAGEngine
 logger = logging.getLogger(__name__)
@@ -19,6 +20,7 @@ def search(
     request: Request,
     req: SearchRequest,
     engine: RAGEngine = Depends(get_rag_engine),
 ):
     """Answer a research question using RAG over the paper corpus."""
     search_limiter.check(request)
@@ -43,7 +45,15 @@ def search(
     if where is None and filters:
         where = filters
-    response = engine.query(question=req.query, top_k=req.top_k, where=where)
     return SearchResponse(
         answer=response.answer,

 from fastapi import APIRouter, Depends, Request
+from src.api.deps import get_db, get_rag_engine
 from src.api.models import SearchRequest, SearchResponse, SourcePaper
 from src.api.rate_limit import search_limiter
 from src.generation.rag_engine import RAGEngine
+from src.storage.sqlite_db import SQLiteDB
 logger = logging.getLogger(__name__)
     request: Request,
     req: SearchRequest,
     engine: RAGEngine = Depends(get_rag_engine),
+    db: SQLiteDB = Depends(get_db),
 ):
     """Answer a research question using RAG over the paper corpus."""
     search_limiter.check(request)
     if where is None and filters:
         where = filters
+    response = engine.query(
+        question=req.query, top_k=5, source_top_k=20, where=where,
+    )
+    # Batch-fetch authors for all source papers
+    paper_ids = [s["paper_id"] for s in response.sources]
+    authors_map = db.get_authors_for_papers(paper_ids)
+    for s in response.sources:
+        s["authors"] = authors_map.get(s["paper_id"], [])
     return SearchResponse(
         answer=response.answer,

src/generation/rag_engine.py CHANGED Viewed

@@ -74,26 +74,33 @@ class RAGEngine:
         self,
         question: str,
         top_k: int = 5,
         where: dict | None = None,
     ) -> RAGResponse:
         """Answer a question using retrieval-augmented generation.
         Args:
             question: The user's natural-language question.
-            top_k: Number of chunks to retrieve as context.
             where: Optional metadata filter for retrieval (e.g., year, venue).
         Returns:
             RAGResponse with the answer, source papers, and metadata.
         """
-        logger.info("RAG query: %r (top_k=%d)", question, top_k)
-        # Step 1: Retrieve relevant chunks
-        results = self.pipeline.search(query=question, top_k=top_k, where=where)
         logger.info("Retrieved %d chunks", len(results))
-        # Step 2: Format context
-        context = format_context(results)
         # Step 3: Build prompt and generate
         prompt = build_prompt(question, context)
@@ -116,7 +123,7 @@ class RAGEngine:
                 usage={},
             )
-        # Step 4: Build source list (deduplicated by paper_id)
         seen_papers: set[str] = set()
         sources = []
         for r in results:
@@ -128,6 +135,7 @@ class RAGEngine:
                     "year": r.year,
                     "venue": r.venue,
                     "chunk_type": r.chunk_type,
                 })
         return RAGResponse(

         self,
         question: str,
         top_k: int = 5,
+        source_top_k: int = 20,
         where: dict | None = None,
     ) -> RAGResponse:
         """Answer a question using retrieval-augmented generation.
         Args:
             question: The user's natural-language question.
+            top_k: Number of chunks used as LLM generation context.
+            source_top_k: Number of chunks to retrieve for the source list
+                (returns more papers than used for generation).
             where: Optional metadata filter for retrieval (e.g., year, venue).
         Returns:
             RAGResponse with the answer, source papers, and metadata.
         """
+        logger.info("RAG query: %r (top_k=%d, source_top_k=%d)", question, top_k, source_top_k)
+        # Step 1: Retrieve relevant chunks (more than needed for generation)
+        results = self.pipeline.search(query=question, top_k=source_top_k, where=where)
         logger.info("Retrieved %d chunks", len(results))
+        # Step 2: Format context from top_k chunks only (for LLM prompt)
+        context_results = results[:top_k]
+        context = format_context(context_results)
+        # Track which papers were used for generation context
+        context_paper_ids = {r.paper_id for r in context_results}
         # Step 3: Build prompt and generate
         prompt = build_prompt(question, context)
                 usage={},
             )
+        # Step 4: Build source list from ALL results (deduplicated by paper_id)
         seen_papers: set[str] = set()
         sources = []
         for r in results:
                     "year": r.year,
                     "venue": r.venue,
                     "chunk_type": r.chunk_type,
+                    "used_in_answer": r.paper_id in context_paper_ids,
                 })
         return RAGResponse(

src/storage/sqlite_db.py CHANGED Viewed

@@ -537,14 +537,22 @@ class SQLiteDB:
             conn.close()
     def get_enrichment_stats(self) -> dict:
-        """Get counts of enriched entities."""
         conn = self.get_connection()
         try:
             paper_count = conn.execute("SELECT COUNT(*) FROM papers").fetchone()[0]
-            method_count = conn.execute("SELECT COUNT(*) FROM methods").fetchone()[0]
-            dataset_count = conn.execute("SELECT COUNT(*) FROM datasets").fetchone()[0]
-            task_count = conn.execute("SELECT COUNT(*) FROM tasks").fetchone()[0]
-            topic_count = conn.execute("SELECT COUNT(*) FROM topics").fetchone()[0]
             papers_with_methods = conn.execute(
                 "SELECT COUNT(DISTINCT paper_id) FROM methods"
             ).fetchone()[0]
@@ -559,6 +567,63 @@ class SQLiteDB:
         finally:
             conn.close()
     # ── Co-occurrence analytics ───────────────────────────────────────
     def method_dataset_cooccurrence(self, top_n: int = 20) -> list[dict]:

             conn.close()
     def get_enrichment_stats(self) -> dict:
+        """Get counts of unique enriched entities (distinct names)."""
         conn = self.get_connection()
         try:
             paper_count = conn.execute("SELECT COUNT(*) FROM papers").fetchone()[0]
+            method_count = conn.execute(
+                "SELECT COUNT(DISTINCT method_name) FROM methods"
+            ).fetchone()[0]
+            dataset_count = conn.execute(
+                "SELECT COUNT(DISTINCT dataset_name) FROM datasets"
+            ).fetchone()[0]
+            task_count = conn.execute(
+                "SELECT COUNT(DISTINCT task_name) FROM tasks"
+            ).fetchone()[0]
+            topic_count = conn.execute(
+                "SELECT COUNT(DISTINCT topic_name) FROM topics"
+            ).fetchone()[0]
             papers_with_methods = conn.execute(
                 "SELECT COUNT(DISTINCT paper_id) FROM methods"
             ).fetchone()[0]
         finally:
             conn.close()
+    def get_authors_for_papers(self, paper_ids: list[str]) -> dict[str, list[str]]:
+        """Batch-fetch authors for multiple papers.
+        Returns:
+            Dict mapping paper_id → list of author names (ordered by position).
+        """
+        if not paper_ids:
+            return {}
+        conn = self.get_connection()
+        try:
+            placeholders = ",".join("?" * len(paper_ids))
+            rows = conn.execute(
+                f"SELECT paper_id, name FROM authors "
+                f"WHERE paper_id IN ({placeholders}) "
+                f"ORDER BY paper_id, position",
+                paper_ids,
+            ).fetchall()
+            result: dict[str, list[str]] = {}
+            for row in rows:
+                result.setdefault(row["paper_id"], []).append(row["name"])
+            return result
+        finally:
+            conn.close()
+    _ENTITY_TABLE_MAP = {
+        "methods": ("methods", "method_name"),
+        "datasets": ("datasets", "dataset_name"),
+        "tasks": ("tasks", "task_name"),
+        "topics": ("topics", "topic_name"),
+    }
+    def get_entity_list(
+        self, entity_type: str, limit: int = 500
+    ) -> list[dict]:
+        """Get all unique entity names with their paper counts.
+        Args:
+            entity_type: One of "methods", "datasets", "tasks", "topics".
+            limit: Maximum entries to return.
+        Returns:
+            List of dicts with keys: name, count. Sorted by count descending.
+        """
+        if entity_type not in self._ENTITY_TABLE_MAP:
+            raise ValueError(f"Unknown entity type: {entity_type}")
+        table, col = self._ENTITY_TABLE_MAP[entity_type]
+        conn = self.get_connection()
+        try:
+            rows = conn.execute(
+                f"SELECT {col} AS name, COUNT(*) AS count "
+                f"FROM {table} GROUP BY {col} ORDER BY count DESC LIMIT ?",
+                (limit,),
+            ).fetchall()
+            return [dict(row) for row in rows]
+        finally:
+            conn.close()
     # ── Co-occurrence analytics ───────────────────────────────────────
     def method_dataset_cooccurrence(self, top_n: int = 20) -> list[dict]: