unknown Claude Opus 4.6 commited on
Commit
854be79
Β·
1 Parent(s): 0c39e68

Add entity list API, author fetching, and search source decoupling

Browse files

Backend changes for the UI update:

- sqlite_db: add get_authors_for_papers() batch method, get_entity_list()
for browsing all entities by type, fix get_enrichment_stats() to use
COUNT(DISTINCT) for accurate unique entity counts
- models: add EntityListItem, authors + used_in_answer to SourcePaper,
authors to PaperSummary
- routes_analytics: add GET /analytics/{type}/list endpoint (methods,
datasets, tasks, topics) with limit param
- routes_papers: batch-fetch authors for browse results
- routes_search: batch-fetch authors for sources, use source_top_k=20
to return more relevant papers while keeping LLM context at top_k=5
- rag_engine: add source_top_k parameter to decouple generation context
from source list, mark sources with used_in_answer flag

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

frontend/tsconfig.tsbuildinfo DELETED
@@ -1 +0,0 @@
1
- {"root":["./src/app.tsx","./src/main.tsx","./src/vite-env.d.ts","./src/components/cooccurrencetable.tsx","./src/components/emptystate.tsx","./src/components/erroralert.tsx","./src/components/growthchart.tsx","./src/components/layout.tsx","./src/components/spinner.tsx","./src/components/statcard.tsx","./src/components/topentitieschart.tsx","./src/components/trendexplorer.tsx","./src/components/venuechart.tsx","./src/lib/api.ts","./src/lib/hooks.ts","./src/lib/types.ts","./src/lib/utils.ts","./src/pages/browsepage.tsx","./src/pages/dashboardpage.tsx","./src/pages/paperpage.tsx","./src/pages/searchpage.tsx"],"version":"5.7.3"}
 
 
src/api/models.py CHANGED
@@ -46,6 +46,8 @@ class SourcePaper(BaseModel):
46
  year: int
47
  venue: str | None
48
  chunk_type: str
 
 
49
 
50
 
51
  class SearchResponse(BaseModel):
@@ -80,6 +82,7 @@ class PaperSummary(BaseModel):
80
  year: int | None
81
  venue: str | None
82
  url: str | None
 
83
 
84
 
85
  class PaperListResponse(BaseModel):
@@ -142,6 +145,13 @@ class GrowthPoint(BaseModel):
142
  growth_pct: float | None
143
 
144
 
 
 
 
 
 
 
 
145
  class HealthResponse(BaseModel):
146
  """System health status."""
147
 
 
46
  year: int
47
  venue: str | None
48
  chunk_type: str
49
+ authors: list[str] = Field(default_factory=list)
50
+ used_in_answer: bool = False
51
 
52
 
53
  class SearchResponse(BaseModel):
 
82
  year: int | None
83
  venue: str | None
84
  url: str | None
85
+ authors: list[str] = Field(default_factory=list)
86
 
87
 
88
  class PaperListResponse(BaseModel):
 
145
  growth_pct: float | None
146
 
147
 
148
+ class EntityListItem(BaseModel):
149
+ """A unique entity name with its paper count."""
150
+
151
+ name: str
152
+ count: int
153
+
154
+
155
  class HealthResponse(BaseModel):
156
  """System health status."""
157
 
src/api/routes_analytics.py CHANGED
@@ -8,6 +8,7 @@ from src.api.deps import get_db
8
  from src.api.models import (
9
  CooccurrenceRow,
10
  EnrichmentStatsResponse,
 
11
  GrowthPoint,
12
  RankedEntity,
13
  TrendPoint,
@@ -119,6 +120,25 @@ def top_topics(
119
  ]
120
 
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  # ── Co-occurrence endpoints ──────────────────────────────────────────
123
 
124
 
 
8
  from src.api.models import (
9
  CooccurrenceRow,
10
  EnrichmentStatsResponse,
11
+ EntityListItem,
12
  GrowthPoint,
13
  RankedEntity,
14
  TrendPoint,
 
120
  ]
121
 
122
 
123
+ # ── Entity list endpoints ────────────────────────────────────────────
124
+
125
+ VALID_ENTITY_TYPES = {"methods", "datasets", "tasks", "topics"}
126
+
127
+
128
+ @router.get("/{entity_type}/list", response_model=list[EntityListItem])
129
+ def entity_list(
130
+ entity_type: str,
131
+ limit: int = Query(default=500, ge=1, le=1000),
132
+ db: SQLiteDB = Depends(get_db),
133
+ ):
134
+ """List all unique entities of a type with their paper counts."""
135
+ if entity_type not in VALID_ENTITY_TYPES:
136
+ from fastapi import HTTPException
137
+ raise HTTPException(status_code=404, detail=f"Unknown entity type: {entity_type}")
138
+ rows = db.get_entity_list(entity_type, limit=limit)
139
+ return [EntityListItem(**r) for r in rows]
140
+
141
+
142
  # ── Co-occurrence endpoints ──────────────────────────────────────────
143
 
144
 
src/api/routes_papers.py CHANGED
@@ -41,6 +41,13 @@ def browse_papers(
41
  )
42
  total = db.count_papers(**filter_kwargs)
43
  rows = db.browse_papers(**filter_kwargs, limit=req.limit, offset=req.offset)
 
 
 
 
 
 
 
44
  papers = [PaperSummary(**r) for r in rows]
45
  return PaperListResponse(
46
  papers=papers, count=total, limit=req.limit, offset=req.offset,
 
41
  )
42
  total = db.count_papers(**filter_kwargs)
43
  rows = db.browse_papers(**filter_kwargs, limit=req.limit, offset=req.offset)
44
+
45
+ # Batch-fetch authors for all papers in this page
46
+ paper_ids = [r["id"] for r in rows]
47
+ authors_map = db.get_authors_for_papers(paper_ids)
48
+ for r in rows:
49
+ r["authors"] = authors_map.get(r["id"], [])
50
+
51
  papers = [PaperSummary(**r) for r in rows]
52
  return PaperListResponse(
53
  papers=papers, count=total, limit=req.limit, offset=req.offset,
src/api/routes_search.py CHANGED
@@ -4,10 +4,11 @@ import logging
4
 
5
  from fastapi import APIRouter, Depends, Request
6
 
7
- from src.api.deps import get_rag_engine
8
  from src.api.models import SearchRequest, SearchResponse, SourcePaper
9
  from src.api.rate_limit import search_limiter
10
  from src.generation.rag_engine import RAGEngine
 
11
 
12
  logger = logging.getLogger(__name__)
13
 
@@ -19,6 +20,7 @@ def search(
19
  request: Request,
20
  req: SearchRequest,
21
  engine: RAGEngine = Depends(get_rag_engine),
 
22
  ):
23
  """Answer a research question using RAG over the paper corpus."""
24
  search_limiter.check(request)
@@ -43,7 +45,15 @@ def search(
43
  if where is None and filters:
44
  where = filters
45
 
46
- response = engine.query(question=req.query, top_k=req.top_k, where=where)
 
 
 
 
 
 
 
 
47
 
48
  return SearchResponse(
49
  answer=response.answer,
 
4
 
5
  from fastapi import APIRouter, Depends, Request
6
 
7
+ from src.api.deps import get_db, get_rag_engine
8
  from src.api.models import SearchRequest, SearchResponse, SourcePaper
9
  from src.api.rate_limit import search_limiter
10
  from src.generation.rag_engine import RAGEngine
11
+ from src.storage.sqlite_db import SQLiteDB
12
 
13
  logger = logging.getLogger(__name__)
14
 
 
20
  request: Request,
21
  req: SearchRequest,
22
  engine: RAGEngine = Depends(get_rag_engine),
23
+ db: SQLiteDB = Depends(get_db),
24
  ):
25
  """Answer a research question using RAG over the paper corpus."""
26
  search_limiter.check(request)
 
45
  if where is None and filters:
46
  where = filters
47
 
48
+ response = engine.query(
49
+ question=req.query, top_k=5, source_top_k=20, where=where,
50
+ )
51
+
52
+ # Batch-fetch authors for all source papers
53
+ paper_ids = [s["paper_id"] for s in response.sources]
54
+ authors_map = db.get_authors_for_papers(paper_ids)
55
+ for s in response.sources:
56
+ s["authors"] = authors_map.get(s["paper_id"], [])
57
 
58
  return SearchResponse(
59
  answer=response.answer,
src/generation/rag_engine.py CHANGED
@@ -74,26 +74,33 @@ class RAGEngine:
74
  self,
75
  question: str,
76
  top_k: int = 5,
 
77
  where: dict | None = None,
78
  ) -> RAGResponse:
79
  """Answer a question using retrieval-augmented generation.
80
 
81
  Args:
82
  question: The user's natural-language question.
83
- top_k: Number of chunks to retrieve as context.
 
 
84
  where: Optional metadata filter for retrieval (e.g., year, venue).
85
 
86
  Returns:
87
  RAGResponse with the answer, source papers, and metadata.
88
  """
89
- logger.info("RAG query: %r (top_k=%d)", question, top_k)
90
 
91
- # Step 1: Retrieve relevant chunks
92
- results = self.pipeline.search(query=question, top_k=top_k, where=where)
93
  logger.info("Retrieved %d chunks", len(results))
94
 
95
- # Step 2: Format context
96
- context = format_context(results)
 
 
 
 
97
 
98
  # Step 3: Build prompt and generate
99
  prompt = build_prompt(question, context)
@@ -116,7 +123,7 @@ class RAGEngine:
116
  usage={},
117
  )
118
 
119
- # Step 4: Build source list (deduplicated by paper_id)
120
  seen_papers: set[str] = set()
121
  sources = []
122
  for r in results:
@@ -128,6 +135,7 @@ class RAGEngine:
128
  "year": r.year,
129
  "venue": r.venue,
130
  "chunk_type": r.chunk_type,
 
131
  })
132
 
133
  return RAGResponse(
 
74
  self,
75
  question: str,
76
  top_k: int = 5,
77
+ source_top_k: int = 20,
78
  where: dict | None = None,
79
  ) -> RAGResponse:
80
  """Answer a question using retrieval-augmented generation.
81
 
82
  Args:
83
  question: The user's natural-language question.
84
+ top_k: Number of chunks used as LLM generation context.
85
+ source_top_k: Number of chunks to retrieve for the source list
86
+ (returns more papers than used for generation).
87
  where: Optional metadata filter for retrieval (e.g., year, venue).
88
 
89
  Returns:
90
  RAGResponse with the answer, source papers, and metadata.
91
  """
92
+ logger.info("RAG query: %r (top_k=%d, source_top_k=%d)", question, top_k, source_top_k)
93
 
94
+ # Step 1: Retrieve relevant chunks (more than needed for generation)
95
+ results = self.pipeline.search(query=question, top_k=source_top_k, where=where)
96
  logger.info("Retrieved %d chunks", len(results))
97
 
98
+ # Step 2: Format context from top_k chunks only (for LLM prompt)
99
+ context_results = results[:top_k]
100
+ context = format_context(context_results)
101
+
102
+ # Track which papers were used for generation context
103
+ context_paper_ids = {r.paper_id for r in context_results}
104
 
105
  # Step 3: Build prompt and generate
106
  prompt = build_prompt(question, context)
 
123
  usage={},
124
  )
125
 
126
+ # Step 4: Build source list from ALL results (deduplicated by paper_id)
127
  seen_papers: set[str] = set()
128
  sources = []
129
  for r in results:
 
135
  "year": r.year,
136
  "venue": r.venue,
137
  "chunk_type": r.chunk_type,
138
+ "used_in_answer": r.paper_id in context_paper_ids,
139
  })
140
 
141
  return RAGResponse(
src/storage/sqlite_db.py CHANGED
@@ -537,14 +537,22 @@ class SQLiteDB:
537
  conn.close()
538
 
539
  def get_enrichment_stats(self) -> dict:
540
- """Get counts of enriched entities."""
541
  conn = self.get_connection()
542
  try:
543
  paper_count = conn.execute("SELECT COUNT(*) FROM papers").fetchone()[0]
544
- method_count = conn.execute("SELECT COUNT(*) FROM methods").fetchone()[0]
545
- dataset_count = conn.execute("SELECT COUNT(*) FROM datasets").fetchone()[0]
546
- task_count = conn.execute("SELECT COUNT(*) FROM tasks").fetchone()[0]
547
- topic_count = conn.execute("SELECT COUNT(*) FROM topics").fetchone()[0]
 
 
 
 
 
 
 
 
548
  papers_with_methods = conn.execute(
549
  "SELECT COUNT(DISTINCT paper_id) FROM methods"
550
  ).fetchone()[0]
@@ -559,6 +567,63 @@ class SQLiteDB:
559
  finally:
560
  conn.close()
561
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
562
  # ── Co-occurrence analytics ───────────────────────────────────────
563
 
564
  def method_dataset_cooccurrence(self, top_n: int = 20) -> list[dict]:
 
537
  conn.close()
538
 
539
  def get_enrichment_stats(self) -> dict:
540
+ """Get counts of unique enriched entities (distinct names)."""
541
  conn = self.get_connection()
542
  try:
543
  paper_count = conn.execute("SELECT COUNT(*) FROM papers").fetchone()[0]
544
+ method_count = conn.execute(
545
+ "SELECT COUNT(DISTINCT method_name) FROM methods"
546
+ ).fetchone()[0]
547
+ dataset_count = conn.execute(
548
+ "SELECT COUNT(DISTINCT dataset_name) FROM datasets"
549
+ ).fetchone()[0]
550
+ task_count = conn.execute(
551
+ "SELECT COUNT(DISTINCT task_name) FROM tasks"
552
+ ).fetchone()[0]
553
+ topic_count = conn.execute(
554
+ "SELECT COUNT(DISTINCT topic_name) FROM topics"
555
+ ).fetchone()[0]
556
  papers_with_methods = conn.execute(
557
  "SELECT COUNT(DISTINCT paper_id) FROM methods"
558
  ).fetchone()[0]
 
567
  finally:
568
  conn.close()
569
 
570
+ def get_authors_for_papers(self, paper_ids: list[str]) -> dict[str, list[str]]:
571
+ """Batch-fetch authors for multiple papers.
572
+
573
+ Returns:
574
+ Dict mapping paper_id β†’ list of author names (ordered by position).
575
+ """
576
+ if not paper_ids:
577
+ return {}
578
+ conn = self.get_connection()
579
+ try:
580
+ placeholders = ",".join("?" * len(paper_ids))
581
+ rows = conn.execute(
582
+ f"SELECT paper_id, name FROM authors "
583
+ f"WHERE paper_id IN ({placeholders}) "
584
+ f"ORDER BY paper_id, position",
585
+ paper_ids,
586
+ ).fetchall()
587
+ result: dict[str, list[str]] = {}
588
+ for row in rows:
589
+ result.setdefault(row["paper_id"], []).append(row["name"])
590
+ return result
591
+ finally:
592
+ conn.close()
593
+
594
+ _ENTITY_TABLE_MAP = {
595
+ "methods": ("methods", "method_name"),
596
+ "datasets": ("datasets", "dataset_name"),
597
+ "tasks": ("tasks", "task_name"),
598
+ "topics": ("topics", "topic_name"),
599
+ }
600
+
601
+ def get_entity_list(
602
+ self, entity_type: str, limit: int = 500
603
+ ) -> list[dict]:
604
+ """Get all unique entity names with their paper counts.
605
+
606
+ Args:
607
+ entity_type: One of "methods", "datasets", "tasks", "topics".
608
+ limit: Maximum entries to return.
609
+
610
+ Returns:
611
+ List of dicts with keys: name, count. Sorted by count descending.
612
+ """
613
+ if entity_type not in self._ENTITY_TABLE_MAP:
614
+ raise ValueError(f"Unknown entity type: {entity_type}")
615
+ table, col = self._ENTITY_TABLE_MAP[entity_type]
616
+ conn = self.get_connection()
617
+ try:
618
+ rows = conn.execute(
619
+ f"SELECT {col} AS name, COUNT(*) AS count "
620
+ f"FROM {table} GROUP BY {col} ORDER BY count DESC LIMIT ?",
621
+ (limit,),
622
+ ).fetchall()
623
+ return [dict(row) for row in rows]
624
+ finally:
625
+ conn.close()
626
+
627
  # ── Co-occurrence analytics ───────────────────────────────────────
628
 
629
  def method_dataset_cooccurrence(self, top_n: int = 20) -> list[dict]: