Spaces:
Running
Add entity list API, author fetching, and search source decoupling
Browse filesBackend changes for the UI update:
- sqlite_db: add get_authors_for_papers() batch method, get_entity_list()
for browsing all entities by type, fix get_enrichment_stats() to use
COUNT(DISTINCT) for accurate unique entity counts
- models: add EntityListItem, authors + used_in_answer to SourcePaper,
authors to PaperSummary
- routes_analytics: add GET /analytics/{type}/list endpoint (methods,
datasets, tasks, topics) with limit param
- routes_papers: batch-fetch authors for browse results
- routes_search: batch-fetch authors for sources, use source_top_k=20
to return more relevant papers while keeping LLM context at top_k=5
- rag_engine: add source_top_k parameter to decouple generation context
from source list, mark sources with used_in_answer flag
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- frontend/tsconfig.tsbuildinfo +0 -1
- src/api/models.py +10 -0
- src/api/routes_analytics.py +20 -0
- src/api/routes_papers.py +7 -0
- src/api/routes_search.py +12 -2
- src/generation/rag_engine.py +15 -7
- src/storage/sqlite_db.py +70 -5
|
@@ -1 +0,0 @@
|
|
| 1 |
-
{"root":["./src/app.tsx","./src/main.tsx","./src/vite-env.d.ts","./src/components/cooccurrencetable.tsx","./src/components/emptystate.tsx","./src/components/erroralert.tsx","./src/components/growthchart.tsx","./src/components/layout.tsx","./src/components/spinner.tsx","./src/components/statcard.tsx","./src/components/topentitieschart.tsx","./src/components/trendexplorer.tsx","./src/components/venuechart.tsx","./src/lib/api.ts","./src/lib/hooks.ts","./src/lib/types.ts","./src/lib/utils.ts","./src/pages/browsepage.tsx","./src/pages/dashboardpage.tsx","./src/pages/paperpage.tsx","./src/pages/searchpage.tsx"],"version":"5.7.3"}
|
|
|
|
|
|
|
@@ -46,6 +46,8 @@ class SourcePaper(BaseModel):
|
|
| 46 |
year: int
|
| 47 |
venue: str | None
|
| 48 |
chunk_type: str
|
|
|
|
|
|
|
| 49 |
|
| 50 |
|
| 51 |
class SearchResponse(BaseModel):
|
|
@@ -80,6 +82,7 @@ class PaperSummary(BaseModel):
|
|
| 80 |
year: int | None
|
| 81 |
venue: str | None
|
| 82 |
url: str | None
|
|
|
|
| 83 |
|
| 84 |
|
| 85 |
class PaperListResponse(BaseModel):
|
|
@@ -142,6 +145,13 @@ class GrowthPoint(BaseModel):
|
|
| 142 |
growth_pct: float | None
|
| 143 |
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
class HealthResponse(BaseModel):
|
| 146 |
"""System health status."""
|
| 147 |
|
|
|
|
| 46 |
year: int
|
| 47 |
venue: str | None
|
| 48 |
chunk_type: str
|
| 49 |
+
authors: list[str] = Field(default_factory=list)
|
| 50 |
+
used_in_answer: bool = False
|
| 51 |
|
| 52 |
|
| 53 |
class SearchResponse(BaseModel):
|
|
|
|
| 82 |
year: int | None
|
| 83 |
venue: str | None
|
| 84 |
url: str | None
|
| 85 |
+
authors: list[str] = Field(default_factory=list)
|
| 86 |
|
| 87 |
|
| 88 |
class PaperListResponse(BaseModel):
|
|
|
|
| 145 |
growth_pct: float | None
|
| 146 |
|
| 147 |
|
| 148 |
+
class EntityListItem(BaseModel):
|
| 149 |
+
"""A unique entity name with its paper count."""
|
| 150 |
+
|
| 151 |
+
name: str
|
| 152 |
+
count: int
|
| 153 |
+
|
| 154 |
+
|
| 155 |
class HealthResponse(BaseModel):
|
| 156 |
"""System health status."""
|
| 157 |
|
|
@@ -8,6 +8,7 @@ from src.api.deps import get_db
|
|
| 8 |
from src.api.models import (
|
| 9 |
CooccurrenceRow,
|
| 10 |
EnrichmentStatsResponse,
|
|
|
|
| 11 |
GrowthPoint,
|
| 12 |
RankedEntity,
|
| 13 |
TrendPoint,
|
|
@@ -119,6 +120,25 @@ def top_topics(
|
|
| 119 |
]
|
| 120 |
|
| 121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
# ββ Co-occurrence endpoints ββββββββββββββββββββββββββββββββββββββββββ
|
| 123 |
|
| 124 |
|
|
|
|
| 8 |
from src.api.models import (
|
| 9 |
CooccurrenceRow,
|
| 10 |
EnrichmentStatsResponse,
|
| 11 |
+
EntityListItem,
|
| 12 |
GrowthPoint,
|
| 13 |
RankedEntity,
|
| 14 |
TrendPoint,
|
|
|
|
| 120 |
]
|
| 121 |
|
| 122 |
|
| 123 |
+
# ββ Entity list endpoints ββββββββββββββββββββββββββββββββββββββββββββ
|
| 124 |
+
|
| 125 |
+
VALID_ENTITY_TYPES = {"methods", "datasets", "tasks", "topics"}
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
@router.get("/{entity_type}/list", response_model=list[EntityListItem])
|
| 129 |
+
def entity_list(
|
| 130 |
+
entity_type: str,
|
| 131 |
+
limit: int = Query(default=500, ge=1, le=1000),
|
| 132 |
+
db: SQLiteDB = Depends(get_db),
|
| 133 |
+
):
|
| 134 |
+
"""List all unique entities of a type with their paper counts."""
|
| 135 |
+
if entity_type not in VALID_ENTITY_TYPES:
|
| 136 |
+
from fastapi import HTTPException
|
| 137 |
+
raise HTTPException(status_code=404, detail=f"Unknown entity type: {entity_type}")
|
| 138 |
+
rows = db.get_entity_list(entity_type, limit=limit)
|
| 139 |
+
return [EntityListItem(**r) for r in rows]
|
| 140 |
+
|
| 141 |
+
|
| 142 |
# ββ Co-occurrence endpoints ββββββββββββββββββββββββββββββββββββββββββ
|
| 143 |
|
| 144 |
|
|
@@ -41,6 +41,13 @@ def browse_papers(
|
|
| 41 |
)
|
| 42 |
total = db.count_papers(**filter_kwargs)
|
| 43 |
rows = db.browse_papers(**filter_kwargs, limit=req.limit, offset=req.offset)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
papers = [PaperSummary(**r) for r in rows]
|
| 45 |
return PaperListResponse(
|
| 46 |
papers=papers, count=total, limit=req.limit, offset=req.offset,
|
|
|
|
| 41 |
)
|
| 42 |
total = db.count_papers(**filter_kwargs)
|
| 43 |
rows = db.browse_papers(**filter_kwargs, limit=req.limit, offset=req.offset)
|
| 44 |
+
|
| 45 |
+
# Batch-fetch authors for all papers in this page
|
| 46 |
+
paper_ids = [r["id"] for r in rows]
|
| 47 |
+
authors_map = db.get_authors_for_papers(paper_ids)
|
| 48 |
+
for r in rows:
|
| 49 |
+
r["authors"] = authors_map.get(r["id"], [])
|
| 50 |
+
|
| 51 |
papers = [PaperSummary(**r) for r in rows]
|
| 52 |
return PaperListResponse(
|
| 53 |
papers=papers, count=total, limit=req.limit, offset=req.offset,
|
|
@@ -4,10 +4,11 @@ import logging
|
|
| 4 |
|
| 5 |
from fastapi import APIRouter, Depends, Request
|
| 6 |
|
| 7 |
-
from src.api.deps import get_rag_engine
|
| 8 |
from src.api.models import SearchRequest, SearchResponse, SourcePaper
|
| 9 |
from src.api.rate_limit import search_limiter
|
| 10 |
from src.generation.rag_engine import RAGEngine
|
|
|
|
| 11 |
|
| 12 |
logger = logging.getLogger(__name__)
|
| 13 |
|
|
@@ -19,6 +20,7 @@ def search(
|
|
| 19 |
request: Request,
|
| 20 |
req: SearchRequest,
|
| 21 |
engine: RAGEngine = Depends(get_rag_engine),
|
|
|
|
| 22 |
):
|
| 23 |
"""Answer a research question using RAG over the paper corpus."""
|
| 24 |
search_limiter.check(request)
|
|
@@ -43,7 +45,15 @@ def search(
|
|
| 43 |
if where is None and filters:
|
| 44 |
where = filters
|
| 45 |
|
| 46 |
-
response = engine.query(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
return SearchResponse(
|
| 49 |
answer=response.answer,
|
|
|
|
| 4 |
|
| 5 |
from fastapi import APIRouter, Depends, Request
|
| 6 |
|
| 7 |
+
from src.api.deps import get_db, get_rag_engine
|
| 8 |
from src.api.models import SearchRequest, SearchResponse, SourcePaper
|
| 9 |
from src.api.rate_limit import search_limiter
|
| 10 |
from src.generation.rag_engine import RAGEngine
|
| 11 |
+
from src.storage.sqlite_db import SQLiteDB
|
| 12 |
|
| 13 |
logger = logging.getLogger(__name__)
|
| 14 |
|
|
|
|
| 20 |
request: Request,
|
| 21 |
req: SearchRequest,
|
| 22 |
engine: RAGEngine = Depends(get_rag_engine),
|
| 23 |
+
db: SQLiteDB = Depends(get_db),
|
| 24 |
):
|
| 25 |
"""Answer a research question using RAG over the paper corpus."""
|
| 26 |
search_limiter.check(request)
|
|
|
|
| 45 |
if where is None and filters:
|
| 46 |
where = filters
|
| 47 |
|
| 48 |
+
response = engine.query(
|
| 49 |
+
question=req.query, top_k=5, source_top_k=20, where=where,
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# Batch-fetch authors for all source papers
|
| 53 |
+
paper_ids = [s["paper_id"] for s in response.sources]
|
| 54 |
+
authors_map = db.get_authors_for_papers(paper_ids)
|
| 55 |
+
for s in response.sources:
|
| 56 |
+
s["authors"] = authors_map.get(s["paper_id"], [])
|
| 57 |
|
| 58 |
return SearchResponse(
|
| 59 |
answer=response.answer,
|
|
@@ -74,26 +74,33 @@ class RAGEngine:
|
|
| 74 |
self,
|
| 75 |
question: str,
|
| 76 |
top_k: int = 5,
|
|
|
|
| 77 |
where: dict | None = None,
|
| 78 |
) -> RAGResponse:
|
| 79 |
"""Answer a question using retrieval-augmented generation.
|
| 80 |
|
| 81 |
Args:
|
| 82 |
question: The user's natural-language question.
|
| 83 |
-
top_k: Number of chunks
|
|
|
|
|
|
|
| 84 |
where: Optional metadata filter for retrieval (e.g., year, venue).
|
| 85 |
|
| 86 |
Returns:
|
| 87 |
RAGResponse with the answer, source papers, and metadata.
|
| 88 |
"""
|
| 89 |
-
logger.info("RAG query: %r (top_k=%d)", question, top_k)
|
| 90 |
|
| 91 |
-
# Step 1: Retrieve relevant chunks
|
| 92 |
-
results = self.pipeline.search(query=question, top_k=
|
| 93 |
logger.info("Retrieved %d chunks", len(results))
|
| 94 |
|
| 95 |
-
# Step 2: Format context
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
# Step 3: Build prompt and generate
|
| 99 |
prompt = build_prompt(question, context)
|
|
@@ -116,7 +123,7 @@ class RAGEngine:
|
|
| 116 |
usage={},
|
| 117 |
)
|
| 118 |
|
| 119 |
-
# Step 4: Build source list (deduplicated by paper_id)
|
| 120 |
seen_papers: set[str] = set()
|
| 121 |
sources = []
|
| 122 |
for r in results:
|
|
@@ -128,6 +135,7 @@ class RAGEngine:
|
|
| 128 |
"year": r.year,
|
| 129 |
"venue": r.venue,
|
| 130 |
"chunk_type": r.chunk_type,
|
|
|
|
| 131 |
})
|
| 132 |
|
| 133 |
return RAGResponse(
|
|
|
|
| 74 |
self,
|
| 75 |
question: str,
|
| 76 |
top_k: int = 5,
|
| 77 |
+
source_top_k: int = 20,
|
| 78 |
where: dict | None = None,
|
| 79 |
) -> RAGResponse:
|
| 80 |
"""Answer a question using retrieval-augmented generation.
|
| 81 |
|
| 82 |
Args:
|
| 83 |
question: The user's natural-language question.
|
| 84 |
+
top_k: Number of chunks used as LLM generation context.
|
| 85 |
+
source_top_k: Number of chunks to retrieve for the source list
|
| 86 |
+
(returns more papers than used for generation).
|
| 87 |
where: Optional metadata filter for retrieval (e.g., year, venue).
|
| 88 |
|
| 89 |
Returns:
|
| 90 |
RAGResponse with the answer, source papers, and metadata.
|
| 91 |
"""
|
| 92 |
+
logger.info("RAG query: %r (top_k=%d, source_top_k=%d)", question, top_k, source_top_k)
|
| 93 |
|
| 94 |
+
# Step 1: Retrieve relevant chunks (more than needed for generation)
|
| 95 |
+
results = self.pipeline.search(query=question, top_k=source_top_k, where=where)
|
| 96 |
logger.info("Retrieved %d chunks", len(results))
|
| 97 |
|
| 98 |
+
# Step 2: Format context from top_k chunks only (for LLM prompt)
|
| 99 |
+
context_results = results[:top_k]
|
| 100 |
+
context = format_context(context_results)
|
| 101 |
+
|
| 102 |
+
# Track which papers were used for generation context
|
| 103 |
+
context_paper_ids = {r.paper_id for r in context_results}
|
| 104 |
|
| 105 |
# Step 3: Build prompt and generate
|
| 106 |
prompt = build_prompt(question, context)
|
|
|
|
| 123 |
usage={},
|
| 124 |
)
|
| 125 |
|
| 126 |
+
# Step 4: Build source list from ALL results (deduplicated by paper_id)
|
| 127 |
seen_papers: set[str] = set()
|
| 128 |
sources = []
|
| 129 |
for r in results:
|
|
|
|
| 135 |
"year": r.year,
|
| 136 |
"venue": r.venue,
|
| 137 |
"chunk_type": r.chunk_type,
|
| 138 |
+
"used_in_answer": r.paper_id in context_paper_ids,
|
| 139 |
})
|
| 140 |
|
| 141 |
return RAGResponse(
|
|
@@ -537,14 +537,22 @@ class SQLiteDB:
|
|
| 537 |
conn.close()
|
| 538 |
|
| 539 |
def get_enrichment_stats(self) -> dict:
|
| 540 |
-
"""Get counts of enriched entities."""
|
| 541 |
conn = self.get_connection()
|
| 542 |
try:
|
| 543 |
paper_count = conn.execute("SELECT COUNT(*) FROM papers").fetchone()[0]
|
| 544 |
-
method_count = conn.execute(
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 548 |
papers_with_methods = conn.execute(
|
| 549 |
"SELECT COUNT(DISTINCT paper_id) FROM methods"
|
| 550 |
).fetchone()[0]
|
|
@@ -559,6 +567,63 @@ class SQLiteDB:
|
|
| 559 |
finally:
|
| 560 |
conn.close()
|
| 561 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
# ββ Co-occurrence analytics βββββββββββββββββββββββββββββββββββββββ
|
| 563 |
|
| 564 |
def method_dataset_cooccurrence(self, top_n: int = 20) -> list[dict]:
|
|
|
|
| 537 |
conn.close()
|
| 538 |
|
| 539 |
def get_enrichment_stats(self) -> dict:
|
| 540 |
+
"""Get counts of unique enriched entities (distinct names)."""
|
| 541 |
conn = self.get_connection()
|
| 542 |
try:
|
| 543 |
paper_count = conn.execute("SELECT COUNT(*) FROM papers").fetchone()[0]
|
| 544 |
+
method_count = conn.execute(
|
| 545 |
+
"SELECT COUNT(DISTINCT method_name) FROM methods"
|
| 546 |
+
).fetchone()[0]
|
| 547 |
+
dataset_count = conn.execute(
|
| 548 |
+
"SELECT COUNT(DISTINCT dataset_name) FROM datasets"
|
| 549 |
+
).fetchone()[0]
|
| 550 |
+
task_count = conn.execute(
|
| 551 |
+
"SELECT COUNT(DISTINCT task_name) FROM tasks"
|
| 552 |
+
).fetchone()[0]
|
| 553 |
+
topic_count = conn.execute(
|
| 554 |
+
"SELECT COUNT(DISTINCT topic_name) FROM topics"
|
| 555 |
+
).fetchone()[0]
|
| 556 |
papers_with_methods = conn.execute(
|
| 557 |
"SELECT COUNT(DISTINCT paper_id) FROM methods"
|
| 558 |
).fetchone()[0]
|
|
|
|
| 567 |
finally:
|
| 568 |
conn.close()
|
| 569 |
|
| 570 |
+
def get_authors_for_papers(self, paper_ids: list[str]) -> dict[str, list[str]]:
|
| 571 |
+
"""Batch-fetch authors for multiple papers.
|
| 572 |
+
|
| 573 |
+
Returns:
|
| 574 |
+
Dict mapping paper_id β list of author names (ordered by position).
|
| 575 |
+
"""
|
| 576 |
+
if not paper_ids:
|
| 577 |
+
return {}
|
| 578 |
+
conn = self.get_connection()
|
| 579 |
+
try:
|
| 580 |
+
placeholders = ",".join("?" * len(paper_ids))
|
| 581 |
+
rows = conn.execute(
|
| 582 |
+
f"SELECT paper_id, name FROM authors "
|
| 583 |
+
f"WHERE paper_id IN ({placeholders}) "
|
| 584 |
+
f"ORDER BY paper_id, position",
|
| 585 |
+
paper_ids,
|
| 586 |
+
).fetchall()
|
| 587 |
+
result: dict[str, list[str]] = {}
|
| 588 |
+
for row in rows:
|
| 589 |
+
result.setdefault(row["paper_id"], []).append(row["name"])
|
| 590 |
+
return result
|
| 591 |
+
finally:
|
| 592 |
+
conn.close()
|
| 593 |
+
|
| 594 |
+
_ENTITY_TABLE_MAP = {
|
| 595 |
+
"methods": ("methods", "method_name"),
|
| 596 |
+
"datasets": ("datasets", "dataset_name"),
|
| 597 |
+
"tasks": ("tasks", "task_name"),
|
| 598 |
+
"topics": ("topics", "topic_name"),
|
| 599 |
+
}
|
| 600 |
+
|
| 601 |
+
def get_entity_list(
|
| 602 |
+
self, entity_type: str, limit: int = 500
|
| 603 |
+
) -> list[dict]:
|
| 604 |
+
"""Get all unique entity names with their paper counts.
|
| 605 |
+
|
| 606 |
+
Args:
|
| 607 |
+
entity_type: One of "methods", "datasets", "tasks", "topics".
|
| 608 |
+
limit: Maximum entries to return.
|
| 609 |
+
|
| 610 |
+
Returns:
|
| 611 |
+
List of dicts with keys: name, count. Sorted by count descending.
|
| 612 |
+
"""
|
| 613 |
+
if entity_type not in self._ENTITY_TABLE_MAP:
|
| 614 |
+
raise ValueError(f"Unknown entity type: {entity_type}")
|
| 615 |
+
table, col = self._ENTITY_TABLE_MAP[entity_type]
|
| 616 |
+
conn = self.get_connection()
|
| 617 |
+
try:
|
| 618 |
+
rows = conn.execute(
|
| 619 |
+
f"SELECT {col} AS name, COUNT(*) AS count "
|
| 620 |
+
f"FROM {table} GROUP BY {col} ORDER BY count DESC LIMIT ?",
|
| 621 |
+
(limit,),
|
| 622 |
+
).fetchall()
|
| 623 |
+
return [dict(row) for row in rows]
|
| 624 |
+
finally:
|
| 625 |
+
conn.close()
|
| 626 |
+
|
| 627 |
# ββ Co-occurrence analytics βββββββββββββββββββββββββββββββββββββββ
|
| 628 |
|
| 629 |
def method_dataset_cooccurrence(self, top_n: int = 20) -> list[dict]:
|