Spaces:
Sleeping
Sleeping
XQ commited on
Commit ·
0a7ef90
1
Parent(s): 1f01595
Fix pipeline details displaying and routing and searching logic
Browse files- src/agent/react_router.py +17 -6
- src/agent/router.py +1 -58
- src/agent/tools.py +2 -2
- src/api/routes.py +12 -1
- src/ingestion/chunker.py +21 -13
- src/ingestion/pipeline.py +3 -1
- src/models.py +5 -4
- src/retrieval/bm25_search.py +2 -2
- src/retrieval/vector_store.py +3 -2
- src/ui/app.py +79 -39
src/agent/react_router.py
CHANGED
|
@@ -145,7 +145,7 @@ class ReActRouter:
|
|
| 145 |
confidence=confidence,
|
| 146 |
pipeline_details=PipelineDetails(
|
| 147 |
original_query=query,
|
| 148 |
-
retrieval_query=", ".join(q for
|
| 149 |
dense_results=store.dense_results,
|
| 150 |
sparse_results=store.sparse_results,
|
| 151 |
fused_results=store.fused_results,
|
|
@@ -175,6 +175,7 @@ class ReActRouter:
|
|
| 175 |
graph = self._make_graph(store)
|
| 176 |
|
| 177 |
all_messages: list = []
|
|
|
|
| 178 |
|
| 179 |
for chunk in graph.stream(
|
| 180 |
{
|
|
@@ -194,20 +195,30 @@ class ReActRouter:
|
|
| 194 |
for msg in node_messages:
|
| 195 |
if isinstance(msg, AIMessage):
|
| 196 |
for tc in getattr(msg, "tool_calls", []):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
yield {
|
| 198 |
"step": "tool_call",
|
| 199 |
"tool": tc.get("name", ""),
|
| 200 |
-
"query":
|
| 201 |
}
|
| 202 |
if msg.content and not getattr(msg, "tool_calls", None):
|
| 203 |
yield {"step": "generate"}
|
| 204 |
|
| 205 |
elif isinstance(msg, ToolMessage):
|
|
|
|
|
|
|
| 206 |
yield {
|
| 207 |
"step": "tool_result",
|
| 208 |
-
"tool":
|
| 209 |
-
"result_count":
|
|
|
|
| 210 |
}
|
|
|
|
| 211 |
|
| 212 |
answer = self._extract_answer(all_messages)
|
| 213 |
sources = store.retrieved[:top_k]
|
|
@@ -222,8 +233,8 @@ class ReActRouter:
|
|
| 222 |
"confidence": confidence,
|
| 223 |
"pipeline_details": {
|
| 224 |
"original_query": query,
|
| 225 |
-
"retrieval_query": ", ".join(q for
|
| 226 |
-
"detected_language": "
|
| 227 |
"translated": False,
|
| 228 |
"dense_results": [r.to_dict(include_text=False) for r in store.dense_results],
|
| 229 |
"sparse_results": [r.to_dict(include_text=False) for r in store.sparse_results],
|
|
|
|
| 145 |
confidence=confidence,
|
| 146 |
pipeline_details=PipelineDetails(
|
| 147 |
original_query=query,
|
| 148 |
+
retrieval_query=", ".join(q for name, q in store.tool_calls if name == "hybrid_search") or query,
|
| 149 |
dense_results=store.dense_results,
|
| 150 |
sparse_results=store.sparse_results,
|
| 151 |
fused_results=store.fused_results,
|
|
|
|
| 175 |
graph = self._make_graph(store)
|
| 176 |
|
| 177 |
all_messages: list = []
|
| 178 |
+
prev_retrieved_count = 0
|
| 179 |
|
| 180 |
for chunk in graph.stream(
|
| 181 |
{
|
|
|
|
| 195 |
for msg in node_messages:
|
| 196 |
if isinstance(msg, AIMessage):
|
| 197 |
for tc in getattr(msg, "tool_calls", []):
|
| 198 |
+
tc_args = tc.get("args", {})
|
| 199 |
+
# Extract the most relevant argument for display
|
| 200 |
+
tc_detail = (
|
| 201 |
+
tc_args.get("query", "")
|
| 202 |
+
or tc_args.get("document_id", "")
|
| 203 |
+
)
|
| 204 |
yield {
|
| 205 |
"step": "tool_call",
|
| 206 |
"tool": tc.get("name", ""),
|
| 207 |
+
"query": tc_detail,
|
| 208 |
}
|
| 209 |
if msg.content and not getattr(msg, "tool_calls", None):
|
| 210 |
yield {"step": "generate"}
|
| 211 |
|
| 212 |
elif isinstance(msg, ToolMessage):
|
| 213 |
+
tool_name = getattr(msg, "name", "")
|
| 214 |
+
current_count = len(store.retrieved)
|
| 215 |
yield {
|
| 216 |
"step": "tool_result",
|
| 217 |
+
"tool": tool_name,
|
| 218 |
+
"result_count": current_count - prev_retrieved_count,
|
| 219 |
+
"total_count": current_count,
|
| 220 |
}
|
| 221 |
+
prev_retrieved_count = current_count
|
| 222 |
|
| 223 |
answer = self._extract_answer(all_messages)
|
| 224 |
sources = store.retrieved[:top_k]
|
|
|
|
| 233 |
"confidence": confidence,
|
| 234 |
"pipeline_details": {
|
| 235 |
"original_query": query,
|
| 236 |
+
"retrieval_query": ", ".join(q for name, q in store.tool_calls if name == "hybrid_search") or query,
|
| 237 |
+
"detected_language": "",
|
| 238 |
"translated": False,
|
| 239 |
"dense_results": [r.to_dict(include_text=False) for r in store.dense_results],
|
| 240 |
"sparse_results": [r.to_dict(include_text=False) for r in store.sparse_results],
|
src/agent/router.py
CHANGED
|
@@ -11,7 +11,6 @@ explicit and testable without hand-rolled flags or callback plumbing.
|
|
| 11 |
"""
|
| 12 |
|
| 13 |
import logging
|
| 14 |
-
import unicodedata
|
| 15 |
from collections.abc import Generator
|
| 16 |
from typing import TypedDict
|
| 17 |
|
|
@@ -123,76 +122,20 @@ class QueryRouter:
|
|
| 123 |
self._translate_query_enabled = translate_query
|
| 124 |
self._graph = self._build_graph()
|
| 125 |
|
| 126 |
-
@staticmethod
|
| 127 |
-
def _detect_script(text: str) -> str | None:
|
| 128 |
-
"""Detect language from Unicode script for non-Latin text.
|
| 129 |
-
|
| 130 |
-
Returns a language name (e.g. "Chinese") if the script is
|
| 131 |
-
unambiguously identifiable, or None to fall back to LLM detection.
|
| 132 |
-
"""
|
| 133 |
-
script_counts: dict[str, int] = {}
|
| 134 |
-
for ch in text:
|
| 135 |
-
if ch.isspace() or ch in ".,!?;:\"'()[]{}":
|
| 136 |
-
continue
|
| 137 |
-
try:
|
| 138 |
-
name = unicodedata.name(ch, "")
|
| 139 |
-
except ValueError:
|
| 140 |
-
continue
|
| 141 |
-
if name.startswith("CJK") or name.startswith("KANGXI"):
|
| 142 |
-
script_counts["CJK"] = script_counts.get("CJK", 0) + 1
|
| 143 |
-
elif name.startswith("HIRAGANA") or name.startswith("KATAKANA"):
|
| 144 |
-
script_counts["Japanese"] = script_counts.get("Japanese", 0) + 1
|
| 145 |
-
elif name.startswith("HANGUL"):
|
| 146 |
-
script_counts["Korean"] = script_counts.get("Korean", 0) + 1
|
| 147 |
-
elif name.startswith("ARABIC"):
|
| 148 |
-
script_counts["Arabic"] = script_counts.get("Arabic", 0) + 1
|
| 149 |
-
elif name.startswith("DEVANAGARI"):
|
| 150 |
-
script_counts["Hindi"] = script_counts.get("Hindi", 0) + 1
|
| 151 |
-
elif name.startswith("THAI"):
|
| 152 |
-
script_counts["Thai"] = script_counts.get("Thai", 0) + 1
|
| 153 |
-
elif name.startswith("CYRILLIC"):
|
| 154 |
-
script_counts["Russian"] = script_counts.get("Russian", 0) + 1
|
| 155 |
-
|
| 156 |
-
if not script_counts:
|
| 157 |
-
return None
|
| 158 |
-
|
| 159 |
-
dominant = max(script_counts, key=lambda k: script_counts[k])
|
| 160 |
-
# CJK characters alone -> Chinese; if mixed with Hiragana/Katakana -> Japanese
|
| 161 |
-
if dominant == "CJK" and "Japanese" in script_counts:
|
| 162 |
-
return "Japanese"
|
| 163 |
-
if dominant == "CJK":
|
| 164 |
-
return "Chinese"
|
| 165 |
-
return dominant
|
| 166 |
-
|
| 167 |
def _detect_language_and_intent(self, query: str) -> tuple[str, IntentType]:
|
| 168 |
"""Detect the query language and classify intent in a single LLM call.
|
| 169 |
|
| 170 |
-
Uses Unicode script detection first for non-Latin scripts. For
|
| 171 |
-
Latin-script text, a single LLM call returns both language and intent,
|
| 172 |
-
saving one full round-trip compared to two separate calls.
|
| 173 |
-
|
| 174 |
Args:
|
| 175 |
query: The user's original query.
|
| 176 |
|
| 177 |
Returns:
|
| 178 |
Tuple of (detected_language, intent).
|
| 179 |
"""
|
| 180 |
-
# Fast path: detect non-Latin scripts via Unicode
|
| 181 |
-
script_language = self._detect_script(query)
|
| 182 |
-
|
| 183 |
-
if script_language is not None:
|
| 184 |
-
# Language is known; still need intent from LLM
|
| 185 |
-
intent = self._intent_classifier.classify(query)
|
| 186 |
-
logger.info("Detected query language: %s", script_language)
|
| 187 |
-
logger.info("Classified intent: %s", intent.value)
|
| 188 |
-
return script_language, intent
|
| 189 |
-
|
| 190 |
-
# Latin-script text — combine language detection + intent classification
|
| 191 |
valid_intents = "factual, summary, comparison, procedural, unknown"
|
| 192 |
prompt = (
|
| 193 |
"You are given a user query. Do TWO things:\n"
|
| 194 |
"1. Detect the language of the query (reply with the language name in English, "
|
| 195 |
-
"e.g. 'Danish', 'English', 'German').\n"
|
| 196 |
"2. Classify the intent into exactly one of: "
|
| 197 |
f"{valid_intents}.\n\n"
|
| 198 |
"Reply with EXACTLY two lines, nothing else:\n"
|
|
|
|
| 11 |
"""
|
| 12 |
|
| 13 |
import logging
|
|
|
|
| 14 |
from collections.abc import Generator
|
| 15 |
from typing import TypedDict
|
| 16 |
|
|
|
|
| 122 |
self._translate_query_enabled = translate_query
|
| 123 |
self._graph = self._build_graph()
|
| 124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
def _detect_language_and_intent(self, query: str) -> tuple[str, IntentType]:
|
| 126 |
"""Detect the query language and classify intent in a single LLM call.
|
| 127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
Args:
|
| 129 |
query: The user's original query.
|
| 130 |
|
| 131 |
Returns:
|
| 132 |
Tuple of (detected_language, intent).
|
| 133 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
valid_intents = "factual, summary, comparison, procedural, unknown"
|
| 135 |
prompt = (
|
| 136 |
"You are given a user query. Do TWO things:\n"
|
| 137 |
"1. Detect the language of the query (reply with the language name in English, "
|
| 138 |
+
"e.g. 'Danish', 'English', 'German', 'Chinese', 'Japanese').\n"
|
| 139 |
"2. Classify the intent into exactly one of: "
|
| 140 |
f"{valid_intents}.\n\n"
|
| 141 |
"Reply with EXACTLY two lines, nothing else:\n"
|
src/agent/tools.py
CHANGED
|
@@ -161,8 +161,8 @@ def make_retrieval_tools(
|
|
| 161 |
f"(Document not found. Use list_documents to see available IDs.)"
|
| 162 |
)
|
| 163 |
|
| 164 |
-
# Sort chunks by
|
| 165 |
-
chunks.sort(key=lambda c: c.
|
| 166 |
|
| 167 |
# Register chunks as QueryResult so confidence and sources are surfaced in the UI.
|
| 168 |
# Score 1.0 indicates a direct full-document fetch (no ranking involved).
|
|
|
|
| 161 |
f"(Document not found. Use list_documents to see available IDs.)"
|
| 162 |
)
|
| 163 |
|
| 164 |
+
# Sort chunks by chunk_index to preserve document order
|
| 165 |
+
chunks.sort(key=lambda c: c.metadata.get("chunk_index", 0))
|
| 166 |
|
| 167 |
# Register chunks as QueryResult so confidence and sources are surfaced in the UI.
|
| 168 |
# Score 1.0 indicates a direct full-document fetch (no ranking involved).
|
src/api/routes.py
CHANGED
|
@@ -90,11 +90,22 @@ class PipelineDetailsResponse(BaseModel):
|
|
| 90 |
reranked_results: list[PipelineResultItem] = []
|
| 91 |
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
class QueryResponse(BaseModel):
|
| 94 |
"""Response body for the query endpoint."""
|
| 95 |
|
| 96 |
answer: str
|
| 97 |
-
sources: list[
|
| 98 |
intent: str
|
| 99 |
confidence: float
|
| 100 |
pipeline_details: PipelineDetailsResponse = PipelineDetailsResponse()
|
|
|
|
| 90 |
reranked_results: list[PipelineResultItem] = []
|
| 91 |
|
| 92 |
|
| 93 |
+
class SourceItem(BaseModel):
|
| 94 |
+
"""A single source item in the query response."""
|
| 95 |
+
|
| 96 |
+
chunk_id: str
|
| 97 |
+
document_id: str
|
| 98 |
+
score: float
|
| 99 |
+
source: str
|
| 100 |
+
text: str = ""
|
| 101 |
+
metadata: dict[str, str | int] = {}
|
| 102 |
+
|
| 103 |
+
|
| 104 |
class QueryResponse(BaseModel):
|
| 105 |
"""Response body for the query endpoint."""
|
| 106 |
|
| 107 |
answer: str
|
| 108 |
+
sources: list[SourceItem]
|
| 109 |
intent: str
|
| 110 |
confidence: float
|
| 111 |
pipeline_details: PipelineDetailsResponse = PipelineDetailsResponse()
|
src/ingestion/chunker.py
CHANGED
|
@@ -32,7 +32,8 @@ class BaseChunker:
|
|
| 32 |
self.chunk_overlap = chunk_overlap
|
| 33 |
|
| 34 |
def chunk(
|
| 35 |
-
self, text: str, document_id: str, metadata: dict[str, str | int]
|
|
|
|
| 36 |
) -> list[DocumentChunk]:
|
| 37 |
"""Split text into chunks.
|
| 38 |
|
|
@@ -40,6 +41,7 @@ class BaseChunker:
|
|
| 40 |
text: The full text to chunk.
|
| 41 |
document_id: Identifier of the source document.
|
| 42 |
metadata: Metadata to attach to each chunk.
|
|
|
|
| 43 |
|
| 44 |
Returns:
|
| 45 |
List of DocumentChunk objects.
|
|
@@ -51,7 +53,8 @@ class FixedSizeChunker(BaseChunker):
|
|
| 51 |
"""Splits text into fixed-size character chunks with overlap."""
|
| 52 |
|
| 53 |
def chunk(
|
| 54 |
-
self, text: str, document_id: str, metadata: dict[str, str | int]
|
|
|
|
| 55 |
) -> list[DocumentChunk]:
|
| 56 |
"""Split text into fixed-size chunks using LangChain CharacterTextSplitter.
|
| 57 |
|
|
@@ -59,6 +62,7 @@ class FixedSizeChunker(BaseChunker):
|
|
| 59 |
text: The full text to chunk.
|
| 60 |
document_id: Identifier of the source document.
|
| 61 |
metadata: Metadata to attach to each chunk.
|
|
|
|
| 62 |
|
| 63 |
Returns:
|
| 64 |
List of DocumentChunk with strategy=FIXED_SIZE.
|
|
@@ -73,13 +77,13 @@ class FixedSizeChunker(BaseChunker):
|
|
| 73 |
texts = splitter.split_text(text)
|
| 74 |
chunks = [
|
| 75 |
DocumentChunk(
|
| 76 |
-
chunk_id=_make_chunk_id(document_id,
|
| 77 |
document_id=document_id,
|
| 78 |
text=chunk_text,
|
| 79 |
-
metadata={**metadata, "chunk_index":
|
| 80 |
strategy=ChunkStrategy.FIXED_SIZE,
|
| 81 |
)
|
| 82 |
-
for
|
| 83 |
]
|
| 84 |
logger.debug("FixedSizeChunker produced %d chunks for %s", len(chunks), document_id)
|
| 85 |
return chunks
|
|
@@ -89,7 +93,8 @@ class RecursiveChunker(BaseChunker):
|
|
| 89 |
"""Recursively splits text using LangChain's RecursiveCharacterTextSplitter."""
|
| 90 |
|
| 91 |
def chunk(
|
| 92 |
-
self, text: str, document_id: str, metadata: dict[str, str | int]
|
|
|
|
| 93 |
) -> list[DocumentChunk]:
|
| 94 |
"""Split text using recursive character splitting.
|
| 95 |
|
|
@@ -97,6 +102,7 @@ class RecursiveChunker(BaseChunker):
|
|
| 97 |
text: The full text to chunk.
|
| 98 |
document_id: Identifier of the source document.
|
| 99 |
metadata: Metadata to attach to each chunk.
|
|
|
|
| 100 |
|
| 101 |
Returns:
|
| 102 |
List of DocumentChunk with strategy=RECURSIVE.
|
|
@@ -107,12 +113,12 @@ class RecursiveChunker(BaseChunker):
|
|
| 107 |
)
|
| 108 |
texts = splitter.split_text(text)
|
| 109 |
chunks: list[DocumentChunk] = []
|
| 110 |
-
for
|
| 111 |
chunks.append(DocumentChunk(
|
| 112 |
-
chunk_id=_make_chunk_id(document_id,
|
| 113 |
document_id=document_id,
|
| 114 |
text=chunk_text,
|
| 115 |
-
metadata={**metadata, "chunk_index":
|
| 116 |
strategy=ChunkStrategy.RECURSIVE,
|
| 117 |
))
|
| 118 |
logger.debug("RecursiveChunker produced %d chunks for %s", len(chunks), document_id)
|
|
@@ -136,7 +142,8 @@ class SemanticChunker(BaseChunker):
|
|
| 136 |
self._embeddings = embeddings
|
| 137 |
|
| 138 |
def chunk(
|
| 139 |
-
self, text: str, document_id: str, metadata: dict[str, str | int]
|
|
|
|
| 140 |
) -> list[DocumentChunk]:
|
| 141 |
"""Split text at semantic boundaries.
|
| 142 |
|
|
@@ -144,6 +151,7 @@ class SemanticChunker(BaseChunker):
|
|
| 144 |
text: The full text to chunk.
|
| 145 |
document_id: Identifier of the source document.
|
| 146 |
metadata: Metadata to attach to each chunk.
|
|
|
|
| 147 |
|
| 148 |
Returns:
|
| 149 |
List of DocumentChunk with strategy=SEMANTIC.
|
|
@@ -151,12 +159,12 @@ class SemanticChunker(BaseChunker):
|
|
| 151 |
splitter = LCSemanticChunker(embeddings=self._embeddings)
|
| 152 |
docs = splitter.create_documents([text])
|
| 153 |
chunks: list[DocumentChunk] = []
|
| 154 |
-
for
|
| 155 |
chunks.append(DocumentChunk(
|
| 156 |
-
chunk_id=_make_chunk_id(document_id,
|
| 157 |
document_id=document_id,
|
| 158 |
text=doc.page_content,
|
| 159 |
-
metadata={**metadata, "chunk_index":
|
| 160 |
strategy=ChunkStrategy.SEMANTIC,
|
| 161 |
))
|
| 162 |
logger.debug("SemanticChunker produced %d chunks for %s", len(chunks), document_id)
|
|
|
|
| 32 |
self.chunk_overlap = chunk_overlap
|
| 33 |
|
| 34 |
def chunk(
|
| 35 |
+
self, text: str, document_id: str, metadata: dict[str, str | int],
|
| 36 |
+
start_index: int = 0,
|
| 37 |
) -> list[DocumentChunk]:
|
| 38 |
"""Split text into chunks.
|
| 39 |
|
|
|
|
| 41 |
text: The full text to chunk.
|
| 42 |
document_id: Identifier of the source document.
|
| 43 |
metadata: Metadata to attach to each chunk.
|
| 44 |
+
start_index: Starting chunk index for globally unique IDs.
|
| 45 |
|
| 46 |
Returns:
|
| 47 |
List of DocumentChunk objects.
|
|
|
|
| 53 |
"""Splits text into fixed-size character chunks with overlap."""
|
| 54 |
|
| 55 |
def chunk(
|
| 56 |
+
self, text: str, document_id: str, metadata: dict[str, str | int],
|
| 57 |
+
start_index: int = 0,
|
| 58 |
) -> list[DocumentChunk]:
|
| 59 |
"""Split text into fixed-size chunks using LangChain CharacterTextSplitter.
|
| 60 |
|
|
|
|
| 62 |
text: The full text to chunk.
|
| 63 |
document_id: Identifier of the source document.
|
| 64 |
metadata: Metadata to attach to each chunk.
|
| 65 |
+
start_index: Starting chunk index for globally unique IDs.
|
| 66 |
|
| 67 |
Returns:
|
| 68 |
List of DocumentChunk with strategy=FIXED_SIZE.
|
|
|
|
| 77 |
texts = splitter.split_text(text)
|
| 78 |
chunks = [
|
| 79 |
DocumentChunk(
|
| 80 |
+
chunk_id=_make_chunk_id(document_id, start_index + i),
|
| 81 |
document_id=document_id,
|
| 82 |
text=chunk_text,
|
| 83 |
+
metadata={**metadata, "chunk_index": start_index + i},
|
| 84 |
strategy=ChunkStrategy.FIXED_SIZE,
|
| 85 |
)
|
| 86 |
+
for i, chunk_text in enumerate(texts)
|
| 87 |
]
|
| 88 |
logger.debug("FixedSizeChunker produced %d chunks for %s", len(chunks), document_id)
|
| 89 |
return chunks
|
|
|
|
| 93 |
"""Recursively splits text using LangChain's RecursiveCharacterTextSplitter."""
|
| 94 |
|
| 95 |
def chunk(
|
| 96 |
+
self, text: str, document_id: str, metadata: dict[str, str | int],
|
| 97 |
+
start_index: int = 0,
|
| 98 |
) -> list[DocumentChunk]:
|
| 99 |
"""Split text using recursive character splitting.
|
| 100 |
|
|
|
|
| 102 |
text: The full text to chunk.
|
| 103 |
document_id: Identifier of the source document.
|
| 104 |
metadata: Metadata to attach to each chunk.
|
| 105 |
+
start_index: Starting chunk index for globally unique IDs.
|
| 106 |
|
| 107 |
Returns:
|
| 108 |
List of DocumentChunk with strategy=RECURSIVE.
|
|
|
|
| 113 |
)
|
| 114 |
texts = splitter.split_text(text)
|
| 115 |
chunks: list[DocumentChunk] = []
|
| 116 |
+
for i, chunk_text in enumerate(texts):
|
| 117 |
chunks.append(DocumentChunk(
|
| 118 |
+
chunk_id=_make_chunk_id(document_id, start_index + i),
|
| 119 |
document_id=document_id,
|
| 120 |
text=chunk_text,
|
| 121 |
+
metadata={**metadata, "chunk_index": start_index + i},
|
| 122 |
strategy=ChunkStrategy.RECURSIVE,
|
| 123 |
))
|
| 124 |
logger.debug("RecursiveChunker produced %d chunks for %s", len(chunks), document_id)
|
|
|
|
| 142 |
self._embeddings = embeddings
|
| 143 |
|
| 144 |
def chunk(
|
| 145 |
+
self, text: str, document_id: str, metadata: dict[str, str | int],
|
| 146 |
+
start_index: int = 0,
|
| 147 |
) -> list[DocumentChunk]:
|
| 148 |
"""Split text at semantic boundaries.
|
| 149 |
|
|
|
|
| 151 |
text: The full text to chunk.
|
| 152 |
document_id: Identifier of the source document.
|
| 153 |
metadata: Metadata to attach to each chunk.
|
| 154 |
+
start_index: Starting chunk index for globally unique IDs.
|
| 155 |
|
| 156 |
Returns:
|
| 157 |
List of DocumentChunk with strategy=SEMANTIC.
|
|
|
|
| 159 |
splitter = LCSemanticChunker(embeddings=self._embeddings)
|
| 160 |
docs = splitter.create_documents([text])
|
| 161 |
chunks: list[DocumentChunk] = []
|
| 162 |
+
for i, doc in enumerate(docs):
|
| 163 |
chunks.append(DocumentChunk(
|
| 164 |
+
chunk_id=_make_chunk_id(document_id, start_index + i),
|
| 165 |
document_id=document_id,
|
| 166 |
text=doc.page_content,
|
| 167 |
+
metadata={**metadata, "chunk_index": start_index + i},
|
| 168 |
strategy=ChunkStrategy.SEMANTIC,
|
| 169 |
))
|
| 170 |
logger.debug("SemanticChunker produced %d chunks for %s", len(chunks), document_id)
|
src/ingestion/pipeline.py
CHANGED
|
@@ -77,6 +77,7 @@ class IngestionPipeline:
|
|
| 77 |
|
| 78 |
pages = self.parser.parse(file_path)
|
| 79 |
all_chunks: list[DocumentChunk] = []
|
|
|
|
| 80 |
|
| 81 |
for page in pages:
|
| 82 |
raw_text = str(page["text"])
|
|
@@ -88,8 +89,9 @@ class IngestionPipeline:
|
|
| 88 |
"source": str(page["source"]),
|
| 89 |
"page_number": int(page["page_number"]),
|
| 90 |
}
|
| 91 |
-
chunks = self.chunker.chunk(cleaned, document_id, metadata)
|
| 92 |
all_chunks.extend(chunks)
|
|
|
|
| 93 |
|
| 94 |
logger.info("Ingested %d chunks from %s", len(all_chunks), file_path)
|
| 95 |
return all_chunks
|
|
|
|
| 77 |
|
| 78 |
pages = self.parser.parse(file_path)
|
| 79 |
all_chunks: list[DocumentChunk] = []
|
| 80 |
+
chunk_offset = 0
|
| 81 |
|
| 82 |
for page in pages:
|
| 83 |
raw_text = str(page["text"])
|
|
|
|
| 89 |
"source": str(page["source"]),
|
| 90 |
"page_number": int(page["page_number"]),
|
| 91 |
}
|
| 92 |
+
chunks = self.chunker.chunk(cleaned, document_id, metadata, start_index=chunk_offset)
|
| 93 |
all_chunks.extend(chunks)
|
| 94 |
+
chunk_offset += len(chunks)
|
| 95 |
|
| 96 |
logger.info("Ingested %d chunks from %s", len(all_chunks), file_path)
|
| 97 |
return all_chunks
|
src/models.py
CHANGED
|
@@ -56,21 +56,22 @@ class QueryResult:
|
|
| 56 |
score: float
|
| 57 |
source: str
|
| 58 |
|
| 59 |
-
def to_dict(self, *, include_text: bool = True) -> dict
|
| 60 |
"""Serialise to a JSON-safe dictionary.
|
| 61 |
|
| 62 |
Args:
|
| 63 |
include_text: Whether to include the chunk text (default True).
|
| 64 |
|
| 65 |
Returns:
|
| 66 |
-
Dictionary with chunk_id, document_id, score, source,
|
| 67 |
-
optionally text.
|
| 68 |
"""
|
| 69 |
-
d: dict
|
| 70 |
"chunk_id": self.chunk.chunk_id,
|
| 71 |
"document_id": self.chunk.document_id,
|
| 72 |
"score": self.score,
|
| 73 |
"source": self.source,
|
|
|
|
| 74 |
}
|
| 75 |
if include_text:
|
| 76 |
d["text"] = self.chunk.text
|
|
|
|
| 56 |
score: float
|
| 57 |
source: str
|
| 58 |
|
| 59 |
+
def to_dict(self, *, include_text: bool = True) -> dict:
|
| 60 |
"""Serialise to a JSON-safe dictionary.
|
| 61 |
|
| 62 |
Args:
|
| 63 |
include_text: Whether to include the chunk text (default True).
|
| 64 |
|
| 65 |
Returns:
|
| 66 |
+
Dictionary with chunk_id, document_id, score, source, metadata,
|
| 67 |
+
and optionally text.
|
| 68 |
"""
|
| 69 |
+
d: dict = {
|
| 70 |
"chunk_id": self.chunk.chunk_id,
|
| 71 |
"document_id": self.chunk.document_id,
|
| 72 |
"score": self.score,
|
| 73 |
"source": self.source,
|
| 74 |
+
"metadata": self.chunk.metadata,
|
| 75 |
}
|
| 76 |
if include_text:
|
| 77 |
d["text"] = self.chunk.text
|
src/retrieval/bm25_search.py
CHANGED
|
@@ -45,7 +45,8 @@ class BM25Search:
|
|
| 45 |
tokenized_query = self._tokenize(query)
|
| 46 |
scores = self._index.get_scores(tokenized_query)
|
| 47 |
|
| 48 |
-
|
|
|
|
| 49 |
|
| 50 |
results = [
|
| 51 |
QueryResult(
|
|
@@ -54,7 +55,6 @@ class BM25Search:
|
|
| 54 |
source="bm25",
|
| 55 |
)
|
| 56 |
for i in ranked_indices
|
| 57 |
-
if scores[i] > 0.0
|
| 58 |
]
|
| 59 |
logger.debug("BM25 search returned %d results for query: %s", len(results), query)
|
| 60 |
return results
|
|
|
|
| 45 |
tokenized_query = self._tokenize(query)
|
| 46 |
scores = self._index.get_scores(tokenized_query)
|
| 47 |
|
| 48 |
+
positive_indices = [i for i in range(len(scores)) if scores[i] > 0.0]
|
| 49 |
+
ranked_indices = sorted(positive_indices, key=lambda i: scores[i], reverse=True)[:top_k]
|
| 50 |
|
| 51 |
results = [
|
| 52 |
QueryResult(
|
|
|
|
| 55 |
source="bm25",
|
| 56 |
)
|
| 57 |
for i in ranked_indices
|
|
|
|
| 58 |
]
|
| 59 |
logger.debug("BM25 search returned %d results for query: %s", len(results), query)
|
| 60 |
return results
|
src/retrieval/vector_store.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
"""Qdrant vector store for dense retrieval."""
|
| 2 |
|
|
|
|
| 3 |
import json
|
| 4 |
import logging
|
| 5 |
|
|
@@ -77,7 +78,7 @@ class VectorStore:
|
|
| 77 |
|
| 78 |
points = [
|
| 79 |
PointStruct(
|
| 80 |
-
id=
|
| 81 |
vector=embedding,
|
| 82 |
payload={
|
| 83 |
"chunk_id": chunk.chunk_id,
|
|
@@ -87,7 +88,7 @@ class VectorStore:
|
|
| 87 |
"strategy": chunk.strategy.value,
|
| 88 |
},
|
| 89 |
)
|
| 90 |
-
for
|
| 91 |
]
|
| 92 |
|
| 93 |
self._client.upsert(collection_name=self._collection_name, points=points)
|
|
|
|
| 1 |
"""Qdrant vector store for dense retrieval."""
|
| 2 |
|
| 3 |
+
import hashlib
|
| 4 |
import json
|
| 5 |
import logging
|
| 6 |
|
|
|
|
| 78 |
|
| 79 |
points = [
|
| 80 |
PointStruct(
|
| 81 |
+
id=int(hashlib.sha256(chunk.chunk_id.encode()).hexdigest()[:15], 16),
|
| 82 |
vector=embedding,
|
| 83 |
payload={
|
| 84 |
"chunk_id": chunk.chunk_id,
|
|
|
|
| 88 |
"strategy": chunk.strategy.value,
|
| 89 |
},
|
| 90 |
)
|
| 91 |
+
for chunk, embedding in zip(chunks, embeddings)
|
| 92 |
]
|
| 93 |
|
| 94 |
self._client.upsert(collection_name=self._collection_name, points=points)
|
src/ui/app.py
CHANGED
|
@@ -4,6 +4,7 @@ Calls the FastAPI backend at http://localhost:8000.
|
|
| 4 |
Single-page document search interface with clean sans-serif design.
|
| 5 |
"""
|
| 6 |
|
|
|
|
| 7 |
import json
|
| 8 |
import os
|
| 9 |
import random
|
|
@@ -721,10 +722,35 @@ if search_clicked and question.strip():
|
|
| 721 |
|
| 722 |
elif _step == "tool_result":
|
| 723 |
_rc = _event.get("result_count", 0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 724 |
st.write(
|
| 725 |
-
(f"
|
| 726 |
if lang == "da"
|
| 727 |
-
else (f"
|
| 728 |
)
|
| 729 |
|
| 730 |
elif _step == "generate":
|
|
@@ -783,7 +809,7 @@ if search_clicked and question.strip():
|
|
| 783 |
|
| 784 |
# -- Answer --
|
| 785 |
answer = data.get("answer", t["no_answer"])
|
| 786 |
-
st.markdown(
|
| 787 |
|
| 788 |
# -- Sources --
|
| 789 |
sources = data.get("sources", [])
|
|
@@ -795,17 +821,17 @@ if search_clicked and question.strip():
|
|
| 795 |
score = src.get("score", 0.0)
|
| 796 |
retrieval_source = src.get("source", "")
|
| 797 |
metadata = src.get("metadata", {})
|
| 798 |
-
page = metadata.get("
|
| 799 |
|
| 800 |
page_info = f' · {t["page_label"]} {page}' if page else ""
|
| 801 |
score_display = f"{score:.3f}"
|
| 802 |
|
| 803 |
st.markdown(
|
| 804 |
f'<div class="source-card">'
|
| 805 |
-
f'<div class="source-card-title">{doc_name}{page_info}</div>'
|
| 806 |
-
f'<div class="source-card-text">{text[:500]}</div>'
|
| 807 |
f'<div class="source-card-meta">'
|
| 808 |
-
f"Score: {score_display} · {retrieval_source}"
|
| 809 |
f"</div>"
|
| 810 |
f"</div>",
|
| 811 |
unsafe_allow_html=True,
|
|
@@ -844,48 +870,62 @@ if search_clicked and question.strip():
|
|
| 844 |
)
|
| 845 |
st.markdown(f"{header}\n{rows}")
|
| 846 |
|
| 847 |
-
|
| 848 |
-
|
|
|
|
| 849 |
|
| 850 |
-
|
|
|
|
|
|
|
| 851 |
|
| 852 |
-
|
| 853 |
-
_render_result_table(pd.get("dense_results", []), t["pipeline_dense"])
|
| 854 |
|
| 855 |
-
|
|
|
|
|
|
|
|
|
|
| 856 |
|
| 857 |
-
|
| 858 |
-
|
| 859 |
|
| 860 |
-
|
| 861 |
|
| 862 |
-
# 5) Reranked
|
| 863 |
reranked = pd.get("reranked_results", [])
|
| 864 |
st.markdown(f'**{t["pipeline_reranked"]}**')
|
| 865 |
if reranked:
|
| 866 |
-
|
| 867 |
-
|
| 868 |
-
|
| 869 |
-
|
| 870 |
-
|
| 871 |
-
|
| 872 |
-
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
|
| 877 |
-
|
| 878 |
-
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
-
|
| 884 |
-
|
| 885 |
-
|
| 886 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 887 |
)
|
| 888 |
-
|
| 889 |
else:
|
| 890 |
st.caption(t["pipeline_no_results"])
|
| 891 |
|
|
|
|
| 4 |
Single-page document search interface with clean sans-serif design.
|
| 5 |
"""
|
| 6 |
|
| 7 |
+
import html
|
| 8 |
import json
|
| 9 |
import os
|
| 10 |
import random
|
|
|
|
| 722 |
|
| 723 |
elif _step == "tool_result":
|
| 724 |
_rc = _event.get("result_count", 0)
|
| 725 |
+
_tool_name = _event.get("tool", "")
|
| 726 |
+
if _tool_name == "list_documents":
|
| 727 |
+
# list_documents returns doc list in its text,
|
| 728 |
+
# parse count from the tool output or show generic
|
| 729 |
+
st.write(
|
| 730 |
+
"Dokumentliste hentet"
|
| 731 |
+
if lang == "da"
|
| 732 |
+
else "Document list retrieved"
|
| 733 |
+
)
|
| 734 |
+
elif _tool_name == "fetch_document":
|
| 735 |
+
st.write(
|
| 736 |
+
(f"Hentet dokument (**{_rc}** afsnit)")
|
| 737 |
+
if lang == "da"
|
| 738 |
+
else (f"Fetched document (**{_rc}** chunks)")
|
| 739 |
+
)
|
| 740 |
+
else:
|
| 741 |
+
st.write(
|
| 742 |
+
(f"Fandt **{_rc}** relevante passager")
|
| 743 |
+
if lang == "da"
|
| 744 |
+
else (f"Found **{_rc}** relevant passages")
|
| 745 |
+
)
|
| 746 |
+
|
| 747 |
+
elif _step == "broaden_query":
|
| 748 |
+
_retry = _event.get("retry_count", 1)
|
| 749 |
+
_rq = _event.get("retrieval_query", "")
|
| 750 |
st.write(
|
| 751 |
+
(f"Lav konfidensgrad – forsøg {_retry} med udvidet søgning: _{_rq}_")
|
| 752 |
if lang == "da"
|
| 753 |
+
else (f"Low confidence – retry {_retry} with broadened query: _{_rq}_")
|
| 754 |
)
|
| 755 |
|
| 756 |
elif _step == "generate":
|
|
|
|
| 809 |
|
| 810 |
# -- Answer --
|
| 811 |
answer = data.get("answer", t["no_answer"])
|
| 812 |
+
st.markdown(answer)
|
| 813 |
|
| 814 |
# -- Sources --
|
| 815 |
sources = data.get("sources", [])
|
|
|
|
| 821 |
score = src.get("score", 0.0)
|
| 822 |
retrieval_source = src.get("source", "")
|
| 823 |
metadata = src.get("metadata", {})
|
| 824 |
+
page = metadata.get("page_number", "") if isinstance(metadata, dict) else ""
|
| 825 |
|
| 826 |
page_info = f' · {t["page_label"]} {page}' if page else ""
|
| 827 |
score_display = f"{score:.3f}"
|
| 828 |
|
| 829 |
st.markdown(
|
| 830 |
f'<div class="source-card">'
|
| 831 |
+
f'<div class="source-card-title">{html.escape(doc_name)}{page_info}</div>'
|
| 832 |
+
f'<div class="source-card-text">{html.escape(text[:500])}</div>'
|
| 833 |
f'<div class="source-card-meta">'
|
| 834 |
+
f"Score: {score_display} · {html.escape(retrieval_source)}"
|
| 835 |
f"</div>"
|
| 836 |
f"</div>",
|
| 837 |
unsafe_allow_html=True,
|
|
|
|
| 870 |
)
|
| 871 |
st.markdown(f"{header}\n{rows}")
|
| 872 |
|
| 873 |
+
_has_retrieval = bool(
|
| 874 |
+
pd.get("dense_results") or pd.get("sparse_results") or pd.get("fused_results")
|
| 875 |
+
)
|
| 876 |
|
| 877 |
+
if _has_retrieval:
|
| 878 |
+
# 2) BM25 results
|
| 879 |
+
_render_result_table(pd.get("sparse_results", []), t["pipeline_bm25"])
|
| 880 |
|
| 881 |
+
st.markdown("---")
|
|
|
|
| 882 |
|
| 883 |
+
# 3) Vector search results
|
| 884 |
+
_render_result_table(pd.get("dense_results", []), t["pipeline_dense"])
|
| 885 |
+
|
| 886 |
+
st.markdown("---")
|
| 887 |
|
| 888 |
+
# 4) RRF fused ranking
|
| 889 |
+
_render_result_table(pd.get("fused_results", []), t["pipeline_fused"])
|
| 890 |
|
| 891 |
+
st.markdown("---")
|
| 892 |
|
| 893 |
+
# 5) Reranked / fetched results
|
| 894 |
reranked = pd.get("reranked_results", [])
|
| 895 |
st.markdown(f'**{t["pipeline_reranked"]}**')
|
| 896 |
if reranked:
|
| 897 |
+
if _has_retrieval:
|
| 898 |
+
# Show score change from RRF → reranking
|
| 899 |
+
fused_scores: dict[str, float] = {
|
| 900 |
+
r.get("chunk_id", ""): r.get("score", 0.0)
|
| 901 |
+
for r in pd.get("fused_results", [])
|
| 902 |
+
}
|
| 903 |
+
header = (
|
| 904 |
+
f'| {t["pipeline_rank"]} | {t["pipeline_doc"]} | '
|
| 905 |
+
f'{t["pipeline_score"]} | {t["pipeline_score_change"]} |\n'
|
| 906 |
+
f"|---|---|---|---|"
|
| 907 |
+
)
|
| 908 |
+
rows_list = []
|
| 909 |
+
for i, r in enumerate(reranked):
|
| 910 |
+
cid = r.get("chunk_id", "")
|
| 911 |
+
new_score = r.get("score", 0.0)
|
| 912 |
+
old_score = fused_scores.get(cid)
|
| 913 |
+
if old_score is not None:
|
| 914 |
+
change = f"RRF {old_score:.4f} -> {new_score:.4f}"
|
| 915 |
+
else:
|
| 916 |
+
change = "-"
|
| 917 |
+
rows_list.append(
|
| 918 |
+
f'| {i + 1} | {_truncate_doc(r.get("document_id", ""))} | {new_score:.4f} | {change} |'
|
| 919 |
+
)
|
| 920 |
+
st.markdown(f"{header}\n" + "\n".join(rows_list))
|
| 921 |
+
else:
|
| 922 |
+
# No hybrid search was used (e.g. fetch_document only) — simple table
|
| 923 |
+
header = f'| {t["pipeline_rank"]} | {t["pipeline_doc"]} | {t["pipeline_score"]} |\n|---|---|---|'
|
| 924 |
+
rows = "\n".join(
|
| 925 |
+
f'| {i + 1} | {_truncate_doc(r.get("document_id", ""))} | {r.get("score", 0):.4f} |'
|
| 926 |
+
for i, r in enumerate(reranked)
|
| 927 |
)
|
| 928 |
+
st.markdown(f"{header}\n{rows}")
|
| 929 |
else:
|
| 930 |
st.caption(t["pipeline_no_results"])
|
| 931 |
|