Spaces:
Running
Running
GitHub Actions commited on
Commit Β·
e007166
1
Parent(s): 385ac95
Deploy 0e8fb42
Browse files
app/core/quality.py
CHANGED
|
@@ -35,6 +35,35 @@ _HEDGE_PHRASES: tuple[str, ...] = (
|
|
| 35 |
)
|
| 36 |
|
| 37 |
_RAW_TAG_RE = re.compile(r"</?[a-zA-Z][^>]*>")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
def is_low_trust(answer: str, chunks: list, complexity: str) -> bool:
|
|
@@ -55,6 +84,10 @@ def is_low_trust(answer: str, chunks: list, complexity: str) -> bool:
|
|
| 55 |
return True
|
| 56 |
if chunks and not re.search(r"\[\d+\]", answer):
|
| 57 |
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
if complexity == "complex" and len(answer.split()) < 30:
|
| 59 |
return True
|
| 60 |
return False
|
|
|
|
| 35 |
)
|
| 36 |
|
| 37 |
_RAW_TAG_RE = re.compile(r"</?[a-zA-Z][^>]*>")
|
| 38 |
+
_CITATION_RE = re.compile(r"\[\d+\]")
|
| 39 |
+
_WORD_RE = re.compile(r"[a-zA-Z0-9]+")
|
| 40 |
+
_SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
|
| 41 |
+
|
| 42 |
+
_MIN_FACT_SENTENCE_WORDS = 6
|
| 43 |
+
_MIN_CITATION_COVERAGE = 0.70
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _is_fact_like_sentence(sentence: str) -> bool:
|
| 47 |
+
stripped = sentence.strip()
|
| 48 |
+
if not stripped:
|
| 49 |
+
return False
|
| 50 |
+
# Skip lightweight list headers and short connective lines.
|
| 51 |
+
if re.match(r"^\d+\.\s", stripped):
|
| 52 |
+
return False
|
| 53 |
+
return len(_WORD_RE.findall(stripped)) >= _MIN_FACT_SENTENCE_WORDS
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _citation_coverage(answer: str) -> tuple[int, int]:
|
| 57 |
+
"""Return (cited_fact_sentences, total_fact_sentences)."""
|
| 58 |
+
total = 0
|
| 59 |
+
cited = 0
|
| 60 |
+
for sentence in _SENTENCE_SPLIT_RE.split(answer):
|
| 61 |
+
if not _is_fact_like_sentence(sentence):
|
| 62 |
+
continue
|
| 63 |
+
total += 1
|
| 64 |
+
if _CITATION_RE.search(sentence):
|
| 65 |
+
cited += 1
|
| 66 |
+
return cited, total
|
| 67 |
|
| 68 |
|
| 69 |
def is_low_trust(answer: str, chunks: list, complexity: str) -> bool:
|
|
|
|
| 84 |
return True
|
| 85 |
if chunks and not re.search(r"\[\d+\]", answer):
|
| 86 |
return True
|
| 87 |
+
if chunks:
|
| 88 |
+
cited_count, fact_count = _citation_coverage(answer)
|
| 89 |
+
if fact_count >= 2 and (cited_count / fact_count) < _MIN_CITATION_COVERAGE:
|
| 90 |
+
return True
|
| 91 |
if complexity == "complex" and len(answer.split()) < 30:
|
| 92 |
return True
|
| 93 |
return False
|
app/pipeline/nodes/retrieve.py
CHANGED
|
@@ -41,6 +41,22 @@ _SIBLING_EXPAND_TOP_N: int = 10 # rank depth to consider for expansion
|
|
| 41 |
_SIBLING_FETCH_LIMIT: int = 20 # max chunks fetched via Qdrant doc_id query
|
| 42 |
_SIBLING_TOTAL_CAP: int = 15 # max new chunks to inject before reranker
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
# Keywords that imply the visitor wants depth from a specific source type.
|
| 45 |
# Values are the source_type values set by ingest (ChunkMetadata.source_type).
|
| 46 |
_FOCUS_KEYWORDS: dict[frozenset[str], str] = {
|
|
@@ -226,6 +242,14 @@ def _is_capability_query(query: str) -> bool:
|
|
| 226 |
return bool(tokens & _CAPABILITY_QUERY_HINTS)
|
| 227 |
|
| 228 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
def make_retrieve_node(
|
| 230 |
vector_store: VectorStore, embedder: Embedder, reranker: Reranker
|
| 231 |
) -> Callable[[PipelineState], dict]:
|
|
@@ -283,20 +307,22 @@ def make_retrieve_node(
|
|
| 283 |
dense_results.append(chunks)
|
| 284 |
|
| 285 |
# ββ Split dense hits into leaf candidates and navigation nodes βββββββββ
|
| 286 |
-
#
|
| 287 |
-
#
|
| 288 |
-
|
| 289 |
-
|
| 290 |
nav_expansion_ids: set[str] = set()
|
| 291 |
|
| 292 |
for hit_list in dense_results:
|
|
|
|
|
|
|
| 293 |
for chunk in hit_list:
|
| 294 |
ct = chunk["metadata"].get("chunk_type", "leaf")
|
| 295 |
if ct == "leaf":
|
| 296 |
fp = f"{chunk['metadata']['doc_id']}::{chunk['metadata']['section']}"
|
| 297 |
-
if fp not in
|
| 298 |
-
|
| 299 |
-
|
| 300 |
elif ct == "raptor_summary":
|
| 301 |
for uid in (chunk["metadata"].get("child_leaf_ids") or []):
|
| 302 |
nav_expansion_ids.add(uid)
|
|
@@ -304,17 +330,17 @@ def make_retrieve_node(
|
|
| 304 |
uid = chunk["metadata"].get("parent_leaf_id", "")
|
| 305 |
if uid:
|
| 306 |
nav_expansion_ids.add(uid)
|
|
|
|
|
|
|
| 307 |
|
| 308 |
-
|
| 309 |
if nav_expansion_ids:
|
| 310 |
expanded_leaves = vector_store.fetch_by_point_ids(list(nav_expansion_ids))
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
logger.debug("UUID expansion: +%d leaves from %d nav node UUIDs.",
|
| 317 |
-
len(expanded_leaves), len(nav_expansion_ids))
|
| 318 |
|
| 319 |
# ββ Query Normalization & Alias Generation βββββββββββββββββββββββββββββ
|
| 320 |
# If the user asks for "xsilica", generate "x silica" and "x-silica".
|
|
@@ -338,17 +364,28 @@ def make_retrieve_node(
|
|
| 338 |
normalized_forms.add(retrieval_query.replace("-", ""))
|
| 339 |
normalized_forms.add(retrieval_query.replace("-", " "))
|
| 340 |
|
| 341 |
-
# ββ Exact Keyword Filter Search (
|
| 342 |
-
# Runs a MatchAny query on Qdrant's `keywords` payload
|
|
|
|
| 343 |
keyword_results: list[Chunk] = []
|
| 344 |
-
extracted_keywords
|
| 345 |
-
|
| 346 |
-
|
|
|
|
|
|
|
|
|
|
| 347 |
for norm in normalized_forms:
|
| 348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
|
| 350 |
# Only query strong >= 4 char keywords to avoid noise matching
|
| 351 |
-
strong_keywords =
|
| 352 |
if strong_keywords:
|
| 353 |
keyword_results = vector_store.keyword_filter_search(strong_keywords, top_k=15)
|
| 354 |
|
|
@@ -360,19 +397,32 @@ def make_retrieve_node(
|
|
| 360 |
sparse_results = vector_store.search_sparse(indices, values, top_k=20)
|
| 361 |
|
| 362 |
# ββ Reciprocal Rank Fusion βββββββββββββββββββββββββββββββββββββββββββββ
|
| 363 |
-
# Merge dense (per variant) + sparse + keyword into one ranked list.
|
| 364 |
-
# Dynamic Weighting: Explicit keyword entity matches get a 1.5x boost
|
| 365 |
-
# over semantic proximity in the RRF formula.
|
| 366 |
all_ranked_lists: list[tuple[float, list[Chunk]]] = []
|
| 367 |
-
for dense_res in
|
| 368 |
all_ranked_lists.append((1.0, dense_res))
|
|
|
|
|
|
|
|
|
|
| 369 |
|
| 370 |
if sparse_results:
|
| 371 |
-
all_ranked_lists.append((
|
| 372 |
|
| 373 |
if keyword_results:
|
| 374 |
-
|
|
|
|
|
|
|
|
|
|
| 375 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
fused: list[Chunk] = _rrf_merge(all_ranked_lists)
|
| 377 |
|
| 378 |
# ββ Reading events β one per unique source document ββββββββββββββββββββ
|
|
@@ -451,12 +501,16 @@ def make_retrieve_node(
|
|
| 451 |
if sibling_count >= _SIBLING_TOTAL_CAP:
|
| 452 |
break
|
| 453 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
try:
|
| 455 |
-
reranked = await reranker.rerank(retrieval_query,
|
| 456 |
except (Exception, asyncio.CancelledError) as exc:
|
| 457 |
logger.error("retrieve: reranker failed (%s); falling back to base retrieval scores.", exc)
|
| 458 |
writer({"type": "status", "label": "Reranker offline; using base retrieval scores..."})
|
| 459 |
-
reranked =
|
| 460 |
# mock top_score so relevance gate allows it through if unique_chunks exist
|
| 461 |
if reranked:
|
| 462 |
reranked[0]["metadata"]["rerank_score"] = 1.0
|
|
|
|
| 41 |
_SIBLING_FETCH_LIMIT: int = 20 # max chunks fetched via Qdrant doc_id query
|
| 42 |
_SIBLING_TOTAL_CAP: int = 15 # max new chunks to inject before reranker
|
| 43 |
|
| 44 |
+
# Leaf chunks expanded from navigation-node UUID edges should influence rank,
|
| 45 |
+
# but with less weight than direct dense matches.
|
| 46 |
+
_EXPANDED_LEAF_RRF_WEIGHT: float = 0.55
|
| 47 |
+
|
| 48 |
+
# Sparse lexical retrieval is the primary lexical signal.
|
| 49 |
+
_SPARSE_RRF_WEIGHT: float = 1.1
|
| 50 |
+
|
| 51 |
+
# Keyword payload filtering is only an entity recall assist; do not let it
|
| 52 |
+
# dominate ranking (BM25 already covers lexical matching semantics).
|
| 53 |
+
_KEYWORD_RRF_WEIGHT_WITH_SPARSE: float = 0.25
|
| 54 |
+
_KEYWORD_RRF_WEIGHT_NO_SPARSE: float = 0.75
|
| 55 |
+
|
| 56 |
+
# Minimum token count for rerank candidates to avoid low-information lines
|
| 57 |
+
# (e.g., contact headers) consuming top reranker slots.
|
| 58 |
+
_MIN_RERANK_WORDS: int = 8
|
| 59 |
+
|
| 60 |
# Keywords that imply the visitor wants depth from a specific source type.
|
| 61 |
# Values are the source_type values set by ingest (ChunkMetadata.source_type).
|
| 62 |
_FOCUS_KEYWORDS: dict[frozenset[str], str] = {
|
|
|
|
| 242 |
return bool(tokens & _CAPABILITY_QUERY_HINTS)
|
| 243 |
|
| 244 |
|
| 245 |
+
def _is_informative_chunk(chunk: Chunk) -> bool:
|
| 246 |
+
"""True when chunk text has enough lexical content for cross-encoder reranking."""
|
| 247 |
+
text = (chunk.get("contextualised_text") or chunk["text"] or "").strip()
|
| 248 |
+
if not text:
|
| 249 |
+
return False
|
| 250 |
+
return len(re.findall(r"[a-zA-Z0-9]+", text)) >= _MIN_RERANK_WORDS
|
| 251 |
+
|
| 252 |
+
|
| 253 |
def make_retrieve_node(
|
| 254 |
vector_store: VectorStore, embedder: Embedder, reranker: Reranker
|
| 255 |
) -> Callable[[PipelineState], dict]:
|
|
|
|
| 307 |
dense_results.append(chunks)
|
| 308 |
|
| 309 |
# ββ Split dense hits into leaf candidates and navigation nodes βββββββββ
|
| 310 |
+
# Dense retrieval may return navigation nodes (raptor_summary/question_proxy).
|
| 311 |
+
# Keep per-query leaf-only rankings for RRF and expand nav UUID edges to
|
| 312 |
+
# supplemental leaf candidates in a lower-weight RRF list.
|
| 313 |
+
dense_leaf_results: list[list[Chunk]] = []
|
| 314 |
nav_expansion_ids: set[str] = set()
|
| 315 |
|
| 316 |
for hit_list in dense_results:
|
| 317 |
+
per_query_leaf: list[Chunk] = []
|
| 318 |
+
per_query_seen: set[str] = set()
|
| 319 |
for chunk in hit_list:
|
| 320 |
ct = chunk["metadata"].get("chunk_type", "leaf")
|
| 321 |
if ct == "leaf":
|
| 322 |
fp = f"{chunk['metadata']['doc_id']}::{chunk['metadata']['section']}"
|
| 323 |
+
if fp not in per_query_seen:
|
| 324 |
+
per_query_seen.add(fp)
|
| 325 |
+
per_query_leaf.append(chunk)
|
| 326 |
elif ct == "raptor_summary":
|
| 327 |
for uid in (chunk["metadata"].get("child_leaf_ids") or []):
|
| 328 |
nav_expansion_ids.add(uid)
|
|
|
|
| 330 |
uid = chunk["metadata"].get("parent_leaf_id", "")
|
| 331 |
if uid:
|
| 332 |
nav_expansion_ids.add(uid)
|
| 333 |
+
if per_query_leaf:
|
| 334 |
+
dense_leaf_results.append(per_query_leaf)
|
| 335 |
|
| 336 |
+
expanded_leaves: list[Chunk] = []
|
| 337 |
if nav_expansion_ids:
|
| 338 |
expanded_leaves = vector_store.fetch_by_point_ids(list(nav_expansion_ids))
|
| 339 |
+
logger.debug(
|
| 340 |
+
"UUID expansion: +%d leaves from %d nav node UUIDs.",
|
| 341 |
+
len(expanded_leaves),
|
| 342 |
+
len(nav_expansion_ids),
|
| 343 |
+
)
|
|
|
|
|
|
|
| 344 |
|
| 345 |
# ββ Query Normalization & Alias Generation βββββββββββββββββββββββββββββ
|
| 346 |
# If the user asks for "xsilica", generate "x silica" and "x-silica".
|
|
|
|
| 364 |
normalized_forms.add(retrieval_query.replace("-", ""))
|
| 365 |
normalized_forms.add(retrieval_query.replace("-", " "))
|
| 366 |
|
| 367 |
+
# ββ Exact Keyword Filter Search (entity recall assist) βββββββββββββββββ
|
| 368 |
+
# Runs a MatchAny query on Qdrant's `keywords` payload index.
|
| 369 |
+
# This should complement sparse BM25, not override it.
|
| 370 |
keyword_results: list[Chunk] = []
|
| 371 |
+
extracted_keywords: set[str] = set()
|
| 372 |
+
|
| 373 |
+
for word in re.findall(r"[a-z0-9-]+", retrieval_query.lower()):
|
| 374 |
+
if len(word) >= 5 and word not in _STOPWORDS and word not in _CAPABILITY_QUERY_HINTS:
|
| 375 |
+
extracted_keywords.add(word)
|
| 376 |
+
|
| 377 |
for norm in normalized_forms:
|
| 378 |
+
norm_clean = norm.strip().lower()
|
| 379 |
+
if " " not in norm_clean and 4 <= len(norm_clean) <= 40 and norm_clean not in _STOPWORDS:
|
| 380 |
+
extracted_keywords.add(norm_clean)
|
| 381 |
+
|
| 382 |
+
for canonical in canonical_forms:
|
| 383 |
+
canonical_clean = canonical.strip().lower()
|
| 384 |
+
if " " not in canonical_clean and 4 <= len(canonical_clean) <= 40:
|
| 385 |
+
extracted_keywords.add(canonical_clean)
|
| 386 |
|
| 387 |
# Only query strong >= 4 char keywords to avoid noise matching
|
| 388 |
+
strong_keywords = sorted(extracted_keywords)
|
| 389 |
if strong_keywords:
|
| 390 |
keyword_results = vector_store.keyword_filter_search(strong_keywords, top_k=15)
|
| 391 |
|
|
|
|
| 397 |
sparse_results = vector_store.search_sparse(indices, values, top_k=20)
|
| 398 |
|
| 399 |
# ββ Reciprocal Rank Fusion βββββββββββββββββββββββββββββββββββββββββββββ
|
| 400 |
+
# Merge dense (per variant) + sparse + keyword-assist into one ranked list.
|
|
|
|
|
|
|
| 401 |
all_ranked_lists: list[tuple[float, list[Chunk]]] = []
|
| 402 |
+
for dense_res in dense_leaf_results:
|
| 403 |
all_ranked_lists.append((1.0, dense_res))
|
| 404 |
+
|
| 405 |
+
if expanded_leaves:
|
| 406 |
+
all_ranked_lists.append((_EXPANDED_LEAF_RRF_WEIGHT, expanded_leaves))
|
| 407 |
|
| 408 |
if sparse_results:
|
| 409 |
+
all_ranked_lists.append((_SPARSE_RRF_WEIGHT, sparse_results))
|
| 410 |
|
| 411 |
if keyword_results:
|
| 412 |
+
keyword_weight = (
|
| 413 |
+
_KEYWORD_RRF_WEIGHT_WITH_SPARSE if sparse_results else _KEYWORD_RRF_WEIGHT_NO_SPARSE
|
| 414 |
+
)
|
| 415 |
+
all_ranked_lists.append((keyword_weight, keyword_results))
|
| 416 |
|
| 417 |
+
if not all_ranked_lists:
|
| 418 |
+
return {
|
| 419 |
+
"answer": "",
|
| 420 |
+
"retrieved_chunks": [],
|
| 421 |
+
"reranked_chunks": [],
|
| 422 |
+
"retrieval_attempts": attempts + 1,
|
| 423 |
+
"top_rerank_score": None,
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
fused: list[Chunk] = _rrf_merge(all_ranked_lists)
|
| 427 |
|
| 428 |
# ββ Reading events β one per unique source document ββββββββββββββββββββ
|
|
|
|
| 501 |
if sibling_count >= _SIBLING_TOTAL_CAP:
|
| 502 |
break
|
| 503 |
|
| 504 |
+
rerank_candidates = [chunk for chunk in unique_chunks if _is_informative_chunk(chunk)]
|
| 505 |
+
if not rerank_candidates:
|
| 506 |
+
rerank_candidates = unique_chunks
|
| 507 |
+
|
| 508 |
try:
|
| 509 |
+
reranked = await reranker.rerank(retrieval_query, rerank_candidates, top_k=10) # RC-5: raised from 7
|
| 510 |
except (Exception, asyncio.CancelledError) as exc:
|
| 511 |
logger.error("retrieve: reranker failed (%s); falling back to base retrieval scores.", exc)
|
| 512 |
writer({"type": "status", "label": "Reranker offline; using base retrieval scores..."})
|
| 513 |
+
reranked = rerank_candidates[:10]
|
| 514 |
# mock top_score so relevance gate allows it through if unique_chunks exist
|
| 515 |
if reranked:
|
| 516 |
reranked[0]["metadata"]["rerank_score"] = 1.0
|
tests/test_quality_gate_citation_coverage.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.core.quality import is_low_trust
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def test_low_trust_when_citation_coverage_is_too_low() -> None:
|
| 5 |
+
answer = (
|
| 6 |
+
"He worked at Xsilica and built payment-testing workflows. "
|
| 7 |
+
"The role improved throughput and reduced defects [1]. "
|
| 8 |
+
"He also collaborated across release cycles with API testing."
|
| 9 |
+
)
|
| 10 |
+
chunks = [{"text": "resume evidence", "metadata": {}}]
|
| 11 |
+
|
| 12 |
+
assert is_low_trust(answer, chunks, complexity="simple") is True
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def test_not_low_trust_when_most_fact_sentences_are_cited() -> None:
|
| 16 |
+
answer = (
|
| 17 |
+
"He worked at Xsilica as a QA intern [1]. "
|
| 18 |
+
"The role increased throughput under load tests [1]. "
|
| 19 |
+
"It also reduced post-release defects across releases [1]."
|
| 20 |
+
)
|
| 21 |
+
chunks = [{"text": "resume evidence", "metadata": {}}]
|
| 22 |
+
|
| 23 |
+
assert is_low_trust(answer, chunks, complexity="simple") is False
|
tests/test_retrieve_chunk_quality_filter.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.pipeline.nodes.retrieve import _is_informative_chunk
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def _chunk(text: str) -> dict:
|
| 5 |
+
return {
|
| 6 |
+
"text": text,
|
| 7 |
+
"metadata": {
|
| 8 |
+
"doc_id": "resume",
|
| 9 |
+
"section": "Experience",
|
| 10 |
+
"source_title": "Resume",
|
| 11 |
+
"source_type": "resume",
|
| 12 |
+
},
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def test_informative_chunk_filter_rejects_low_information_lines() -> None:
|
| 17 |
+
chunk = _chunk("Apr 2023 - Oct 2023 Hyderabad India")
|
| 18 |
+
|
| 19 |
+
assert _is_informative_chunk(chunk) is False
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def test_informative_chunk_filter_accepts_contentful_passages() -> None:
|
| 23 |
+
chunk = _chunk(
|
| 24 |
+
"Reduced post-release defects by 40 percent across four releases by executing 250 test cases."
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
assert _is_informative_chunk(chunk) is True
|