GitHub Actions commited on
Commit
e007166
Β·
1 Parent(s): 385ac95

Deploy 0e8fb42

Browse files
app/core/quality.py CHANGED
@@ -35,6 +35,35 @@ _HEDGE_PHRASES: tuple[str, ...] = (
35
  )
36
 
37
  _RAW_TAG_RE = re.compile(r"</?[a-zA-Z][^>]*>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
 
40
  def is_low_trust(answer: str, chunks: list, complexity: str) -> bool:
@@ -55,6 +84,10 @@ def is_low_trust(answer: str, chunks: list, complexity: str) -> bool:
55
  return True
56
  if chunks and not re.search(r"\[\d+\]", answer):
57
  return True
 
 
 
 
58
  if complexity == "complex" and len(answer.split()) < 30:
59
  return True
60
  return False
 
35
  )
36
 
37
  _RAW_TAG_RE = re.compile(r"</?[a-zA-Z][^>]*>")
38
+ _CITATION_RE = re.compile(r"\[\d+\]")
39
+ _WORD_RE = re.compile(r"[a-zA-Z0-9]+")
40
+ _SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
41
+
42
+ _MIN_FACT_SENTENCE_WORDS = 6
43
+ _MIN_CITATION_COVERAGE = 0.70
44
+
45
+
46
+ def _is_fact_like_sentence(sentence: str) -> bool:
47
+ stripped = sentence.strip()
48
+ if not stripped:
49
+ return False
50
+ # Skip lightweight list headers and short connective lines.
51
+ if re.match(r"^\d+\.\s", stripped):
52
+ return False
53
+ return len(_WORD_RE.findall(stripped)) >= _MIN_FACT_SENTENCE_WORDS
54
+
55
+
56
+ def _citation_coverage(answer: str) -> tuple[int, int]:
57
+ """Return (cited_fact_sentences, total_fact_sentences)."""
58
+ total = 0
59
+ cited = 0
60
+ for sentence in _SENTENCE_SPLIT_RE.split(answer):
61
+ if not _is_fact_like_sentence(sentence):
62
+ continue
63
+ total += 1
64
+ if _CITATION_RE.search(sentence):
65
+ cited += 1
66
+ return cited, total
67
 
68
 
69
  def is_low_trust(answer: str, chunks: list, complexity: str) -> bool:
 
84
  return True
85
  if chunks and not re.search(r"\[\d+\]", answer):
86
  return True
87
+ if chunks:
88
+ cited_count, fact_count = _citation_coverage(answer)
89
+ if fact_count >= 2 and (cited_count / fact_count) < _MIN_CITATION_COVERAGE:
90
+ return True
91
  if complexity == "complex" and len(answer.split()) < 30:
92
  return True
93
  return False
app/pipeline/nodes/retrieve.py CHANGED
@@ -41,6 +41,22 @@ _SIBLING_EXPAND_TOP_N: int = 10 # rank depth to consider for expansion
41
  _SIBLING_FETCH_LIMIT: int = 20 # max chunks fetched via Qdrant doc_id query
42
  _SIBLING_TOTAL_CAP: int = 15 # max new chunks to inject before reranker
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  # Keywords that imply the visitor wants depth from a specific source type.
45
  # Values are the source_type values set by ingest (ChunkMetadata.source_type).
46
  _FOCUS_KEYWORDS: dict[frozenset[str], str] = {
@@ -226,6 +242,14 @@ def _is_capability_query(query: str) -> bool:
226
  return bool(tokens & _CAPABILITY_QUERY_HINTS)
227
 
228
 
 
 
 
 
 
 
 
 
229
  def make_retrieve_node(
230
  vector_store: VectorStore, embedder: Embedder, reranker: Reranker
231
  ) -> Callable[[PipelineState], dict]:
@@ -283,20 +307,22 @@ def make_retrieve_node(
283
  dense_results.append(chunks)
284
 
285
  # ── Split dense hits into leaf candidates and navigation nodes ─────────
286
- # raptor_summary and question_proxy are navigation-only; they are expanded
287
- # to their real leaf pages via Qdrant point UUID lookups.
288
- leaf_candidates: list[Chunk] = []
289
- leaf_fps_seen: set[str] = set()
290
  nav_expansion_ids: set[str] = set()
291
 
292
  for hit_list in dense_results:
 
 
293
  for chunk in hit_list:
294
  ct = chunk["metadata"].get("chunk_type", "leaf")
295
  if ct == "leaf":
296
  fp = f"{chunk['metadata']['doc_id']}::{chunk['metadata']['section']}"
297
- if fp not in leaf_fps_seen:
298
- leaf_fps_seen.add(fp)
299
- leaf_candidates.append(chunk)
300
  elif ct == "raptor_summary":
301
  for uid in (chunk["metadata"].get("child_leaf_ids") or []):
302
  nav_expansion_ids.add(uid)
@@ -304,17 +330,17 @@ def make_retrieve_node(
304
  uid = chunk["metadata"].get("parent_leaf_id", "")
305
  if uid:
306
  nav_expansion_ids.add(uid)
 
 
307
 
308
- # Expand nav nodes to their leaf pages in a single Qdrant retrieve call.
309
  if nav_expansion_ids:
310
  expanded_leaves = vector_store.fetch_by_point_ids(list(nav_expansion_ids))
311
- for leaf in expanded_leaves:
312
- fp = f"{leaf['metadata']['doc_id']}::{leaf['metadata']['section']}"
313
- if fp not in leaf_fps_seen:
314
- leaf_fps_seen.add(fp)
315
- leaf_candidates.append(leaf)
316
- logger.debug("UUID expansion: +%d leaves from %d nav node UUIDs.",
317
- len(expanded_leaves), len(nav_expansion_ids))
318
 
319
  # ── Query Normalization & Alias Generation ─────────────────────────────
320
  # If the user asks for "xsilica", generate "x silica" and "x-silica".
@@ -338,17 +364,28 @@ def make_retrieve_node(
338
  normalized_forms.add(retrieval_query.replace("-", ""))
339
  normalized_forms.add(retrieval_query.replace("-", " "))
340
 
341
- # ── Exact Keyword Filter Search (Database hit) ─────────────────────────
342
- # Runs a MatchAny query on Qdrant's `keywords` payload payload.
 
343
  keyword_results: list[Chunk] = []
344
- extracted_keywords = []
345
- for word in retrieval_query.lower().split():
346
- extracted_keywords.append(word)
 
 
 
347
  for norm in normalized_forms:
348
- extracted_keywords.append(norm)
 
 
 
 
 
 
 
349
 
350
  # Only query strong >= 4 char keywords to avoid noise matching
351
- strong_keywords = [k for k in extracted_keywords if len(k) >= 4 and k not in _STOPWORDS]
352
  if strong_keywords:
353
  keyword_results = vector_store.keyword_filter_search(strong_keywords, top_k=15)
354
 
@@ -360,19 +397,32 @@ def make_retrieve_node(
360
  sparse_results = vector_store.search_sparse(indices, values, top_k=20)
361
 
362
  # ── Reciprocal Rank Fusion ─────────────────────────────────────────────
363
- # Merge dense (per variant) + sparse + keyword into one ranked list.
364
- # Dynamic Weighting: Explicit keyword entity matches get a 1.5x boost
365
- # over semantic proximity in the RRF formula.
366
  all_ranked_lists: list[tuple[float, list[Chunk]]] = []
367
- for dense_res in dense_results:
368
  all_ranked_lists.append((1.0, dense_res))
 
 
 
369
 
370
  if sparse_results:
371
- all_ranked_lists.append((1.0, sparse_results))
372
 
373
  if keyword_results:
374
- all_ranked_lists.append((1.5, keyword_results))
 
 
 
375
 
 
 
 
 
 
 
 
 
 
376
  fused: list[Chunk] = _rrf_merge(all_ranked_lists)
377
 
378
  # ── Reading events β€” one per unique source document ────────────────────
@@ -451,12 +501,16 @@ def make_retrieve_node(
451
  if sibling_count >= _SIBLING_TOTAL_CAP:
452
  break
453
 
 
 
 
 
454
  try:
455
- reranked = await reranker.rerank(retrieval_query, unique_chunks, top_k=10) # RC-5: raised from 7
456
  except (Exception, asyncio.CancelledError) as exc:
457
  logger.error("retrieve: reranker failed (%s); falling back to base retrieval scores.", exc)
458
  writer({"type": "status", "label": "Reranker offline; using base retrieval scores..."})
459
- reranked = unique_chunks[:10]
460
  # mock top_score so relevance gate allows it through if unique_chunks exist
461
  if reranked:
462
  reranked[0]["metadata"]["rerank_score"] = 1.0
 
41
  _SIBLING_FETCH_LIMIT: int = 20 # max chunks fetched via Qdrant doc_id query
42
  _SIBLING_TOTAL_CAP: int = 15 # max new chunks to inject before reranker
43
 
44
+ # Leaf chunks expanded from navigation-node UUID edges should influence rank,
45
+ # but with less weight than direct dense matches.
46
+ _EXPANDED_LEAF_RRF_WEIGHT: float = 0.55
47
+
48
+ # Sparse lexical retrieval is the primary lexical signal.
49
+ _SPARSE_RRF_WEIGHT: float = 1.1
50
+
51
+ # Keyword payload filtering is only an entity recall assist; do not let it
52
+ # dominate ranking (BM25 already covers lexical matching semantics).
53
+ _KEYWORD_RRF_WEIGHT_WITH_SPARSE: float = 0.25
54
+ _KEYWORD_RRF_WEIGHT_NO_SPARSE: float = 0.75
55
+
56
+ # Minimum token count for rerank candidates to avoid low-information lines
57
+ # (e.g., contact headers) consuming top reranker slots.
58
+ _MIN_RERANK_WORDS: int = 8
59
+
60
  # Keywords that imply the visitor wants depth from a specific source type.
61
  # Values are the source_type values set by ingest (ChunkMetadata.source_type).
62
  _FOCUS_KEYWORDS: dict[frozenset[str], str] = {
 
242
  return bool(tokens & _CAPABILITY_QUERY_HINTS)
243
 
244
 
245
+ def _is_informative_chunk(chunk: Chunk) -> bool:
246
+ """True when chunk text has enough lexical content for cross-encoder reranking."""
247
+ text = (chunk.get("contextualised_text") or chunk["text"] or "").strip()
248
+ if not text:
249
+ return False
250
+ return len(re.findall(r"[a-zA-Z0-9]+", text)) >= _MIN_RERANK_WORDS
251
+
252
+
253
  def make_retrieve_node(
254
  vector_store: VectorStore, embedder: Embedder, reranker: Reranker
255
  ) -> Callable[[PipelineState], dict]:
 
307
  dense_results.append(chunks)
308
 
309
  # ── Split dense hits into leaf candidates and navigation nodes ─────────
310
+ # Dense retrieval may return navigation nodes (raptor_summary/question_proxy).
311
+ # Keep per-query leaf-only rankings for RRF and expand nav UUID edges to
312
+ # supplemental leaf candidates in a lower-weight RRF list.
313
+ dense_leaf_results: list[list[Chunk]] = []
314
  nav_expansion_ids: set[str] = set()
315
 
316
  for hit_list in dense_results:
317
+ per_query_leaf: list[Chunk] = []
318
+ per_query_seen: set[str] = set()
319
  for chunk in hit_list:
320
  ct = chunk["metadata"].get("chunk_type", "leaf")
321
  if ct == "leaf":
322
  fp = f"{chunk['metadata']['doc_id']}::{chunk['metadata']['section']}"
323
+ if fp not in per_query_seen:
324
+ per_query_seen.add(fp)
325
+ per_query_leaf.append(chunk)
326
  elif ct == "raptor_summary":
327
  for uid in (chunk["metadata"].get("child_leaf_ids") or []):
328
  nav_expansion_ids.add(uid)
 
330
  uid = chunk["metadata"].get("parent_leaf_id", "")
331
  if uid:
332
  nav_expansion_ids.add(uid)
333
+ if per_query_leaf:
334
+ dense_leaf_results.append(per_query_leaf)
335
 
336
+ expanded_leaves: list[Chunk] = []
337
  if nav_expansion_ids:
338
  expanded_leaves = vector_store.fetch_by_point_ids(list(nav_expansion_ids))
339
+ logger.debug(
340
+ "UUID expansion: +%d leaves from %d nav node UUIDs.",
341
+ len(expanded_leaves),
342
+ len(nav_expansion_ids),
343
+ )
 
 
344
 
345
  # ── Query Normalization & Alias Generation ─────────────────────────────
346
  # If the user asks for "xsilica", generate "x silica" and "x-silica".
 
364
  normalized_forms.add(retrieval_query.replace("-", ""))
365
  normalized_forms.add(retrieval_query.replace("-", " "))
366
 
367
+ # ── Exact Keyword Filter Search (entity recall assist) ─────────────────
368
+ # Runs a MatchAny query on Qdrant's `keywords` payload index.
369
+ # This should complement sparse BM25, not override it.
370
  keyword_results: list[Chunk] = []
371
+ extracted_keywords: set[str] = set()
372
+
373
+ for word in re.findall(r"[a-z0-9-]+", retrieval_query.lower()):
374
+ if len(word) >= 5 and word not in _STOPWORDS and word not in _CAPABILITY_QUERY_HINTS:
375
+ extracted_keywords.add(word)
376
+
377
  for norm in normalized_forms:
378
+ norm_clean = norm.strip().lower()
379
+ if " " not in norm_clean and 4 <= len(norm_clean) <= 40 and norm_clean not in _STOPWORDS:
380
+ extracted_keywords.add(norm_clean)
381
+
382
+ for canonical in canonical_forms:
383
+ canonical_clean = canonical.strip().lower()
384
+ if " " not in canonical_clean and 4 <= len(canonical_clean) <= 40:
385
+ extracted_keywords.add(canonical_clean)
386
 
387
  # Only query strong >= 4 char keywords to avoid noise matching
388
+ strong_keywords = sorted(extracted_keywords)
389
  if strong_keywords:
390
  keyword_results = vector_store.keyword_filter_search(strong_keywords, top_k=15)
391
 
 
397
  sparse_results = vector_store.search_sparse(indices, values, top_k=20)
398
 
399
  # ── Reciprocal Rank Fusion ─────────────────────────────────────────────
400
+ # Merge dense (per variant) + sparse + keyword-assist into one ranked list.
 
 
401
  all_ranked_lists: list[tuple[float, list[Chunk]]] = []
402
+ for dense_res in dense_leaf_results:
403
  all_ranked_lists.append((1.0, dense_res))
404
+
405
+ if expanded_leaves:
406
+ all_ranked_lists.append((_EXPANDED_LEAF_RRF_WEIGHT, expanded_leaves))
407
 
408
  if sparse_results:
409
+ all_ranked_lists.append((_SPARSE_RRF_WEIGHT, sparse_results))
410
 
411
  if keyword_results:
412
+ keyword_weight = (
413
+ _KEYWORD_RRF_WEIGHT_WITH_SPARSE if sparse_results else _KEYWORD_RRF_WEIGHT_NO_SPARSE
414
+ )
415
+ all_ranked_lists.append((keyword_weight, keyword_results))
416
 
417
+ if not all_ranked_lists:
418
+ return {
419
+ "answer": "",
420
+ "retrieved_chunks": [],
421
+ "reranked_chunks": [],
422
+ "retrieval_attempts": attempts + 1,
423
+ "top_rerank_score": None,
424
+ }
425
+
426
  fused: list[Chunk] = _rrf_merge(all_ranked_lists)
427
 
428
  # ── Reading events β€” one per unique source document ────────────────────
 
501
  if sibling_count >= _SIBLING_TOTAL_CAP:
502
  break
503
 
504
+ rerank_candidates = [chunk for chunk in unique_chunks if _is_informative_chunk(chunk)]
505
+ if not rerank_candidates:
506
+ rerank_candidates = unique_chunks
507
+
508
  try:
509
+ reranked = await reranker.rerank(retrieval_query, rerank_candidates, top_k=10) # RC-5: raised from 7
510
  except (Exception, asyncio.CancelledError) as exc:
511
  logger.error("retrieve: reranker failed (%s); falling back to base retrieval scores.", exc)
512
  writer({"type": "status", "label": "Reranker offline; using base retrieval scores..."})
513
+ reranked = rerank_candidates[:10]
514
  # mock top_score so relevance gate allows it through if unique_chunks exist
515
  if reranked:
516
  reranked[0]["metadata"]["rerank_score"] = 1.0
tests/test_quality_gate_citation_coverage.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.core.quality import is_low_trust
2
+
3
+
4
+ def test_low_trust_when_citation_coverage_is_too_low() -> None:
5
+ answer = (
6
+ "He worked at Xsilica and built payment-testing workflows. "
7
+ "The role improved throughput and reduced defects [1]. "
8
+ "He also collaborated across release cycles with API testing."
9
+ )
10
+ chunks = [{"text": "resume evidence", "metadata": {}}]
11
+
12
+ assert is_low_trust(answer, chunks, complexity="simple") is True
13
+
14
+
15
+ def test_not_low_trust_when_most_fact_sentences_are_cited() -> None:
16
+ answer = (
17
+ "He worked at Xsilica as a QA intern [1]. "
18
+ "The role increased throughput under load tests [1]. "
19
+ "It also reduced post-release defects across releases [1]."
20
+ )
21
+ chunks = [{"text": "resume evidence", "metadata": {}}]
22
+
23
+ assert is_low_trust(answer, chunks, complexity="simple") is False
tests/test_retrieve_chunk_quality_filter.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.pipeline.nodes.retrieve import _is_informative_chunk
2
+
3
+
4
+ def _chunk(text: str) -> dict:
5
+ return {
6
+ "text": text,
7
+ "metadata": {
8
+ "doc_id": "resume",
9
+ "section": "Experience",
10
+ "source_title": "Resume",
11
+ "source_type": "resume",
12
+ },
13
+ }
14
+
15
+
16
+ def test_informative_chunk_filter_rejects_low_information_lines() -> None:
17
+ chunk = _chunk("Apr 2023 - Oct 2023 Hyderabad India")
18
+
19
+ assert _is_informative_chunk(chunk) is False
20
+
21
+
22
+ def test_informative_chunk_filter_accepts_contentful_passages() -> None:
23
+ chunk = _chunk(
24
+ "Reduced post-release defects by 40 percent across four releases by executing 250 test cases."
25
+ )
26
+
27
+ assert _is_informative_chunk(chunk) is True