Spaces:

1337XCode
/

personabot-api

Running

App Files Files Community

GitHub Actions commited on Apr 20

Commit

e007166

1 Parent(s): 385ac95

Deploy 0e8fb42

Browse files

Files changed (4) hide show

app/core/quality.py +33 -0
app/pipeline/nodes/retrieve.py +84 -30
tests/test_quality_gate_citation_coverage.py +23 -0
tests/test_retrieve_chunk_quality_filter.py +27 -0

app/core/quality.py CHANGED Viewed

@@ -35,6 +35,35 @@ _HEDGE_PHRASES: tuple[str, ...] = (
 )
 _RAW_TAG_RE = re.compile(r"</?[a-zA-Z][^>]*>")
 def is_low_trust(answer: str, chunks: list, complexity: str) -> bool:
@@ -55,6 +84,10 @@ def is_low_trust(answer: str, chunks: list, complexity: str) -> bool:
         return True
     if chunks and not re.search(r"\[\d+\]", answer):
         return True
     if complexity == "complex" and len(answer.split()) < 30:
         return True
     return False

 )
 _RAW_TAG_RE = re.compile(r"</?[a-zA-Z][^>]*>")
+_CITATION_RE = re.compile(r"\[\d+\]")
+_WORD_RE = re.compile(r"[a-zA-Z0-9]+")
+_SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
+_MIN_FACT_SENTENCE_WORDS = 6
+_MIN_CITATION_COVERAGE = 0.70
+def _is_fact_like_sentence(sentence: str) -> bool:
+    stripped = sentence.strip()
+    if not stripped:
+        return False
+    # Skip lightweight list headers and short connective lines.
+    if re.match(r"^\d+\.\s", stripped):
+        return False
+    return len(_WORD_RE.findall(stripped)) >= _MIN_FACT_SENTENCE_WORDS
+def _citation_coverage(answer: str) -> tuple[int, int]:
+    """Return (cited_fact_sentences, total_fact_sentences)."""
+    total = 0
+    cited = 0
+    for sentence in _SENTENCE_SPLIT_RE.split(answer):
+        if not _is_fact_like_sentence(sentence):
+            continue
+        total += 1
+        if _CITATION_RE.search(sentence):
+            cited += 1
+    return cited, total
 def is_low_trust(answer: str, chunks: list, complexity: str) -> bool:
         return True
     if chunks and not re.search(r"\[\d+\]", answer):
         return True
+    if chunks:
+        cited_count, fact_count = _citation_coverage(answer)
+        if fact_count >= 2 and (cited_count / fact_count) < _MIN_CITATION_COVERAGE:
+            return True
     if complexity == "complex" and len(answer.split()) < 30:
         return True
     return False

app/pipeline/nodes/retrieve.py CHANGED Viewed

@@ -41,6 +41,22 @@ _SIBLING_EXPAND_TOP_N: int = 10  # rank depth to consider for expansion
 _SIBLING_FETCH_LIMIT: int = 20   # max chunks fetched via Qdrant doc_id query
 _SIBLING_TOTAL_CAP: int = 15     # max new chunks to inject before reranker
 # Keywords that imply the visitor wants depth from a specific source type.
 # Values are the source_type values set by ingest (ChunkMetadata.source_type).
 _FOCUS_KEYWORDS: dict[frozenset[str], str] = {
@@ -226,6 +242,14 @@ def _is_capability_query(query: str) -> bool:
     return bool(tokens & _CAPABILITY_QUERY_HINTS)
 def make_retrieve_node(
     vector_store: VectorStore, embedder: Embedder, reranker: Reranker
 ) -> Callable[[PipelineState], dict]:
@@ -283,20 +307,22 @@ def make_retrieve_node(
             dense_results.append(chunks)
         # ── Split dense hits into leaf candidates and navigation nodes ─────────
-        # raptor_summary and question_proxy are navigation-only; they are expanded
-        # to their real leaf pages via Qdrant point UUID lookups.
-        leaf_candidates: list[Chunk] = []
-        leaf_fps_seen: set[str] = set()
         nav_expansion_ids: set[str] = set()
         for hit_list in dense_results:
             for chunk in hit_list:
                 ct = chunk["metadata"].get("chunk_type", "leaf")
                 if ct == "leaf":
                     fp = f"{chunk['metadata']['doc_id']}::{chunk['metadata']['section']}"
-                    if fp not in leaf_fps_seen:
-                        leaf_fps_seen.add(fp)
-                        leaf_candidates.append(chunk)
                 elif ct == "raptor_summary":
                     for uid in (chunk["metadata"].get("child_leaf_ids") or []):
                         nav_expansion_ids.add(uid)
@@ -304,17 +330,17 @@ def make_retrieve_node(
                     uid = chunk["metadata"].get("parent_leaf_id", "")
                     if uid:
                         nav_expansion_ids.add(uid)
-        # Expand nav nodes to their leaf pages in a single Qdrant retrieve call.
         if nav_expansion_ids:
             expanded_leaves = vector_store.fetch_by_point_ids(list(nav_expansion_ids))
-            for leaf in expanded_leaves:
-                fp = f"{leaf['metadata']['doc_id']}::{leaf['metadata']['section']}"
-                if fp not in leaf_fps_seen:
-                    leaf_fps_seen.add(fp)
-                    leaf_candidates.append(leaf)
-            logger.debug("UUID expansion: +%d leaves from %d nav node UUIDs.",
-                         len(expanded_leaves), len(nav_expansion_ids))
         # ── Query Normalization & Alias Generation ─────────────────────────────
         # If the user asks for "xsilica", generate "x silica" and "x-silica".
@@ -338,17 +364,28 @@ def make_retrieve_node(
             normalized_forms.add(retrieval_query.replace("-", ""))
             normalized_forms.add(retrieval_query.replace("-", " "))
-        # ── Exact Keyword Filter Search (Database hit) ─────────────────────────
-        # Runs a MatchAny query on Qdrant's `keywords` payload payload.
         keyword_results: list[Chunk] = []
-        extracted_keywords = []
-        for word in retrieval_query.lower().split():
-            extracted_keywords.append(word)
         for norm in normalized_forms:
-            extracted_keywords.append(norm)
         # Only query strong >= 4 char keywords to avoid noise matching
-        strong_keywords = [k for k in extracted_keywords if len(k) >= 4 and k not in _STOPWORDS]
         if strong_keywords:
             keyword_results = vector_store.keyword_filter_search(strong_keywords, top_k=15)
@@ -360,19 +397,32 @@ def make_retrieve_node(
             sparse_results = vector_store.search_sparse(indices, values, top_k=20)
         # ── Reciprocal Rank Fusion ─────────────────────────────────────────────
-        # Merge dense (per variant) + sparse + keyword into one ranked list.
-        # Dynamic Weighting: Explicit keyword entity matches get a 1.5x boost
-        # over semantic proximity in the RRF formula.
         all_ranked_lists: list[tuple[float, list[Chunk]]] = []
-        for dense_res in dense_results:
             all_ranked_lists.append((1.0, dense_res))
         if sparse_results:
-            all_ranked_lists.append((1.0, sparse_results))
         if keyword_results:
-            all_ranked_lists.append((1.5, keyword_results))
         fused: list[Chunk] = _rrf_merge(all_ranked_lists)
         # ── Reading events — one per unique source document ────────────────────
@@ -451,12 +501,16 @@ def make_retrieve_node(
                         if sibling_count >= _SIBLING_TOTAL_CAP:
                             break
         try:
-            reranked = await reranker.rerank(retrieval_query, unique_chunks, top_k=10)  # RC-5: raised from 7
         except (Exception, asyncio.CancelledError) as exc:
             logger.error("retrieve: reranker failed (%s); falling back to base retrieval scores.", exc)
             writer({"type": "status", "label": "Reranker offline; using base retrieval scores..."})
-            reranked = unique_chunks[:10]
             # mock top_score so relevance gate allows it through if unique_chunks exist
             if reranked:
                 reranked[0]["metadata"]["rerank_score"] = 1.0

 _SIBLING_FETCH_LIMIT: int = 20   # max chunks fetched via Qdrant doc_id query
 _SIBLING_TOTAL_CAP: int = 15     # max new chunks to inject before reranker
+# Leaf chunks expanded from navigation-node UUID edges should influence rank,
+# but with less weight than direct dense matches.
+_EXPANDED_LEAF_RRF_WEIGHT: float = 0.55
+# Sparse lexical retrieval is the primary lexical signal.
+_SPARSE_RRF_WEIGHT: float = 1.1
+# Keyword payload filtering is only an entity recall assist; do not let it
+# dominate ranking (BM25 already covers lexical matching semantics).
+_KEYWORD_RRF_WEIGHT_WITH_SPARSE: float = 0.25
+_KEYWORD_RRF_WEIGHT_NO_SPARSE: float = 0.75
+# Minimum token count for rerank candidates to avoid low-information lines
+# (e.g., contact headers) consuming top reranker slots.
+_MIN_RERANK_WORDS: int = 8
 # Keywords that imply the visitor wants depth from a specific source type.
 # Values are the source_type values set by ingest (ChunkMetadata.source_type).
 _FOCUS_KEYWORDS: dict[frozenset[str], str] = {
     return bool(tokens & _CAPABILITY_QUERY_HINTS)
+def _is_informative_chunk(chunk: Chunk) -> bool:
+    """True when chunk text has enough lexical content for cross-encoder reranking."""
+    text = (chunk.get("contextualised_text") or chunk["text"] or "").strip()
+    if not text:
+        return False
+    return len(re.findall(r"[a-zA-Z0-9]+", text)) >= _MIN_RERANK_WORDS
 def make_retrieve_node(
     vector_store: VectorStore, embedder: Embedder, reranker: Reranker
 ) -> Callable[[PipelineState], dict]:
             dense_results.append(chunks)
         # ── Split dense hits into leaf candidates and navigation nodes ─────────
+        # Dense retrieval may return navigation nodes (raptor_summary/question_proxy).
+        # Keep per-query leaf-only rankings for RRF and expand nav UUID edges to
+        # supplemental leaf candidates in a lower-weight RRF list.
+        dense_leaf_results: list[list[Chunk]] = []
         nav_expansion_ids: set[str] = set()
         for hit_list in dense_results:
+            per_query_leaf: list[Chunk] = []
+            per_query_seen: set[str] = set()
             for chunk in hit_list:
                 ct = chunk["metadata"].get("chunk_type", "leaf")
                 if ct == "leaf":
                     fp = f"{chunk['metadata']['doc_id']}::{chunk['metadata']['section']}"
+                    if fp not in per_query_seen:
+                        per_query_seen.add(fp)
+                        per_query_leaf.append(chunk)
                 elif ct == "raptor_summary":
                     for uid in (chunk["metadata"].get("child_leaf_ids") or []):
                         nav_expansion_ids.add(uid)
                     uid = chunk["metadata"].get("parent_leaf_id", "")
                     if uid:
                         nav_expansion_ids.add(uid)
+            if per_query_leaf:
+                dense_leaf_results.append(per_query_leaf)
+        expanded_leaves: list[Chunk] = []
         if nav_expansion_ids:
             expanded_leaves = vector_store.fetch_by_point_ids(list(nav_expansion_ids))
+            logger.debug(
+                "UUID expansion: +%d leaves from %d nav node UUIDs.",
+                len(expanded_leaves),
+                len(nav_expansion_ids),
+            )
         # ── Query Normalization & Alias Generation ─────────────────────────────
         # If the user asks for "xsilica", generate "x silica" and "x-silica".
             normalized_forms.add(retrieval_query.replace("-", ""))
             normalized_forms.add(retrieval_query.replace("-", " "))
+        # ── Exact Keyword Filter Search (entity recall assist) ─────────────────
+        # Runs a MatchAny query on Qdrant's `keywords` payload index.
+        # This should complement sparse BM25, not override it.
         keyword_results: list[Chunk] = []
+        extracted_keywords: set[str] = set()
+        for word in re.findall(r"[a-z0-9-]+", retrieval_query.lower()):
+            if len(word) >= 5 and word not in _STOPWORDS and word not in _CAPABILITY_QUERY_HINTS:
+                extracted_keywords.add(word)
         for norm in normalized_forms:
+            norm_clean = norm.strip().lower()
+            if " " not in norm_clean and 4 <= len(norm_clean) <= 40 and norm_clean not in _STOPWORDS:
+                extracted_keywords.add(norm_clean)
+        for canonical in canonical_forms:
+            canonical_clean = canonical.strip().lower()
+            if " " not in canonical_clean and 4 <= len(canonical_clean) <= 40:
+                extracted_keywords.add(canonical_clean)
         # Only query strong >= 4 char keywords to avoid noise matching
+        strong_keywords = sorted(extracted_keywords)
         if strong_keywords:
             keyword_results = vector_store.keyword_filter_search(strong_keywords, top_k=15)
             sparse_results = vector_store.search_sparse(indices, values, top_k=20)
         # ── Reciprocal Rank Fusion ─────────────────────────────────────────────
+        # Merge dense (per variant) + sparse + keyword-assist into one ranked list.
         all_ranked_lists: list[tuple[float, list[Chunk]]] = []
+        for dense_res in dense_leaf_results:
             all_ranked_lists.append((1.0, dense_res))
+        if expanded_leaves:
+            all_ranked_lists.append((_EXPANDED_LEAF_RRF_WEIGHT, expanded_leaves))
         if sparse_results:
+            all_ranked_lists.append((_SPARSE_RRF_WEIGHT, sparse_results))
         if keyword_results:
+            keyword_weight = (
+                _KEYWORD_RRF_WEIGHT_WITH_SPARSE if sparse_results else _KEYWORD_RRF_WEIGHT_NO_SPARSE
+            )
+            all_ranked_lists.append((keyword_weight, keyword_results))
+        if not all_ranked_lists:
+            return {
+                "answer": "",
+                "retrieved_chunks": [],
+                "reranked_chunks": [],
+                "retrieval_attempts": attempts + 1,
+                "top_rerank_score": None,
+            }
         fused: list[Chunk] = _rrf_merge(all_ranked_lists)
         # ── Reading events — one per unique source document ────────────────────
                         if sibling_count >= _SIBLING_TOTAL_CAP:
                             break
+        rerank_candidates = [chunk for chunk in unique_chunks if _is_informative_chunk(chunk)]
+        if not rerank_candidates:
+            rerank_candidates = unique_chunks
         try:
+            reranked = await reranker.rerank(retrieval_query, rerank_candidates, top_k=10)  # RC-5: raised from 7
         except (Exception, asyncio.CancelledError) as exc:
             logger.error("retrieve: reranker failed (%s); falling back to base retrieval scores.", exc)
             writer({"type": "status", "label": "Reranker offline; using base retrieval scores..."})
+            reranked = rerank_candidates[:10]
             # mock top_score so relevance gate allows it through if unique_chunks exist
             if reranked:
                 reranked[0]["metadata"]["rerank_score"] = 1.0

tests/test_quality_gate_citation_coverage.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from app.core.quality import is_low_trust
+def test_low_trust_when_citation_coverage_is_too_low() -> None:
+    answer = (
+        "He worked at Xsilica and built payment-testing workflows. "
+        "The role improved throughput and reduced defects [1]. "
+        "He also collaborated across release cycles with API testing."
+    )
+    chunks = [{"text": "resume evidence", "metadata": {}}]
+    assert is_low_trust(answer, chunks, complexity="simple") is True
+def test_not_low_trust_when_most_fact_sentences_are_cited() -> None:
+    answer = (
+        "He worked at Xsilica as a QA intern [1]. "
+        "The role increased throughput under load tests [1]. "
+        "It also reduced post-release defects across releases [1]."
+    )
+    chunks = [{"text": "resume evidence", "metadata": {}}]
+    assert is_low_trust(answer, chunks, complexity="simple") is False

tests/test_retrieve_chunk_quality_filter.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from app.pipeline.nodes.retrieve import _is_informative_chunk
+def _chunk(text: str) -> dict:
+    return {
+        "text": text,
+        "metadata": {
+            "doc_id": "resume",
+            "section": "Experience",
+            "source_title": "Resume",
+            "source_type": "resume",
+        },
+    }
+def test_informative_chunk_filter_rejects_low_information_lines() -> None:
+    chunk = _chunk("Apr 2023 - Oct 2023 Hyderabad India")
+    assert _is_informative_chunk(chunk) is False
+def test_informative_chunk_filter_accepts_contentful_passages() -> None:
+    chunk = _chunk(
+        "Reduced post-release defects by 40 percent across four releases by executing 250 test cases."
+    )
+    assert _is_informative_chunk(chunk) is True