Spaces:

1337XCode
/

personabot-api

Running

App Files Files Community

GitHub Actions commited on Mar 8

Commit

8fdc5ad

1 Parent(s): d1766f7

Deploy 5383798

Browse files

Files changed (8) hide show

app/core/config.py +3 -5
app/models/pipeline.py +8 -0
app/pipeline/nodes/gemini_fast.py +9 -15
app/pipeline/nodes/generate.py +37 -20
app/pipeline/nodes/log_eval.py +7 -2
app/pipeline/nodes/retrieve.py +17 -2
app/services/gemini_context.toon +23 -17
app/services/reranker.py +4 -1

app/core/config.py CHANGED Viewed

@@ -44,12 +44,10 @@ class Settings(BaseSettings):
     # HF Spaces persistent volume mounts at /data. Local dev uses a relative path.
     DB_PATH: str = "sqlite.db"
-    # Gemini fast-path — separate keys by concern.
-    # GEMINI_API_KEY handles live query traffic only.
-    # GEMINI_PROCESSING_API_KEY is used exclusively in the offline weekly refresh
-    # script (refresh_gemini_context.py) and MUST NOT appear in any chat logs.
     GEMINI_API_KEY: Optional[str] = None
-    GEMINI_PROCESSING_API_KEY: Optional[str] = None
     GEMINI_MODEL: str = "gemini-2.5-flash-lite"
     GEMINI_CONTEXT_PATH: str = "backend/app/services/gemini_context.toon"

     # HF Spaces persistent volume mounts at /data. Local dev uses a relative path.
     DB_PATH: str = "sqlite.db"
+    # Gemini fast-path — live query traffic only.
+    # GEMINI_CONTEXT_PATH points to the manually maintained context file.
+    # Edit backend/app/services/gemini_context.toon to update fast-path context.
     GEMINI_API_KEY: Optional[str] = None
     GEMINI_MODEL: str = "gemini-2.5-flash-lite"
     GEMINI_CONTEXT_PATH: str = "backend/app/services/gemini_context.toon"

app/models/pipeline.py CHANGED Viewed

@@ -30,6 +30,11 @@ class ChunkMetadata(TypedDict, total=False):
     # ingestion time.  Used for Qdrant keyword payload filter at query time so
     # canonical name variants ("XSilica", "XSILICA") all match.
     keywords: list[str]
     # ── raptor_summary-only fields ────────────────────────────────────────────
     # Qdrant point UUIDs of the leaf chunks that were summarised to produce
     # this cluster node.  Used at query time to expand relevant cluster hits
@@ -105,3 +110,6 @@ class PipelineState(TypedDict):
     # proper nouns in the query).  Fed into the BM25 query as a union so the sparse
     # component scores positively across "XSilica", "XSILICA", "xsilica", etc.
     query_canonical_forms: list[str]

     # ingestion time.  Used for Qdrant keyword payload filter at query time so
     # canonical name variants ("XSilica", "XSILICA") all match.
     keywords: list[str]
+    # ── Positional ordering field ─────────────────────────────────────────────
+    # 0-based position of this chunk within its source document, set at ingestion
+    # time by heading_chunker.  Used for ordered sibling expansion at query time
+    # so retrieve.py can prefer adjacent chunks over arbitrary doc members.
+    chunk_index: int
     # ── raptor_summary-only fields ────────────────────────────────────────────
     # Qdrant point UUIDs of the leaf chunks that were summarised to produce
     # this cluster node.  Used at query time to expand relevant cluster hits
     # proper nouns in the query).  Fed into the BM25 query as a union so the sparse
     # component scores positively across "XSilica", "XSILICA", "xsilica", etc.
     query_canonical_forms: list[str]
+    # RC-13: retrieval diagnostics logged per turn.
+    sibling_expansion_count: Optional[int]  # chunks added via sibling expansion
+    focused_source_type: Optional[str]      # e.g. "cv", "project", "blog", None

app/pipeline/nodes/gemini_fast.py CHANGED Viewed

@@ -98,23 +98,17 @@ def _is_trivial(query: str) -> bool:
     """
     True when the query is pure navigation — safe for Gemini fast-path.
-    A query is trivial when:
-      - Its stripped form exactly matches a known navigation phrase, OR
-      - It is fewer than 4 words AND contains no named entity after the
-        first word (short queries without NEs are clarifications, not career
-        or project questions).
-    Everything else — including all career, project, skills, education,
-    hackathon, and biographical questions — is NOT trivial and routes to
-    RAG by default.  RAG calls Qdrant.  Qdrant has the data.
     """
     stripped = query.strip().rstrip("?!.")
-    if stripped.lower() in _TRIVIAL_PHRASES:
-        return True
-    words = query.split()
-    if len(words) < 4 and not _NE_RE.search(query):
-        return True
-    return False
 def _is_complex(query: str) -> bool:

     """
     True when the query is pure navigation — safe for Gemini fast-path.
+    A query is trivial ONLY when its stripped form exactly matches a known
+    navigation phrase from _TRIVIAL_PHRASES.  All other queries — including
+    short career/skills/internship questions — route to full RAG so they
+    receive citations backed by Qdrant evidence, not the stale TOON summary.
+    Removing the <4-word bypass (RC-10): queries like "his skills?" or
+    "any internships?" previously hit the TOON fast-path and could return
+    un-cited, outdated answers. They now always go through retrieval.
     """
     stripped = query.strip().rstrip("?!.")
+    return stripped.lower() in _TRIVIAL_PHRASES
 def _is_complex(query: str) -> bool:

app/pipeline/nodes/generate.py CHANGED Viewed

@@ -162,31 +162,48 @@ def _format_history(state: "PipelineState") -> str:
 def _merge_by_source(chunks: list) -> list[dict]:
     """
-    Collapse chunks that share the same source_url (or source_title when URL is
-    absent) into a single merged chunk.  Insertion order is preserved so the
-    highest-scoring chunk's source appears first in the numbered context block.
-    This is the correct fix for duplicate citations: if two chunks both come from
-    TextOps, they become one numbered passage [N] instead of two separate [N][M]
-    passages that make Groq cite the same document twice in the same sentence.
-    Text from subsequent chunks is appended with a separator so no content is lost.
     """
-    seen: dict[str, dict] = {}
     order: list[str] = []
     for chunk in chunks:
         meta = chunk["metadata"]
-        # Prefer URL as dedup key; fall back to title so untitled chunks aren't
-        # collapsed with each other when they come from different documents.
-        key = (meta.get("source_url") or "").strip() or meta.get("source_title", "")
-        if key not in seen:
-            # Deep-copy metadata so the mutation below doesn't affect original state.
-            seen[key] = {"text": chunk["text"], "metadata": dict(meta)}
-            order.append(key)
         else:
-            # Append additional context from the same source document.  The separator
-            # helps the LLM understand these are different excerpts, not one paragraph.
-            seen[key]["text"] += "\n\n[...continued from same source...]\n\n" + chunk["text"]
-    return [seen[k] for k in order]
 def _dedup_sources(source_refs: list[SourceRef], limit: int | None = None) -> list[SourceRef]:

 def _merge_by_source(chunks: list) -> list[dict]:
     """
+    Collapse chunks that share the same source_url + section (or source_title
+    when both URL and section are absent) into a single merged chunk.
+    RC-3 fix: keying by URL alone collapsed ALL resume chunks into one [1] blob
+    because every resume chunk has the same PDF URL.  Keying by URL::section
+    gives each section its own [N] number, so Work Experience, Education, and
+    Skills are separately citable.
+    Chunks within each group are sorted by chunk_index (document order) before
+    concatenation so the LLM reads sections top-to-bottom, not in RRF score order.
     """
+    # Collect all chunks per group key, preserving insertion order of groups.
+    groups: dict[str, list] = {}
     order: list[str] = []
     for chunk in chunks:
         meta = chunk["metadata"]
+        url = (meta.get("source_url") or "").strip()
+        section = (meta.get("section") or "").strip()
+        # Use url::section when both are available, url alone when section is
+        # empty, title alone when URL is also empty.
+        if url and section:
+            key = f"{url}::{section}"
+        elif url:
+            key = url
         else:
+            key = meta.get("source_title", "")
+        if key not in groups:
+            groups[key] = []
+            order.append(key)
+        groups[key].append(chunk)
+    merged: list[dict] = []
+    for key in order:
+        group = groups[key]
+        # Sort by chunk_index so we read the document top-to-bottom.
+        group.sort(key=lambda c: c["metadata"].get("chunk_index", 0))
+        # Use first chunk's metadata as canonical; deep-copy so we don't mutate state.
+        canonical_meta = dict(group[0]["metadata"])
+        text_parts = [c["text"] for c in group]
+        merged_text = "\n\n[...continued from same source...]\n\n".join(text_parts)
+        merged.append({"text": merged_text, "metadata": canonical_meta})
+    return merged
 def _dedup_sources(source_refs: list[SourceRef], limit: int | None = None) -> list[SourceRef]:

app/pipeline/nodes/log_eval.py CHANGED Viewed

@@ -84,6 +84,9 @@ def make_log_eval_node(db_path: str, github_log=None) -> Callable[[PipelineState
                 ("critic_quality", "TEXT"),
                 # Fix 1: enumeration classifier flag
                 ("is_enumeration_query", "BOOLEAN DEFAULT 0"),
             ]:
                 try:
                     conn.execute(f"ALTER TABLE interactions ADD COLUMN {col} {definition}")
@@ -96,8 +99,8 @@ def make_log_eval_node(db_path: str, github_log=None) -> Callable[[PipelineState
                     (timestamp, session_id, query, answer, chunks_used, rerank_scores,
                      reranked_chunks_json, latency_ms, cached, path,
                      critic_groundedness, critic_completeness, critic_specificity, critic_quality,
-                     is_enumeration_query)
-                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                 """,
                 (
                     datetime.now(tz=timezone.utc).isoformat(),
@@ -115,6 +118,8 @@ def make_log_eval_node(db_path: str, github_log=None) -> Callable[[PipelineState
                     state.get("critic_specificity"),
                     state.get("critic_quality"),
                     state.get("is_enumeration_query", False),
                 ),
             )
             return cursor.lastrowid  # type: ignore[return-value]

                 ("critic_quality", "TEXT"),
                 # Fix 1: enumeration classifier flag
                 ("is_enumeration_query", "BOOLEAN DEFAULT 0"),
+                # RC-13: retrieval diagnostics
+                ("sibling_expansion_count", "INTEGER"),
+                ("focused_source_type", "TEXT"),
             ]:
                 try:
                     conn.execute(f"ALTER TABLE interactions ADD COLUMN {col} {definition}")
                     (timestamp, session_id, query, answer, chunks_used, rerank_scores,
                      reranked_chunks_json, latency_ms, cached, path,
                      critic_groundedness, critic_completeness, critic_specificity, critic_quality,
+                     is_enumeration_query, sibling_expansion_count, focused_source_type)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                 """,
                 (
                     datetime.now(tz=timezone.utc).isoformat(),
                     state.get("critic_specificity"),
                     state.get("critic_quality"),
                     state.get("is_enumeration_query", False),
+                    state.get("sibling_expansion_count"),
+                    state.get("focused_source_type"),
                 ),
             )
             return cursor.lastrowid  # type: ignore[return-value]

app/pipeline/nodes/retrieve.py CHANGED Viewed

@@ -272,8 +272,21 @@ def make_retrieve_node(
                 if sibling_count >= _SIBLING_TOTAL_CAP:
                     break
                 doc_id = seed["metadata"]["doc_id"]
                 siblings = vector_store.fetch_by_doc_id(doc_id, limit=_SIBLING_FETCH_LIMIT)
-                for sib in siblings:
                     fp = f"{sib['metadata']['doc_id']}::{sib['metadata']['section']}"
                     if fp not in sibling_fps:
                         sibling_fps.add(fp)
@@ -282,7 +295,7 @@ def make_retrieve_node(
                         if sibling_count >= _SIBLING_TOTAL_CAP:
                             break
-        reranked = await reranker.rerank(retrieval_query, unique_chunks, top_k=7)
         # Guard: assert all reranker inputs were leaf chunks.
         # Non-leaf nodes (raptor_summary / question_proxy) reaching the reranker
@@ -368,6 +381,8 @@ def make_retrieve_node(
             "reranked_chunks": diverse_chunks,
             "retrieval_attempts": attempts + 1,
             "top_rerank_score": top_score,
         }
     return retrieve_node

                 if sibling_count >= _SIBLING_TOTAL_CAP:
                     break
                 doc_id = seed["metadata"]["doc_id"]
+                seed_idx = seed["metadata"].get("chunk_index", -1)
                 siblings = vector_store.fetch_by_doc_id(doc_id, limit=_SIBLING_FETCH_LIMIT)
+                # RC-2 fix: sort by chunk_index so we can prefer adjacent chunks.
+                siblings.sort(key=lambda c: c["metadata"].get("chunk_index", 0))
+                # If seed position is known, prefer adjacent indices (±2) first,
+                # then fall through to remaining siblings in document order.
+                if seed_idx >= 0:
+                    adjacent = [s for s in siblings
+                                if abs(s["metadata"].get("chunk_index", -999) - seed_idx) <= 2]
+                    rest = [s for s in siblings
+                            if abs(s["metadata"].get("chunk_index", -999) - seed_idx) > 2]
+                    ordered_siblings = adjacent + rest
+                else:
+                    ordered_siblings = siblings
+                for sib in ordered_siblings:
                     fp = f"{sib['metadata']['doc_id']}::{sib['metadata']['section']}"
                     if fp not in sibling_fps:
                         sibling_fps.add(fp)
                         if sibling_count >= _SIBLING_TOTAL_CAP:
                             break
+        reranked = await reranker.rerank(retrieval_query, unique_chunks, top_k=10)  # RC-5: raised from 7
         # Guard: assert all reranker inputs were leaf chunks.
         # Non-leaf nodes (raptor_summary / question_proxy) reaching the reranker
             "reranked_chunks": diverse_chunks,
             "retrieval_attempts": attempts + 1,
             "top_rerank_score": top_score,
+            "sibling_expansion_count": sibling_count if unique_chunks else 0,  # RC-13
+            "focused_source_type": focused_type,  # RC-13
         }
     return retrieve_node

app/services/gemini_context.toon CHANGED Viewed

@@ -1,18 +1,24 @@
-# doc-hashes: {"src/content/posts/prompt-engineering-jailbreak/index.mdx":"5820b126e93a97eb","src/content/posts/assistive-vision/index.mdx":"0b27e26824cd8542","src/content/projects/donut-asm/index.mdx":"bf34dff12224679b","src/content/projects/echo-echo/index.mdx":"c112959f32f7b9cc","src/content/projects/localhost/index.mdx":"c7fa4b0ef8668353","src/content/projects/save-the-planet/index.mdx":"e825b0597f56c3e8","src/content/projects/sorting-demo/index.mdx":"6282b97a72b92874","src/content/projects/student-management-system/index.mdx":"f022589b3256fdda","src/content/projects/sysphus/index.mdx":"16c55970ad3e8ab3","src/content/projects/textops/index.mdx":"1a8f0ae804865956"}
-# doc-summaries: {}
-# PersonaBot — Gemini fast-path context (TOON format)
-# Auto-generated by scripts/refresh_gemini_context.py — do not hand-edit.
-# Refreshed weekly via GitHub Actions (refresh_context.yml).
-projects[8]{name,description,technologies,url,github}:
-  donut-asm,A fully working implementation of donut.c in Assembly x86.,"['Assembly x86', 'Calculus']",/projects/donut-asm,""
-  echo-echo,"A decentralized, anonymous, peer-to-peer chat application featuring historic message sync, serverless architecture, and end-to-end encryption, built using Kademlia DHT and WebRTC.","['KademliaDHT', 'WebRTC', 'VanillaJS', 'Decentralized', 'E2EE']",/projects/echo-echo,""
-  localhost,"A production-grade, decentralized portfolio hosting system using Tor hidden service and Kademlia DHT-based P2P CDN.","['TOR', 'Kademlia DHT', 'WebRTC', 'Termux']",/projects/localhost,""
-  save-the-planet,A console based text game in Java with an environmental awareness theme.,"['Java', 'JUnit', 'OOP', 'TDD']",/projects/save-the-planet,""
-  sorting-demo,A visual demonstration of basic sorting algorithms.,"['Algorithms', 'Sorting', 'VanillaJS']",/projects/sorting-demo,""
-  student-management-system,"A web-based application for monitoring student progression, featuring both Admin and Student dashboards.","['Node.js', 'EJS', 'JWT', 'SQL', 'ORM', 'OWASP Top 10']",/projects/student-management-system,""
-  sysphus,"A task management application inspired by Jira, featuring Markdown support for creating and organizing tasks efficiently.","['HTML5', 'CSS3', 'VanillaJS']",/projects/sysphus,""
-  textops,Engineered polyglot microservices text editor with custom Go API gateway achieving >5ms latency overhead. Migrated to serverless architecture reducing operational costs by 94% while maintaining 99.9% uptime SLOs.,"['Kubernetes', 'Docker', 'AWS', 'GCP', 'Prometheus', 'GitLab CI/CD']",/projects/textops,""
-blogs[2]{title,summary,url,tags}:
-  60 FPS Object Detection on Android using YOLOv8,"How I built a realtime Android vision loop with YOLO + NCNN, IOU tracking, and distance-adaptive PID control all running at 60 FPS.",/blog/assistive-vision,"['Computer Vision', 'Real-Time Systems', 'Android']"
-  Mongo Tom is back with GPT-5,"How I used JSON-structured prompts with fictional character framing to bypass safety guardrails in GPT-5, Claude, Gemini, and Grok.",/blog/prompt-engineering-jailbreak,"['Prompt Engineering', 'LLMs', 'AI Safety']"

+# PersonaBot — Gemini fast-path context
+# Manually maintained. Edit this file whenever your portfolio changes.
+# This file is loaded once at startup and passed to Gemini as primary context
+# for conversational/trivial questions. Evidence-based questions always go
+# through full RAG (Qdrant retrieval) regardless of what is written here.
+#
+# Format: free text. Write naturally — Gemini reads this as a context block.
+# Keep it concise; ~500-800 words is ideal. Longer = more tokens per request.
+#
+# HINT: Only include things that are genuinely fast-path-safe:
+#   - Who Darshan is (1-2 sentences)
+#   - Overview of the portfolio site
+#   - What topics the bot can answer questions about
+# Do NOT put internship details, tech stacks, or project specifics here —
+# those answers must come from Qdrant, fully cited.
+Darshan Chheda is a software engineer and CS student whose portfolio covers
+projects in systems programming, distributed systems, RAG/LLM systems,
+computer vision, and full-stack web development.
+This portfolio chatbot can answer questions about his projects, blog posts,
+technical skills, work experience, education, hackathons, and general background.
+For any specific factual question, it searches his actual portfolio content
+and provides cited answers.

app/services/reranker.py CHANGED Viewed

@@ -37,7 +37,10 @@ class Reranker:
             self._min_score = 0.0
             return []
-        texts = [chunk["text"] for chunk in chunks]
         if self._remote:
             async with httpx.AsyncClient(timeout=30.0) as client:

             self._min_score = 0.0
             return []
+        # RC-12: prefer contextualised_text (doc title + section prefix) so the
+        # cross-encoder sees the same enriched text as the dense retriever.
+        # Falls back to raw chunk text for old points that pre-date contextualisation.
+        texts = [chunk.get("contextualised_text") or chunk["text"] for chunk in chunks]
         if self._remote:
             async with httpx.AsyncClient(timeout=30.0) as client: