GitHub Actions commited on
Commit
8fdc5ad
·
1 Parent(s): d1766f7

Deploy 5383798

Browse files
app/core/config.py CHANGED
@@ -44,12 +44,10 @@ class Settings(BaseSettings):
44
  # HF Spaces persistent volume mounts at /data. Local dev uses a relative path.
45
  DB_PATH: str = "sqlite.db"
46
 
47
- # Gemini fast-path — separate keys by concern.
48
- # GEMINI_API_KEY handles live query traffic only.
49
- # GEMINI_PROCESSING_API_KEY is used exclusively in the offline weekly refresh
50
- # script (refresh_gemini_context.py) and MUST NOT appear in any chat logs.
51
  GEMINI_API_KEY: Optional[str] = None
52
- GEMINI_PROCESSING_API_KEY: Optional[str] = None
53
  GEMINI_MODEL: str = "gemini-2.5-flash-lite"
54
  GEMINI_CONTEXT_PATH: str = "backend/app/services/gemini_context.toon"
55
 
 
44
  # HF Spaces persistent volume mounts at /data. Local dev uses a relative path.
45
  DB_PATH: str = "sqlite.db"
46
 
47
+ # Gemini fast-path — live query traffic only.
48
+ # GEMINI_CONTEXT_PATH points to the manually maintained context file.
49
+ # Edit backend/app/services/gemini_context.toon to update fast-path context.
 
50
  GEMINI_API_KEY: Optional[str] = None
 
51
  GEMINI_MODEL: str = "gemini-2.5-flash-lite"
52
  GEMINI_CONTEXT_PATH: str = "backend/app/services/gemini_context.toon"
53
 
app/models/pipeline.py CHANGED
@@ -30,6 +30,11 @@ class ChunkMetadata(TypedDict, total=False):
30
  # ingestion time. Used for Qdrant keyword payload filter at query time so
31
  # canonical name variants ("XSilica", "XSILICA") all match.
32
  keywords: list[str]
 
 
 
 
 
33
  # ── raptor_summary-only fields ────────────────────────────────────────────
34
  # Qdrant point UUIDs of the leaf chunks that were summarised to produce
35
  # this cluster node. Used at query time to expand relevant cluster hits
@@ -105,3 +110,6 @@ class PipelineState(TypedDict):
105
  # proper nouns in the query). Fed into the BM25 query as a union so the sparse
106
  # component scores positively across "XSilica", "XSILICA", "xsilica", etc.
107
  query_canonical_forms: list[str]
 
 
 
 
30
  # ingestion time. Used for Qdrant keyword payload filter at query time so
31
  # canonical name variants ("XSilica", "XSILICA") all match.
32
  keywords: list[str]
33
+ # ── Positional ordering field ─────────────────────────────────────────────
34
+ # 0-based position of this chunk within its source document, set at ingestion
35
+ # time by heading_chunker. Used for ordered sibling expansion at query time
36
+ # so retrieve.py can prefer adjacent chunks over arbitrary doc members.
37
+ chunk_index: int
38
  # ── raptor_summary-only fields ────────────────────────────────────────────
39
  # Qdrant point UUIDs of the leaf chunks that were summarised to produce
40
  # this cluster node. Used at query time to expand relevant cluster hits
 
110
  # proper nouns in the query). Fed into the BM25 query as a union so the sparse
111
  # component scores positively across "XSilica", "XSILICA", "xsilica", etc.
112
  query_canonical_forms: list[str]
113
+ # RC-13: retrieval diagnostics logged per turn.
114
+ sibling_expansion_count: Optional[int] # chunks added via sibling expansion
115
+ focused_source_type: Optional[str] # e.g. "cv", "project", "blog", None
app/pipeline/nodes/gemini_fast.py CHANGED
@@ -98,23 +98,17 @@ def _is_trivial(query: str) -> bool:
98
  """
99
  True when the query is pure navigation — safe for Gemini fast-path.
100
 
101
- A query is trivial when:
102
- - Its stripped form exactly matches a known navigation phrase, OR
103
- - It is fewer than 4 words AND contains no named entity after the
104
- first word (short queries without NEs are clarifications, not career
105
- or project questions).
106
-
107
- Everything else including all career, project, skills, education,
108
- hackathon, and biographical questions is NOT trivial and routes to
109
- RAG by default. RAG calls Qdrant. Qdrant has the data.
110
  """
111
  stripped = query.strip().rstrip("?!.")
112
- if stripped.lower() in _TRIVIAL_PHRASES:
113
- return True
114
- words = query.split()
115
- if len(words) < 4 and not _NE_RE.search(query):
116
- return True
117
- return False
118
 
119
 
120
  def _is_complex(query: str) -> bool:
 
98
  """
99
  True when the query is pure navigation — safe for Gemini fast-path.
100
 
101
+ A query is trivial ONLY when its stripped form exactly matches a known
102
+ navigation phrase from _TRIVIAL_PHRASES. All other queries including
103
+ short career/skills/internship questions route to full RAG so they
104
+ receive citations backed by Qdrant evidence, not the stale TOON summary.
105
+
106
+ Removing the <4-word bypass (RC-10): queries like "his skills?" or
107
+ "any internships?" previously hit the TOON fast-path and could return
108
+ un-cited, outdated answers. They now always go through retrieval.
 
109
  """
110
  stripped = query.strip().rstrip("?!.")
111
+ return stripped.lower() in _TRIVIAL_PHRASES
 
 
 
 
 
112
 
113
 
114
  def _is_complex(query: str) -> bool:
app/pipeline/nodes/generate.py CHANGED
@@ -162,31 +162,48 @@ def _format_history(state: "PipelineState") -> str:
162
 
163
  def _merge_by_source(chunks: list) -> list[dict]:
164
  """
165
- Collapse chunks that share the same source_url (or source_title when URL is
166
- absent) into a single merged chunk. Insertion order is preserved so the
167
- highest-scoring chunk's source appears first in the numbered context block.
168
-
169
- This is the correct fix for duplicate citations: if two chunks both come from
170
- TextOps, they become one numbered passage [N] instead of two separate [N][M]
171
- passages that make Groq cite the same document twice in the same sentence.
172
- Text from subsequent chunks is appended with a separator so no content is lost.
 
 
173
  """
174
- seen: dict[str, dict] = {}
 
175
  order: list[str] = []
176
  for chunk in chunks:
177
  meta = chunk["metadata"]
178
- # Prefer URL as dedup key; fall back to title so untitled chunks aren't
179
- # collapsed with each other when they come from different documents.
180
- key = (meta.get("source_url") or "").strip() or meta.get("source_title", "")
181
- if key not in seen:
182
- # Deep-copy metadata so the mutation below doesn't affect original state.
183
- seen[key] = {"text": chunk["text"], "metadata": dict(meta)}
184
- order.append(key)
 
185
  else:
186
- # Append additional context from the same source document. The separator
187
- # helps the LLM understand these are different excerpts, not one paragraph.
188
- seen[key]["text"] += "\n\n[...continued from same source...]\n\n" + chunk["text"]
189
- return [seen[k] for k in order]
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
 
192
  def _dedup_sources(source_refs: list[SourceRef], limit: int | None = None) -> list[SourceRef]:
 
162
 
163
  def _merge_by_source(chunks: list) -> list[dict]:
164
  """
165
+ Collapse chunks that share the same source_url + section (or source_title
166
+ when both URL and section are absent) into a single merged chunk.
167
+
168
+ RC-3 fix: keying by URL alone collapsed ALL resume chunks into one [1] blob
169
+ because every resume chunk has the same PDF URL. Keying by URL::section
170
+ gives each section its own [N] number, so Work Experience, Education, and
171
+ Skills are separately citable.
172
+
173
+ Chunks within each group are sorted by chunk_index (document order) before
174
+ concatenation so the LLM reads sections top-to-bottom, not in RRF score order.
175
  """
176
+ # Collect all chunks per group key, preserving insertion order of groups.
177
+ groups: dict[str, list] = {}
178
  order: list[str] = []
179
  for chunk in chunks:
180
  meta = chunk["metadata"]
181
+ url = (meta.get("source_url") or "").strip()
182
+ section = (meta.get("section") or "").strip()
183
+ # Use url::section when both are available, url alone when section is
184
+ # empty, title alone when URL is also empty.
185
+ if url and section:
186
+ key = f"{url}::{section}"
187
+ elif url:
188
+ key = url
189
  else:
190
+ key = meta.get("source_title", "")
191
+ if key not in groups:
192
+ groups[key] = []
193
+ order.append(key)
194
+ groups[key].append(chunk)
195
+
196
+ merged: list[dict] = []
197
+ for key in order:
198
+ group = groups[key]
199
+ # Sort by chunk_index so we read the document top-to-bottom.
200
+ group.sort(key=lambda c: c["metadata"].get("chunk_index", 0))
201
+ # Use first chunk's metadata as canonical; deep-copy so we don't mutate state.
202
+ canonical_meta = dict(group[0]["metadata"])
203
+ text_parts = [c["text"] for c in group]
204
+ merged_text = "\n\n[...continued from same source...]\n\n".join(text_parts)
205
+ merged.append({"text": merged_text, "metadata": canonical_meta})
206
+ return merged
207
 
208
 
209
  def _dedup_sources(source_refs: list[SourceRef], limit: int | None = None) -> list[SourceRef]:
app/pipeline/nodes/log_eval.py CHANGED
@@ -84,6 +84,9 @@ def make_log_eval_node(db_path: str, github_log=None) -> Callable[[PipelineState
84
  ("critic_quality", "TEXT"),
85
  # Fix 1: enumeration classifier flag
86
  ("is_enumeration_query", "BOOLEAN DEFAULT 0"),
 
 
 
87
  ]:
88
  try:
89
  conn.execute(f"ALTER TABLE interactions ADD COLUMN {col} {definition}")
@@ -96,8 +99,8 @@ def make_log_eval_node(db_path: str, github_log=None) -> Callable[[PipelineState
96
  (timestamp, session_id, query, answer, chunks_used, rerank_scores,
97
  reranked_chunks_json, latency_ms, cached, path,
98
  critic_groundedness, critic_completeness, critic_specificity, critic_quality,
99
- is_enumeration_query)
100
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
101
  """,
102
  (
103
  datetime.now(tz=timezone.utc).isoformat(),
@@ -115,6 +118,8 @@ def make_log_eval_node(db_path: str, github_log=None) -> Callable[[PipelineState
115
  state.get("critic_specificity"),
116
  state.get("critic_quality"),
117
  state.get("is_enumeration_query", False),
 
 
118
  ),
119
  )
120
  return cursor.lastrowid # type: ignore[return-value]
 
84
  ("critic_quality", "TEXT"),
85
  # Fix 1: enumeration classifier flag
86
  ("is_enumeration_query", "BOOLEAN DEFAULT 0"),
87
+ # RC-13: retrieval diagnostics
88
+ ("sibling_expansion_count", "INTEGER"),
89
+ ("focused_source_type", "TEXT"),
90
  ]:
91
  try:
92
  conn.execute(f"ALTER TABLE interactions ADD COLUMN {col} {definition}")
 
99
  (timestamp, session_id, query, answer, chunks_used, rerank_scores,
100
  reranked_chunks_json, latency_ms, cached, path,
101
  critic_groundedness, critic_completeness, critic_specificity, critic_quality,
102
+ is_enumeration_query, sibling_expansion_count, focused_source_type)
103
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
104
  """,
105
  (
106
  datetime.now(tz=timezone.utc).isoformat(),
 
118
  state.get("critic_specificity"),
119
  state.get("critic_quality"),
120
  state.get("is_enumeration_query", False),
121
+ state.get("sibling_expansion_count"),
122
+ state.get("focused_source_type"),
123
  ),
124
  )
125
  return cursor.lastrowid # type: ignore[return-value]
app/pipeline/nodes/retrieve.py CHANGED
@@ -272,8 +272,21 @@ def make_retrieve_node(
272
  if sibling_count >= _SIBLING_TOTAL_CAP:
273
  break
274
  doc_id = seed["metadata"]["doc_id"]
 
275
  siblings = vector_store.fetch_by_doc_id(doc_id, limit=_SIBLING_FETCH_LIMIT)
276
- for sib in siblings:
 
 
 
 
 
 
 
 
 
 
 
 
277
  fp = f"{sib['metadata']['doc_id']}::{sib['metadata']['section']}"
278
  if fp not in sibling_fps:
279
  sibling_fps.add(fp)
@@ -282,7 +295,7 @@ def make_retrieve_node(
282
  if sibling_count >= _SIBLING_TOTAL_CAP:
283
  break
284
 
285
- reranked = await reranker.rerank(retrieval_query, unique_chunks, top_k=7)
286
 
287
  # Guard: assert all reranker inputs were leaf chunks.
288
  # Non-leaf nodes (raptor_summary / question_proxy) reaching the reranker
@@ -368,6 +381,8 @@ def make_retrieve_node(
368
  "reranked_chunks": diverse_chunks,
369
  "retrieval_attempts": attempts + 1,
370
  "top_rerank_score": top_score,
 
 
371
  }
372
 
373
  return retrieve_node
 
272
  if sibling_count >= _SIBLING_TOTAL_CAP:
273
  break
274
  doc_id = seed["metadata"]["doc_id"]
275
+ seed_idx = seed["metadata"].get("chunk_index", -1)
276
  siblings = vector_store.fetch_by_doc_id(doc_id, limit=_SIBLING_FETCH_LIMIT)
277
+ # RC-2 fix: sort by chunk_index so we can prefer adjacent chunks.
278
+ siblings.sort(key=lambda c: c["metadata"].get("chunk_index", 0))
279
+ # If seed position is known, prefer adjacent indices (±2) first,
280
+ # then fall through to remaining siblings in document order.
281
+ if seed_idx >= 0:
282
+ adjacent = [s for s in siblings
283
+ if abs(s["metadata"].get("chunk_index", -999) - seed_idx) <= 2]
284
+ rest = [s for s in siblings
285
+ if abs(s["metadata"].get("chunk_index", -999) - seed_idx) > 2]
286
+ ordered_siblings = adjacent + rest
287
+ else:
288
+ ordered_siblings = siblings
289
+ for sib in ordered_siblings:
290
  fp = f"{sib['metadata']['doc_id']}::{sib['metadata']['section']}"
291
  if fp not in sibling_fps:
292
  sibling_fps.add(fp)
 
295
  if sibling_count >= _SIBLING_TOTAL_CAP:
296
  break
297
 
298
+ reranked = await reranker.rerank(retrieval_query, unique_chunks, top_k=10) # RC-5: raised from 7
299
 
300
  # Guard: assert all reranker inputs were leaf chunks.
301
  # Non-leaf nodes (raptor_summary / question_proxy) reaching the reranker
 
381
  "reranked_chunks": diverse_chunks,
382
  "retrieval_attempts": attempts + 1,
383
  "top_rerank_score": top_score,
384
+ "sibling_expansion_count": sibling_count if unique_chunks else 0, # RC-13
385
+ "focused_source_type": focused_type, # RC-13
386
  }
387
 
388
  return retrieve_node
app/services/gemini_context.toon CHANGED
@@ -1,18 +1,24 @@
1
- # doc-hashes: {"src/content/posts/prompt-engineering-jailbreak/index.mdx":"5820b126e93a97eb","src/content/posts/assistive-vision/index.mdx":"0b27e26824cd8542","src/content/projects/donut-asm/index.mdx":"bf34dff12224679b","src/content/projects/echo-echo/index.mdx":"c112959f32f7b9cc","src/content/projects/localhost/index.mdx":"c7fa4b0ef8668353","src/content/projects/save-the-planet/index.mdx":"e825b0597f56c3e8","src/content/projects/sorting-demo/index.mdx":"6282b97a72b92874","src/content/projects/student-management-system/index.mdx":"f022589b3256fdda","src/content/projects/sysphus/index.mdx":"16c55970ad3e8ab3","src/content/projects/textops/index.mdx":"1a8f0ae804865956"}
2
- # doc-summaries: {}
3
- # PersonaBot Gemini fast-path context (TOON format)
4
- # Auto-generated by scripts/refresh_gemini_context.py do not hand-edit.
5
- # Refreshed weekly via GitHub Actions (refresh_context.yml).
 
 
 
 
 
 
 
 
 
 
6
 
7
- projects[8]{name,description,technologies,url,github}:
8
- donut-asm,A fully working implementation of donut.c in Assembly x86.,"['Assembly x86', 'Calculus']",/projects/donut-asm,""
9
- echo-echo,"A decentralized, anonymous, peer-to-peer chat application featuring historic message sync, serverless architecture, and end-to-end encryption, built using Kademlia DHT and WebRTC.","['KademliaDHT', 'WebRTC', 'VanillaJS', 'Decentralized', 'E2EE']",/projects/echo-echo,""
10
- localhost,"A production-grade, decentralized portfolio hosting system using Tor hidden service and Kademlia DHT-based P2P CDN.","['TOR', 'Kademlia DHT', 'WebRTC', 'Termux']",/projects/localhost,""
11
- save-the-planet,A console based text game in Java with an environmental awareness theme.,"['Java', 'JUnit', 'OOP', 'TDD']",/projects/save-the-planet,""
12
- sorting-demo,A visual demonstration of basic sorting algorithms.,"['Algorithms', 'Sorting', 'VanillaJS']",/projects/sorting-demo,""
13
- student-management-system,"A web-based application for monitoring student progression, featuring both Admin and Student dashboards.","['Node.js', 'EJS', 'JWT', 'SQL', 'ORM', 'OWASP Top 10']",/projects/student-management-system,""
14
- sysphus,"A task management application inspired by Jira, featuring Markdown support for creating and organizing tasks efficiently.","['HTML5', 'CSS3', 'VanillaJS']",/projects/sysphus,""
15
- textops,Engineered polyglot microservices text editor with custom Go API gateway achieving >5ms latency overhead. Migrated to serverless architecture reducing operational costs by 94% while maintaining 99.9% uptime SLOs.,"['Kubernetes', 'Docker', 'AWS', 'GCP', 'Prometheus', 'GitLab CI/CD']",/projects/textops,""
16
- blogs[2]{title,summary,url,tags}:
17
- 60 FPS Object Detection on Android using YOLOv8,"How I built a realtime Android vision loop with YOLO + NCNN, IOU tracking, and distance-adaptive PID control all running at 60 FPS.",/blog/assistive-vision,"['Computer Vision', 'Real-Time Systems', 'Android']"
18
- Mongo Tom is back with GPT-5,"How I used JSON-structured prompts with fictional character framing to bypass safety guardrails in GPT-5, Claude, Gemini, and Grok.",/blog/prompt-engineering-jailbreak,"['Prompt Engineering', 'LLMs', 'AI Safety']"
 
1
+ # PersonaBot — Gemini fast-path context
2
+ # Manually maintained. Edit this file whenever your portfolio changes.
3
+ # This file is loaded once at startup and passed to Gemini as primary context
4
+ # for conversational/trivial questions. Evidence-based questions always go
5
+ # through full RAG (Qdrant retrieval) regardless of what is written here.
6
+ #
7
+ # Format: free text. Write naturally — Gemini reads this as a context block.
8
+ # Keep it concise; ~500-800 words is ideal. Longer = more tokens per request.
9
+ #
10
+ # HINT: Only include things that are genuinely fast-path-safe:
11
+ # - Who Darshan is (1-2 sentences)
12
+ # - Overview of the portfolio site
13
+ # - What topics the bot can answer questions about
14
+ # Do NOT put internship details, tech stacks, or project specifics here —
15
+ # those answers must come from Qdrant, fully cited.
16
 
17
+ Darshan Chheda is a software engineer and CS student whose portfolio covers
18
+ projects in systems programming, distributed systems, RAG/LLM systems,
19
+ computer vision, and full-stack web development.
20
+
21
+ This portfolio chatbot can answer questions about his projects, blog posts,
22
+ technical skills, work experience, education, hackathons, and general background.
23
+ For any specific factual question, it searches his actual portfolio content
24
+ and provides cited answers.
 
 
 
 
app/services/reranker.py CHANGED
@@ -37,7 +37,10 @@ class Reranker:
37
  self._min_score = 0.0
38
  return []
39
 
40
- texts = [chunk["text"] for chunk in chunks]
 
 
 
41
 
42
  if self._remote:
43
  async with httpx.AsyncClient(timeout=30.0) as client:
 
37
  self._min_score = 0.0
38
  return []
39
 
40
+ # RC-12: prefer contextualised_text (doc title + section prefix) so the
41
+ # cross-encoder sees the same enriched text as the dense retriever.
42
+ # Falls back to raw chunk text for old points that pre-date contextualisation.
43
+ texts = [chunk.get("contextualised_text") or chunk["text"] for chunk in chunks]
44
 
45
  if self._remote:
46
  async with httpx.AsyncClient(timeout=30.0) as client: