GitHub Actions commited on
Commit
05bea13
Β·
1 Parent(s): 5fed3ee

Deploy c526388

Browse files
app/pipeline/nodes/generate.py CHANGED
@@ -151,6 +151,62 @@ FORMATTING RULES:
151
  7. No apologies, no padding.
152
  """.format(topics=_TOPIC_SUGGESTIONS)
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  def _format_history(state: "PipelineState") -> str:
156
  """
@@ -362,6 +418,7 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
362
  writer = get_stream_writer()
363
  query = state["query"]
364
  complexity = state.get("query_complexity", "simple")
 
365
  reranked_chunks = state.get("reranked_chunks", [])
366
  if len(reranked_chunks) > 12:
367
  logger.warning("generate: unusually large reranked chunk set (%d); truncating to 12.", len(reranked_chunks))
@@ -457,7 +514,7 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
457
  # (Llama 3.1 8B on simple queries), we switch to direct emission with no wait.
458
  stream = llm_client.complete_with_complexity(
459
  prompt=prompt,
460
- system=build_system_prompt(),
461
  stream=True,
462
  complexity=complexity,
463
  )
@@ -569,6 +626,12 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
569
  # collapse repeated document references into one source card.
570
  full_answer, cited_sources = _reindex_citations_and_sources(full_answer, source_refs)
571
 
 
 
 
 
 
 
572
  # ── Stage 3: SELF-RAG critic ──────────────────────────────────────────
573
  # Runs after answer is fully streamed β€” zero latency impact on first token.
574
  # Scores groundedness (stays in passages), completeness (covers the query),
 
151
  7. No apologies, no padding.
152
  """.format(topics=_TOPIC_SUGGESTIONS)
153
 
154
+ # ── Voice / audio-mode system prompt ─────────────────────────────────────────
155
+ # When the user speaks their question, the response is read aloud via TTS.
156
+ # Citation markers ([1], [2]) sound terrible spoken aloud.
157
+ # Internal RAG language ("passages", "based on the passages") must be forbidden.
158
+ # Markdown (**, ##, ---) must be absent since TTS reads it literally.
159
+ # Keep answers short and conversational β€” 2-4 spoken sentences max.
160
+ _VOICE_SYSTEM_PROMPT = """\
161
+ You are the voice assistant on Darshan Chheda's portfolio website.
162
+ You have retrieved factual information about Darshan from his portfolio.
163
+ Answer concisely and naturally, as if speaking aloud.
164
+
165
+ RULES (voice output β€” these are non-negotiable):
166
+ 1. NO citation markers. Never write [1], [2], or any [N] references β€” they sound broken when spoken.
167
+ 2. NO markdown. No **, no ##, no bullet dashes, no numbered lists with dots β€” the TTS engine reads them literally.
168
+ 3. NO internal language. Never say "based on the passages", "the passages say", "from the retrieved content". Speak as if you simply know the answer.
169
+ 4. Be conversational and direct. 2–4 spoken sentences. No padding, no preamble.
170
+ 5. Only state facts you have information about. If something is unknown, say so in one sentence and suggest a related topic.
171
+ 6. Never make negative or false claims about Darshan.
172
+
173
+ EXAMPLE (bad): "Based on the passages [1][2], Darshan has experience with **Python** and Docker."
174
+ EXAMPLE (good): "Darshan works with Python, Docker, and Kubernetes, with production deployments on AWS and Google Cloud."
175
+ """
176
+
177
+ # Citation + markdown stripping for voice output.
178
+ _CITATION_RE = re.compile(r"\[\d+\]")
179
+ _MARKDOWN_RE = re.compile(
180
+ r"(?:\*\*|__)(.*?)(?:\*\*|__)" # bold
181
+ r"|(?:\*|_)(.*?)(?:\*|_)" # italic
182
+ r"|#{1,6}\s+" # headings
183
+ r"|[-*+]\s+" # unordered list bullets
184
+ r"|`{1,3}[^`]*`{1,3}" # inline/block code
185
+ r"|\[([^\]]+)\]\([^)]+\)", # markdown links β†’ keep link text
186
+ re.DOTALL,
187
+ )
188
+
189
+
190
+ def _strip_for_voice(text: str) -> str:
191
+ """Remove citation markers and markdown syntax from text destined for TTS."""
192
+ # Expand markdown links to just their visible text.
193
+ text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
194
+ # Remove all remaining [N] citation markers.
195
+ text = _CITATION_RE.sub("", text)
196
+ # Remove bold/italic markers, keeping their inner text.
197
+ text = re.sub(r"(?:\*\*|__)([^*_]+?)(?:\*\*|__)", r"\1", text)
198
+ text = re.sub(r"(?:\*|_)([^*_]+?)(?:\*|_)", r"\1", text)
199
+ # Strip heading markers.
200
+ text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
201
+ # Strip unordered list bullets, keeping the text.
202
+ text = re.sub(r"^[-*+]\s+", "", text, flags=re.MULTILINE)
203
+ # Strip inline code backticks.
204
+ text = re.sub(r"`+([^`]+)`+", r"\1", text)
205
+ # Collapse excess whitespace.
206
+ text = re.sub(r"\n{2,}", " ", text)
207
+ text = re.sub(r"\s+", " ", text)
208
+ return text.strip()
209
+
210
 
211
  def _format_history(state: "PipelineState") -> str:
212
  """
 
418
  writer = get_stream_writer()
419
  query = state["query"]
420
  complexity = state.get("query_complexity", "simple")
421
+ is_audio_mode = state.get("is_audio_mode", False)
422
  reranked_chunks = state.get("reranked_chunks", [])
423
  if len(reranked_chunks) > 12:
424
  logger.warning("generate: unusually large reranked chunk set (%d); truncating to 12.", len(reranked_chunks))
 
514
  # (Llama 3.1 8B on simple queries), we switch to direct emission with no wait.
515
  stream = llm_client.complete_with_complexity(
516
  prompt=prompt,
517
+ system=_VOICE_SYSTEM_PROMPT if is_audio_mode else build_system_prompt(),
518
  stream=True,
519
  complexity=complexity,
520
  )
 
626
  # collapse repeated document references into one source card.
627
  full_answer, cited_sources = _reindex_citations_and_sources(full_answer, source_refs)
628
 
629
+ # ── Voice mode: strip all citation markers and markdown ───────────────
630
+ # TTS reads "[1]" and "**bold**" literally. Strip them from the spoken
631
+ # answer but keep cited_sources so the frontend can still render pills.
632
+ if is_audio_mode:
633
+ full_answer = _strip_for_voice(full_answer)
634
+
635
  # ── Stage 3: SELF-RAG critic ──────────────────────────────────────────
636
  # Runs after answer is fully streamed β€” zero latency impact on first token.
637
  # Scores groundedness (stays in passages), completeness (covers the query),
app/services/transcriber.py CHANGED
@@ -44,7 +44,7 @@ class GroqTranscriber:
44
 
45
  @retry(
46
  stop=stop_after_attempt(2),
47
- wait=wait_fixed(0.8),
48
  retry=retry_if_exception_type((httpx.RequestError, httpx.TimeoutException)),
49
  )
50
  async def transcribe(
@@ -65,20 +65,21 @@ class GroqTranscriber:
65
  model=self._model,
66
  temperature=0,
67
  language=target_language,
68
- prompt="PersonaBot, Darshan, RAG, portfolio, software engineering",
 
 
 
 
 
69
  )
70
  text = getattr(response, "text", None)
71
  if isinstance(text, str) and text.strip():
72
  cleaned = _normalise_transcript_text(text, self._replacements)
73
- if len(cleaned) < 3:
74
- raise GenerationError("Transcription too short to be valid")
75
  return cleaned
76
  if isinstance(response, dict):
77
  value = response.get("text")
78
  if isinstance(value, str) and value.strip():
79
  cleaned = _normalise_transcript_text(value, self._replacements)
80
- if len(cleaned) < 3:
81
- raise GenerationError("Transcription too short to be valid")
82
  return cleaned
83
  raise GenerationError("Transcription response did not contain text")
84
 
 
44
 
45
  @retry(
46
  stop=stop_after_attempt(2),
47
+ wait=wait_fixed(0.3),
48
  retry=retry_if_exception_type((httpx.RequestError, httpx.TimeoutException)),
49
  )
50
  async def transcribe(
 
65
  model=self._model,
66
  temperature=0,
67
  language=target_language,
68
+ # Domain-specific prompt biases Whisper toward correct spellings
69
+ # and prevents hallucination of generic words for quiet/short clips.
70
+ prompt=(
71
+ "PersonaBot, Darshan Chheda, RAG, Qdrant, LangGraph, FastAPI, "
72
+ "TypeScript, portfolio, software engineering, HuggingFace, Groq"
73
+ ),
74
  )
75
  text = getattr(response, "text", None)
76
  if isinstance(text, str) and text.strip():
77
  cleaned = _normalise_transcript_text(text, self._replacements)
 
 
78
  return cleaned
79
  if isinstance(response, dict):
80
  value = response.get("text")
81
  if isinstance(value, str) and value.strip():
82
  cleaned = _normalise_transcript_text(value, self._replacements)
 
 
83
  return cleaned
84
  raise GenerationError("Transcription response did not contain text")
85
 
app/services/tts_client.py CHANGED
@@ -64,10 +64,16 @@ class TTSClient:
64
  text = text.strip()
65
  if not text:
66
  raise GenerationError("TTS request text is empty")
67
-
68
  if not self.is_configured:
69
  raise GenerationError("TTS client is not configured")
70
 
 
 
 
 
 
 
71
  try:
72
  async with self._http.stream(
73
  "POST",
@@ -76,8 +82,15 @@ class TTSClient:
76
  headers={"Content-Type": "application/json"},
77
  ) as response:
78
  response.raise_for_status()
79
- async for chunk in response.aiter_bytes():
80
- yield chunk
 
 
 
 
 
 
 
81
  except httpx.TimeoutException as exc:
82
  raise GenerationError("TTS request timed out") from exc
83
  except httpx.HTTPStatusError as exc:
 
64
  text = text.strip()
65
  if not text:
66
  raise GenerationError("TTS request text is empty")
67
+
68
  if not self.is_configured:
69
  raise GenerationError("TTS client is not configured")
70
 
71
+ # Minimum bytes to buffer before yielding each chunk.
72
+ # Sub-kilobyte chunks cause micro-stutter in browsers because each
73
+ # Web Audio decode call has fixed overhead; 4KB is a safe floor for
74
+ # 22kHz mono PCM without adding noticeable latency.
75
+ _MIN_CHUNK_BYTES = 4096
76
+
77
  try:
78
  async with self._http.stream(
79
  "POST",
 
82
  headers={"Content-Type": "application/json"},
83
  ) as response:
84
  response.raise_for_status()
85
+ buf = b""
86
+ async for chunk in response.aiter_bytes(chunk_size=8192):
87
+ buf += chunk
88
+ while len(buf) >= _MIN_CHUNK_BYTES:
89
+ yield buf[:_MIN_CHUNK_BYTES]
90
+ buf = buf[_MIN_CHUNK_BYTES:]
91
+ # Flush remainder (always β€” this carries the last PCM samples).
92
+ if buf:
93
+ yield buf
94
  except httpx.TimeoutException as exc:
95
  raise GenerationError("TTS request timed out") from exc
96
  except httpx.HTTPStatusError as exc: