Spaces:

1337XCode
/

personabot-api

Running

App Files Files Community

GitHub Actions commited on 20 days ago

Commit

05bea13

1 Parent(s): 5fed3ee

Deploy c526388

Browse files

Files changed (3) hide show

app/pipeline/nodes/generate.py +64 -1
app/services/transcriber.py +7 -6
app/services/tts_client.py +16 -3

app/pipeline/nodes/generate.py CHANGED Viewed

@@ -151,6 +151,62 @@ FORMATTING RULES:
 7. No apologies, no padding.
 """.format(topics=_TOPIC_SUGGESTIONS)
 def _format_history(state: "PipelineState") -> str:
     """
@@ -362,6 +418,7 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
         writer = get_stream_writer()
         query = state["query"]
         complexity = state.get("query_complexity", "simple")
         reranked_chunks = state.get("reranked_chunks", [])
         if len(reranked_chunks) > 12:
             logger.warning("generate: unusually large reranked chunk set (%d); truncating to 12.", len(reranked_chunks))
@@ -457,7 +514,7 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
         # (Llama 3.1 8B on simple queries), we switch to direct emission with no wait.
         stream = llm_client.complete_with_complexity(
             prompt=prompt,
-            system=build_system_prompt(),
             stream=True,
             complexity=complexity,
         )
@@ -569,6 +626,12 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
         # collapse repeated document references into one source card.
         full_answer, cited_sources = _reindex_citations_and_sources(full_answer, source_refs)
         # ── Stage 3: SELF-RAG critic ──────────────────────────────────────────
         # Runs after answer is fully streamed — zero latency impact on first token.
         # Scores groundedness (stays in passages), completeness (covers the query),

 7. No apologies, no padding.
 """.format(topics=_TOPIC_SUGGESTIONS)
+# ── Voice / audio-mode system prompt ─────────────────────────────────────────
+# When the user speaks their question, the response is read aloud via TTS.
+# Citation markers ([1], [2]) sound terrible spoken aloud.
+# Internal RAG language ("passages", "based on the passages") must be forbidden.
+# Markdown (**, ##, ---) must be absent since TTS reads it literally.
+# Keep answers short and conversational — 2-4 spoken sentences max.
+_VOICE_SYSTEM_PROMPT = """\
+You are the voice assistant on Darshan Chheda's portfolio website.
+You have retrieved factual information about Darshan from his portfolio.
+Answer concisely and naturally, as if speaking aloud.
+RULES (voice output — these are non-negotiable):
+1. NO citation markers. Never write [1], [2], or any [N] references — they sound broken when spoken.
+2. NO markdown. No **, no ##, no bullet dashes, no numbered lists with dots — the TTS engine reads them literally.
+3. NO internal language. Never say "based on the passages", "the passages say", "from the retrieved content". Speak as if you simply know the answer.
+4. Be conversational and direct. 2–4 spoken sentences. No padding, no preamble.
+5. Only state facts you have information about. If something is unknown, say so in one sentence and suggest a related topic.
+6. Never make negative or false claims about Darshan.
+EXAMPLE (bad): "Based on the passages [1][2], Darshan has experience with **Python** and Docker."
+EXAMPLE (good): "Darshan works with Python, Docker, and Kubernetes, with production deployments on AWS and Google Cloud."
+"""
+# Citation + markdown stripping for voice output.
+_CITATION_RE = re.compile(r"\[\d+\]")
+_MARKDOWN_RE = re.compile(
+    r"(?:\*\*|__)(.*?)(?:\*\*|__)"  # bold
+    r"|(?:\*|_)(.*?)(?:\*|_)"       # italic
+    r"|#{1,6}\s+"                   # headings
+    r"|[-*+]\s+"                    # unordered list bullets
+    r"|`{1,3}[^`]*`{1,3}"          # inline/block code
+    r"|\[([^\]]+)\]\([^)]+\)",     # markdown links → keep link text
+    re.DOTALL,
+)
+def _strip_for_voice(text: str) -> str:
+    """Remove citation markers and markdown syntax from text destined for TTS."""
+    # Expand markdown links to just their visible text.
+    text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
+    # Remove all remaining [N] citation markers.
+    text = _CITATION_RE.sub("", text)
+    # Remove bold/italic markers, keeping their inner text.
+    text = re.sub(r"(?:\*\*|__)([^*_]+?)(?:\*\*|__)", r"\1", text)
+    text = re.sub(r"(?:\*|_)([^*_]+?)(?:\*|_)", r"\1", text)
+    # Strip heading markers.
+    text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
+    # Strip unordered list bullets, keeping the text.
+    text = re.sub(r"^[-*+]\s+", "", text, flags=re.MULTILINE)
+    # Strip inline code backticks.
+    text = re.sub(r"`+([^`]+)`+", r"\1", text)
+    # Collapse excess whitespace.
+    text = re.sub(r"\n{2,}", " ", text)
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
 def _format_history(state: "PipelineState") -> str:
     """
         writer = get_stream_writer()
         query = state["query"]
         complexity = state.get("query_complexity", "simple")
+        is_audio_mode = state.get("is_audio_mode", False)
         reranked_chunks = state.get("reranked_chunks", [])
         if len(reranked_chunks) > 12:
             logger.warning("generate: unusually large reranked chunk set (%d); truncating to 12.", len(reranked_chunks))
         # (Llama 3.1 8B on simple queries), we switch to direct emission with no wait.
         stream = llm_client.complete_with_complexity(
             prompt=prompt,
+            system=_VOICE_SYSTEM_PROMPT if is_audio_mode else build_system_prompt(),
             stream=True,
             complexity=complexity,
         )
         # collapse repeated document references into one source card.
         full_answer, cited_sources = _reindex_citations_and_sources(full_answer, source_refs)
+        # ── Voice mode: strip all citation markers and markdown ───────────────
+        # TTS reads "[1]" and "**bold**" literally. Strip them from the spoken
+        # answer but keep cited_sources so the frontend can still render pills.
+        if is_audio_mode:
+            full_answer = _strip_for_voice(full_answer)
         # ── Stage 3: SELF-RAG critic ──────────────────────────────────────────
         # Runs after answer is fully streamed — zero latency impact on first token.
         # Scores groundedness (stays in passages), completeness (covers the query),

app/services/transcriber.py CHANGED Viewed

@@ -44,7 +44,7 @@ class GroqTranscriber:
     @retry(
         stop=stop_after_attempt(2),
-        wait=wait_fixed(0.8),
         retry=retry_if_exception_type((httpx.RequestError, httpx.TimeoutException)),
     )
     async def transcribe(
@@ -65,20 +65,21 @@ class GroqTranscriber:
                 model=self._model,
                 temperature=0,
                 language=target_language,
-                prompt="PersonaBot, Darshan, RAG, portfolio, software engineering",
             )
             text = getattr(response, "text", None)
             if isinstance(text, str) and text.strip():
                 cleaned = _normalise_transcript_text(text, self._replacements)
-                if len(cleaned) < 3:
-                    raise GenerationError("Transcription too short to be valid")
                 return cleaned
             if isinstance(response, dict):
                 value = response.get("text")
                 if isinstance(value, str) and value.strip():
                     cleaned = _normalise_transcript_text(value, self._replacements)
-                    if len(cleaned) < 3:
-                        raise GenerationError("Transcription too short to be valid")
                     return cleaned
             raise GenerationError("Transcription response did not contain text")

     @retry(
         stop=stop_after_attempt(2),
+        wait=wait_fixed(0.3),
         retry=retry_if_exception_type((httpx.RequestError, httpx.TimeoutException)),
     )
     async def transcribe(
                 model=self._model,
                 temperature=0,
                 language=target_language,
+                # Domain-specific prompt biases Whisper toward correct spellings
+                # and prevents hallucination of generic words for quiet/short clips.
+                prompt=(
+                    "PersonaBot, Darshan Chheda, RAG, Qdrant, LangGraph, FastAPI, "
+                    "TypeScript, portfolio, software engineering, HuggingFace, Groq"
+                ),
             )
             text = getattr(response, "text", None)
             if isinstance(text, str) and text.strip():
                 cleaned = _normalise_transcript_text(text, self._replacements)
                 return cleaned
             if isinstance(response, dict):
                 value = response.get("text")
                 if isinstance(value, str) and value.strip():
                     cleaned = _normalise_transcript_text(value, self._replacements)
                     return cleaned
             raise GenerationError("Transcription response did not contain text")

app/services/tts_client.py CHANGED Viewed

@@ -64,10 +64,16 @@ class TTSClient:
         text = text.strip()
         if not text:
             raise GenerationError("TTS request text is empty")
         if not self.is_configured:
             raise GenerationError("TTS client is not configured")
         try:
             async with self._http.stream(
                 "POST",
@@ -76,8 +82,15 @@ class TTSClient:
                 headers={"Content-Type": "application/json"},
             ) as response:
                 response.raise_for_status()
-                async for chunk in response.aiter_bytes():
-                    yield chunk
         except httpx.TimeoutException as exc:
             raise GenerationError("TTS request timed out") from exc
         except httpx.HTTPStatusError as exc:

         text = text.strip()
         if not text:
             raise GenerationError("TTS request text is empty")
         if not self.is_configured:
             raise GenerationError("TTS client is not configured")
+        # Minimum bytes to buffer before yielding each chunk.
+        # Sub-kilobyte chunks cause micro-stutter in browsers because each
+        # Web Audio decode call has fixed overhead; 4KB is a safe floor for
+        # 22kHz mono PCM without adding noticeable latency.
+        _MIN_CHUNK_BYTES = 4096
         try:
             async with self._http.stream(
                 "POST",
                 headers={"Content-Type": "application/json"},
             ) as response:
                 response.raise_for_status()
+                buf = b""
+                async for chunk in response.aiter_bytes(chunk_size=8192):
+                    buf += chunk
+                    while len(buf) >= _MIN_CHUNK_BYTES:
+                        yield buf[:_MIN_CHUNK_BYTES]
+                        buf = buf[_MIN_CHUNK_BYTES:]
+                # Flush remainder (always — this carries the last PCM samples).
+                if buf:
+                    yield buf
         except httpx.TimeoutException as exc:
             raise GenerationError("TTS request timed out") from exc
         except httpx.HTTPStatusError as exc: