Spaces:
Running
Running
GitHub Actions commited on
Commit Β·
05bea13
1
Parent(s): 5fed3ee
Deploy c526388
Browse files- app/pipeline/nodes/generate.py +64 -1
- app/services/transcriber.py +7 -6
- app/services/tts_client.py +16 -3
app/pipeline/nodes/generate.py
CHANGED
|
@@ -151,6 +151,62 @@ FORMATTING RULES:
|
|
| 151 |
7. No apologies, no padding.
|
| 152 |
""".format(topics=_TOPIC_SUGGESTIONS)
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
def _format_history(state: "PipelineState") -> str:
|
| 156 |
"""
|
|
@@ -362,6 +418,7 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
|
|
| 362 |
writer = get_stream_writer()
|
| 363 |
query = state["query"]
|
| 364 |
complexity = state.get("query_complexity", "simple")
|
|
|
|
| 365 |
reranked_chunks = state.get("reranked_chunks", [])
|
| 366 |
if len(reranked_chunks) > 12:
|
| 367 |
logger.warning("generate: unusually large reranked chunk set (%d); truncating to 12.", len(reranked_chunks))
|
|
@@ -457,7 +514,7 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
|
|
| 457 |
# (Llama 3.1 8B on simple queries), we switch to direct emission with no wait.
|
| 458 |
stream = llm_client.complete_with_complexity(
|
| 459 |
prompt=prompt,
|
| 460 |
-
system=build_system_prompt(),
|
| 461 |
stream=True,
|
| 462 |
complexity=complexity,
|
| 463 |
)
|
|
@@ -569,6 +626,12 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
|
|
| 569 |
# collapse repeated document references into one source card.
|
| 570 |
full_answer, cited_sources = _reindex_citations_and_sources(full_answer, source_refs)
|
| 571 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 572 |
# ββ Stage 3: SELF-RAG critic ββββββββββββββββββββββββββββββββββββββββββ
|
| 573 |
# Runs after answer is fully streamed β zero latency impact on first token.
|
| 574 |
# Scores groundedness (stays in passages), completeness (covers the query),
|
|
|
|
| 151 |
7. No apologies, no padding.
|
| 152 |
""".format(topics=_TOPIC_SUGGESTIONS)
|
| 153 |
|
| 154 |
+
# ββ Voice / audio-mode system prompt βββββββββββββββββββββββββββββββββββββββββ
|
| 155 |
+
# When the user speaks their question, the response is read aloud via TTS.
|
| 156 |
+
# Citation markers ([1], [2]) sound terrible spoken aloud.
|
| 157 |
+
# Internal RAG language ("passages", "based on the passages") must be forbidden.
|
| 158 |
+
# Markdown (**, ##, ---) must be absent since TTS reads it literally.
|
| 159 |
+
# Keep answers short and conversational β 2-4 spoken sentences max.
|
| 160 |
+
_VOICE_SYSTEM_PROMPT = """\
|
| 161 |
+
You are the voice assistant on Darshan Chheda's portfolio website.
|
| 162 |
+
You have retrieved factual information about Darshan from his portfolio.
|
| 163 |
+
Answer concisely and naturally, as if speaking aloud.
|
| 164 |
+
|
| 165 |
+
RULES (voice output β these are non-negotiable):
|
| 166 |
+
1. NO citation markers. Never write [1], [2], or any [N] references β they sound broken when spoken.
|
| 167 |
+
2. NO markdown. No **, no ##, no bullet dashes, no numbered lists with dots β the TTS engine reads them literally.
|
| 168 |
+
3. NO internal language. Never say "based on the passages", "the passages say", "from the retrieved content". Speak as if you simply know the answer.
|
| 169 |
+
4. Be conversational and direct. 2β4 spoken sentences. No padding, no preamble.
|
| 170 |
+
5. Only state facts you have information about. If something is unknown, say so in one sentence and suggest a related topic.
|
| 171 |
+
6. Never make negative or false claims about Darshan.
|
| 172 |
+
|
| 173 |
+
EXAMPLE (bad): "Based on the passages [1][2], Darshan has experience with **Python** and Docker."
|
| 174 |
+
EXAMPLE (good): "Darshan works with Python, Docker, and Kubernetes, with production deployments on AWS and Google Cloud."
|
| 175 |
+
"""
|
| 176 |
+
|
| 177 |
+
# Citation + markdown stripping for voice output.
|
| 178 |
+
_CITATION_RE = re.compile(r"\[\d+\]")
|
| 179 |
+
_MARKDOWN_RE = re.compile(
|
| 180 |
+
r"(?:\*\*|__)(.*?)(?:\*\*|__)" # bold
|
| 181 |
+
r"|(?:\*|_)(.*?)(?:\*|_)" # italic
|
| 182 |
+
r"|#{1,6}\s+" # headings
|
| 183 |
+
r"|[-*+]\s+" # unordered list bullets
|
| 184 |
+
r"|`{1,3}[^`]*`{1,3}" # inline/block code
|
| 185 |
+
r"|\[([^\]]+)\]\([^)]+\)", # markdown links β keep link text
|
| 186 |
+
re.DOTALL,
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def _strip_for_voice(text: str) -> str:
|
| 191 |
+
"""Remove citation markers and markdown syntax from text destined for TTS."""
|
| 192 |
+
# Expand markdown links to just their visible text.
|
| 193 |
+
text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
|
| 194 |
+
# Remove all remaining [N] citation markers.
|
| 195 |
+
text = _CITATION_RE.sub("", text)
|
| 196 |
+
# Remove bold/italic markers, keeping their inner text.
|
| 197 |
+
text = re.sub(r"(?:\*\*|__)([^*_]+?)(?:\*\*|__)", r"\1", text)
|
| 198 |
+
text = re.sub(r"(?:\*|_)([^*_]+?)(?:\*|_)", r"\1", text)
|
| 199 |
+
# Strip heading markers.
|
| 200 |
+
text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
|
| 201 |
+
# Strip unordered list bullets, keeping the text.
|
| 202 |
+
text = re.sub(r"^[-*+]\s+", "", text, flags=re.MULTILINE)
|
| 203 |
+
# Strip inline code backticks.
|
| 204 |
+
text = re.sub(r"`+([^`]+)`+", r"\1", text)
|
| 205 |
+
# Collapse excess whitespace.
|
| 206 |
+
text = re.sub(r"\n{2,}", " ", text)
|
| 207 |
+
text = re.sub(r"\s+", " ", text)
|
| 208 |
+
return text.strip()
|
| 209 |
+
|
| 210 |
|
| 211 |
def _format_history(state: "PipelineState") -> str:
|
| 212 |
"""
|
|
|
|
| 418 |
writer = get_stream_writer()
|
| 419 |
query = state["query"]
|
| 420 |
complexity = state.get("query_complexity", "simple")
|
| 421 |
+
is_audio_mode = state.get("is_audio_mode", False)
|
| 422 |
reranked_chunks = state.get("reranked_chunks", [])
|
| 423 |
if len(reranked_chunks) > 12:
|
| 424 |
logger.warning("generate: unusually large reranked chunk set (%d); truncating to 12.", len(reranked_chunks))
|
|
|
|
| 514 |
# (Llama 3.1 8B on simple queries), we switch to direct emission with no wait.
|
| 515 |
stream = llm_client.complete_with_complexity(
|
| 516 |
prompt=prompt,
|
| 517 |
+
system=_VOICE_SYSTEM_PROMPT if is_audio_mode else build_system_prompt(),
|
| 518 |
stream=True,
|
| 519 |
complexity=complexity,
|
| 520 |
)
|
|
|
|
| 626 |
# collapse repeated document references into one source card.
|
| 627 |
full_answer, cited_sources = _reindex_citations_and_sources(full_answer, source_refs)
|
| 628 |
|
| 629 |
+
# ββ Voice mode: strip all citation markers and markdown βββββββββββββββ
|
| 630 |
+
# TTS reads "[1]" and "**bold**" literally. Strip them from the spoken
|
| 631 |
+
# answer but keep cited_sources so the frontend can still render pills.
|
| 632 |
+
if is_audio_mode:
|
| 633 |
+
full_answer = _strip_for_voice(full_answer)
|
| 634 |
+
|
| 635 |
# ββ Stage 3: SELF-RAG critic ββββββββββββββββββββββββββββββββββββββββββ
|
| 636 |
# Runs after answer is fully streamed β zero latency impact on first token.
|
| 637 |
# Scores groundedness (stays in passages), completeness (covers the query),
|
app/services/transcriber.py
CHANGED
|
@@ -44,7 +44,7 @@ class GroqTranscriber:
|
|
| 44 |
|
| 45 |
@retry(
|
| 46 |
stop=stop_after_attempt(2),
|
| 47 |
-
wait=wait_fixed(0.
|
| 48 |
retry=retry_if_exception_type((httpx.RequestError, httpx.TimeoutException)),
|
| 49 |
)
|
| 50 |
async def transcribe(
|
|
@@ -65,20 +65,21 @@ class GroqTranscriber:
|
|
| 65 |
model=self._model,
|
| 66 |
temperature=0,
|
| 67 |
language=target_language,
|
| 68 |
-
prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
)
|
| 70 |
text = getattr(response, "text", None)
|
| 71 |
if isinstance(text, str) and text.strip():
|
| 72 |
cleaned = _normalise_transcript_text(text, self._replacements)
|
| 73 |
-
if len(cleaned) < 3:
|
| 74 |
-
raise GenerationError("Transcription too short to be valid")
|
| 75 |
return cleaned
|
| 76 |
if isinstance(response, dict):
|
| 77 |
value = response.get("text")
|
| 78 |
if isinstance(value, str) and value.strip():
|
| 79 |
cleaned = _normalise_transcript_text(value, self._replacements)
|
| 80 |
-
if len(cleaned) < 3:
|
| 81 |
-
raise GenerationError("Transcription too short to be valid")
|
| 82 |
return cleaned
|
| 83 |
raise GenerationError("Transcription response did not contain text")
|
| 84 |
|
|
|
|
| 44 |
|
| 45 |
@retry(
|
| 46 |
stop=stop_after_attempt(2),
|
| 47 |
+
wait=wait_fixed(0.3),
|
| 48 |
retry=retry_if_exception_type((httpx.RequestError, httpx.TimeoutException)),
|
| 49 |
)
|
| 50 |
async def transcribe(
|
|
|
|
| 65 |
model=self._model,
|
| 66 |
temperature=0,
|
| 67 |
language=target_language,
|
| 68 |
+
# Domain-specific prompt biases Whisper toward correct spellings
|
| 69 |
+
# and prevents hallucination of generic words for quiet/short clips.
|
| 70 |
+
prompt=(
|
| 71 |
+
"PersonaBot, Darshan Chheda, RAG, Qdrant, LangGraph, FastAPI, "
|
| 72 |
+
"TypeScript, portfolio, software engineering, HuggingFace, Groq"
|
| 73 |
+
),
|
| 74 |
)
|
| 75 |
text = getattr(response, "text", None)
|
| 76 |
if isinstance(text, str) and text.strip():
|
| 77 |
cleaned = _normalise_transcript_text(text, self._replacements)
|
|
|
|
|
|
|
| 78 |
return cleaned
|
| 79 |
if isinstance(response, dict):
|
| 80 |
value = response.get("text")
|
| 81 |
if isinstance(value, str) and value.strip():
|
| 82 |
cleaned = _normalise_transcript_text(value, self._replacements)
|
|
|
|
|
|
|
| 83 |
return cleaned
|
| 84 |
raise GenerationError("Transcription response did not contain text")
|
| 85 |
|
app/services/tts_client.py
CHANGED
|
@@ -64,10 +64,16 @@ class TTSClient:
|
|
| 64 |
text = text.strip()
|
| 65 |
if not text:
|
| 66 |
raise GenerationError("TTS request text is empty")
|
| 67 |
-
|
| 68 |
if not self.is_configured:
|
| 69 |
raise GenerationError("TTS client is not configured")
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
try:
|
| 72 |
async with self._http.stream(
|
| 73 |
"POST",
|
|
@@ -76,8 +82,15 @@ class TTSClient:
|
|
| 76 |
headers={"Content-Type": "application/json"},
|
| 77 |
) as response:
|
| 78 |
response.raise_for_status()
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
except httpx.TimeoutException as exc:
|
| 82 |
raise GenerationError("TTS request timed out") from exc
|
| 83 |
except httpx.HTTPStatusError as exc:
|
|
|
|
| 64 |
text = text.strip()
|
| 65 |
if not text:
|
| 66 |
raise GenerationError("TTS request text is empty")
|
| 67 |
+
|
| 68 |
if not self.is_configured:
|
| 69 |
raise GenerationError("TTS client is not configured")
|
| 70 |
|
| 71 |
+
# Minimum bytes to buffer before yielding each chunk.
|
| 72 |
+
# Sub-kilobyte chunks cause micro-stutter in browsers because each
|
| 73 |
+
# Web Audio decode call has fixed overhead; 4KB is a safe floor for
|
| 74 |
+
# 22kHz mono PCM without adding noticeable latency.
|
| 75 |
+
_MIN_CHUNK_BYTES = 4096
|
| 76 |
+
|
| 77 |
try:
|
| 78 |
async with self._http.stream(
|
| 79 |
"POST",
|
|
|
|
| 82 |
headers={"Content-Type": "application/json"},
|
| 83 |
) as response:
|
| 84 |
response.raise_for_status()
|
| 85 |
+
buf = b""
|
| 86 |
+
async for chunk in response.aiter_bytes(chunk_size=8192):
|
| 87 |
+
buf += chunk
|
| 88 |
+
while len(buf) >= _MIN_CHUNK_BYTES:
|
| 89 |
+
yield buf[:_MIN_CHUNK_BYTES]
|
| 90 |
+
buf = buf[_MIN_CHUNK_BYTES:]
|
| 91 |
+
# Flush remainder (always β this carries the last PCM samples).
|
| 92 |
+
if buf:
|
| 93 |
+
yield buf
|
| 94 |
except httpx.TimeoutException as exc:
|
| 95 |
raise GenerationError("TTS request timed out") from exc
|
| 96 |
except httpx.HTTPStatusError as exc:
|