Saathi / modules /voice_notes.py
Pushpraj
fixed distortion
2bbcf98
"""Module 6 - Voice Notes: browser dictation + Saathi response.
Streamlit 1.32 does not include a native microphone-to-text widget. To stay
dependency-light for HF Spaces, this module uses the browser Web Speech API
inside a local HTML component. The transcript stays in the browser until the
user pastes it into the Streamlit text area.
"""
from __future__ import annotations
import json
from typing import Dict, List
import streamlit as st
import streamlit.components.v1 as components
from backend.claude_client import chat
from backend.i18n import claude_language_name, t
from backend.safeguards import check_crisis, render_crisis_banner
MODULE_NAME = "voice_notes"
NOTE_KEY = "voice_note"
RESPONSE_KEY = "voice_response"
HISTORY_KEY = "voice_history"
VOICE_LANG_TAGS = {
"en": "en-IN",
"hi": "hi-IN",
"bn": "bn-IN",
"ta": "ta-IN",
"te": "te-IN",
"mr": "mr-IN",
"ur": "ur-IN",
}
def _init_state() -> None:
if NOTE_KEY not in st.session_state:
st.session_state[NOTE_KEY] = ""
if RESPONSE_KEY not in st.session_state:
st.session_state[RESPONSE_KEY] = ""
if HISTORY_KEY not in st.session_state:
st.session_state[HISTORY_KEY] = []
def _append_history(role: str, content: str) -> None:
history = list(st.session_state.get(HISTORY_KEY, []))
history.append({"role": role, "content": content})
st.session_state[HISTORY_KEY] = history
def _render_voice_component(lang: str) -> None:
cfg = {
"lang": VOICE_LANG_TAGS.get(lang, "en-IN"),
"start": t("voice_start_button", lang),
"stop": t("voice_stop_button", lang),
"copy": t("voice_copy_button", lang),
"clear": t("voice_clear_button", lang),
"transcript": t("voice_transcript_label", lang),
"unsupported": t("voice_not_supported", lang),
}
cfg_json = json.dumps(cfg, ensure_ascii=False)
components.html(
f"""
<div style="font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;">
<div id="voiceStatus" style="margin-bottom: 8px; color: #374151;"></div>
<div style="display: flex; flex-wrap: wrap; gap: 8px; margin-bottom: 10px;">
<button id="startBtn" style="border-radius: 8px; padding: 8px 12px; border: 1px solid #4f46e5; background: #4f46e5; color: white;">Start</button>
<button id="stopBtn" style="border-radius: 8px; padding: 8px 12px; border: 1px solid #6b7280; background: white; color: #111827;">Stop</button>
<button id="copyBtn" style="border-radius: 8px; padding: 8px 12px; border: 1px solid #6b7280; background: white; color: #111827;">Copy</button>
<button id="clearBtn" style="border-radius: 8px; padding: 8px 12px; border: 1px solid #6b7280; background: white; color: #111827;">Clear</button>
</div>
<label for="transcriptBox" style="display:block; font-weight: 600; margin-bottom: 6px;">Transcript</label>
<textarea id="transcriptBox" style="width: 100%; min-height: 150px; border: 1px solid #d1d5db; border-radius: 8px; padding: 10px; font-size: 15px; line-height: 1.45;"></textarea>
</div>
<script>
const cfg = {cfg_json};
const statusEl = document.getElementById("voiceStatus");
const transcriptEl = document.getElementById("transcriptBox");
const startBtn = document.getElementById("startBtn");
const stopBtn = document.getElementById("stopBtn");
const copyBtn = document.getElementById("copyBtn");
const clearBtn = document.getElementById("clearBtn");
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
startBtn.textContent = cfg.start;
stopBtn.textContent = cfg.stop;
copyBtn.textContent = cfg.copy;
clearBtn.textContent = cfg.clear;
document.querySelector("label[for='transcriptBox']").textContent = cfg.transcript;
let recognition = null;
let finalTranscript = "";
if (!SpeechRecognition) {{
statusEl.textContent = cfg.unsupported;
startBtn.disabled = true;
stopBtn.disabled = true;
}} else {{
recognition = new SpeechRecognition();
recognition.lang = cfg.lang;
recognition.continuous = true;
recognition.interimResults = true;
recognition.onstart = () => {{
statusEl.textContent = "Listening...";
}};
recognition.onerror = (event) => {{
statusEl.textContent = "Voice capture stopped: " + event.error;
}};
recognition.onend = () => {{
statusEl.textContent = "Stopped. Copy the transcript, then paste it below.";
}};
recognition.onresult = (event) => {{
let interim = "";
for (let i = event.resultIndex; i < event.results.length; i++) {{
const piece = event.results[i][0].transcript;
if (event.results[i].isFinal) {{
finalTranscript += piece + " ";
}} else {{
interim += piece;
}}
}}
transcriptEl.value = (finalTranscript + interim).trim();
}};
}}
startBtn.onclick = () => {{
if (recognition) recognition.start();
}};
stopBtn.onclick = () => {{
if (recognition) recognition.stop();
}};
copyBtn.onclick = async () => {{
try {{
await navigator.clipboard.writeText(transcriptEl.value);
statusEl.textContent = "Copied. Paste it into the Streamlit box below.";
}} catch (err) {{
transcriptEl.select();
document.execCommand("copy");
statusEl.textContent = "Copied. Paste it into the Streamlit box below.";
}}
}};
clearBtn.onclick = () => {{
finalTranscript = "";
transcriptEl.value = "";
statusEl.textContent = "";
}};
</script>
""",
height=310,
)
def render(lang: str) -> None:
_init_state()
st.header(t("voice_header", lang))
st.caption(t("voice_sub", lang))
st.info(t("voice_copy_note", lang))
_render_voice_component(lang)
note = st.text_area(
t("voice_note_label", lang),
value=st.session_state[NOTE_KEY],
placeholder=t("voice_paste_placeholder", lang),
height=140,
key="voice_note_input",
)
cols = st.columns([1, 1, 3])
with cols[0]:
save_clicked = st.button(t("voice_save_button", lang), key="voice_save_button", type="secondary")
with cols[1]:
ask_clicked = st.button(t("voice_ask_button", lang), key="voice_ask_button", type="primary")
if save_clicked and note.strip():
st.session_state[NOTE_KEY] = note.strip()
_append_history("user", note.strip())
st.success(t("voice_saved", lang))
if ask_clicked and note.strip():
if check_crisis(note):
render_crisis_banner(lang)
return
st.session_state[NOTE_KEY] = note.strip()
_append_history("user", note.strip())
with st.spinner("..."):
try:
response = chat(
module=MODULE_NAME,
user_text=note.strip(),
language_name=claude_language_name(lang),
max_tokens=1200,
)
except Exception as e:
response = f"(Could not reach the model right now: {e})"
st.session_state[RESPONSE_KEY] = response
_append_history("assistant", response)
if st.session_state[RESPONSE_KEY]:
st.markdown(st.session_state[RESPONSE_KEY])
st.markdown(f"##### {t('voice_history_heading', lang)}")
history: List[Dict[str, str]] = st.session_state.get(HISTORY_KEY, [])
if not history:
st.caption(t("voice_no_history", lang))
for msg in history[-6:]:
with st.chat_message(msg.get("role", "assistant")):
st.markdown(msg.get("content", ""))