Spaces:
Sleeping
Sleeping
File size: 8,084 Bytes
0ae7ad3 2bbcf98 0ae7ad3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 | """Module 6 - Voice Notes: browser dictation + Saathi response.
Streamlit 1.32 does not include a native microphone-to-text widget. To stay
dependency-light for HF Spaces, this module uses the browser Web Speech API
inside a local HTML component. The transcript stays in the browser until the
user pastes it into the Streamlit text area.
"""
from __future__ import annotations
import json
from typing import Dict, List
import streamlit as st
import streamlit.components.v1 as components
from backend.claude_client import chat
from backend.i18n import claude_language_name, t
from backend.safeguards import check_crisis, render_crisis_banner
MODULE_NAME = "voice_notes"
NOTE_KEY = "voice_note"
RESPONSE_KEY = "voice_response"
HISTORY_KEY = "voice_history"
VOICE_LANG_TAGS = {
"en": "en-IN",
"hi": "hi-IN",
"bn": "bn-IN",
"ta": "ta-IN",
"te": "te-IN",
"mr": "mr-IN",
"ur": "ur-IN",
}
def _init_state() -> None:
if NOTE_KEY not in st.session_state:
st.session_state[NOTE_KEY] = ""
if RESPONSE_KEY not in st.session_state:
st.session_state[RESPONSE_KEY] = ""
if HISTORY_KEY not in st.session_state:
st.session_state[HISTORY_KEY] = []
def _append_history(role: str, content: str) -> None:
history = list(st.session_state.get(HISTORY_KEY, []))
history.append({"role": role, "content": content})
st.session_state[HISTORY_KEY] = history
def _render_voice_component(lang: str) -> None:
cfg = {
"lang": VOICE_LANG_TAGS.get(lang, "en-IN"),
"start": t("voice_start_button", lang),
"stop": t("voice_stop_button", lang),
"copy": t("voice_copy_button", lang),
"clear": t("voice_clear_button", lang),
"transcript": t("voice_transcript_label", lang),
"unsupported": t("voice_not_supported", lang),
}
cfg_json = json.dumps(cfg, ensure_ascii=False)
components.html(
f"""
<div style="font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;">
<div id="voiceStatus" style="margin-bottom: 8px; color: #374151;"></div>
<div style="display: flex; flex-wrap: wrap; gap: 8px; margin-bottom: 10px;">
<button id="startBtn" style="border-radius: 8px; padding: 8px 12px; border: 1px solid #4f46e5; background: #4f46e5; color: white;">Start</button>
<button id="stopBtn" style="border-radius: 8px; padding: 8px 12px; border: 1px solid #6b7280; background: white; color: #111827;">Stop</button>
<button id="copyBtn" style="border-radius: 8px; padding: 8px 12px; border: 1px solid #6b7280; background: white; color: #111827;">Copy</button>
<button id="clearBtn" style="border-radius: 8px; padding: 8px 12px; border: 1px solid #6b7280; background: white; color: #111827;">Clear</button>
</div>
<label for="transcriptBox" style="display:block; font-weight: 600; margin-bottom: 6px;">Transcript</label>
<textarea id="transcriptBox" style="width: 100%; min-height: 150px; border: 1px solid #d1d5db; border-radius: 8px; padding: 10px; font-size: 15px; line-height: 1.45;"></textarea>
</div>
<script>
const cfg = {cfg_json};
const statusEl = document.getElementById("voiceStatus");
const transcriptEl = document.getElementById("transcriptBox");
const startBtn = document.getElementById("startBtn");
const stopBtn = document.getElementById("stopBtn");
const copyBtn = document.getElementById("copyBtn");
const clearBtn = document.getElementById("clearBtn");
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
startBtn.textContent = cfg.start;
stopBtn.textContent = cfg.stop;
copyBtn.textContent = cfg.copy;
clearBtn.textContent = cfg.clear;
document.querySelector("label[for='transcriptBox']").textContent = cfg.transcript;
let recognition = null;
let finalTranscript = "";
if (!SpeechRecognition) {{
statusEl.textContent = cfg.unsupported;
startBtn.disabled = true;
stopBtn.disabled = true;
}} else {{
recognition = new SpeechRecognition();
recognition.lang = cfg.lang;
recognition.continuous = true;
recognition.interimResults = true;
recognition.onstart = () => {{
statusEl.textContent = "Listening...";
}};
recognition.onerror = (event) => {{
statusEl.textContent = "Voice capture stopped: " + event.error;
}};
recognition.onend = () => {{
statusEl.textContent = "Stopped. Copy the transcript, then paste it below.";
}};
recognition.onresult = (event) => {{
let interim = "";
for (let i = event.resultIndex; i < event.results.length; i++) {{
const piece = event.results[i][0].transcript;
if (event.results[i].isFinal) {{
finalTranscript += piece + " ";
}} else {{
interim += piece;
}}
}}
transcriptEl.value = (finalTranscript + interim).trim();
}};
}}
startBtn.onclick = () => {{
if (recognition) recognition.start();
}};
stopBtn.onclick = () => {{
if (recognition) recognition.stop();
}};
copyBtn.onclick = async () => {{
try {{
await navigator.clipboard.writeText(transcriptEl.value);
statusEl.textContent = "Copied. Paste it into the Streamlit box below.";
}} catch (err) {{
transcriptEl.select();
document.execCommand("copy");
statusEl.textContent = "Copied. Paste it into the Streamlit box below.";
}}
}};
clearBtn.onclick = () => {{
finalTranscript = "";
transcriptEl.value = "";
statusEl.textContent = "";
}};
</script>
""",
height=310,
)
def render(lang: str) -> None:
_init_state()
st.header(t("voice_header", lang))
st.caption(t("voice_sub", lang))
st.info(t("voice_copy_note", lang))
_render_voice_component(lang)
note = st.text_area(
t("voice_note_label", lang),
value=st.session_state[NOTE_KEY],
placeholder=t("voice_paste_placeholder", lang),
height=140,
key="voice_note_input",
)
cols = st.columns([1, 1, 3])
with cols[0]:
save_clicked = st.button(t("voice_save_button", lang), key="voice_save_button", type="secondary")
with cols[1]:
ask_clicked = st.button(t("voice_ask_button", lang), key="voice_ask_button", type="primary")
if save_clicked and note.strip():
st.session_state[NOTE_KEY] = note.strip()
_append_history("user", note.strip())
st.success(t("voice_saved", lang))
if ask_clicked and note.strip():
if check_crisis(note):
render_crisis_banner(lang)
return
st.session_state[NOTE_KEY] = note.strip()
_append_history("user", note.strip())
with st.spinner("..."):
try:
response = chat(
module=MODULE_NAME,
user_text=note.strip(),
language_name=claude_language_name(lang),
max_tokens=1200,
)
except Exception as e:
response = f"(Could not reach the model right now: {e})"
st.session_state[RESPONSE_KEY] = response
_append_history("assistant", response)
if st.session_state[RESPONSE_KEY]:
st.markdown(st.session_state[RESPONSE_KEY])
st.markdown(f"##### {t('voice_history_heading', lang)}")
history: List[Dict[str, str]] = st.session_state.get(HISTORY_KEY, [])
if not history:
st.caption(t("voice_no_history", lang))
for msg in history[-6:]:
with st.chat_message(msg.get("role", "assistant")):
st.markdown(msg.get("content", ""))
|