Spaces:

samarth1999
/

Saathi

Sleeping

Saathi / modules /voice_notes.py

Pushpraj

fixed distortion

2bbcf98 about 1 month ago

8.08 kB

	"""Module 6 - Voice Notes: browser dictation + Saathi response.

	Streamlit 1.32 does not include a native microphone-to-text widget. To stay
	dependency-light for HF Spaces, this module uses the browser Web Speech API
	inside a local HTML component. The transcript stays in the browser until the
	user pastes it into the Streamlit text area.
	"""
	from __future__ import annotations

	import json
	from typing import Dict, List

	import streamlit as st
	import streamlit.components.v1 as components

	from backend.claude_client import chat
	from backend.i18n import claude_language_name, t
	from backend.safeguards import check_crisis, render_crisis_banner

	MODULE_NAME = "voice_notes"
	NOTE_KEY = "voice_note"
	RESPONSE_KEY = "voice_response"
	HISTORY_KEY = "voice_history"

	VOICE_LANG_TAGS = {
	"en": "en-IN",
	"hi": "hi-IN",
	"bn": "bn-IN",
	"ta": "ta-IN",
	"te": "te-IN",
	"mr": "mr-IN",
	"ur": "ur-IN",
	}


	def _init_state() -> None:
	if NOTE_KEY not in st.session_state:
	st.session_state[NOTE_KEY] = ""
	if RESPONSE_KEY not in st.session_state:
	st.session_state[RESPONSE_KEY] = ""
	if HISTORY_KEY not in st.session_state:
	st.session_state[HISTORY_KEY] = []


	def _append_history(role: str, content: str) -> None:
	history = list(st.session_state.get(HISTORY_KEY, []))
	history.append({"role": role, "content": content})
	st.session_state[HISTORY_KEY] = history


	def _render_voice_component(lang: str) -> None:
	cfg = {
	"lang": VOICE_LANG_TAGS.get(lang, "en-IN"),
	"start": t("voice_start_button", lang),
	"stop": t("voice_stop_button", lang),
	"copy": t("voice_copy_button", lang),
	"clear": t("voice_clear_button", lang),
	"transcript": t("voice_transcript_label", lang),
	"unsupported": t("voice_not_supported", lang),
	}
	cfg_json = json.dumps(cfg, ensure_ascii=False)
	components.html(
	f"""
	<div style="font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;">
	<div id="voiceStatus" style="margin-bottom: 8px; color: #374151;"></div>
	<div style="display: flex; flex-wrap: wrap; gap: 8px; margin-bottom: 10px;">
	<button id="startBtn" style="border-radius: 8px; padding: 8px 12px; border: 1px solid #4f46e5; background: #4f46e5; color: white;">Start</button>
	<button id="stopBtn" style="border-radius: 8px; padding: 8px 12px; border: 1px solid #6b7280; background: white; color: #111827;">Stop</button>
	<button id="copyBtn" style="border-radius: 8px; padding: 8px 12px; border: 1px solid #6b7280; background: white; color: #111827;">Copy</button>
	<button id="clearBtn" style="border-radius: 8px; padding: 8px 12px; border: 1px solid #6b7280; background: white; color: #111827;">Clear</button>
	</div>
	<label for="transcriptBox" style="display:block; font-weight: 600; margin-bottom: 6px;">Transcript</label>
	<textarea id="transcriptBox" style="width: 100%; min-height: 150px; border: 1px solid #d1d5db; border-radius: 8px; padding: 10px; font-size: 15px; line-height: 1.45;"></textarea>
	</div>
	<script>
	const cfg = {cfg_json};
	const statusEl = document.getElementById("voiceStatus");
	const transcriptEl = document.getElementById("transcriptBox");
	const startBtn = document.getElementById("startBtn");
	const stopBtn = document.getElementById("stopBtn");
	const copyBtn = document.getElementById("copyBtn");
	const clearBtn = document.getElementById("clearBtn");
	const SpeechRecognition = window.SpeechRecognition \|\| window.webkitSpeechRecognition;

	startBtn.textContent = cfg.start;
	stopBtn.textContent = cfg.stop;
	copyBtn.textContent = cfg.copy;
	clearBtn.textContent = cfg.clear;
	document.querySelector("label[for='transcriptBox']").textContent = cfg.transcript;

	let recognition = null;
	let finalTranscript = "";

	if (!SpeechRecognition) {{
	statusEl.textContent = cfg.unsupported;
	startBtn.disabled = true;
	stopBtn.disabled = true;
	}} else {{
	recognition = new SpeechRecognition();
	recognition.lang = cfg.lang;
	recognition.continuous = true;
	recognition.interimResults = true;

	recognition.onstart = () => {{
	statusEl.textContent = "Listening...";
	}};
	recognition.onerror = (event) => {{
	statusEl.textContent = "Voice capture stopped: " + event.error;
	}};
	recognition.onend = () => {{
	statusEl.textContent = "Stopped. Copy the transcript, then paste it below.";
	}};
	recognition.onresult = (event) => {{
	let interim = "";
	for (let i = event.resultIndex; i < event.results.length; i++) {{
	const piece = event.results[i][0].transcript;
	if (event.results[i].isFinal) {{
	finalTranscript += piece + " ";
	}} else {{
	interim += piece;
	}}
	}}
	transcriptEl.value = (finalTranscript + interim).trim();
	}};
	}}

	startBtn.onclick = () => {{
	if (recognition) recognition.start();
	}};
	stopBtn.onclick = () => {{
	if (recognition) recognition.stop();
	}};
	copyBtn.onclick = async () => {{
	try {{
	await navigator.clipboard.writeText(transcriptEl.value);
	statusEl.textContent = "Copied. Paste it into the Streamlit box below.";
	}} catch (err) {{
	transcriptEl.select();
	document.execCommand("copy");
	statusEl.textContent = "Copied. Paste it into the Streamlit box below.";
	}}
	}};
	clearBtn.onclick = () => {{
	finalTranscript = "";
	transcriptEl.value = "";
	statusEl.textContent = "";
	}};
	</script>
	""",
	height=310,
	)


	def render(lang: str) -> None:
	_init_state()

	st.header(t("voice_header", lang))
	st.caption(t("voice_sub", lang))
	st.info(t("voice_copy_note", lang))
	_render_voice_component(lang)

	note = st.text_area(
	t("voice_note_label", lang),
	value=st.session_state[NOTE_KEY],
	placeholder=t("voice_paste_placeholder", lang),
	height=140,
	key="voice_note_input",
	)

	cols = st.columns([1, 1, 3])
	with cols[0]:
	save_clicked = st.button(t("voice_save_button", lang), key="voice_save_button", type="secondary")
	with cols[1]:
	ask_clicked = st.button(t("voice_ask_button", lang), key="voice_ask_button", type="primary")

	if save_clicked and note.strip():
	st.session_state[NOTE_KEY] = note.strip()
	_append_history("user", note.strip())
	st.success(t("voice_saved", lang))

	if ask_clicked and note.strip():
	if check_crisis(note):
	render_crisis_banner(lang)
	return
	st.session_state[NOTE_KEY] = note.strip()
	_append_history("user", note.strip())
	with st.spinner("..."):
	try:
	response = chat(
	module=MODULE_NAME,
	user_text=note.strip(),
	language_name=claude_language_name(lang),
	max_tokens=1200,
	)
	except Exception as e:
	response = f"(Could not reach the model right now: {e})"
	st.session_state[RESPONSE_KEY] = response
	_append_history("assistant", response)

	if st.session_state[RESPONSE_KEY]:
	st.markdown(st.session_state[RESPONSE_KEY])

	st.markdown(f"##### {t('voice_history_heading', lang)}")
	history: List[Dict[str, str]] = st.session_state.get(HISTORY_KEY, [])
	if not history:
	st.caption(t("voice_no_history", lang))
	for msg in history[-6:]:
	with st.chat_message(msg.get("role", "assistant")):
	st.markdown(msg.get("content", ""))