Spaces:

samarth1999
/

Saathi

Sleeping

File size: 8,084 Bytes

"""Module 6 - Voice Notes: browser dictation + Saathi response.

Streamlit 1.32 does not include a native microphone-to-text widget. To stay
dependency-light for HF Spaces, this module uses the browser Web Speech API
inside a local HTML component. The transcript stays in the browser until the
user pastes it into the Streamlit text area.
"""
from __future__ import annotations

import json
from typing import Dict, List

import streamlit as st
import streamlit.components.v1 as components

from backend.claude_client import chat
from backend.i18n import claude_language_name, t
from backend.safeguards import check_crisis, render_crisis_banner

MODULE_NAME = "voice_notes"
NOTE_KEY = "voice_note"
RESPONSE_KEY = "voice_response"
HISTORY_KEY = "voice_history"

VOICE_LANG_TAGS = {
    "en": "en-IN",
    "hi": "hi-IN",
    "bn": "bn-IN",
    "ta": "ta-IN",
    "te": "te-IN",
    "mr": "mr-IN",
    "ur": "ur-IN",
}


def _init_state() -> None:
    if NOTE_KEY not in st.session_state:
        st.session_state[NOTE_KEY] = ""
    if RESPONSE_KEY not in st.session_state:
        st.session_state[RESPONSE_KEY] = ""
    if HISTORY_KEY not in st.session_state:
        st.session_state[HISTORY_KEY] = []


def _append_history(role: str, content: str) -> None:
    history = list(st.session_state.get(HISTORY_KEY, []))
    history.append({"role": role, "content": content})
    st.session_state[HISTORY_KEY] = history


def _render_voice_component(lang: str) -> None:
    cfg = {
        "lang": VOICE_LANG_TAGS.get(lang, "en-IN"),
        "start": t("voice_start_button", lang),
        "stop": t("voice_stop_button", lang),
        "copy": t("voice_copy_button", lang),
        "clear": t("voice_clear_button", lang),
        "transcript": t("voice_transcript_label", lang),
        "unsupported": t("voice_not_supported", lang),
    }
    cfg_json = json.dumps(cfg, ensure_ascii=False)
    components.html(
        f"""
        <div style="font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;">
          <div id="voiceStatus" style="margin-bottom: 8px; color: #374151;"></div>
          <div style="display: flex; flex-wrap: wrap; gap: 8px; margin-bottom: 10px;">
            <button id="startBtn" style="border-radius: 8px; padding: 8px 12px; border: 1px solid #4f46e5; background: #4f46e5; color: white;">Start</button>
            <button id="stopBtn" style="border-radius: 8px; padding: 8px 12px; border: 1px solid #6b7280; background: white; color: #111827;">Stop</button>
            <button id="copyBtn" style="border-radius: 8px; padding: 8px 12px; border: 1px solid #6b7280; background: white; color: #111827;">Copy</button>
            <button id="clearBtn" style="border-radius: 8px; padding: 8px 12px; border: 1px solid #6b7280; background: white; color: #111827;">Clear</button>
          </div>
          <label for="transcriptBox" style="display:block; font-weight: 600; margin-bottom: 6px;">Transcript</label>
          <textarea id="transcriptBox" style="width: 100%; min-height: 150px; border: 1px solid #d1d5db; border-radius: 8px; padding: 10px; font-size: 15px; line-height: 1.45;"></textarea>
        </div>
        <script>
        const cfg = {cfg_json};
        const statusEl = document.getElementById("voiceStatus");
        const transcriptEl = document.getElementById("transcriptBox");
        const startBtn = document.getElementById("startBtn");
        const stopBtn = document.getElementById("stopBtn");
        const copyBtn = document.getElementById("copyBtn");
        const clearBtn = document.getElementById("clearBtn");
        const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;

        startBtn.textContent = cfg.start;
        stopBtn.textContent = cfg.stop;
        copyBtn.textContent = cfg.copy;
        clearBtn.textContent = cfg.clear;
        document.querySelector("label[for='transcriptBox']").textContent = cfg.transcript;

        let recognition = null;
        let finalTranscript = "";

        if (!SpeechRecognition) {{
          statusEl.textContent = cfg.unsupported;
          startBtn.disabled = true;
          stopBtn.disabled = true;
        }} else {{
          recognition = new SpeechRecognition();
          recognition.lang = cfg.lang;
          recognition.continuous = true;
          recognition.interimResults = true;

          recognition.onstart = () => {{
            statusEl.textContent = "Listening...";
          }};
          recognition.onerror = (event) => {{
            statusEl.textContent = "Voice capture stopped: " + event.error;
          }};
          recognition.onend = () => {{
            statusEl.textContent = "Stopped. Copy the transcript, then paste it below.";
          }};
          recognition.onresult = (event) => {{
            let interim = "";
            for (let i = event.resultIndex; i < event.results.length; i++) {{
              const piece = event.results[i][0].transcript;
              if (event.results[i].isFinal) {{
                finalTranscript += piece + " ";
              }} else {{
                interim += piece;
              }}
            }}
            transcriptEl.value = (finalTranscript + interim).trim();
          }};
        }}

        startBtn.onclick = () => {{
          if (recognition) recognition.start();
        }};
        stopBtn.onclick = () => {{
          if (recognition) recognition.stop();
        }};
        copyBtn.onclick = async () => {{
          try {{
            await navigator.clipboard.writeText(transcriptEl.value);
            statusEl.textContent = "Copied. Paste it into the Streamlit box below.";
          }} catch (err) {{
            transcriptEl.select();
            document.execCommand("copy");
            statusEl.textContent = "Copied. Paste it into the Streamlit box below.";
          }}
        }};
        clearBtn.onclick = () => {{
          finalTranscript = "";
          transcriptEl.value = "";
          statusEl.textContent = "";
        }};
        </script>
        """,
        height=310,
    )


def render(lang: str) -> None:
    _init_state()

    st.header(t("voice_header", lang))
    st.caption(t("voice_sub", lang))
    st.info(t("voice_copy_note", lang))
    _render_voice_component(lang)

    note = st.text_area(
        t("voice_note_label", lang),
        value=st.session_state[NOTE_KEY],
        placeholder=t("voice_paste_placeholder", lang),
        height=140,
        key="voice_note_input",
    )

    cols = st.columns([1, 1, 3])
    with cols[0]:
        save_clicked = st.button(t("voice_save_button", lang), key="voice_save_button", type="secondary")
    with cols[1]:
        ask_clicked = st.button(t("voice_ask_button", lang), key="voice_ask_button", type="primary")

    if save_clicked and note.strip():
        st.session_state[NOTE_KEY] = note.strip()
        _append_history("user", note.strip())
        st.success(t("voice_saved", lang))

    if ask_clicked and note.strip():
        if check_crisis(note):
            render_crisis_banner(lang)
            return
        st.session_state[NOTE_KEY] = note.strip()
        _append_history("user", note.strip())
        with st.spinner("..."):
            try:
                response = chat(
                    module=MODULE_NAME,
                    user_text=note.strip(),
                    language_name=claude_language_name(lang),
                    max_tokens=1200,
                )
            except Exception as e:
                response = f"(Could not reach the model right now: {e})"
        st.session_state[RESPONSE_KEY] = response
        _append_history("assistant", response)

    if st.session_state[RESPONSE_KEY]:
        st.markdown(st.session_state[RESPONSE_KEY])

    st.markdown(f"##### {t('voice_history_heading', lang)}")
    history: List[Dict[str, str]] = st.session_state.get(HISTORY_KEY, [])
    if not history:
        st.caption(t("voice_no_history", lang))
    for msg in history[-6:]:
        with st.chat_message(msg.get("role", "assistant")):
            st.markdown(msg.get("content", ""))