Spaces:

bdstar
/

VoiceTutor-AI

Sleeping

App Files Files Community

bdstar commited on Oct 31, 2025

Commit

c812b33

verified ·

1 Parent(s): b286a6f

update app file

Browse files

Files changed (1) hide show

app.py +167 -167

app.py CHANGED Viewed

@@ -1,167 +1,167 @@
-import gradio as gr
-import subprocess, json, os, io, tempfile
-from faster_whisper import WhisperModel
-from ollama import Client as OllamaClient
-# ---- CONFIG ----
-LLM_MODEL = "llama3.2:3b"      # or "mistral:7b", "qwen2.5:3b"
-WHISPER_SIZE = "small"         # "base", "small", "medium"
-USE_SILERO = True              # set False to use Coqui XTTS v2
-import os
-USE_REMOTE_OLLAMA = bool(os.getenv("OLLAMA_HOST"))
-if not USE_REMOTE_OLLAMA:
-    # Transformers fallback for Spaces (CPU-friendly small instruct model)
-    from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-    HF_CHAT_MODEL = os.getenv("HF_CHAT_MODEL", "google/gemma-2-2b-it")  # small instruct model that runs on CPU
-    _tok = AutoTokenizer.from_pretrained(HF_CHAT_MODEL)
-    _mdl = AutoModelForCausalLM.from_pretrained(HF_CHAT_MODEL, torch_dtype="auto", device_map="auto")
-    gen = pipeline("text-generation", model=_mdl, tokenizer=_tok, max_new_tokens=256)
-# ---- STT (faster-whisper) ----
-# Run on GPU if available: compute_type="float16", device="cuda"
-stt_model = WhisperModel(WHISPER_SIZE, device="cuda" if os.environ.get("CUDA_VISIBLE_DEVICES") else "cpu",
-                         compute_type="float16" if os.environ.get("CUDA_VISIBLE_DEVICES") else "int8")
-def speech_to_text(audio_path: str) -> str:
-    segments, info = stt_model.transcribe(audio_path, beam_size=1, vad_filter=True)
-    text = "".join(seg.text for seg in segments).strip()
-    return text
-# ---- LLM (Ollama) ----
-ollama = OllamaClient(host="http://127.0.0.1:11434")
-SYSTEM_PROMPT = """You are a friendly conversational English coach and voice assistant.
-- First, understand the user's utterance.
-- If there are mistakes (grammar/word choice/tense), provide a brief corrected sentence first, prefixed with "Correction:".
-- In 1 short line, explain the key fix, prefixed with "Why:".
-- Then continue the conversation naturally in one or two sentences.
-- Be concise, supportive, and avoid long lectures.
-Format:
-Correction: <corrected sentence or "None">
-Why: <very brief reason, or "N/A">
-Reply: <your friendly response to keep the conversation going>"""
-def chat_with_llm(history_messages, user_text):
-    if USE_REMOTE_OLLAMA:
-        messages = [{"role": "system", "content": SYSTEM_PROMPT}]
-        for m in (history_messages or []):
-            if m.get("role") in ("user", "assistant") and m.get("content"):
-                messages.append({"role": m["role"], "content": m["content"]})
-        messages.append({"role": "user", "content": user_text})
-        resp = ollama.chat(model=LLM_MODEL, messages=messages)
-        return resp["message"]["content"]
-    else:
-        # Simple prompt stitching for the fallback pipeline
-        history_text = "\n".join(
-            [f"User: {m['content']}" if m["role"]=="user" else f"Assistant: {m['content']}"
-             for m in (history_messages or [])]
-        )
-        prompt = f"{SYSTEM_PROMPT}\n{history_text}\nUser: {user_text}\nAssistant:"
-        out = gen(prompt)[0]["generated_text"]
-        # Return only the new assistant chunk after the prompt
-        return out.split("Assistant:", 1)[-1].strip()
-# ---- TTS ----
-def tts_silero(text: str) -> str:
-    """
-    Return path to a WAV file synthesized by Silero (CPU-friendly).
-    Works across recent torch.hub return signatures.
-    """
-    import torch, tempfile
-    import soundfile as sf
-    # Newer torch.hub supports "trust_repo"; set to True or 'check'
-    obj = torch.hub.load(
-        repo_or_dir="snakers4/silero-models",
-        model="silero_tts",
-        language="en",
-        speaker="v3_en",
-        trust_repo=True  # or 'check' to be prompted the first time
-    )
-    # Handle both cases: either a single model, or a (model, something) tuple
-    model = obj[0] if isinstance(obj, (list, tuple)) else obj
-    sample_rate = 48000
-    speaker = "en_0"  # valid default voice in v3_en pack
-    audio = model.apply_tts(text=text, speaker=speaker, sample_rate=sample_rate)
-    out_wav = tempfile.mktemp(suffix=".wav")
-    sf.write(out_wav, audio, sample_rate)
-    return out_wav
-def tts_coqui_xtts(text: str) -> str:
-    """
-    Returns path to a WAV file synthesized by Coqui XTTS v2 (higher quality; GPU-friendly).
-    """
-    from TTS.api import TTS
-    tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
-    out_wav = tempfile.mktemp(suffix=".wav")
-    tts.tts_to_file(text=text, file_path=out_wav, speaker="female-en-5", language="en")
-    return out_wav
-def text_to_speech(text: str) -> str:
-    if USE_SILERO:
-        return tts_silero(text)
-    else:
-        return tts_coqui_xtts(text)
-# ---- Gradio pipeline ----
-def pipeline(audio, history):
-    # audio is (sample_rate, np.array) OR a filepath (depends on Gradio version)
-    # Normalize to a temp wav file
-    if audio is None:
-        return history, None, "Please speak something."
-    if isinstance(audio, tuple):
-        # (sr, data) -> write wav
-        import soundfile as sf, numpy as np, tempfile
-        sr, data = audio
-        tmp_in = tempfile.mktemp(suffix=".wav")
-        sf.write(tmp_in, data.astype("float32"), sr)
-        audio_path = tmp_in
-    else:
-        audio_path = audio  # path already
-    user_text = speech_to_text(audio_path)
-    if not user_text:
-        return history, None, "Didn't catch that—could you repeat?"
-    reply = chat_with_llm(history, user_text)
-    # Extract the "Reply:" line for TTS; speak only the conversational reply
-    speak_text = reply
-    for tag in ["Reply:", "Correction:", "Why:"]:
-        # Try to find "Reply:" block
-        if "Reply:" in reply:
-            speak_text = reply.split("Reply:", 1)[1].strip()
-            break
-    wav_path = text_to_speech(speak_text)
-    updated = (history or []) + [
-        {"role": "user", "content": user_text},
-        {"role": "assistant", "content": reply},
-    ]
-    return updated, wav_path, ""
-with gr.Blocks(title="Voice Coach") as demo:
-    gr.Markdown("## 🎙️ Interactive Voice Chat (with on-the-fly sentence correction)")
-    with gr.Row():
-        audio_in = gr.Audio(sources=["microphone"], type="filepath", label="Speak")
-        audio_out = gr.Audio(label="Assistant (TTS)", autoplay=True)
-    chatbox = gr.Chatbot(type="messages", height=300)
-    status = gr.Markdown()
-    btn = gr.Button("Send")
-    # Use continuous recording or press "Send" after recording
-    audio_in.change(pipeline, inputs=[audio_in, chatbox], outputs=[chatbox, audio_out, status])
-    btn.click(pipeline, inputs=[audio_in, chatbox], outputs=[chatbox, audio_out, status])
-if __name__ == "__main__":
-    demo.launch(share=True)

+import gradio as gr
+import subprocess, json, os, io, tempfile
+from faster_whisper import WhisperModel
+from ollama import Client as OllamaClient
+# ---- CONFIG ----
+LLM_MODEL = "llama3.2:3b"      # or "mistral:7b", "qwen2.5:3b"
+WHISPER_SIZE = "small"         # "base", "small", "medium"
+USE_SILERO = True              # set False to use Coqui XTTS v2
+import os
+USE_REMOTE_OLLAMA = bool(os.getenv("OLLAMA_HOST"))
+if not USE_REMOTE_OLLAMA:
+    # Transformers fallback for Spaces (CPU-friendly small instruct model)
+    from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+    HF_CHAT_MODEL = os.getenv("HF_CHAT_MODEL", "google/gemma-2-2b-it")  # small instruct model that runs on CPU
+    _tok = AutoTokenizer.from_pretrained(HF_CHAT_MODEL)
+    _mdl = AutoModelForCausalLM.from_pretrained(HF_CHAT_MODEL, torch_dtype="auto", device_map="auto")
+    gen = pipeline("text-generation", model=_mdl, tokenizer=_tok, max_new_tokens=256)
+# ---- STT (faster-whisper) ----
+# Run on GPU if available: compute_type="float16", device="cuda"
+stt_model = WhisperModel(WHISPER_SIZE, device="cuda" if os.environ.get("CUDA_VISIBLE_DEVICES") else "cpu",
+                         compute_type="float16" if os.environ.get("CUDA_VISIBLE_DEVICES") else "int8")
+def speech_to_text(audio_path: str) -> str:
+    segments, info = stt_model.transcribe(audio_path, beam_size=1, vad_filter=True)
+    text = "".join(seg.text for seg in segments).strip()
+    return text
+# ---- LLM (Ollama) ----
+# ollama = OllamaClient(host="http://127.0.0.1:11434")
+SYSTEM_PROMPT = """You are a friendly conversational English coach and voice assistant.
+- First, understand the user's utterance.
+- If there are mistakes (grammar/word choice/tense), provide a brief corrected sentence first, prefixed with "Correction:".
+- In 1 short line, explain the key fix, prefixed with "Why:".
+- Then continue the conversation naturally in one or two sentences.
+- Be concise, supportive, and avoid long lectures.
+Format:
+Correction: <corrected sentence or "None">
+Why: <very brief reason, or "N/A">
+Reply: <your friendly response to keep the conversation going>"""
+def chat_with_llm(history_messages, user_text):
+    if USE_REMOTE_OLLAMA:
+        messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+        for m in (history_messages or []):
+            if m.get("role") in ("user", "assistant") and m.get("content"):
+                messages.append({"role": m["role"], "content": m["content"]})
+        messages.append({"role": "user", "content": user_text})
+        resp = ollama.chat(model=LLM_MODEL, messages=messages)
+        return resp["message"]["content"]
+    else:
+        # Simple prompt stitching for the fallback pipeline
+        history_text = "\n".join(
+            [f"User: {m['content']}" if m["role"]=="user" else f"Assistant: {m['content']}"
+             for m in (history_messages or [])]
+        )
+        prompt = f"{SYSTEM_PROMPT}\n{history_text}\nUser: {user_text}\nAssistant:"
+        out = gen(prompt)[0]["generated_text"]
+        # Return only the new assistant chunk after the prompt
+        return out.split("Assistant:", 1)[-1].strip()
+# ---- TTS ----
+def tts_silero(text: str) -> str:
+    """
+    Return path to a WAV file synthesized by Silero (CPU-friendly).
+    Works across recent torch.hub return signatures.
+    """
+    import torch, tempfile
+    import soundfile as sf
+    # Newer torch.hub supports "trust_repo"; set to True or 'check'
+    obj = torch.hub.load(
+        repo_or_dir="snakers4/silero-models",
+        model="silero_tts",
+        language="en",
+        speaker="v3_en",
+        trust_repo=True  # or 'check' to be prompted the first time
+    )
+    # Handle both cases: either a single model, or a (model, something) tuple
+    model = obj[0] if isinstance(obj, (list, tuple)) else obj
+    sample_rate = 48000
+    speaker = "en_0"  # valid default voice in v3_en pack
+    audio = model.apply_tts(text=text, speaker=speaker, sample_rate=sample_rate)
+    out_wav = tempfile.mktemp(suffix=".wav")
+    sf.write(out_wav, audio, sample_rate)
+    return out_wav
+def tts_coqui_xtts(text: str) -> str:
+    """
+    Returns path to a WAV file synthesized by Coqui XTTS v2 (higher quality; GPU-friendly).
+    """
+    from TTS.api import TTS
+    tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
+    out_wav = tempfile.mktemp(suffix=".wav")
+    tts.tts_to_file(text=text, file_path=out_wav, speaker="female-en-5", language="en")
+    return out_wav
+def text_to_speech(text: str) -> str:
+    if USE_SILERO:
+        return tts_silero(text)
+    else:
+        return tts_coqui_xtts(text)
+# ---- Gradio pipeline ----
+def pipeline(audio, history):
+    # audio is (sample_rate, np.array) OR a filepath (depends on Gradio version)
+    # Normalize to a temp wav file
+    if audio is None:
+        return history, None, "Please speak something."
+    if isinstance(audio, tuple):
+        # (sr, data) -> write wav
+        import soundfile as sf, numpy as np, tempfile
+        sr, data = audio
+        tmp_in = tempfile.mktemp(suffix=".wav")
+        sf.write(tmp_in, data.astype("float32"), sr)
+        audio_path = tmp_in
+    else:
+        audio_path = audio  # path already
+    user_text = speech_to_text(audio_path)
+    if not user_text:
+        return history, None, "Didn't catch that—could you repeat?"
+    reply = chat_with_llm(history, user_text)
+    # Extract the "Reply:" line for TTS; speak only the conversational reply
+    speak_text = reply
+    for tag in ["Reply:", "Correction:", "Why:"]:
+        # Try to find "Reply:" block
+        if "Reply:" in reply:
+            speak_text = reply.split("Reply:", 1)[1].strip()
+            break
+    wav_path = text_to_speech(speak_text)
+    updated = (history or []) + [
+        {"role": "user", "content": user_text},
+        {"role": "assistant", "content": reply},
+    ]
+    return updated, wav_path, ""
+with gr.Blocks(title="Voice Coach") as demo:
+    gr.Markdown("## 🎙️ Interactive Voice Chat (with on-the-fly sentence correction)")
+    with gr.Row():
+        audio_in = gr.Audio(sources=["microphone"], type="filepath", label="Speak")
+        audio_out = gr.Audio(label="Assistant (TTS)", autoplay=True)
+    chatbox = gr.Chatbot(type="messages", height=300)
+    status = gr.Markdown()
+    btn = gr.Button("Send")
+    # Use continuous recording or press "Send" after recording
+    audio_in.change(pipeline, inputs=[audio_in, chatbox], outputs=[chatbox, audio_out, status])
+    btn.click(pipeline, inputs=[audio_in, chatbox], outputs=[chatbox, audio_out, status])
+if __name__ == "__main__":
+    demo.launch(share=True)