Spaces:

AgentsWithoutBorders
/

SherpaAI

Sleeping

App Files Files Community

Jatila commited on Feb 15

Commit

b79bf8c

verified ·

1 Parent(s): f6a9546

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -51

app.py CHANGED Viewed

@@ -113,31 +113,42 @@ VOICE_DESCRIPTION = (
     "The audio is very clean with no background noise."
 )
-def text_to_speech(text, lang="Español"):
     if not text:
         return None
-    try:
-        # Spanish-capable TTS, adjust for Catalan if a model exists
-        model_repo = "tts_models/es/tacotron2-DDC"
-        tts_model = ParlerTTSForConditionalGeneration.from_pretrained(
-            model_repo, torch_dtype=torch_dtype
-        ).to(device)
-        tts_tokenizer = AutoTokenizer.from_pretrained(model_repo)
-        sampling_rate = 22050
-        input_ids = tts_tokenizer(text, return_tensors="pt").input_ids.to(device)
         with torch.no_grad():
-            generation = tts_model.generate(input_ids=input_ids)
         audio_array = generation.cpu().to(torch.float32).numpy().squeeze()
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
             scipy.io.wavfile.write(f.name, rate=sampling_rate, data=audio_array)
             return f.name
     except Exception as e:
         print(f"TTS error: {e}")
         return None
 # ── LLM: HF Inference API + RAG ───────────────────────────────
 SYSTEM_PROMPT = """You are a warm, calm, and knowledgeable support assistant for caregivers of people with Alzheimer's disease.
@@ -185,31 +196,40 @@ def respond_to_message(message, history, lang="Español"):
 # ── Pipelines ─────────────────────────────────────────────────
-def voice_pipeline(audio_input, history):
     transcript = transcribe_audio(audio_input)
     if not transcript:
-        return history, None, "⚠️ Could not transcribe audio."
-    lang = detect_language(transcript)
-    reply = respond_to_message(transcript, history, lang=lang)
     history = history or []
     history.append({"role": "user", "content": transcript})
     history.append({"role": "assistant", "content": reply})
-    audio_out = text_to_speech(reply, lang=lang)
-    return history, audio_out, transcript
-def text_pipeline(text_input, history, lang):
     if not text_input.strip():
         return history, None, ""
-    reply = respond_to_message(text_input, history, lang=lang)
     history = history or []
     history.append({"role": "user", "content": text_input})
     history.append({"role": "assistant", "content": reply})
-    audio_out = text_to_speech(reply, lang=lang)
     return history, audio_out, ""
 # ── Gradio UI ─────────────────────────────────────────────────
 with gr.Blocks(
     theme=gr.themes.Soft(
@@ -299,39 +319,46 @@ with gr.Blocks(
     )
     # Update chatbot function (dummy, required for Gradio workflow)
-    def update_chatbot(history):
-        return history
-    # Button callbacks
-    voice_btn.click(
-        fn=voice_pipeline,
-        inputs=[audio_input, chat_history, lang_selector],
-        outputs=[chat_history, audio_output, transcript_display],
-    ).then(
-        fn=update_chatbot,
-        inputs=[chat_history],
-        outputs=[chatbot],
-    )
-    text_btn.click(
-        fn=text_pipeline,
-        inputs=[text_input, chat_history, lang_selector],
-        outputs=[chat_history, audio_output, transcript_display],
-    ).then(
-        fn=update_chatbot,
-        inputs=[chat_history],
-        outputs=[chatbot],
-    )
-    text_input.submit(
-        fn=text_pipeline,
-        inputs=[text_input, chat_history, lang_selector],
-        outputs=[chat_history, audio_output, transcript_display],
-    ).then(
-        fn=update_chatbot,
-        inputs=[chat_history],
-        outputs=[chatbot],
-    )
 if __name__ == "__main__":
     demo.launch()

     "The audio is very clean with no background noise."
 )
+def text_to_speech(text, lang="es"):
     if not text:
         return None
+    # Choose voice description per language
+    if lang == "ca":
+        voice_desc = (
+            "Clara speaks Catalan with a calm, clear, empathetic voice. "
+            "She speaks slowly, like a caring nurse."
+        )
+    else:  # default Spanish
+        voice_desc = (
+            "Laura speaks Spanish with a warm, clear, empathetic voice. "
+            "She speaks slowly, like a caring nurse."
+        )
+    try:
+        input_ids = tts_tokenizer(voice_desc, return_tensors="pt").input_ids.to(device)
+        prompt_ids = tts_tokenizer(text, return_tensors="pt").input_ids.to(device)
         with torch.no_grad():
+            generation = tts_model.generate(
+                input_ids=input_ids,
+                prompt_input_ids=prompt_ids,
+            )
         audio_array = generation.cpu().to(torch.float32).numpy().squeeze()
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
             scipy.io.wavfile.write(f.name, rate=sampling_rate, data=audio_array)
             return f.name
     except Exception as e:
         print(f"TTS error: {e}")
         return None
 # ── LLM: HF Inference API + RAG ───────────────────────────────
 SYSTEM_PROMPT = """You are a warm, calm, and knowledgeable support assistant for caregivers of people with Alzheimer's disease.
 # ── Pipelines ─────────────────────────────────────────────────
+# ── Voice Pipeline with Language Support ─────────────────────────
+def voice_pipeline(audio_input, history, tts_lang):
+    # Transcribe audio
     transcript = transcribe_audio(audio_input)
     if not transcript:
+        return history, None, "⚠️ Could not transcribe audio. Please try again."
+    # Generate response from LLM + RAG
+    reply = respond_to_message(transcript, history)
+    # Update chat history
     history = history or []
     history.append({"role": "user", "content": transcript})
     history.append({"role": "assistant", "content": reply})
+    # Convert to speech
+    audio_out = text_to_speech(reply, tts_lang)
+    return history, audio_out, f'"{transcript}"'
+# ── Text Pipeline with Language Support ─────────────────────────
+def text_pipeline(text_input, history, tts_lang):
     if not text_input.strip():
         return history, None, ""
+    reply = respond_to_message(text_input, history)
     history = history or []
     history.append({"role": "user", "content": text_input})
     history.append({"role": "assistant", "content": reply})
+    audio_out = text_to_speech(reply, tts_lang)
     return history, audio_out, ""
 # ── Gradio UI ─────────────────────────────────────────────────
 with gr.Blocks(
     theme=gr.themes.Soft(
     )
     # Update chatbot function (dummy, required for Gradio workflow)
+# Helper to refresh chatbot UI
+def update_chatbot(history):
+    return history
+# 🎤 Voice button click
+voice_btn.click(
+    fn=voice_pipeline,
+    inputs=[audio_input, chat_history, lang_selector],
+    outputs=[chat_history, audio_output, transcript_display],
+).then(
+    fn=update_chatbot,
+    inputs=[chat_history],
+    outputs=[chatbot],
+)
+# ⌨️ Text button click
+text_btn.click(
+    fn=text_pipeline,
+    inputs=[text_input, chat_history, lang_selector],
+    outputs=[chat_history, audio_output, transcript_display],
+).then(
+    fn=update_chatbot,
+    inputs=[chat_history],
+    outputs=[chatbot],
+)
+# ⌨️ Press Enter to send text
+text_input.submit(
+    fn=text_pipeline,
+    inputs=[text_input, chat_history, lang_selector],
+    outputs=[chat_history, audio_output, transcript_display],
+).then(
+    fn=update_chatbot,
+    inputs=[chat_history],
+    outputs=[chatbot],
+)
 if __name__ == "__main__":
     demo.launch()