Spaces:

AgentsWithoutBorders
/

SherpaAI

Sleeping

App Files Files Community

Jatila commited on Feb 15

Commit

23c8db3

verified ·

1 Parent(s): e2c032e

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -97

app.py CHANGED Viewed

@@ -58,6 +58,15 @@ def retrieve_rag_context(query, k=5):
         results.append(chunk["text"])
     return "\n\n---\n\n".join(results)
 # ── STT: Distil-Whisper ───────────────────────────────────────
 print("Loading Whisper STT model...")
 stt_pipe = pipeline(
@@ -70,15 +79,21 @@ stt_pipe = pipeline(
 def transcribe_audio(audio_path):
     if audio_path is None:
         return ""
-    result = stt_pipe(
-        audio_path,
-        generate_kwargs={"task": "transcribe"},
-        return_timestamps=False,
-    )
     transcript = result["text"].strip()
-    print(f"Transcript: '{transcript}'")
     return transcript
 # ── TTS: Parler TTS mini v1 (neutral American voice) ─────────
 print("Loading Parler TTS model...")
 TTS_REPO = "parler-tts/parler-tts-mini-v1"
@@ -98,34 +113,31 @@ VOICE_DESCRIPTION = (
     "The audio is very clean with no background noise."
 )
-def text_to_speech(text):
     if not text:
         return None
     try:
-        input_ids = tts_tokenizer(
-            VOICE_DESCRIPTION, return_tensors="pt"
-        ).input_ids.to(device)
-        prompt_ids = tts_tokenizer(
-            text, return_tensors="pt"
-        ).input_ids.to(device)
         with torch.no_grad():
-            generation = tts_model.generate(
-                input_ids=input_ids,
-                prompt_input_ids=prompt_ids,
-            )
-        # ← float16 fix: convert to float32 before writing WAV
         audio_array = generation.cpu().to(torch.float32).numpy().squeeze()
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
             scipy.io.wavfile.write(f.name, rate=sampling_rate, data=audio_array)
             return f.name
     except Exception as e:
         print(f"TTS error: {e}")
-        return None  # silently skip audio, text response still shows
 # ── LLM: HF Inference API + RAG ───────────────────────────────
 SYSTEM_PROMPT = """You are a warm, calm, and knowledgeable support assistant for caregivers of people with Alzheimer's disease.
@@ -140,19 +152,14 @@ If asked about local resources, ONLY reference services mentioned in the retriev
 If no relevant local services are in the context, say so honestly.
 Always remind caregivers that asking for help is a sign of strength, not weakness."""
-def respond_to_message(message, history):
     if not message.strip():
         return ""
     client = InferenceClient(token=HF_TOKEN, model="openai/gpt-oss-20b")
     rag_context = retrieve_rag_context(message)
-    full_system = (
-        f"{SYSTEM_PROMPT}\n\n"
-        f"=== RETRIEVED KNOWLEDGE BASE CONTEXT ===\n{rag_context}\n"
-        f"========================================\n"
-        f"Only use the above context for local resource recommendations."
-    )
     messages = [{"role": "system", "content": full_system}]
     for h in history[-6:]:
@@ -174,102 +181,65 @@ def respond_to_message(message, history):
         return response.strip()
     except Exception as e:
         print(f"LLM error: {e}")
-        return "I'm sorry, I had trouble generating a response. Please try again."
 # ── Pipelines ─────────────────────────────────────────────────
 def voice_pipeline(audio_input, history):
     transcript = transcribe_audio(audio_input)
     if not transcript:
-        return history, None, "⚠️ Could not transcribe audio. Please try again."
-    reply = respond_to_message(transcript, history)
     history = history or []
     history.append({"role": "user", "content": transcript})
     history.append({"role": "assistant", "content": reply})
-    audio_out = text_to_speech(reply)
-    return history, audio_out, f'"{transcript}"'
-def text_pipeline(text_input, history):
     if not text_input.strip():
         return history, None, ""
-    reply = respond_to_message(text_input, history)
     history = history or []
     history.append({"role": "user", "content": text_input})
     history.append({"role": "assistant", "content": reply})
-    audio_out = text_to_speech(reply)
     return history, audio_out, ""
 # ── Gradio UI ─────────────────────────────────────────────────
-with gr.Blocks(
-    theme=gr.themes.Soft(
-        primary_hue="green",
-        neutral_hue="slate",
-        font=gr.themes.GoogleFont("DM Sans"),
-    ),
-    title="CareCompanion",
-) as demo:
     chat_history = gr.State([])
-    gr.Markdown(
-        """
-        # SherpaAI
-        ### Smart support for AD caregivers in Barcelona
-                """
-    )
     with gr.Row():
         with gr.Column(scale=2):
-            chatbot = gr.Chatbot(
-                label="Conversation",
-                height=420,
-                type="messages",
-                show_label=False,
-                bubble_full_width=False,
-            )
-            audio_output = gr.Audio(
-                label="🔊 Voice Response",
-                autoplay=True,
-                show_download_button=False,
-            )
         with gr.Column(scale=1):
             gr.Markdown("### 🎤 Voice Input")
-            audio_input = gr.Audio(
-                sources=["microphone"],
-                type="filepath",
-                label="Record your question",
-            )
-            voice_btn = gr.Button(
-                "🎤 Send Voice Message",
-                variant="primary",
-                size="lg",
-            )
-            gr.Markdown("---")
-            gr.Markdown("### ⌨️ Text Input")
-            text_input = gr.Textbox(
-                placeholder="Or type your question here…",
-                label="",
-                lines=3,
-            )
-            text_btn = gr.Button(
-                "➤ Send Text Message",
-                variant="secondary",
-                size="lg",
-            )
-            transcript_display = gr.Textbox(
-                label="📝 What I heard",
-                interactive=False,
-                lines=2,
-                placeholder="Your transcribed speech will appear here…",
-            )
     gr.Markdown(
         """

         results.append(chunk["text"])
     return "\n\n---\n\n".join(results)
+# ── SYSTEM PROMPTS ─────────────────────────────
+def get_system_prompt(lang="Español"):
+    if lang == "Català":
+        return """Ets un assistent càlid i empàtic per a cuidadors de persones amb Alzheimer a Barcelona.
+Proporciona orientació clara, menciona serveis locals si existeixen en el context i mantén les respostes breus i comprensibles."""
+    else:
+        return """Eres un asistente cálido y empático para cuidadores de personas con Alzheimer en Barcelona.
+Proporciona orientación clara, menciona recursos locales si existen en el contexto y mantén las respuestas breves y comprensibles."""
 # ── STT: Distil-Whisper ───────────────────────────────────────
 print("Loading Whisper STT model...")
 stt_pipe = pipeline(
 def transcribe_audio(audio_path):
     if audio_path is None:
         return ""
+    result = stt_pipe(audio_path, generate_kwargs={"task": "transcribe"}, return_timestamps=False)
     transcript = result["text"].strip()
     return transcript
+def detect_language(text):
+    try:
+        lang = detect(text)
+        if lang == "ca":
+            return "Català"
+        elif lang == "es":
+            return "Español"
+        else:
+            return "Español"
+    except:
+        return "Español"
 # ── TTS: Parler TTS mini v1 (neutral American voice) ─────────
 print("Loading Parler TTS model...")
 TTS_REPO = "parler-tts/parler-tts-mini-v1"
     "The audio is very clean with no background noise."
 )
+def text_to_speech(text, lang="Español"):
     if not text:
         return None
     try:
+        # Spanish-capable TTS, adjust for Catalan if a model exists
+        model_repo = "tts_models/es/tacotron2-DDC"
+        tts_model = ParlerTTSForConditionalGeneration.from_pretrained(
+            model_repo, torch_dtype=torch_dtype
+        ).to(device)
+        tts_tokenizer = AutoTokenizer.from_pretrained(model_repo)
+        sampling_rate = 22050
+        input_ids = tts_tokenizer(text, return_tensors="pt").input_ids.to(device)
         with torch.no_grad():
+            generation = tts_model.generate(input_ids=input_ids)
         audio_array = generation.cpu().to(torch.float32).numpy().squeeze()
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
             scipy.io.wavfile.write(f.name, rate=sampling_rate, data=audio_array)
             return f.name
     except Exception as e:
         print(f"TTS error: {e}")
+        return None
 # ── LLM: HF Inference API + RAG ───────────────────────────────
 SYSTEM_PROMPT = """You are a warm, calm, and knowledgeable support assistant for caregivers of people with Alzheimer's disease.
 If no relevant local services are in the context, say so honestly.
 Always remind caregivers that asking for help is a sign of strength, not weakness."""
+def respond_to_message(message, history, lang="Español"):
     if not message.strip():
         return ""
     client = InferenceClient(token=HF_TOKEN, model="openai/gpt-oss-20b")
     rag_context = retrieve_rag_context(message)
+    full_system = f"{get_system_prompt(lang)}\n\n=== RETRIEVED CONTEXT ===\n{rag_context}"
     messages = [{"role": "system", "content": full_system}]
     for h in history[-6:]:
         return response.strip()
     except Exception as e:
         print(f"LLM error: {e}")
+        return "Ho sento, no puc generar una resposta en aquest moment." if lang=="Català" else "Lo siento, no puedo generar una respuesta en este momento."
 # ── Pipelines ─────────────────────────────────────────────────
 def voice_pipeline(audio_input, history):
     transcript = transcribe_audio(audio_input)
     if not transcript:
+        return history, None, "⚠️ Could not transcribe audio."
+    lang = detect_language(transcript)
+    reply = respond_to_message(transcript, history, lang=lang)
     history = history or []
     history.append({"role": "user", "content": transcript})
     history.append({"role": "assistant", "content": reply})
+    audio_out = text_to_speech(reply, lang=lang)
+    return history, audio_out, transcript
+def text_pipeline(text_input, history, lang):
     if not text_input.strip():
         return history, None, ""
+    reply = respond_to_message(text_input, history, lang=lang)
     history = history or []
     history.append({"role": "user", "content": text_input})
     history.append({"role": "assistant", "content": reply})
+    audio_out = text_to_speech(reply, lang=lang)
     return history, audio_out, ""
 # ── Gradio UI ─────────────────────────────────────────────────
+with gr.Blocks() as demo:
     chat_history = gr.State([])
+    gr.Markdown("## SherpaAI — Suport intel·ligent per a cuidadors d’Alzheimer a Barcelona")
     with gr.Row():
         with gr.Column(scale=2):
+            chatbot = gr.Chatbot(label="Conversation", height=420)
+            audio_output = gr.Audio(label="🔊 Voice Response", autoplay=True)
         with gr.Column(scale=1):
+            lang_selector = gr.Dropdown(["Español", "Català"], label="Language", value="Español")
+            text_input = gr.Textbox(placeholder="Escriu la teva pregunta aquí…", lines=3)
+            text_btn = gr.Button("Enviar / Send")
             gr.Markdown("### 🎤 Voice Input")
+            audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Record your question")
+            voice_btn = gr.Button("🎤 Send Voice Message")
+    text_btn.click(
+        fn=text_pipeline,
+        inputs=[text_input, chat_history, lang_selector],
+        outputs=[chat_history, audio_output, text_input],
+    )
+    voice_btn.click(
+        fn=voice_pipeline,
+        inputs=[audio_input, chat_history],
+        outputs=[chat_history, audio_output, text_input],
+    )
     gr.Markdown(
         """