Spaces:

AgentsWithoutBorders
/

SherpaAI

Sleeping

App Files Files Community

hbchiu commited on Feb 15

Commit

5b34b4f

verified ·

1 Parent(s): 7721466

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -130

app.py CHANGED Viewed

@@ -1,184 +1,165 @@
 # app.py — CareCompanion: Alzheimer's Caregiver Voice Assistant
 #
-# Stack (all via API — no local model loading, fast startup):
-#   STT:  openai/whisper-large-v3         via HF Inference API
-#   LANG: papluca/xlm-roberta-base-...    via HF Inference API
-#   LLM:  openai/gpt-oss-20b + FAISS RAG  via HF Inference API
-#   TTS:  facebook/mms-tts-*              via HF Inference API (per language)
 #
 # Secrets needed in HF Space Settings:
-#   HF_TOKEN — your Hugging Face access token (required)
 import os
-import time
 import faiss
 import pickle
-import tempfile
 import numpy as np
 import gradio as gr
-from huggingface_hub import InferenceClient
 from sentence_transformers import SentenceTransformer
 # ── Auth ───────────────────────────────────────────────────────
 HF_TOKEN = os.environ.get("HF_TOKEN")
 if not HF_TOKEN:
     raise ValueError("HF_TOKEN is not set. Add it in Space Settings → Repository Secrets.")
-# Single shared API client — reused for all calls
-api_client = InferenceClient(token=HF_TOKEN)
-# ── RAG: FAISS + multilingual embeddings ───────────────────────
-print("Loading FAISS index and embedding model...")
 index = faiss.read_index("alzheimers_index.faiss")
 with open("chunks.pkl", "rb") as f:
     chunks = pickle.load(f)
-# Multilingual embeddings — handles English, Spanish, Catalan and 50+ others
-# Much better than all-MiniLM-L6-v2 for multilingual content
-embed_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
-print(f"Loaded {len(chunks)} chunks into RAG")
 def retrieve_rag_context(query, k=5):
-    """Retrieve top-k relevant chunks from FAISS index."""
     query_embedding = embed_model.encode([query])
     distances, indices = index.search(np.array(query_embedding), k)
     results = []
     for i in indices[0]:
         chunk = chunks[i]
         print(f"  RAG chunk: id={chunk.get('id')} topic={chunk.get('topic')} lang={chunk.get('language')}")
-        print(f"  Preview: {chunk['text'][:100]}")
         results.append(chunk["text"])
     return "\n\n---\n\n".join(results)
-# ── Language detection ─────────────────────────────────────────
-# Maps detected language codes to MMS TTS model names
-MMS_MODELS = {
-    "en": "facebook/mms-tts-eng",
-    "es": "facebook/mms-tts-spa",
-    "ca": "facebook/mms-tts-cat",  # Catalan
-    "fr": "facebook/mms-tts-fra",
-    "de": "facebook/mms-tts-deu",
-    "it": "facebook/mms-tts-ita",
-    "pt": "facebook/mms-tts-por",
-}
-DEFAULT_TTS_MODEL = "facebook/mms-tts-eng"
-def detect_language(text):
-    """Detect language of text using xlm-roberta model."""
-    try:
-        result = api_client.text_classification(
-            text,
-            model="papluca/xlm-roberta-base-language-detection"
-        )
-        lang_code = result[0].label[:2].lower()
-        print(f"  Detected language: {lang_code} (confidence: {result[0].score:.2f})")
-        return lang_code
-    except Exception as e:
-        print(f"  Language detection failed: {e} — defaulting to English")
-        return "en"
-# ── STT: Whisper via HF Inference API ─────────────────────────
 def transcribe_audio(audio_path):
-    """Transcribe audio file using Whisper via HF API."""
     if audio_path is None:
-        return "", "en"
-    t0 = time.time()
-    try:
-        with open(audio_path, "rb") as f:
-            result = api_client.automatic_speech_recognition(
-                f,
-                model="openai/whisper-large-v3"
-            )
-        transcript = result.text.strip()
-        print(f"STT done in {time.time()-t0:.1f}s: '{transcript}'")
-        # Detect language from what was spoken
-        lang = detect_language(transcript) if transcript else "en"
-        return transcript, lang
-    except Exception as e:
-        print(f"STT error: {e}")
-        return "", "en"
-# ── TTS: Facebook MMS via HF Inference API ────────────────────
-def text_to_speech(text, language="en"):
-    """Convert text to speech using Facebook MMS — proper per-language voices."""
     if not text:
         return None
-    t0 = time.time()
-    tts_model = MMS_MODELS.get(language, DEFAULT_TTS_MODEL)
-    print(f"TTS using model: {tts_model}")
-    try:
-        audio_bytes = api_client.text_to_speech(
-            text,
-            model=tts_model
-        )
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-            f.write(audio_bytes)
-            print(f"TTS done in {time.time()-t0:.1f}s")
             return f.name
     except Exception as e:
-        print(f"TTS error ({tts_model}): {e}")
-        # Try fallback to English if language-specific model fails
-        try:
-            audio_bytes = api_client.text_to_speech(text, model=DEFAULT_TTS_MODEL)
-            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-                f.write(audio_bytes)
-                return f.name
-        except Exception as e2:
-            print(f"TTS fallback also failed: {e2}")
-            return None
 # ── LLM: HF Inference API + RAG ───────────────────────────────
 SYSTEM_PROMPT = """You are a warm, calm, and knowledgeable support assistant for caregivers of people with Alzheimer's disease.
 Your role is to:
 - Provide clear, compassionate guidance for caregiving challenges
-- Suggest relevant local support services when available in the retrieved context below
 - Give practical, actionable advice
-- Keep responses concise — under 100 words — so they are easy to listen to
 - Always be encouraging and non-judgmental
 - Respond in the same language the user wrote in
 If asked about local resources, ONLY reference services mentioned in the retrieved context. Do not invent services.
-If no relevant local services are found in the context, say so honestly.
 Always remind caregivers that asking for help is a sign of strength, not weakness."""
-def respond_to_message(message, history, detected_lang="en"):
-    """Generate a response using RAG context + LLM."""
     if not message.strip():
         return ""
-    t0 = time.time()
-    # Retrieve relevant chunks from FAISS
     rag_context = retrieve_rag_context(message)
     full_system = (
         f"{SYSTEM_PROMPT}\n\n"
-        f"User's language: {detected_lang}\n\n"
         f"=== RETRIEVED KNOWLEDGE BASE CONTEXT ===\n{rag_context}\n"
         f"========================================\n"
         f"Only use the above context for local resource recommendations."
     )
-    # Build message history
     messages = [{"role": "system", "content": full_system}]
-    for h in history[-6:]:  # keep last 6 turns
         if isinstance(h, dict):
             messages.append({"role": h["role"], "content": h["content"]})
     messages.append({"role": "user", "content": message})
-    # Stream response from LLM
     response = ""
     try:
-        for chunk in api_client.chat_completion(
             messages,
-            model="openai/gpt-oss-20b",
             max_tokens=350,
             stream=True,
             temperature=0.7,
@@ -186,53 +167,40 @@ def respond_to_message(message, history, detected_lang="en"):
         ):
             if chunk.choices and chunk.choices[0].delta.content:
                 response += chunk.choices[0].delta.content
-        print(f"LLM done in {time.time()-t0:.1f}s")
         return response.strip()
     except Exception as e:
         print(f"LLM error: {e}")
         return "I'm sorry, I had trouble generating a response. Please try again."
-# ── Voice pipeline: mic → STT → LLM+RAG → TTS ────────────────
 def voice_pipeline(audio_input, history):
-    t_start = time.time()
-    transcript, lang = transcribe_audio(audio_input)
     if not transcript:
         return history, None, "⚠️ Could not transcribe audio. Please try again."
-    reply = respond_to_message(transcript, history, lang)
     history = history or []
     history.append({"role": "user", "content": transcript})
     history.append({"role": "assistant", "content": reply})
-    audio_out = text_to_speech(reply, language=lang)
-    print(f"Total voice pipeline: {time.time()-t_start:.1f}s")
-    return history, audio_out, f'"{transcript}" [{lang}]'
-# ── Text pipeline: text → LLM+RAG → TTS ──────────────────────
 def text_pipeline(text_input, history):
     if not text_input.strip():
         return history, None, ""
-    t_start = time.time()
-    lang = detect_language(text_input)
-    reply = respond_to_message(text_input, history, lang)
     history = history or []
     history.append({"role": "user", "content": text_input})
     history.append({"role": "assistant", "content": reply})
-    audio_out = text_to_speech(reply, language=lang)
-    print(f"Total text pipeline: {time.time()-t_start:.1f}s")
     return history, audio_out, ""
-# ── Gradio UI ──────────────────────────────────────────────────
 with gr.Blocks(
     theme=gr.themes.Soft(
         primary_hue="green",
@@ -246,10 +214,9 @@ with gr.Blocks(
     gr.Markdown(
         """
-        # Sherpa
-        ### Smart Support for Alzheimer's Caregivers in Barcelona
-        *Ask anything by voice or text — in English, Spanish, or Catalan.*
-        *Responses draw from a curated local knowledge base.*
         """
     )
@@ -284,7 +251,7 @@ with gr.Blocks(
             gr.Markdown("---")
             gr.Markdown("### ⌨️ Text Input")
             text_input = gr.Textbox(
-                placeholder="Or type your question here… (any language)",
                 label="",
                 lines=3,
             )
@@ -305,7 +272,7 @@ with gr.Blocks(
         """
         ---
         *Responses are AI-generated and do not replace professional medical advice.
-        In emergencies, call 112 (EU) or your local emergency services.*
         """
     )

 # app.py — CareCompanion: Alzheimer's Caregiver Voice Assistant
 #
+# Stack:
+#   STT:  distil-whisper/distil-large-v3   (local, fast)
+#   LLM:  openai/gpt-oss-20b + FAISS RAG   (HF Inference API)
+#   TTS:  parler-tts/parler-tts-mini-v1    (local, neutral American voice)
 #
 # Secrets needed in HF Space Settings:
+#   HF_TOKEN — your Hugging Face access token
 import os
 import faiss
 import pickle
 import numpy as np
 import gradio as gr
+import torch
+import scipy.io.wavfile
+import tempfile
 from sentence_transformers import SentenceTransformer
+from huggingface_hub import InferenceClient
+from transformers import AutoTokenizer, pipeline
+from parler_tts import ParlerTTSForConditionalGeneration
 # ── Auth ───────────────────────────────────────────────────────
 HF_TOKEN = os.environ.get("HF_TOKEN")
 if not HF_TOKEN:
     raise ValueError("HF_TOKEN is not set. Add it in Space Settings → Repository Secrets.")
+# ── Device ────────────────────────────────────────────────────
+device = "cuda" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if device == "cuda" else torch.float32
+print(f"Running on: {device}")
+# ── RAG: FAISS index ──────────────────────────────────────────
+print("Loading FAISS index...")
 index = faiss.read_index("alzheimers_index.faiss")
 with open("chunks.pkl", "rb") as f:
     chunks = pickle.load(f)
+embed_model = SentenceTransformer("all-MiniLM-L6-v2")
+print(f"Loaded {len(chunks)} chunks")
 def retrieve_rag_context(query, k=5):
     query_embedding = embed_model.encode([query])
     distances, indices = index.search(np.array(query_embedding), k)
     results = []
     for i in indices[0]:
         chunk = chunks[i]
         print(f"  RAG chunk: id={chunk.get('id')} topic={chunk.get('topic')} lang={chunk.get('language')}")
         results.append(chunk["text"])
     return "\n\n---\n\n".join(results)
+# ── STT: Distil-Whisper ───────────────────────────────────────
+print("Loading Whisper STT model...")
+stt_pipe = pipeline(
+    "automatic-speech-recognition",
+    model="distil-whisper/distil-large-v3",
+    torch_dtype=torch_dtype,
+    device=device,
+)
 def transcribe_audio(audio_path):
     if audio_path is None:
+        return ""
+    result = stt_pipe(
+        audio_path,
+        generate_kwargs={"task": "transcribe"},
+        return_timestamps=False,
+    )
+    transcript = result["text"].strip()
+    print(f"Transcript: '{transcript}'")
+    return transcript
+# ── TTS: Parler TTS mini v1 (neutral American voice) ─────────
+# Using base mini-v1 model — NOT jenny (which is Irish)
+# Laura is a warm, calm American speaker in this model
+print("Loading Parler TTS model...")
+TTS_REPO = "parler-tts/parler-tts-mini-v1"
+tts_model = ParlerTTSForConditionalGeneration.from_pretrained(
+    TTS_REPO,
+    torch_dtype=torch_dtype,
+    low_cpu_mem_usage=True,
+).to(device)
+tts_tokenizer = AutoTokenizer.from_pretrained(TTS_REPO)
+sampling_rate = tts_model.audio_encoder.config.sampling_rate
+VOICE_DESCRIPTION = (
+    "Laura speaks with a warm, calm and empathetic American accent. "
+    "She speaks clearly at a gentle, measured pace, like a caring nurse. "
+    "The audio is very clean with no background noise."
+)
+def text_to_speech(text):
     if not text:
         return None
+    try:
+        input_ids = tts_tokenizer(
+            VOICE_DESCRIPTION, return_tensors="pt"
+        ).input_ids.to(device)
+        prompt_ids = tts_tokenizer(
+            text, return_tensors="pt"
+        ).input_ids.to(device)
+        with torch.no_grad():
+            generation = tts_model.generate(
+                input_ids=input_ids,
+                prompt_input_ids=prompt_ids,
+            )
+        # ← float16 fix: convert to float32 before writing WAV
+        audio_array = generation.cpu().to(torch.float32).numpy().squeeze()
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            scipy.io.wavfile.write(f.name, rate=sampling_rate, data=audio_array)
             return f.name
     except Exception as e:
+        print(f"TTS error: {e}")
+        return None  # silently skip audio, text response still shows
 # ── LLM: HF Inference API + RAG ───────────────────────────────
 SYSTEM_PROMPT = """You are a warm, calm, and knowledgeable support assistant for caregivers of people with Alzheimer's disease.
 Your role is to:
 - Provide clear, compassionate guidance for caregiving challenges
+- Suggest relevant local support services when available in the retrieved context
 - Give practical, actionable advice
+- Keep responses concise — under 120 words — so they are easy to listen to
 - Always be encouraging and non-judgmental
 - Respond in the same language the user wrote in
 If asked about local resources, ONLY reference services mentioned in the retrieved context. Do not invent services.
+If no relevant local services are in the context, say so honestly.
 Always remind caregivers that asking for help is a sign of strength, not weakness."""
+def respond_to_message(message, history):
     if not message.strip():
         return ""
+    client = InferenceClient(token=HF_TOKEN, model="openai/gpt-oss-20b")
     rag_context = retrieve_rag_context(message)
     full_system = (
         f"{SYSTEM_PROMPT}\n\n"
         f"=== RETRIEVED KNOWLEDGE BASE CONTEXT ===\n{rag_context}\n"
         f"========================================\n"
         f"Only use the above context for local resource recommendations."
     )
     messages = [{"role": "system", "content": full_system}]
+    for h in history[-6:]:
         if isinstance(h, dict):
             messages.append({"role": h["role"], "content": h["content"]})
     messages.append({"role": "user", "content": message})
     response = ""
     try:
+        for chunk in client.chat_completion(
             messages,
             max_tokens=350,
             stream=True,
             temperature=0.7,
         ):
             if chunk.choices and chunk.choices[0].delta.content:
                 response += chunk.choices[0].delta.content
         return response.strip()
     except Exception as e:
         print(f"LLM error: {e}")
         return "I'm sorry, I had trouble generating a response. Please try again."
+# ── Pipelines ─────────────────────────────────────────────────
 def voice_pipeline(audio_input, history):
+    transcript = transcribe_audio(audio_input)
     if not transcript:
         return history, None, "⚠️ Could not transcribe audio. Please try again."
+    reply = respond_to_message(transcript, history)
     history = history or []
     history.append({"role": "user", "content": transcript})
     history.append({"role": "assistant", "content": reply})
+    audio_out = text_to_speech(reply)
+    return history, audio_out, f'"{transcript}"'
 def text_pipeline(text_input, history):
     if not text_input.strip():
         return history, None, ""
+    reply = respond_to_message(text_input, history)
     history = history or []
     history.append({"role": "user", "content": text_input})
     history.append({"role": "assistant", "content": reply})
+    audio_out = text_to_speech(reply)
     return history, audio_out, ""
+# ── Gradio UI ─────────────────────────────────────────────────
 with gr.Blocks(
     theme=gr.themes.Soft(
         primary_hue="green",
     gr.Markdown(
         """
+        # 🤍 CareCompanion
+        ### Alzheimer's Caregiver Support Assistant
+        *Ask anything — by voice or text. Responses draw from a curated Alzheimer's knowledge base.*
         """
     )
             gr.Markdown("---")
             gr.Markdown("### ⌨️ Text Input")
             text_input = gr.Textbox(
+                placeholder="Or type your question here…",
                 label="",
                 lines=3,
             )
         """
         ---
         *Responses are AI-generated and do not replace professional medical advice.
+        In emergencies, call 112 or your local emergency services.*
         """
     )