Spaces:

EYEDOL
/

SALAMA

Sleeping

App Files Files Community

EYEDOL commited on Aug 11, 2025

Commit

e1a9f6f

verified ·

1 Parent(s): d507929

Update app.py

Browse files

Files changed (1) hide show

app.py +209 -79

app.py CHANGED Viewed

@@ -1,28 +1,44 @@
 # -*- coding: utf-8 -*-
 """
-This script implements a multi-modal Swahili assistant for Hugging Face Spaces.
-It uses Gradio for the user interface and loads models from the HF Hub.
 """
-import gradio as gr
 import numpy as np
-import onnxruntime
-import torch
 import librosa
-from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer, pipeline, TextIteratorStreamer
 from scipy.io.wavfile import write as write_wav
-import os
-import re
 from huggingface_hub import login
-import threading
-hf_token = os.environ.get("hugface") # Using "HF_TOKEN" is the standard on Spaces
-if not hf_token:
-    raise ValueError("HF_TOKEN not found. Please set it in Hugging Face Space repository secrets.")
-login(token=hf_token)
-print("Successfully logged into Hugging Face Hub!")
-# --- Configuration ---
 STT_MODEL_ID = "EYEDOL/SALAMA_C3"
 LLM_MODEL_ID = "EYEDOL/Llama-3.2-1B_ON_ALPACA5"
 TTS_TOKENIZER_ID = "facebook/mms-tts-swh"
@@ -31,6 +47,18 @@ TTS_ONNX_MODEL_PATH = "swahili_tts.onnx"
 TEMP_DIR = "temp"
 os.makedirs(TEMP_DIR, exist_ok=True)
 class WeeboAssistant:
     def __init__(self):
@@ -48,125 +76,224 @@ class WeeboAssistant:
         self.torch_dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
         print(f"Using device: {self.device}")
-        # STT
         print(f"Loading STT model: {STT_MODEL_ID}")
         self.stt_processor = AutoProcessor.from_pretrained(STT_MODEL_ID)
         self.stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
-            STT_MODEL_ID,
-            torch_dtype=self.torch_dtype,
-            low_cpu_mem_usage=True,
-            use_safetensors=True
-        ).to(self.device)
         print("STT model loaded successfully.")
-        # LLM
         print(f"Loading LLM: {LLM_MODEL_ID}")
-        self.llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID)
-        self.llm_pipeline = pipeline(
-            "text-generation",
-            model=LLM_MODEL_ID,
-            model_kwargs={"torch_dtype": self.torch_dtype},
-            tokenizer=self.llm_tokenizer,
-            device=self.device,
-        )
-        print("LLM pipeline loaded successfully.")
-        # TTS
         print(f"Loading TTS model: {TTS_ONNX_MODEL_PATH}")
-        self.tts_session = onnxruntime.InferenceSession(
-            TTS_ONNX_MODEL_PATH,
-            providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
-        )
         self.tts_tokenizer = AutoTokenizer.from_pretrained(TTS_TOKENIZER_ID)
         print("TTS model and tokenizer loaded successfully.")
         print("-" * 30)
         print("All models initialized successfully! ✅")
     def transcribe_audio(self, audio_tuple):
         if audio_tuple is None:
             return ""
         sample_rate, audio_data = audio_tuple
         if audio_data.ndim > 1:
             audio_data = audio_data.mean(axis=1)
         if audio_data.dtype != np.float32:
-            audio_data = audio_data.astype(np.float32) / np.iinfo(audio_data.dtype).max
         if sample_rate != self.STT_SAMPLE_RATE:
             audio_data = librosa.resample(y=audio_data, orig_sr=sample_rate, target_sr=self.STT_SAMPLE_RATE)
         if len(audio_data) < 1000:
             return "(Audio too short to transcribe)"
         inputs = self.stt_processor(audio_data, sampling_rate=self.STT_SAMPLE_RATE, return_tensors="pt")
-        inputs = {k: v.to(self.device) for k, v in inputs.items()}
         with torch.no_grad():
             generated_ids = self.stt_model.generate(**inputs, max_new_tokens=128)
         transcription = self.stt_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         return transcription.strip()
     def generate_speech(self, text):
         if not text:
             return None
         text = text.strip()
         inputs = self.tts_tokenizer(text, return_tensors="np")
-        ort_inputs = {self.tts_session.get_inputs()[0].name: inputs.input_ids}
         audio_waveform = self.tts_session.run(None, ort_inputs)[0].flatten()
         output_path = os.path.join(TEMP_DIR, f"{os.urandom(8).hex()}.wav")
-        write_wav(output_path, self.TTS_SAMPLE_RATE, audio_waveform)
         return output_path
     def get_llm_response(self, chat_history):
-        # <-- FIX: Reverted to using a 'system' role, which is correct for Llama 3 -->
-        messages = [{'role': 'system', 'content': self.SYSTEM_PROMPT}]
         for user_msg, assistant_msg in chat_history:
-            messages.append({"role": "user", "content": user_msg})
             if assistant_msg:
-                messages.append({"role": "assistant", "content": assistant_msg})
-        prompt = self.llm_pipeline.tokenizer.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
-        )
-        terminators = [
-            self.llm_pipeline.tokenizer.eos_token_id,
-            self.llm_pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
-        ]
-        streamer = TextIteratorStreamer(
-            self.llm_pipeline.tokenizer, skip_prompt=True, skip_special_tokens=True
-        )
         generation_kwargs = dict(
-            streamer=streamer,
             max_new_tokens=512,
-            eos_token_id=terminators,
             do_sample=True,
             temperature=0.6,
             top_p=0.9,
         )
-        thread = threading.Thread(target=self.llm_pipeline, args=[prompt], kwargs=generation_kwargs)
-        thread.start()
         return streamer
 assistant = WeeboAssistant()
 def s2s_pipeline(audio_input, chat_history):
     user_text = assistant.transcribe_audio(audio_input)
     if not user_text or user_text.startswith("("):
         chat_history.append((user_text or "(No valid speech detected)", None))
         yield chat_history, None, "Please record your voice again."
         return
     chat_history.append((user_text, ""))
     yield chat_history, None, "..."
     response_stream = assistant.get_llm_response(chat_history)
     llm_response_text = ""
     for text_chunk in response_stream:
         llm_response_text += text_chunk
         chat_history[-1] = (user_text, llm_response_text)
         yield chat_history, None, llm_response_text
     final_audio_path = assistant.generate_speech(llm_response_text)
     yield chat_history, final_audio_path, llm_response_text
@@ -174,7 +301,7 @@ def s2s_pipeline(audio_input, chat_history):
 def t2t_pipeline(text_input, chat_history):
     chat_history.append((text_input, ""))
     yield chat_history
     response_stream = assistant.get_llm_response(chat_history)
     llm_response_text = ""
     for text_chunk in response_stream:
@@ -187,10 +314,11 @@ def clear_textbox():
     return gr.Textbox(value="")
 with gr.Blocks(theme=gr.themes.Soft(), title="Msaidizi wa Kiswahili") as demo:
     gr.Markdown("# 🤖 Msaidizi wa Sauti wa Kiswahili (Swahili Voice Assistant)")
     gr.Markdown("Ongea na msaidizi kwa Kiswahili. Toa sauti, andika maandishi, na upate majibu kwa sauti au maandishi.")
     with gr.Tabs():
         with gr.TabItem("🎙️ Sauti-kwa-Sauti (Speech-to-Speech)"):
             with gr.Row():
@@ -201,7 +329,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Msaidizi wa Kiswahili") as demo:
                     s2s_chatbot = gr.Chatbot(label="Mazungumzo (Conversation)", bubble_full_width=False, height=400)
                     s2s_audio_out = gr.Audio(type="filepath", label="Jibu la Sauti (Audio Response)", autoplay=True)
                     s2s_text_out = gr.Textbox(label="Jibu la Maandishi (Text Response)", interactive=False)
         with gr.TabItem("⌨️ Maandishi-kwa-Maandishi (Text-to-Text)"):
             t2t_chatbot = gr.Chatbot(label="Mazungumzo (Conversation)", bubble_full_width=False, height=500)
             with gr.Row():
@@ -225,46 +353,48 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Msaidizi wa Kiswahili") as demo:
         fn=s2s_pipeline,
         inputs=[s2s_audio_in, s2s_chatbot],
         outputs=[s2s_chatbot, s2s_audio_out, s2s_text_out],
-        queue=True
     ).then(
         fn=lambda: gr.Audio(value=None),
         inputs=None,
-        outputs=s2s_audio_in
     )
     t2t_submit_btn.click(
         fn=t2t_pipeline,
         inputs=[t2t_text_in, t2t_chatbot],
         outputs=[t2t_chatbot],
-        queue=True
     ).then(
         fn=clear_textbox,
         inputs=None,
-        outputs=t2t_text_in
     )
     t2t_text_in.submit(
         fn=t2t_pipeline,
         inputs=[t2t_text_in, t2t_chatbot],
         outputs=[t2t_chatbot],
-        queue=True
     ).then(
         fn=clear_textbox,
         inputs=None,
-        outputs=t2t_text_in
     )
     tool_s2t_btn.click(
         fn=assistant.transcribe_audio,
         inputs=tool_s2t_audio_in,
         outputs=tool_s2t_text_out,
-        queue=True
     )
     tool_t2s_btn.click(
         fn=assistant.generate_speech,
         inputs=tool_t2s_text_in,
         outputs=tool_t2s_audio_out,
-        queue=True
     )
-demo.queue().launch(debug=True)

 # -*- coding: utf-8 -*-
 """
+Fixed and self-contained Swahili multimodal assistant for Hugging Face Spaces.
+Key fixes / improvements over original:
+- Robust loading of an LLM repo that may lack `model_type` in config.json by
+  loading the model object directly and using `trust_remote_code=True` as a
+  fallback. Avoids `pipeline(... )` raising ValueError on AutoConfig.
+- Correct handling of `pipeline(..., device=...)` which expects an int GPU
+  index or -1 for CPU (previously passed a string like "cpu").
+- Streaming generation implemented by calling `model.generate(..., streamer=TextIteratorStreamer(...))`
+  in a background thread so the main thread can iterate over the streamer.
+- Use standard HF env var `HF_TOKEN` and graceful error message if not set.
+- Minor robustness improvements (resampling audio, handling mono/stereo, temp
+  filenames, etc.).
+Drop this file into your Space and replace the old app.py contents.
 """
+import os
+import re
+import threading
 import numpy as np
+import gradio as gr
 import librosa
+import torch
 from scipy.io.wavfile import write as write_wav
 from huggingface_hub import login
+import onnxruntime
+from transformers import (
+    AutoProcessor,
+    AutoModelForSpeechSeq2Seq,
+    AutoTokenizer,
+    AutoConfig,
+    AutoModelForCausalLM,
+    pipeline,
+    TextIteratorStreamer,
+)
+# -------------------- Configuration --------------------
 STT_MODEL_ID = "EYEDOL/SALAMA_C3"
 LLM_MODEL_ID = "EYEDOL/Llama-3.2-1B_ON_ALPACA5"
 TTS_TOKENIZER_ID = "facebook/mms-tts-swh"
 TEMP_DIR = "temp"
 os.makedirs(TEMP_DIR, exist_ok=True)
+# Use the standard environment variable name used by Spaces
+HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("hugface")
+if not HF_TOKEN:
+    raise ValueError("HF_TOKEN not found. Please set it in Hugging Face Space repository secrets.")
+# Attempt login to HF hub (Spaces typically already provides token, but this keeps parity)
+try:
+    login(token=HF_TOKEN)
+    print("Successfully logged into Hugging Face Hub!")
+except Exception as e:
+    print("Warning: could not call huggingface_hub.login(). Proceeding — ensure your token is valid in the environment. Error:", e)
 class WeeboAssistant:
     def __init__(self):
         self.torch_dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
         print(f"Using device: {self.device}")
+        # ---------------- STT ----------------
         print(f"Loading STT model: {STT_MODEL_ID}")
         self.stt_processor = AutoProcessor.from_pretrained(STT_MODEL_ID)
+        # Speech seq2seq model (e.g. Whisper-like)
         self.stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            STT_MODEL_ID,
+            torch_dtype=self.torch_dtype,
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+        )
+        if self.device == "cuda":
+            try:
+                self.stt_model = self.stt_model.to("cuda")
+            except Exception:
+                pass
         print("STT model loaded successfully.")
+        # ---------------- LLM ----------------
         print(f"Loading LLM: {LLM_MODEL_ID}")
+        self.llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID, use_fast=True)
+        # Attempt robust loading. If the repo lacks a model_type in config.json,
+        # try loading with trust_remote_code=True (this allows custom model code in repo).
+        try:
+            config = AutoConfig.from_pretrained(LLM_MODEL_ID)
+            # If config loaded but missing model_type, continue to try direct load
+            if not getattr(config, "model_type", None):
+                raise ValueError("config missing model_type - forcing trusted load")
+            # Try to load into a causal LM class (works for many standard model types)
+            self.llm_model = AutoModelForCausalLM.from_pretrained(
+                LLM_MODEL_ID,
+                config=config,
+                torch_dtype=self.torch_dtype,
+                low_cpu_mem_usage=True,
+            )
+        except Exception as first_err:
+            print("Standard AutoConfig/AutoModel load failed or model_type missing. Trying trust_remote_code=True. Error:", first_err)
+            # Try using trust_remote_code which will import repo-specific model code if present
+            try:
+                config = AutoConfig.from_pretrained(LLM_MODEL_ID, trust_remote_code=True)
+                self.llm_model = AutoModelForCausalLM.from_pretrained(
+                    LLM_MODEL_ID,
+                    config=config,
+                    torch_dtype=self.torch_dtype,
+                    trust_remote_code=True,
+                    low_cpu_mem_usage=True,
+                    device_map="auto" if torch.cuda.is_available() else None,
+                )
+            except Exception as second_err:
+                # Final fallback: try to load without special configs — may still fail for custom repos
+                print("Fallback load also failed:", second_err)
+                raise RuntimeError(
+                    "Unable to load LLM model. Check the model repo, ensure config.json contains a model_type or that trust_remote_code is allowed."
+                )
+        # If device_map wasn't used and model is on CPU, ensure model is moved to CPU
+        if self.device == "cpu":
+            try:
+                # Many Hugging Face helpers use device_map; if not used, move model
+                self.llm_model = self.llm_model.to("cpu")
+            except Exception:
+                pass
+        # For convenience, create a pipeline for non-streaming quick calls (device expects int or -1)
+        device_index = 0 if torch.cuda.is_available() else -1
+        try:
+            self.llm_pipeline = pipeline(
+                "text-generation",
+                model=self.llm_model,
+                tokenizer=self.llm_tokenizer,
+                device=device_index,
+                model_kwargs={"torch_dtype": self.torch_dtype},
+            )
+        except Exception:
+            # pipeline is optional; if it fails we still support the streaming flow via model.generate
+            self.llm_pipeline = None
+        print("LLM loaded successfully.")
+        # ---------------- TTS ----------------
         print(f"Loading TTS model: {TTS_ONNX_MODEL_PATH}")
+        # ONNX runtime session; providers include CUDA if available
+        providers = ["CPUExecutionProvider"]
+        if torch.cuda.is_available():
+            providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+        self.tts_session = onnxruntime.InferenceSession(TTS_ONNX_MODEL_PATH, providers=providers)
         self.tts_tokenizer = AutoTokenizer.from_pretrained(TTS_TOKENIZER_ID)
         print("TTS model and tokenizer loaded successfully.")
         print("-" * 30)
         print("All models initialized successfully! ✅")
+    # ---------------- Utility methods ----------------
     def transcribe_audio(self, audio_tuple):
+        """Take a Gradio audio tuple (sample_rate, np_audio) and return transcription string."""
         if audio_tuple is None:
             return ""
         sample_rate, audio_data = audio_tuple
+        # Convert to mono
         if audio_data.ndim > 1:
             audio_data = audio_data.mean(axis=1)
+        # Normalize to float32
         if audio_data.dtype != np.float32:
+            # handle common integer audio dtypes
+            if np.issubdtype(audio_data.dtype, np.integer):
+                max_val = np.iinfo(audio_data.dtype).max
+                audio_data = audio_data.astype(np.float32) / float(max_val)
+            else:
+                audio_data = audio_data.astype(np.float32)
+        # Resample if needed
         if sample_rate != self.STT_SAMPLE_RATE:
             audio_data = librosa.resample(y=audio_data, orig_sr=sample_rate, target_sr=self.STT_SAMPLE_RATE)
         if len(audio_data) < 1000:
             return "(Audio too short to transcribe)"
         inputs = self.stt_processor(audio_data, sampling_rate=self.STT_SAMPLE_RATE, return_tensors="pt")
+        inputs = {k: v.to(next(self.stt_model.parameters()).device) for k, v in inputs.items()}
         with torch.no_grad():
             generated_ids = self.stt_model.generate(**inputs, max_new_tokens=128)
         transcription = self.stt_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         return transcription.strip()
     def generate_speech(self, text):
+        """Synthesize speech using the ONNX TTS model and return a filepath to a WAV file."""
         if not text:
             return None
         text = text.strip()
+        # Tokenize with numpy arrays for ONNX
         inputs = self.tts_tokenizer(text, return_tensors="np")
+        input_name = self.tts_session.get_inputs()[0].name
+        ort_inputs = {input_name: inputs["input_ids"]}
         audio_waveform = self.tts_session.run(None, ort_inputs)[0].flatten()
+        # ONNX model might produce float audio in range [-1,1] or int16 depending on model. We'll safe-guard.
+        # Normalize to int16 WAV
+        if np.issubdtype(audio_waveform.dtype, np.floating):
+            # Clip and convert
+            audio_clip = np.clip(audio_waveform, -1.0, 1.0)
+            audio_int16 = (audio_clip * 32767).astype(np.int16)
+        else:
+            audio_int16 = audio_waveform.astype(np.int16)
         output_path = os.path.join(TEMP_DIR, f"{os.urandom(8).hex()}.wav")
+        write_wav(output_path, self.TTS_SAMPLE_RATE, audio_int16)
         return output_path
     def get_llm_response(self, chat_history):
+        """Return a TextIteratorStreamer that yields generated text pieces as the model produces them.
+        This implementation uses self.llm_model.generate(...) with a TextIteratorStreamer and
+        runs generate in a background thread so the caller can iterate over streamer.
+        """
+        # Build prompt from system + conversation. Adjust this template to match your LLM's preferred format.
+        prompt_lines = [self.SYSTEM_PROMPT.strip(), "\n"]
         for user_msg, assistant_msg in chat_history:
+            if user_msg:
+                # tag user messages clearly so model understands dialogue turns
+                prompt_lines.append("User: " + user_msg)
             if assistant_msg:
+                prompt_lines.append("Assistant: " + assistant_msg)
+        prompt_lines.append("Assistant: ")
+        prompt = "\n".join(prompt_lines)
+        # Tokenize and prepare inputs on the same device as the model
+        inputs = self.llm_tokenizer(prompt, return_tensors="pt")
+        try:
+            model_device = next(self.llm_model.parameters()).device
+        except StopIteration:
+            model_device = torch.device("cpu")
+        inputs = {k: v.to(model_device) for k, v in inputs.items()}
+        streamer = TextIteratorStreamer(self.llm_tokenizer, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = dict(
+            input_ids=inputs["input_ids"],
+            attention_mask=inputs.get("attention_mask", None),
             max_new_tokens=512,
             do_sample=True,
             temperature=0.6,
             top_p=0.9,
+            streamer=streamer,
+            eos_token_id=getattr(self.llm_tokenizer, "eos_token_id", None),
         )
+        # Launch generation in a thread so we can yield from the streamer in the main thread
+        gen_thread = threading.Thread(target=self.llm_model.generate, kwargs=generation_kwargs, daemon=True)
+        gen_thread.start()
         return streamer
+# -------------------- Create assistant instance --------------------
 assistant = WeeboAssistant()
+# -------------------- Gradio pipelines --------------------
 def s2s_pipeline(audio_input, chat_history):
+    # `chat_history` is expected to be a list of (user_text, assistant_text) tuples
     user_text = assistant.transcribe_audio(audio_input)
     if not user_text or user_text.startswith("("):
         chat_history.append((user_text or "(No valid speech detected)", None))
         yield chat_history, None, "Please record your voice again."
         return
     chat_history.append((user_text, ""))
     yield chat_history, None, "..."
     response_stream = assistant.get_llm_response(chat_history)
     llm_response_text = ""
     for text_chunk in response_stream:
         llm_response_text += text_chunk
+        # Update last turn in chat history
         chat_history[-1] = (user_text, llm_response_text)
         yield chat_history, None, llm_response_text
+    # Once finished, synthesize audio
     final_audio_path = assistant.generate_speech(llm_response_text)
     yield chat_history, final_audio_path, llm_response_text
 def t2t_pipeline(text_input, chat_history):
     chat_history.append((text_input, ""))
     yield chat_history
     response_stream = assistant.get_llm_response(chat_history)
     llm_response_text = ""
     for text_chunk in response_stream:
     return gr.Textbox(value="")
+# -------------------- Gradio UI --------------------
 with gr.Blocks(theme=gr.themes.Soft(), title="Msaidizi wa Kiswahili") as demo:
     gr.Markdown("# 🤖 Msaidizi wa Sauti wa Kiswahili (Swahili Voice Assistant)")
     gr.Markdown("Ongea na msaidizi kwa Kiswahili. Toa sauti, andika maandishi, na upate majibu kwa sauti au maandishi.")
     with gr.Tabs():
         with gr.TabItem("🎙️ Sauti-kwa-Sauti (Speech-to-Speech)"):
             with gr.Row():
                     s2s_chatbot = gr.Chatbot(label="Mazungumzo (Conversation)", bubble_full_width=False, height=400)
                     s2s_audio_out = gr.Audio(type="filepath", label="Jibu la Sauti (Audio Response)", autoplay=True)
                     s2s_text_out = gr.Textbox(label="Jibu la Maandishi (Text Response)", interactive=False)
         with gr.TabItem("⌨️ Maandishi-kwa-Maandishi (Text-to-Text)"):
             t2t_chatbot = gr.Chatbot(label="Mazungumzo (Conversation)", bubble_full_width=False, height=500)
             with gr.Row():
         fn=s2s_pipeline,
         inputs=[s2s_audio_in, s2s_chatbot],
         outputs=[s2s_chatbot, s2s_audio_out, s2s_text_out],
+        queue=True,
     ).then(
         fn=lambda: gr.Audio(value=None),
         inputs=None,
+        outputs=s2s_audio_in,
     )
     t2t_submit_btn.click(
         fn=t2t_pipeline,
         inputs=[t2t_text_in, t2t_chatbot],
         outputs=[t2t_chatbot],
+        queue=True,
     ).then(
         fn=clear_textbox,
         inputs=None,
+        outputs=t2t_text_in,
     )
     t2t_text_in.submit(
         fn=t2t_pipeline,
         inputs=[t2t_text_in, t2t_chatbot],
         outputs=[t2t_chatbot],
+        queue=True,
     ).then(
         fn=clear_textbox,
         inputs=None,
+        outputs=t2t_text_in,
     )
     tool_s2t_btn.click(
         fn=assistant.transcribe_audio,
         inputs=tool_s2t_audio_in,
         outputs=tool_s2t_text_out,
+        queue=True,
     )
     tool_t2s_btn.click(
         fn=assistant.generate_speech,
         inputs=tool_t2s_text_in,
         outputs=tool_t2s_audio_out,
+        queue=True,
     )
+demo.queue().launch(debug=True)