Spaces:

EYEDOL
/

SALAMA

Running

App Files Files Community

EYEDOL commited on Aug 8, 2025

Commit

aeacff6

verified ·

1 Parent(s): ca24ea7

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -150

app.py CHANGED Viewed

@@ -13,184 +13,118 @@ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer
 from scipy.io.wavfile import write as write_wav
 import os
 import re
-import os
 from huggingface_hub import login
-# Read the secret from environment variables
-hf_token = os.environ.get("hugface")  # Replace HF_TOKEN with the name of your secret in HF settings
-# Login using the token
 login(token=hf_token)
 print("Successfully logged into Hugging Face Hub!")
 # --- Configuration ---
 STT_MODEL_ID = "EYEDOL/SALAMA_C3"
-#Swahili LLM.
-LLM_MODEL_ID = "google/gemma-3-1b-it"
-# This is the tokenizer for your ONNX TTS model.
 TTS_TOKENIZER_ID = "facebook/mms-tts-swh"
 TTS_ONNX_MODEL_PATH = "swahili_tts.onnx"
-# Ensure the temporary directory for audio files exists
 TEMP_DIR = "temp"
 os.makedirs(TEMP_DIR, exist_ok=True)
 class WeeboAssistant:
     def __init__(self):
-        # Audio settings
         self.STT_SAMPLE_RATE = 16000
         self.TTS_SAMPLE_RATE = 16000
-        # System prompt for the LLM
-        self.SYSTEM_PROMPT = "Wewe ni msaidizi mwenye akili, jibu swali lililoulizwa kwa UFUPI na kwa usahihi. Jibu kwa lugha ya Kiswahili pekee. Hakuna jibu refu."
         self._init_models()
     def _init_models(self):
-        """Initializes all models required for the pipeline."""
         print("Initializing models...")
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.torch_dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
         print(f"Using device: {self.device}")
-        # --- 1. Initialize Swahili Speech-to-Text (STT/ASR) ---
         print(f"Loading STT model: {STT_MODEL_ID}")
-        try:
-            self.stt_processor = AutoProcessor.from_pretrained(STT_MODEL_ID)
-            self.stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
-                STT_MODEL_ID,
-                torch_dtype=self.torch_dtype,
-                low_cpu_mem_usage=True,
-                use_safetensors=True
-            )
-            self.stt_model.to(self.device)
-            print("STT model loaded successfully.")
-        except Exception as e:
-            print(f"FATAL: Could not load STT model. Please check the model ID and ensure you have access. Error: {e}")
-            # In a real app, you might want to handle this more gracefully
-            raise
-        # --- 2. Initialize Language Model (LLM) ---
         print(f"Loading LLM: {LLM_MODEL_ID}")
-        try:
-            # We don't need a separate tokenizer for the pipeline
-            self.llm_pipeline = pipeline(
-                "text-generation",
-                model=LLM_MODEL_ID,
-                model_kwargs={"torch_dtype": self.torch_dtype},
-                device=self.device,
-            )
-            print("LLM pipeline loaded successfully.")
-        except Exception as e:
-            print(f"FATAL: Could not load LLM. Error: {e}")
-            raise
-        # --- 3. Initialize Swahili Text-to-Speech (TTS) ---
         print(f"Loading TTS model: {TTS_ONNX_MODEL_PATH}")
-        try:
-            # The ONNX model should be in the same repository as app.py
-            self.tts_session = onnxruntime.InferenceSession(
-                TTS_ONNX_MODEL_PATH,
-                providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
-            )
-            self.tts_tokenizer = AutoTokenizer.from_pretrained(TTS_TOKENIZER_ID)
-            print("TTS model and tokenizer loaded successfully.")
-        except Exception as e:
-            print(f"FATAL: Could not load TTS model. Make sure '{TTS_ONNX_MODEL_PATH}' is in the repository. Error: {e}")
-            raise
         print("-" * 30)
         print("All models initialized successfully! ✅")
-    def transcribe_audio(self, audio_tuple: tuple) -> str:
-        """
-        Transcribes audio from Gradio's audio component.
-        The input is a tuple (sample_rate, numpy_array).
-        """
         if audio_tuple is None:
             return ""
         sample_rate, audio_data = audio_tuple
-        # Convert to mono float32
         if audio_data.ndim > 1:
             audio_data = audio_data.mean(axis=1)
         if audio_data.dtype != np.float32:
             audio_data = audio_data.astype(np.float32) / np.iinfo(audio_data.dtype).max
-        # Resample if necessary
         if sample_rate != self.STT_SAMPLE_RATE:
             audio_data = librosa.resample(y=audio_data, orig_sr=sample_rate, target_sr=self.STT_SAMPLE_RATE)
-        if len(audio_data) < 1000: # Ignore very short audio clips
             return "(Audio too short to transcribe)"
-        # Process and transcribe
         inputs = self.stt_processor(audio_data, sampling_rate=self.STT_SAMPLE_RATE, return_tensors="pt")
-        inputs = {key: val.to(self.device) for key, val in inputs.items()}
         with torch.no_grad():
             generated_ids = self.stt_model.generate(**inputs, max_new_tokens=128)
         transcription = self.stt_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         return transcription.strip()
-    def generate_speech(self, text: str) -> str:
-        """
-        Generates audio from text and saves it to a temporary file.
-        Returns the path to the audio file.
-        """
         if not text:
             return None
-        # Clean text
         text = text.strip()
-        try:
-            inputs = self.tts_tokenizer(text, return_tensors="np")
-            input_ids = inputs.input_ids
-            ort_inputs = {self.tts_session.get_inputs()[0].name: input_ids}
-            audio_waveform = self.tts_session.run(None, ort_inputs)[0].flatten()
-            # Save to a temporary WAV file
-            output_path = os.path.join(TEMP_DIR, f"{os.urandom(8).hex()}.wav")
-            write_wav(output_path, self.TTS_SAMPLE_RATE, audio_waveform)
-            return output_path
-        except Exception as e:
-            print(f"Error during audio generation: {e}")
-            return None
-    def get_llm_response(self, chat_history: list):
-        """
-        Gets a streaming response from the LLM.
-        Yields the updated full response at each step.
-        """
-        # Format messages for the pipeline
-        # The Gemma-2 instruction-tuned model uses a specific turn-based format
         messages = [{'role': 'system', 'content': self.SYSTEM_PROMPT}]
         for turn in chat_history:
-             messages.append({'role': 'user', 'content': turn[0]})
-             if turn[1] is not None:
                 messages.append({'role': 'assistant', 'content': turn[1]})
         prompt = self.llm_pipeline.tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
         )
         terminators = [
             self.llm_pipeline.tokenizer.eos_token_id,
             self.llm_pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
         ]
         streamer = self.llm_pipeline(
             prompt,
             max_new_tokens=512,
@@ -202,59 +136,48 @@ class WeeboAssistant:
         )
         return streamer
-# --- Gradio Interface Logic ---
-# Instantiate the assistant
 assistant = WeeboAssistant()
 def s2s_pipeline(audio_input, chat_history):
-    """The main function for the Speech-to-Speech tab."""
-    # 1. Transcribe user's speech
     user_text = assistant.transcribe_audio(audio_input)
     if not user_text or user_text.startswith("("):
         chat_history.append((user_text or "(No valid speech detected)", None))
         yield chat_history, None, "Please record your voice again."
         return
     chat_history.append((user_text, None))
-    yield chat_history, None, "..." # Show user text and a thinking indicator
-    # 2. Get LLM response as a stream
     response_stream = assistant.get_llm_response(chat_history)
-    # Stream the response text to the UI
     llm_response_text = ""
     for text_chunk in response_stream:
         llm_response_text = text_chunk
         chat_history[-1] = (user_text, llm_response_text)
         yield chat_history, None, llm_response_text
-    # 3. Synthesize the final LLM response to speech
     final_audio_path = assistant.generate_speech(llm_response_text)
-    # 4. Final update to the UI
     yield chat_history, final_audio_path, llm_response_text
 def t2t_pipeline(text_input, chat_history):
-    """The main function for the Text-to-Text tab."""
     chat_history.append((text_input, None))
     yield chat_history, "..."
     response_stream = assistant.get_llm_response(chat_history)
     llm_response_text = ""
     for text_chunk in response_stream:
         llm_response_text = text_chunk
         chat_history[-1] = (text_input, llm_response_text)
         yield chat_history, llm_response_text
-# --- Build Gradio UI ---
 with gr.Blocks(theme=gr.themes.Soft(), title="Msaidizi wa Kiswahili") as demo:
     gr.Markdown("# 🤖 Msaidizi wa Sauti wa Kiswahili (Swahili Voice Assistant)")
     gr.Markdown("Ongea na msaidizi kwa Kiswahili. Toa sauti, andika maandishi, na upate majibu kwa sauti au maandishi.")
     with gr.Tabs():
-        # Tab 1: Speech-to-Speech
         with gr.TabItem("🎙️ Sauti-kwa-Sauti (Speech-to-Speech)"):
             with gr.Row():
                 with gr.Column(scale=2):
@@ -265,32 +188,25 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Msaidizi wa Kiswahili") as demo:
                     s2s_audio_out = gr.Audio(type="filepath", label="Jibu la Sauti (Audio Response)", autoplay=True)
                     s2s_text_out = gr.Textbox(label="Jibu la Maandishi (Text Response)", interactive=False)
-        # Tab 2: Text-to-Text
         with gr.TabItem("⌨️ Maandishi-kwa-Maandishi (Text-to-Text)"):
             t2t_chatbot = gr.Chatbot(label="Mazungumzo (Conversation)", bubble_full_width=False, height=500)
             with gr.Row():
                 t2t_text_in = gr.Textbox(label="Andika Hapa (Write Here)", placeholder="Habari yako...", scale=4)
                 t2t_submit_btn = gr.Button("Tuma (Submit)", variant="primary", scale=1)
-        # Tab 3: Direct Tools
         with gr.TabItem("🛠️ Zana (Tools)"):
             with gr.Row():
-                # Speech to Text Tool
                 with gr.Column():
                     gr.Markdown("### Unukuzi wa Sauti (Speech Transcription)")
                     tool_s2t_audio_in = gr.Audio(sources=["microphone"], type="numpy", label="Sauti ya Kuingiza (Input Audio)")
                     tool_s2t_text_out = gr.Textbox(label="Maandishi Yaliyonukuliwa (Transcribed Text)", interactive=False)
                     tool_s2t_btn = gr.Button("Nukuu (Transcribe)")
-                # Text to Speech Tool
                 with gr.Column():
                     gr.Markdown("### Utengenezaji wa Sauti (Speech Synthesis)")
                     tool_t2s_text_in = gr.Textbox(label="Maandishi ya Kuingiza (Input Text)", placeholder="Andika Kiswahili hapa...")
                     tool_t2s_audio_out = gr.Audio(type="filepath", label="Sauti Iliyotengenezwa (Synthesized Audio)", autoplay=False)
                     tool_t2s_btn = gr.Button("Tengeneza Sauti (Synthesize)")
-    # --- Event Handlers ---
-    # Speech-to-Speech handler
     s2s_submit_btn.click(
         fn=s2s_pipeline,
         inputs=[s2s_audio_in, s2s_chatbot],
@@ -298,17 +214,17 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Msaidizi wa Kiswahili") as demo:
         queue=True
     )
-    # Text-to-Text handler
     t2t_submit_btn.click(
         fn=t2t_pipeline,
         inputs=[t2t_text_in, t2t_chatbot],
-        outputs=[t2t_chatbot, t2t_text_in.change(value="")], # Clear input box on submit
         queue=True
     ).then(
-        lambda x: x, t2t_chatbot, t2t_text_in
-    ) # The text response is streamed directly to the chatbot UI
-    # Tool handlers
     tool_s2t_btn.click(
         fn=assistant.transcribe_audio,
         inputs=tool_s2t_audio_in,
@@ -320,5 +236,4 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Msaidizi wa Kiswahili") as demo:
         outputs=tool_t2s_audio_out
     )
-# Launch the Gradio app
-demo.queue().launch(debug=True)

 from scipy.io.wavfile import write as write_wav
 import os
 import re
 from huggingface_hub import login
+# --- Login to Hugging Face using secret ---
+# Make sure HF_TOKEN is set in your Hugging Face Space > Settings > Repository secrets
+hf_token = os.environ.get("HF_TOKEN")
+if not hf_token:
+    raise ValueError("HF_TOKEN not found. Please set it in Hugging Face Space repository secrets.")
 login(token=hf_token)
 print("Successfully logged into Hugging Face Hub!")
 # --- Configuration ---
 STT_MODEL_ID = "EYEDOL/SALAMA_C3"
+LLM_MODEL_ID = "google/gemma-3-1b-it"
 TTS_TOKENIZER_ID = "facebook/mms-tts-swh"
 TTS_ONNX_MODEL_PATH = "swahili_tts.onnx"
 TEMP_DIR = "temp"
 os.makedirs(TEMP_DIR, exist_ok=True)
 class WeeboAssistant:
     def __init__(self):
         self.STT_SAMPLE_RATE = 16000
         self.TTS_SAMPLE_RATE = 16000
+        self.SYSTEM_PROMPT = (
+            "Wewe ni msaidizi mwenye akili, jibu swali lililoulizwa kwa UFUPI na kwa usahihi. "
+            "Jibu kwa lugha ya Kiswahili pekee. Hakuna jibu refu."
+        )
         self._init_models()
     def _init_models(self):
         print("Initializing models...")
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.torch_dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
         print(f"Using device: {self.device}")
+        # STT
         print(f"Loading STT model: {STT_MODEL_ID}")
+        self.stt_processor = AutoProcessor.from_pretrained(STT_MODEL_ID)
+        self.stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            STT_MODEL_ID,
+            torch_dtype=self.torch_dtype,
+            low_cpu_mem_usage=True,
+            use_safetensors=True
+        ).to(self.device)
+        print("STT model loaded successfully.")
+        # LLM
         print(f"Loading LLM: {LLM_MODEL_ID}")
+        self.llm_pipeline = pipeline(
+            "text-generation",
+            model=LLM_MODEL_ID,
+            model_kwargs={"torch_dtype": self.torch_dtype},
+            device=self.device,
+        )
+        print("LLM pipeline loaded successfully.")
+        # TTS
         print(f"Loading TTS model: {TTS_ONNX_MODEL_PATH}")
+        self.tts_session = onnxruntime.InferenceSession(
+            TTS_ONNX_MODEL_PATH,
+            providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
+        )
+        self.tts_tokenizer = AutoTokenizer.from_pretrained(TTS_TOKENIZER_ID)
+        print("TTS model and tokenizer loaded successfully.")
         print("-" * 30)
         print("All models initialized successfully! ✅")
+    def transcribe_audio(self, audio_tuple):
         if audio_tuple is None:
             return ""
         sample_rate, audio_data = audio_tuple
         if audio_data.ndim > 1:
             audio_data = audio_data.mean(axis=1)
         if audio_data.dtype != np.float32:
             audio_data = audio_data.astype(np.float32) / np.iinfo(audio_data.dtype).max
         if sample_rate != self.STT_SAMPLE_RATE:
             audio_data = librosa.resample(y=audio_data, orig_sr=sample_rate, target_sr=self.STT_SAMPLE_RATE)
+        if len(audio_data) < 1000:
             return "(Audio too short to transcribe)"
         inputs = self.stt_processor(audio_data, sampling_rate=self.STT_SAMPLE_RATE, return_tensors="pt")
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
         with torch.no_grad():
             generated_ids = self.stt_model.generate(**inputs, max_new_tokens=128)
         transcription = self.stt_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         return transcription.strip()
+    def generate_speech(self, text):
         if not text:
             return None
         text = text.strip()
+        inputs = self.tts_tokenizer(text, return_tensors="np")
+        ort_inputs = {self.tts_session.get_inputs()[0].name: inputs.input_ids}
+        audio_waveform = self.tts_session.run(None, ort_inputs)[0].flatten()
+        output_path = os.path.join(TEMP_DIR, f"{os.urandom(8).hex()}.wav")
+        write_wav(output_path, self.TTS_SAMPLE_RATE, audio_waveform)
+        return output_path
+    def get_llm_response(self, chat_history):
         messages = [{'role': 'system', 'content': self.SYSTEM_PROMPT}]
         for turn in chat_history:
+            messages.append({'role': 'user', 'content': turn[0]})
+            if turn[1] is not None:
                 messages.append({'role': 'assistant', 'content': turn[1]})
         prompt = self.llm_pipeline.tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
         )
         terminators = [
             self.llm_pipeline.tokenizer.eos_token_id,
             self.llm_pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
         ]
         streamer = self.llm_pipeline(
             prompt,
             max_new_tokens=512,
         )
         return streamer
 assistant = WeeboAssistant()
 def s2s_pipeline(audio_input, chat_history):
     user_text = assistant.transcribe_audio(audio_input)
     if not user_text or user_text.startswith("("):
         chat_history.append((user_text or "(No valid speech detected)", None))
         yield chat_history, None, "Please record your voice again."
         return
     chat_history.append((user_text, None))
+    yield chat_history, None, "..."
     response_stream = assistant.get_llm_response(chat_history)
     llm_response_text = ""
     for text_chunk in response_stream:
         llm_response_text = text_chunk
         chat_history[-1] = (user_text, llm_response_text)
         yield chat_history, None, llm_response_text
     final_audio_path = assistant.generate_speech(llm_response_text)
     yield chat_history, final_audio_path, llm_response_text
 def t2t_pipeline(text_input, chat_history):
     chat_history.append((text_input, None))
     yield chat_history, "..."
     response_stream = assistant.get_llm_response(chat_history)
     llm_response_text = ""
     for text_chunk in response_stream:
         llm_response_text = text_chunk
         chat_history[-1] = (text_input, llm_response_text)
         yield chat_history, llm_response_text
+def clear_textbox():
+    return ""
 with gr.Blocks(theme=gr.themes.Soft(), title="Msaidizi wa Kiswahili") as demo:
     gr.Markdown("# 🤖 Msaidizi wa Sauti wa Kiswahili (Swahili Voice Assistant)")
     gr.Markdown("Ongea na msaidizi kwa Kiswahili. Toa sauti, andika maandishi, na upate majibu kwa sauti au maandishi.")
     with gr.Tabs():
         with gr.TabItem("🎙️ Sauti-kwa-Sauti (Speech-to-Speech)"):
             with gr.Row():
                 with gr.Column(scale=2):
                     s2s_audio_out = gr.Audio(type="filepath", label="Jibu la Sauti (Audio Response)", autoplay=True)
                     s2s_text_out = gr.Textbox(label="Jibu la Maandishi (Text Response)", interactive=False)
         with gr.TabItem("⌨️ Maandishi-kwa-Maandishi (Text-to-Text)"):
             t2t_chatbot = gr.Chatbot(label="Mazungumzo (Conversation)", bubble_full_width=False, height=500)
             with gr.Row():
                 t2t_text_in = gr.Textbox(label="Andika Hapa (Write Here)", placeholder="Habari yako...", scale=4)
                 t2t_submit_btn = gr.Button("Tuma (Submit)", variant="primary", scale=1)
         with gr.TabItem("🛠️ Zana (Tools)"):
             with gr.Row():
                 with gr.Column():
                     gr.Markdown("### Unukuzi wa Sauti (Speech Transcription)")
                     tool_s2t_audio_in = gr.Audio(sources=["microphone"], type="numpy", label="Sauti ya Kuingiza (Input Audio)")
                     tool_s2t_text_out = gr.Textbox(label="Maandishi Yaliyonukuliwa (Transcribed Text)", interactive=False)
                     tool_s2t_btn = gr.Button("Nukuu (Transcribe)")
                 with gr.Column():
                     gr.Markdown("### Utengenezaji wa Sauti (Speech Synthesis)")
                     tool_t2s_text_in = gr.Textbox(label="Maandishi ya Kuingiza (Input Text)", placeholder="Andika Kiswahili hapa...")
                     tool_t2s_audio_out = gr.Audio(type="filepath", label="Sauti Iliyotengenezwa (Synthesized Audio)", autoplay=False)
                     tool_t2s_btn = gr.Button("Tengeneza Sauti (Synthesize)")
     s2s_submit_btn.click(
         fn=s2s_pipeline,
         inputs=[s2s_audio_in, s2s_chatbot],
         queue=True
     )
     t2t_submit_btn.click(
         fn=t2t_pipeline,
         inputs=[t2t_text_in, t2t_chatbot],
+        outputs=[t2t_chatbot, t2t_text_in],
         queue=True
     ).then(
+        fn=clear_textbox,
+        inputs=None,
+        outputs=t2t_text_in
+    )
     tool_s2t_btn.click(
         fn=assistant.transcribe_audio,
         inputs=tool_s2t_audio_in,
         outputs=tool_t2s_audio_out
     )
+demo.queue().launch(debug=True)