Spaces:

EYEDOL
/

SALAMA

Sleeping

App Files Files Community

EYEDOL commited on Aug 11, 2025

Commit

babb493

verified ·

1 Parent(s): e1a9f6f

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -96

app.py CHANGED Viewed

@@ -1,24 +1,23 @@
 # -*- coding: utf-8 -*-
 """
-Fixed and self-contained Swahili multimodal assistant for Hugging Face Spaces.
-Key fixes / improvements over original:
-- Robust loading of an LLM repo that may lack `model_type` in config.json by
-  loading the model object directly and using `trust_remote_code=True` as a
-  fallback. Avoids `pipeline(... )` raising ValueError on AutoConfig.
-- Correct handling of `pipeline(..., device=...)` which expects an int GPU
-  index or -1 for CPU (previously passed a string like "cpu").
-- Streaming generation implemented by calling `model.generate(..., streamer=TextIteratorStreamer(...))`
-  in a background thread so the main thread can iterate over the streamer.
-- Use standard HF env var `HF_TOKEN` and graceful error message if not set.
-- Minor robustness improvements (resampling audio, handling mono/stereo, temp
-  filenames, etc.).
-Drop this file into your Space and replace the old app.py contents.
 """
 import os
-import re
 import threading
 import numpy as np
 import gradio as gr
@@ -38,26 +37,29 @@ from transformers import (
     TextIteratorStreamer,
 )
 # -------------------- Configuration --------------------
 STT_MODEL_ID = "EYEDOL/SALAMA_C3"
-LLM_MODEL_ID = "EYEDOL/Llama-3.2-1B_ON_ALPACA5"
 TTS_TOKENIZER_ID = "facebook/mms-tts-swh"
 TTS_ONNX_MODEL_PATH = "swahili_tts.onnx"
 TEMP_DIR = "temp"
 os.makedirs(TEMP_DIR, exist_ok=True)
-# Use the standard environment variable name used by Spaces
 HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("hugface")
 if not HF_TOKEN:
-    raise ValueError("HF_TOKEN not found. Please set it in Hugging Face Space repository secrets.")
-# Attempt login to HF hub (Spaces typically already provides token, but this keeps parity)
-try:
-    login(token=HF_TOKEN)
-    print("Successfully logged into Hugging Face Hub!")
-except Exception as e:
-    print("Warning: could not call huggingface_hub.login(). Proceeding — ensure your token is valid in the environment. Error:", e)
 class WeeboAssistant:
@@ -79,7 +81,6 @@ class WeeboAssistant:
         # ---------------- STT ----------------
         print(f"Loading STT model: {STT_MODEL_ID}")
         self.stt_processor = AutoProcessor.from_pretrained(STT_MODEL_ID)
-        # Speech seq2seq model (e.g. Whisper-like)
         self.stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
             STT_MODEL_ID,
             torch_dtype=self.torch_dtype,
@@ -93,56 +94,53 @@ class WeeboAssistant:
                 pass
         print("STT model loaded successfully.")
-        # ---------------- LLM ----------------
-        print(f"Loading LLM: {LLM_MODEL_ID}")
-        self.llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID, use_fast=True)
-        # Attempt robust loading. If the repo lacks a model_type in config.json,
-        # try loading with trust_remote_code=True (this allows custom model code in repo).
         try:
-            config = AutoConfig.from_pretrained(LLM_MODEL_ID)
-            # If config loaded but missing model_type, continue to try direct load
-            if not getattr(config, "model_type", None):
-                raise ValueError("config missing model_type - forcing trusted load")
-            # Try to load into a causal LM class (works for many standard model types)
             self.llm_model = AutoModelForCausalLM.from_pretrained(
-                LLM_MODEL_ID,
-                config=config,
                 torch_dtype=self.torch_dtype,
                 low_cpu_mem_usage=True,
             )
-        except Exception as first_err:
-            print("Standard AutoConfig/AutoModel load failed or model_type missing. Trying trust_remote_code=True. Error:", first_err)
-            # Try using trust_remote_code which will import repo-specific model code if present
-            try:
-                config = AutoConfig.from_pretrained(LLM_MODEL_ID, trust_remote_code=True)
-                self.llm_model = AutoModelForCausalLM.from_pretrained(
-                    LLM_MODEL_ID,
-                    config=config,
-                    torch_dtype=self.torch_dtype,
-                    trust_remote_code=True,
-                    low_cpu_mem_usage=True,
-                    device_map="auto" if torch.cuda.is_available() else None,
-                )
-            except Exception as second_err:
-                # Final fallback: try to load without special configs — may still fail for custom repos
-                print("Fallback load also failed:", second_err)
-                raise RuntimeError(
-                    "Unable to load LLM model. Check the model repo, ensure config.json contains a model_type or that trust_remote_code is allowed."
-                )
-        # If device_map wasn't used and model is on CPU, ensure model is moved to CPU
-        if self.device == "cpu":
-            try:
-                # Many Hugging Face helpers use device_map; if not used, move model
-                self.llm_model = self.llm_model.to("cpu")
-            except Exception:
-                pass
-        # For convenience, create a pipeline for non-streaming quick calls (device expects int or -1)
-        device_index = 0 if torch.cuda.is_available() else -1
         try:
             self.llm_pipeline = pipeline(
                 "text-generation",
                 model=self.llm_model,
@@ -150,15 +148,14 @@ class WeeboAssistant:
                 device=device_index,
                 model_kwargs={"torch_dtype": self.torch_dtype},
             )
-        except Exception:
-            # pipeline is optional; if it fails we still support the streaming flow via model.generate
             self.llm_pipeline = None
-        print("LLM loaded successfully.")
         # ---------------- TTS ----------------
         print(f"Loading TTS model: {TTS_ONNX_MODEL_PATH}")
-        # ONNX runtime session; providers include CUDA if available
         providers = ["CPUExecutionProvider"]
         if torch.cuda.is_available():
             providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
@@ -171,22 +168,17 @@ class WeeboAssistant:
     # ---------------- Utility methods ----------------
     def transcribe_audio(self, audio_tuple):
-        """Take a Gradio audio tuple (sample_rate, np_audio) and return transcription string."""
         if audio_tuple is None:
             return ""
         sample_rate, audio_data = audio_tuple
-        # Convert to mono
         if audio_data.ndim > 1:
             audio_data = audio_data.mean(axis=1)
-        # Normalize to float32
         if audio_data.dtype != np.float32:
-            # handle common integer audio dtypes
             if np.issubdtype(audio_data.dtype, np.integer):
                 max_val = np.iinfo(audio_data.dtype).max
                 audio_data = audio_data.astype(np.float32) / float(max_val)
             else:
                 audio_data = audio_data.astype(np.float32)
-        # Resample if needed
         if sample_rate != self.STT_SAMPLE_RATE:
             audio_data = librosa.resample(y=audio_data, orig_sr=sample_rate, target_sr=self.STT_SAMPLE_RATE)
         if len(audio_data) < 1000:
@@ -200,20 +192,15 @@ class WeeboAssistant:
         return transcription.strip()
     def generate_speech(self, text):
-        """Synthesize speech using the ONNX TTS model and return a filepath to a WAV file."""
         if not text:
             return None
         text = text.strip()
-        # Tokenize with numpy arrays for ONNX
         inputs = self.tts_tokenizer(text, return_tensors="np")
         input_name = self.tts_session.get_inputs()[0].name
         ort_inputs = {input_name: inputs["input_ids"]}
         audio_waveform = self.tts_session.run(None, ort_inputs)[0].flatten()
-        # ONNX model might produce float audio in range [-1,1] or int16 depending on model. We'll safe-guard.
-        # Normalize to int16 WAV
         if np.issubdtype(audio_waveform.dtype, np.floating):
-            # Clip and convert
             audio_clip = np.clip(audio_waveform, -1.0, 1.0)
             audio_int16 = (audio_clip * 32767).astype(np.int16)
         else:
@@ -224,23 +211,17 @@ class WeeboAssistant:
         return output_path
     def get_llm_response(self, chat_history):
-        """Return a TextIteratorStreamer that yields generated text pieces as the model produces them.
-        This implementation uses self.llm_model.generate(...) with a TextIteratorStreamer and
-        runs generate in a background thread so the caller can iterate over streamer.
-        """
-        # Build prompt from system + conversation. Adjust this template to match your LLM's preferred format.
-        prompt_lines = [self.SYSTEM_PROMPT.strip(), "\n"]
         for user_msg, assistant_msg in chat_history:
             if user_msg:
-                # tag user messages clearly so model understands dialogue turns
                 prompt_lines.append("User: " + user_msg)
             if assistant_msg:
                 prompt_lines.append("Assistant: " + assistant_msg)
         prompt_lines.append("Assistant: ")
-        prompt = "\n".join(prompt_lines)
-        # Tokenize and prepare inputs on the same device as the model
         inputs = self.llm_tokenizer(prompt, return_tensors="pt")
         try:
             model_device = next(self.llm_model.parameters()).device
@@ -261,7 +242,6 @@ class WeeboAssistant:
             eos_token_id=getattr(self.llm_tokenizer, "eos_token_id", None),
         )
-        # Launch generation in a thread so we can yield from the streamer in the main thread
         gen_thread = threading.Thread(target=self.llm_model.generate, kwargs=generation_kwargs, daemon=True)
         gen_thread.start()
@@ -275,7 +255,6 @@ assistant = WeeboAssistant()
 # -------------------- Gradio pipelines --------------------
 def s2s_pipeline(audio_input, chat_history):
-    # `chat_history` is expected to be a list of (user_text, assistant_text) tuples
     user_text = assistant.transcribe_audio(audio_input)
     if not user_text or user_text.startswith("("):
         chat_history.append((user_text or "(No valid speech detected)", None))
@@ -289,11 +268,9 @@ def s2s_pipeline(audio_input, chat_history):
     llm_response_text = ""
     for text_chunk in response_stream:
         llm_response_text += text_chunk
-        # Update last turn in chat history
         chat_history[-1] = (user_text, llm_response_text)
         yield chat_history, None, llm_response_text
-    # Once finished, synthesize audio
     final_audio_path = assistant.generate_speech(llm_response_text)
     yield chat_history, final_audio_path, llm_response_text

 # -*- coding: utf-8 -*-
 """
+Salama Assistant — fixed full app.py with PEFT adapter loading (base + adapter)
+Drop this file into your Hugging Face Space (replace your existing app.py).
+Requirements:
+- transformers
+- peft
+- onnxruntime
+- librosa
+- huggingface_hub
+- gradio
+Note: install `peft` (e.g. add to requirements.txt: "peft>=0.4.0") or pip install in your environment.
 """
 import os
+import json
+import tempfile
 import threading
 import numpy as np
 import gradio as gr
     TextIteratorStreamer,
 )
+# PEFT imports
+from peft import PeftModel, PeftConfig
 # -------------------- Configuration --------------------
 STT_MODEL_ID = "EYEDOL/SALAMA_C3"
+ADAPTER_REPO_ID = "EYEDOL/Llama-3.2-1B_ON_ALPACA5"  # adapter-only repo
+BASE_MODEL_ID = "unsloth/Llama-3.2-1B-Instruct"    # full base model referenced by adapter
 TTS_TOKENIZER_ID = "facebook/mms-tts-swh"
 TTS_ONNX_MODEL_PATH = "swahili_tts.onnx"
 TEMP_DIR = "temp"
 os.makedirs(TEMP_DIR, exist_ok=True)
+# Use HF token from env; Spaces normally provide HF_TOKEN
 HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("hugface")
 if not HF_TOKEN:
+    print("Warning: HF_TOKEN not found in env. Public models may still load, but private repos require a token.")
+else:
+    try:
+        login(token=HF_TOKEN)
+        print("Successfully logged into Hugging Face Hub!")
+    except Exception as e:
+        print("Warning: huggingface_hub.login() failed:", e)
 class WeeboAssistant:
         # ---------------- STT ----------------
         print(f"Loading STT model: {STT_MODEL_ID}")
         self.stt_processor = AutoProcessor.from_pretrained(STT_MODEL_ID)
         self.stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
             STT_MODEL_ID,
             torch_dtype=self.torch_dtype,
                 pass
         print("STT model loaded successfully.")
+        # ---------------- LLM (base + PEFT adapter) ----------------
+        print(f"Loading base LLM: {BASE_MODEL_ID} and applying adapter: {ADAPTER_REPO_ID}")
+        # 1) Tokenizer: prefer base tokenizer
         try:
+            self.llm_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_fast=True)
+        except Exception as e:
+            print("Warning: could not load base tokenizer, falling back to adapter tokenizer. Error:", e)
+            self.llm_tokenizer = AutoTokenizer.from_pretrained(ADAPTER_REPO_ID, use_fast=True)
+        # 2) Load base model
+        device_map = "auto" if torch.cuda.is_available() else None
+        try:
             self.llm_model = AutoModelForCausalLM.from_pretrained(
+                BASE_MODEL_ID,
                 torch_dtype=self.torch_dtype,
                 low_cpu_mem_usage=True,
+                device_map=device_map,
+                trust_remote_code=True,
+            )
+        except Exception as e:
+            # Helpful error info and hint
+            raise RuntimeError(
+                "Failed to load base model. Ensure the base model ID is correct and the HF_TOKEN has access if private. Error: "
+                + str(e)
             )
+        # 3) Load and apply PEFT adapter (adapter-only repo)
         try:
+            # This discovers adapter config (adapter_config.json) and applies weights
+            peft_config = PeftConfig.from_pretrained(ADAPTER_REPO_ID)
+            self.llm_model = PeftModel.from_pretrained(
+                self.llm_model,
+                ADAPTER_REPO_ID,
+                device_map=device_map,
+                torch_dtype=self.torch_dtype,
+                low_cpu_mem_usage=True,
+            )
+        except Exception as e:
+            raise RuntimeError(
+                "Failed to load/apply PEFT adapter from adapter repo. Make sure adapter files (adapter_config.json and adapter_model.safetensors) are present and HF_TOKEN has access if private. Error: "
+                + str(e)
+            )
+        # 4) Optionally create a non-streaming pipeline for quick tests
+        try:
+            device_index = 0 if torch.cuda.is_available() else -1
             self.llm_pipeline = pipeline(
                 "text-generation",
                 model=self.llm_model,
                 device=device_index,
                 model_kwargs={"torch_dtype": self.torch_dtype},
             )
+        except Exception as e:
+            print("Warning: could not create text-generation pipeline. Streaming generate will still work. Error:", e)
             self.llm_pipeline = None
+        print("LLM base + adapter loaded successfully.")
         # ---------------- TTS ----------------
         print(f"Loading TTS model: {TTS_ONNX_MODEL_PATH}")
         providers = ["CPUExecutionProvider"]
         if torch.cuda.is_available():
             providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
     # ---------------- Utility methods ----------------
     def transcribe_audio(self, audio_tuple):
         if audio_tuple is None:
             return ""
         sample_rate, audio_data = audio_tuple
         if audio_data.ndim > 1:
             audio_data = audio_data.mean(axis=1)
         if audio_data.dtype != np.float32:
             if np.issubdtype(audio_data.dtype, np.integer):
                 max_val = np.iinfo(audio_data.dtype).max
                 audio_data = audio_data.astype(np.float32) / float(max_val)
             else:
                 audio_data = audio_data.astype(np.float32)
         if sample_rate != self.STT_SAMPLE_RATE:
             audio_data = librosa.resample(y=audio_data, orig_sr=sample_rate, target_sr=self.STT_SAMPLE_RATE)
         if len(audio_data) < 1000:
         return transcription.strip()
     def generate_speech(self, text):
         if not text:
             return None
         text = text.strip()
         inputs = self.tts_tokenizer(text, return_tensors="np")
         input_name = self.tts_session.get_inputs()[0].name
         ort_inputs = {input_name: inputs["input_ids"]}
         audio_waveform = self.tts_session.run(None, ort_inputs)[0].flatten()
         if np.issubdtype(audio_waveform.dtype, np.floating):
             audio_clip = np.clip(audio_waveform, -1.0, 1.0)
             audio_int16 = (audio_clip * 32767).astype(np.int16)
         else:
         return output_path
     def get_llm_response(self, chat_history):
+        prompt_lines = [self.SYSTEM_PROMPT.strip(), "
+"]
         for user_msg, assistant_msg in chat_history:
             if user_msg:
                 prompt_lines.append("User: " + user_msg)
             if assistant_msg:
                 prompt_lines.append("Assistant: " + assistant_msg)
         prompt_lines.append("Assistant: ")
+        prompt = "
+".join(prompt_lines)
         inputs = self.llm_tokenizer(prompt, return_tensors="pt")
         try:
             model_device = next(self.llm_model.parameters()).device
             eos_token_id=getattr(self.llm_tokenizer, "eos_token_id", None),
         )
         gen_thread = threading.Thread(target=self.llm_model.generate, kwargs=generation_kwargs, daemon=True)
         gen_thread.start()
 # -------------------- Gradio pipelines --------------------
 def s2s_pipeline(audio_input, chat_history):
     user_text = assistant.transcribe_audio(audio_input)
     if not user_text or user_text.startswith("("):
         chat_history.append((user_text or "(No valid speech detected)", None))
     llm_response_text = ""
     for text_chunk in response_stream:
         llm_response_text += text_chunk
         chat_history[-1] = (user_text, llm_response_text)
         yield chat_history, None, llm_response_text
     final_audio_path = assistant.generate_speech(llm_response_text)
     yield chat_history, final_audio_path, llm_response_text