VoiceChat

Paused

App Files Files Community

legolasyiu commited on 23 days ago

Commit

738d49d

verified ·

1 Parent(s): 9b1e5ee

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -38

app.py CHANGED Viewed

@@ -3,15 +3,15 @@ import torch
 import librosa
 import soundfile as sf
 import tempfile
-import os
 from transformers import (
     AutoProcessor,
     AutoModelForImageTextToText,
     AutoTokenizer,
-    AutoModelForCausalLM,
 )
 # -----------------------------
 # CONFIG
 # -----------------------------
@@ -19,35 +19,47 @@ STT_MODEL_ID = "EpistemeAI/Audiogemma-3N-finetune"
 TTS_MODEL_ID = "EpistemeAI/LexiVox"
 TARGET_SR = 16000
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DTYPE = torch.bfloat16 if DEVICE == "cuda" else torch.float32
 # -----------------------------
-# LOAD MODELS (ONCE)
 # -----------------------------
 print("Loading STT model...")
 processor = AutoProcessor.from_pretrained(STT_MODEL_ID)
-model = AutoModelForImageTextToText.from_pretrained(
     STT_MODEL_ID,
     torch_dtype="auto",
     device_map="auto",
 )
-print("Loading TTS model...")
 tts_tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_ID)
-tts_model = AutoModelForCausalLM.from_pretrained(
-    TTS_MODEL_ID,
-    torch_dtype="auto",
-)
-def transcribe_and_translate(audio_file):
-    if audio_file is None:
-        return "Please upload an audio file."
-    # Save temp file path
-    audio_path = audio_file
-    prompt = f"Transcribe the audio accurately in German."
     messages = [
         {
@@ -55,7 +67,7 @@ def transcribe_and_translate(audio_file):
             "content": [
                 {"type": "audio", "audio": audio_path},
                 {"type": "text", "text": prompt},
-            ]
         }
     ]
@@ -63,54 +75,58 @@ def transcribe_and_translate(audio_file):
         messages,
         add_generation_prompt=True,
         tokenize=True,
         return_dict=True,
-        return_tensors="pt"
     )
-    inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    with torch.no_grad():
-        outputs = model.generate(
             **inputs,
             max_new_tokens=MAX_TOKENS,
             do_sample=False,
             temperature=0.2,
         )
-    decoded = processor.batch_decode(
         outputs,
         skip_special_tokens=True,
-        clean_up_tokenization_spaces=True
-    )
-    return decoded[0]
 # -----------------------------
-# PIPELINE FUNCTION
 # -----------------------------
 def speech_to_speech(audio_file):
     if audio_file is None:
         return "", None
-    # Load + resample
-    audio, sr = librosa.load(audio_file, sr=TARGET_SR)
     # ---------- STT ----------
-    transcription = transcribe_and_translate(audio_file)
     # ---------- TTS ----------
     tts_inputs = tts_tokenizer(
         transcription,
         return_tensors="pt",
-    )
-    with torch.no_grad():
-        speech = tts_model.generate(**tts_inputs)
-    audio_out = speech.cpu().numpy().squeeze()
-    # Save temp wav
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
     sf.write(tmp.name, audio_out, TARGET_SR)
@@ -119,14 +135,13 @@ def speech_to_speech(audio_file):
 # -----------------------------
 # GRADIO UI
 # -----------------------------
-with gr.Blocks(title="Audiogemma → LexiVox Speech Loop") as demo:
     gr.Markdown(
         """
         # 🎙️ Speech → Text → Speech
-        **Audiogemma-3N + LexiVox**
-        Upload audio or use the microphone.
-        The system transcribes speech, then speaks it back using an LLM-based TTS.
         """
     )

 import librosa
 import soundfile as sf
 import tempfile
 from transformers import (
     AutoProcessor,
     AutoModelForImageTextToText,
     AutoTokenizer,
 )
+from unsloth import FastLanguageModel
 # -----------------------------
 # CONFIG
 # -----------------------------
 TTS_MODEL_ID = "EpistemeAI/LexiVox"
 TARGET_SR = 16000
+MAX_TOKENS = 512
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DTYPE = torch.bfloat16 if DEVICE == "cuda" else torch.float32
 # -----------------------------
+# LOAD STT MODEL
 # -----------------------------
 print("Loading STT model...")
 processor = AutoProcessor.from_pretrained(STT_MODEL_ID)
+stt_model = AutoModelForImageTextToText.from_pretrained(
     STT_MODEL_ID,
     torch_dtype="auto",
     device_map="auto",
 )
+stt_model.eval()
+# -----------------------------
+# LOAD TTS MODEL (UNSLOTH)
+# -----------------------------
+print("Loading TTS model with Unsloth...")
 tts_tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_ID)
+tts_model, _ = FastLanguageModel.from_pretrained(
+    model_name = TTS_MODEL_ID,
+    max_seq_length = 4096,
+    dtype = DTYPE,
+    load_in_4bit = True,
+)
+FastLanguageModel.for_inference(tts_model)
+tts_model.eval()
+# -----------------------------
+# STT FUNCTION
+# -----------------------------
+def transcribe(audio_path):
+    prompt = "Transcribe the audio accurately in German."
     messages = [
         {
             "content": [
                 {"type": "audio", "audio": audio_path},
                 {"type": "text", "text": prompt},
+            ],
         }
     ]
         messages,
         add_generation_prompt=True,
         tokenize=True,
+        return_tensors="pt",
         return_dict=True,
     )
+    inputs = {k: v.to(stt_model.device) for k, v in inputs.items()}
+    with torch.inference_mode():
+        outputs = stt_model.generate(
             **inputs,
             max_new_tokens=MAX_TOKENS,
             do_sample=False,
             temperature=0.2,
         )
+    text = processor.batch_decode(
         outputs,
         skip_special_tokens=True,
+        clean_up_tokenization_spaces=True,
+    )[0]
+    return text
 # -----------------------------
+# SPEECH → SPEECH PIPELINE
 # -----------------------------
 def speech_to_speech(audio_file):
     if audio_file is None:
         return "", None
+    # Ensure audio is valid
+    _audio, _ = librosa.load(audio_file, sr=TARGET_SR)
     # ---------- STT ----------
+    transcription = transcribe(audio_file)
     # ---------- TTS ----------
     tts_inputs = tts_tokenizer(
         transcription,
         return_tensors="pt",
+    ).to(tts_model.device)
+    with torch.inference_mode():
+        speech_tokens = tts_model.generate(
+            **tts_inputs,
+            max_new_tokens=2048,
+            do_sample=False,
+            temperature=0.7,
+        )
+    audio_out = speech_tokens.cpu().numpy().squeeze()
+    # Save temporary WAV
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
     sf.write(tmp.name, audio_out, TARGET_SR)
 # -----------------------------
 # GRADIO UI
 # -----------------------------
+with gr.Blocks(title="Audiogemma → LexiVox (Unsloth)") as demo:
     gr.Markdown(
         """
         # 🎙️ Speech → Text → Speech
+        **Audiogemma-3N + LexiVox (Unsloth Accelerated)**
+        Upload audio or use your microphone.
         """
     )