VoiceChat

Paused

App Files Files Community

legolasyiu commited on Jan 21

Commit

d0faf3c

verified ·

1 Parent(s): 9d1e3b9

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -73

app.py CHANGED Viewed

@@ -1,98 +1,127 @@
-import torch
 import gradio as gr
 import tempfile
-from transformers import AutoProcessor, AutoModelForImageTextToText
-# ---------------- CONFIG ---------------- #
-MODEL_ID = "EpistemeAI/Audiogemma-3N-finetune"
-MAX_TOKENS = 256
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print("Loading model and processor...")
-processor = AutoProcessor.from_pretrained(MODEL_ID, device_map="auto")
-model = AutoModelForImageTextToText.from_pretrained(
-    MODEL_ID,
-    torch_dtype="auto",
-    device_map="auto"
 )
-print("Model loaded.")
-# ---------------- INFERENCE FUNCTION ---------------- #
-def transcribe_and_translate(audio_file, target_language):
     if audio_file is None:
-        return "Please upload an audio file."
-    # Save temp file path
-    audio_path = audio_file
-    prompt = f"Transcribe this audio into English, and then translate it into {target_language}."
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "audio", "audio": audio_path},
-                {"type": "text", "text": prompt},
-            ]
-        }
-    ]
-    inputs = processor.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        tokenize=True,
-        return_dict=True,
-        return_tensors="pt"
-    )
-    inputs = {k: v.to(model.device) for k, v in inputs.items()}
     with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=MAX_TOKENS,
-            do_sample=False,
-            temperature=0.2,
         )
-    decoded = processor.batch_decode(
-        outputs,
         skip_special_tokens=True,
-        clean_up_tokenization_spaces=True
     )
-    return decoded[0]
-# ---------------- GRADIO UI ---------------- #
-with gr.Blocks(title="Audiogemma Multilingual Transcriber") as demo:
-    gr.Markdown("# 🎧 Audiogemma Multilingual Transcriber")
-    gr.Markdown("Upload an audio file and get transcription + translation powered by **Gemma 3N**.")
-    with gr.Row():
-        audio_input = gr.Audio(type="filepath", label="Upload Audio or Use Microphone")
-        language_input = gr.Dropdown(
-            choices=[
-                "French", "Spanish", "German", "Chinese", "Japanese",
-                "Korean", "Italian", "Portuguese", "Arabic", "Hindi"
-            ],
-            value="French",
-            label="Translate To"
-        )
-    transcribe_btn = gr.Button("Transcribe & Translate")
-    output_text = gr.Textbox(label="Result", lines=12)
-    transcribe_btn.click(
-        fn=transcribe_and_translate,
-        inputs=[audio_input, language_input],
-        outputs=output_text
     )
 demo.launch()

 import gradio as gr
+import torch
+import librosa
+import soundfile as sf
 import tempfile
+import os
+from transformers import (
+    AutoProcessor,
+    AutoModelForImageTextToText,
+    AutoTokenizer,
+    AutoModelForTextToSpeech,
 )
+# -----------------------------
+# CONFIG
+# -----------------------------
+STT_MODEL_ID = "EpistemeAI/Audiogemma-3N-finetune"
+TTS_MODEL_ID = "EpistemeAI/LexiVox"
+TARGET_SR = 16000
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.bfloat16 if DEVICE == "cuda" else torch.float32
+# -----------------------------
+# LOAD MODELS (ONCE)
+# -----------------------------
+print("Loading STT model...")
+stt_processor = AutoProcessor.from_pretrained(STT_MODEL_ID)
+stt_model = AutoModelForImageTextToText.from_pretrained(
+    STT_MODEL_ID,
+    torch_dtype=DTYPE,
+    device_map="auto",
+)
+print("Loading TTS model...")
+tts_tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_ID)
+tts_model = AutoModelForTextToSpeech.from_pretrained(
+    TTS_MODEL_ID,
+    torch_dtype=DTYPE,
+).to(DEVICE)
+# -----------------------------
+# PIPELINE FUNCTION
+# -----------------------------
+def speech_to_speech(audio_file):
     if audio_file is None:
+        return "", None
+    # Load + resample
+    audio, sr = librosa.load(audio_file, sr=TARGET_SR)
+    # ---------- STT ----------
+    stt_inputs = stt_processor(
+        audio=audio,
+        sampling_rate=TARGET_SR,
+        text="Transcribe the audio accurately.",
+        return_tensors="pt",
+    ).to(DEVICE)
     with torch.no_grad():
+        output_ids = stt_model.generate(
+            **stt_inputs,
+            max_new_tokens=512,
         )
+    transcription = stt_processor.decode(
+        output_ids[0],
         skip_special_tokens=True,
     )
+    # ---------- TTS ----------
+    tts_inputs = tts_tokenizer(
+        transcription,
+        return_tensors="pt",
+    ).to(DEVICE)
+    with torch.no_grad():
+        speech = tts_model.generate(**tts_inputs)
+    audio_out = speech.cpu().numpy().squeeze()
+    # Save temp wav
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    sf.write(tmp.name, audio_out, TARGET_SR)
+    return transcription, tmp.name
+# -----------------------------
+# GRADIO UI
+# -----------------------------
+with gr.Blocks(title="Audiogemma → LexiVox Speech Loop") as demo:
+    gr.Markdown(
+        """
+        # 🎙️ Speech → Text → Speech
+        **Audiogemma-3N + LexiVox**
+        Upload audio or use the microphone.
+        The system transcribes speech, then speaks it back using an LLM-based TTS.
+        """
+    )
+    audio_input = gr.Audio(
+        sources=["microphone", "upload"],
+        type="filepath",
+        label="Input Audio",
+    )
+    run_btn = gr.Button("Run Speech Loop")
+    text_output = gr.Textbox(
+        label="Transcription",
+        lines=4,
+    )
+    audio_output = gr.Audio(
+        label="Synthesized Speech",
+        type="filepath",
+    )
+    run_btn.click(
+        fn=speech_to_speech,
+        inputs=audio_input,
+        outputs=[text_output, audio_output],
     )
 demo.launch()