VoiceChat

Paused

App Files Files Community

legolasyiu commited on 25 days ago

Commit

9be43ff

verified ·

1 Parent(s): 92e1e76

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -44

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
-# disable TorchDynamo since UnsloTh models can have issues with TorchDynamo
 os.environ["TORCHDYNAMO_DISABLE"] = "1"
 import gradio as gr
@@ -13,68 +12,67 @@ from transformers import AutoProcessor, TextIteratorStreamer
 TARGET_SAMPLING_RATE = 16000
 device = "cuda" if torch.cuda.is_available() else "cpu"
-print("Loading model + processor...")
-# load the processor & model from the right repo
-processor = AutoProcessor.from_pretrained("EpistemeAI/Audiogemma-3N-finetune")
-model, _ = FastModel.from_pretrained(
-    model_name="EpistemeAI/Audiogemma-3N-finetune",
-    dtype=None,
     max_seq_length=2048,
     load_in_4bit=True,
     full_finetuning=False,
-    device_map="auto"
 )
 model.eval()
-print("Loaded Gemma-3N on", device)
 def transcribe_and_translate(audio_input):
     if audio_input is None:
-        yield "Upload or record audio first."
         return
-    sample_rate, audio_array = audio_input
-    # mono
-    if audio_array.ndim > 1:
-        audio_array = audio_array.mean(axis=1)
-    audio_array = audio_array.astype(np.float32)
     # resample to 16k
     if sample_rate != TARGET_SAMPLING_RATE:
-        audio_array = librosa.resample(
-            audio_array, orig_sr=sample_rate, target_sr=TARGET_SAMPLING_RATE
-        )
-    # prepare prompt
     messages = [
         {
             "role": "system",
             "content": [
-                {
-                    "type": "text",
-                    "text": "You are a model that accurately transcribes spoken audio and translates it to German."
-                }
             ],
         },
         {
             "role": "user",
             "content": [
-                {"type": "audio", "audio": audio_array},
-                {"type": "text", "text": "Transcribe the spoken audio and translate to German."}
             ],
         },
     ]
-    # tokenize & prep inputs
     inputs = processor.apply_chat_template(
         messages,
-        add_generation_prompt=True,
         tokenize=True,
         return_dict=True,
-        return_tensors="pt"
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True)
@@ -82,40 +80,39 @@ def transcribe_and_translate(audio_input):
     generation_kwargs = dict(
         **inputs,
         max_new_tokens=1024,
-        temperature=1.0,
         top_p=0.95,
         top_k=50,
-        streamer=streamer
     )
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
-    output_text = ""
-    for chunk in streamer:
-        output_text += chunk
-        yield output_text
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# Gemma-3N Audio Transcription + German Translation")
     with gr.Row():
         audio_input = gr.Audio(
-            sources=["upload","microphone"],
             type="numpy",
-            label="Your Audio"
         )
         text_output = gr.Textbox(
-            label="Transcript & Translation",
-            lines=10
         )
-    submit_btn = gr.Button("Transcribe + Translate")
-    submit_btn.click(
-        fn=transcribe_and_translate,
-        inputs=audio_input,
-        outputs=text_output
-    )
 if __name__ == "__main__":
     demo.launch()

 import os
 os.environ["TORCHDYNAMO_DISABLE"] = "1"
 import gradio as gr
 TARGET_SAMPLING_RATE = 16000
 device = "cuda" if torch.cuda.is_available() else "cpu"
+print("Loading Gemma-3N audio model...")
+# IMPORTANT: disable alt-up (fixes uint8 clamp crash)
+model, tokenizer = FastModel.from_pretrained(
+    model_name="unsloth/gemma-3n-E4B-it-unsloth-bnb-4bit",
     max_seq_length=2048,
+    dtype=None,
     load_in_4bit=True,
     full_finetuning=False,
+    disable_altup=True,      # ← critical fix
+    device_map="auto",
+)
+processor = AutoProcessor.from_pretrained(
+    "unsloth/gemma-3n-E4B-it-unsloth-bnb-4bit"
 )
 model.eval()
+print("Model loaded on", device)
+# ---------------- AUDIO PIPELINE ---------------- #
 def transcribe_and_translate(audio_input):
     if audio_input is None:
+        yield "Please upload or record audio."
         return
+    sample_rate, audio = audio_input
+    # convert to mono
+    if audio.ndim > 1:
+        audio = audio.mean(axis=1)
+    audio = audio.astype(np.float32)
     # resample to 16k
     if sample_rate != TARGET_SAMPLING_RATE:
+        audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=TARGET_SAMPLING_RATE)
     messages = [
         {
             "role": "system",
             "content": [
+                {"type": "text", "text": "You transcribe spoken audio and translate it into German."}
             ],
         },
         {
             "role": "user",
             "content": [
+                {"type": "audio", "audio": audio},
+                {"type": "text", "text": "Please transcribe this audio and translate it to German."}
             ],
         },
     ]
     inputs = processor.apply_chat_template(
         messages,
         tokenize=True,
+        add_generation_prompt=True,
+        return_tensors="pt",
         return_dict=True,
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True)
     generation_kwargs = dict(
         **inputs,
         max_new_tokens=1024,
+        temperature=0.7,
         top_p=0.95,
         top_k=50,
+        streamer=streamer,
     )
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
+    output = ""
+    for token in streamer:
+        output += token
+        yield output
+# ---------------- GRADIO UI ---------------- #
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# Gemma-3N Audio Transcription + German Translation")
     with gr.Row():
         audio_input = gr.Audio(
+            sources=["upload", "microphone"],
             type="numpy",
+            label="Audio Input"
         )
         text_output = gr.Textbox(
+            label="Transcription + Translation",
+            lines=12
         )
+    btn = gr.Button("Transcribe and Translate", variant="primary")
+    btn.click(transcribe_and_translate, audio_input, text_output)
 if __name__ == "__main__":
     demo.launch()