VoiceChat

Paused

App Files Files Community

legolasyiu commited on about 1 month ago

Commit

cf972e4

verified ·

1 Parent(s): 505b4c9

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -20

app.py CHANGED Viewed

@@ -8,34 +8,63 @@ from transformers import AutoProcessor, TextIteratorStreamer
 from threading import Thread
 TARGET_SAMPLING_RATE = 16000
-print("Loading model and processor...")
 processor = AutoProcessor.from_pretrained('EpistemeAI/Audiogemma-3N-finetune')
 model, _ = FastModel.from_pretrained(
     model_name='EpistemeAI/Audiogemma-3N-finetune',
     max_seq_length=512,
     load_in_4bit=True,
     dtype=torch.bfloat16,
 )
-print("Model and processor loaded successfully.")
 def transcribe_and_translate(audio_input):
     """
-    This function takes audio data from the Gradio component, processes it,
-    and then streams the model's transcription and translation back to the UI.
     """
     if audio_input is None:
         yield "Error: Please upload or record a German audio file first."
         return
-    sample_rate, audio_array = audio_input
     if audio_array.ndim > 1:
         audio_array = audio_array.mean(axis=1)
     audio_array = audio_array.astype(np.float32)
     if sample_rate != TARGET_SAMPLING_RATE:
         audio_array = librosa.resample(
             y=audio_array,
             orig_sr=sample_rate,
             target_sr=TARGET_SAMPLING_RATE
         )
     messages = [
         {
             'role': 'system',
@@ -50,34 +79,77 @@ def transcribe_and_translate(audio_input):
             'role': 'user',
             'content': [
                 {'type': 'audio', 'audio': audio_array},
-                {'type': 'text', 'text': 'Please transcribe this audio and translate it to English. Give both, the transcription and the translation.'}
             ]
         }
     ]
     inputs = processor.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=True,
         return_dict=True,
         return_tensors='pt'
-    ).to('cuda', dtype=torch.bfloat16)
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = dict(
-        **inputs,
-        streamer=streamer,
-        max_new_tokens=1024,
-        do_sample=False,
     )
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     output_text = ""
-    for new_text in streamer:
-        output_text += new_text
-        yield output_text
-# Grab all wav files in the directory
 example_audios = glob.glob('test_wav_files/*.wav')
-example_list = [ for audio in example_audios]
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
@@ -90,11 +162,14 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         audio_input = gr.Audio(sources=["upload", "microphone"], type="numpy", label="German Audio")
         text_output = gr.Textbox(label="Transcription and Translation", lines=10, interactive=False)
     submit_btn = gr.Button("Transcribe and Translate", variant="primary")
     submit_btn.click(
         fn=transcribe_and_translate,
         inputs=audio_input,
         outputs=text_output
     )
     gr.Examples(
         examples=example_list,
         inputs=audio_input,
@@ -102,5 +177,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         fn=transcribe_and_translate,
         cache_examples=False
     )
 if __name__ == "__main__":
-    demo.launch(share=True)

 from threading import Thread
 TARGET_SAMPLING_RATE = 16000
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print("Loading processor and model...")
 processor = AutoProcessor.from_pretrained('EpistemeAI/Audiogemma-3N-finetune')
+# FastModel.from_pretrained may return (model, something). keep as you had it.
+# Note: load_in_4bit and dtype handling depend on your environment and FastModel implementation.
 model, _ = FastModel.from_pretrained(
     model_name='EpistemeAI/Audiogemma-3N-finetune',
     max_seq_length=512,
     load_in_4bit=True,
     dtype=torch.bfloat16,
 )
+# Move model to device if needed (FastModel might already handle device_map)
+try:
+    model.to(device)
+except Exception:
+    # some FastModel wrappers manage device automatically; ignore if .to is unsupported
+    pass
+print("Model and processor loaded successfully. Device:", device)
 def transcribe_and_translate(audio_input):
     """
+    Generator function for Gradio streaming. Yields progressive output text.
+    audio_input from gr.Audio(type="numpy") is (sample_rate, np_array)
     """
     if audio_input is None:
         yield "Error: Please upload or record a German audio file first."
         return
+    # Unpack Gradio audio tuple
+    try:
+        sample_rate, audio_array = audio_input
+    except Exception:
+        # If Gradio returns just the numpy array sometimes, handle that
+        audio_array = audio_input
+        sample_rate = TARGET_SAMPLING_RATE
+    # Mono conversion
+    if audio_array is None:
+        yield "Error: audio data is empty."
+        return
     if audio_array.ndim > 1:
         audio_array = audio_array.mean(axis=1)
     audio_array = audio_array.astype(np.float32)
+    # Resample if needed
     if sample_rate != TARGET_SAMPLING_RATE:
         audio_array = librosa.resample(
             y=audio_array,
             orig_sr=sample_rate,
             target_sr=TARGET_SAMPLING_RATE
         )
     messages = [
         {
             'role': 'system',
             'role': 'user',
             'content': [
                 {'type': 'audio', 'audio': audio_array},
+                {'type': 'text', 'text': 'Please transcribe this audio and translate it to English. Give both the transcription and the translation.'}
             ]
         }
     ]
+    # Build model inputs. apply_chat_template returns tensors when return_tensors='pt'.
     inputs = processor.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=True,
         return_dict=True,
         return_tensors='pt'
     )
+    # Move any tensors to the device (do NOT force dtype changes on integer tensors)
+    def _move_to_device(obj):
+        if isinstance(obj, torch.Tensor):
+            return obj.to(device)
+        if isinstance(obj, dict):
+            return {k: _move_to_device(v) for k, v in obj.items()}
+        if isinstance(obj, (list, tuple)):
+            return type(obj)(_move_to_device(x) for x in obj)
+        return obj
+    inputs = _move_to_device(inputs)
+    # Prepare the tokenizer-based streamer (TextIteratorStreamer expects a tokenizer)
+    tokenizer = getattr(processor, "tokenizer", None)
+    if tokenizer is None:
+        # fallback: try attribute name used by some processors
+        tokenizer = getattr(processor, "tokenizer_fast", None)
+    if tokenizer is None:
+        yield "Error: tokenizer not found on processor (needed for streaming)."
+        return
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    # Prepare generation args - only include tensor keys model.generate expects (e.g., input_ids, attention_mask)
+    gen_inputs = {}
+    for k, v in inputs.items():
+        # typical keys: input_ids, attention_mask, etc. pass tensors only.
+        if isinstance(v, torch.Tensor):
+            gen_inputs[k] = v
+    gen_inputs.update({
+        "streamer": streamer,
+        "max_new_tokens": 1024,
+        "do_sample": False,
+    })
+    # Run generation in background thread so we can stream results
+    thread = Thread(target=model.generate, kwargs=gen_inputs, daemon=True)
     thread.start()
+    # Collect and yield streaming text
     output_text = ""
+    try:
+        for new_text in streamer:
+            output_text += new_text
+            yield output_text
+    except GeneratorExit:
+        # Gradio closed the generator early
+        return
+    finally:
+        # ensure thread finishes (optional)
+        thread.join(timeout=1)
+# Grab all wav files in the directory and format examples as lists for one input component
 example_audios = glob.glob('test_wav_files/*.wav')
+example_list = [[audio] for audio in example_audios]  # gr.Examples expects each example to match inputs
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
         audio_input = gr.Audio(sources=["upload", "microphone"], type="numpy", label="German Audio")
         text_output = gr.Textbox(label="Transcription and Translation", lines=10, interactive=False)
     submit_btn = gr.Button("Transcribe and Translate", variant="primary")
+    # NOTE: For Gradio streaming to the Textbox, Gradio supports generator-returning functions mapped to an output component.
     submit_btn.click(
         fn=transcribe_and_translate,
         inputs=audio_input,
         outputs=text_output
     )
     gr.Examples(
         examples=example_list,
         inputs=audio_input,
         fn=transcribe_and_translate,
         cache_examples=False
     )
 if __name__ == "__main__":
+    demo.launch(share=True)