Spaces:

bilguun
/

gemma3n-audio-mn

Running on Zero

App Files Files Community

bilguun commited on Aug 7

Commit

63456dc

verified ·

1 Parent(s): b912268

chore: add requirements

Browse files

chore: sample audio files

feat: update app

feat: update app

feat: update app

feat: update app

feat: update app

feat: update app

feat: update app

feat: update app

feat: update app

feat: update app

chore: update config

Files changed (4) hide show

.gitattributes +1 -0
README.md +4 -4
app.py +80 -59
requirements.txt +7 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: Gemma3n Audio Mn
-emoji: 😻
-colorFrom: pink
-colorTo: yellow
 sdk: gradio
 sdk_version: 5.41.1
 app_file: app.py

 ---
+title: Gemma3n Audio MN
+emoji: 🎤
+colorFrom: red
+colorTo: blue
 sdk: gradio
 sdk_version: 5.41.1
 app_file: app.py

app.py CHANGED Viewed

@@ -1,26 +1,26 @@
-import gc
 import gradio as gr
 import spaces
 import torch
 from transformers import AutoModelForImageTextToText, AutoProcessor
 BASE_GEMMA_MODEL_ID = "google/gemma-3n-E2B-it"
 GEMMA_MODEL_ID = "bilguun/gemma-3n-E2B-it-audio-en-mn"
 print("Loading processor and model...")
-processor = AutoProcessor.from_pretrained(BASE_GEMMA_MODEL_ID, device_map="cuda")
-model = AutoModelForImageTextToText.from_pretrained(GEMMA_MODEL_ID, device_map="cuda")
-if hasattr(model, "eval"):
-    model.eval()
 print("Model loaded successfully!")
-@spaces.GPU
-def process_audio(audio_file, prompt_type):
     if audio_file is None:
         return "Please upload an audio file."
@@ -32,55 +32,44 @@ def process_audio(audio_file, prompt_type):
     selected_prompt = prompts[prompt_type]
-    try:
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "audio", "audio": audio_file},
-                    {"type": "text", "text": selected_prompt},
-                ],
-            }
-        ]
-        with torch.no_grad():
-            input_ids = processor.apply_chat_template(
-                messages,
-                add_generation_prompt=True,
-                tokenize=True,
-                return_dict=True,
-                return_tensors="pt",
-            )
-            input_ids = {
-                k: v.to(model.device, dtype=torch.long if "input_ids" in k else v.dtype)
-                for k, v in input_ids.items()
-            }
-            outputs = model.generate(
-                **input_ids,
-                max_new_tokens=128,
-                pad_token_id=processor.tokenizer.eos_token_id,
-            )
-            input_length = input_ids["input_ids"].shape[1]
-            generated_tokens = outputs[:, input_length:]
-            text = processor.batch_decode(
-                generated_tokens,
-                skip_special_tokens=True,
-                clean_up_tokenization_spaces=False,
-            )
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        del input_ids, outputs, generated_tokens
-        gc.collect()
-        return text[0]
-    except Exception as e:
-        return f"Error processing audio: {str(e)}"
 with gr.Blocks(title="Gemma 3n Audio Transcription & Translation") as demo:
@@ -113,22 +102,54 @@ with gr.Blocks(title="Gemma 3n Audio Transcription & Translation") as demo:
                 max_lines=20,
                 placeholder="Transcribed text will appear here...",
                 show_copy_button=True,
             )
     process_btn.click(
         fn=process_audio,
-        inputs=[audio_input, prompt_dropdown],
         outputs=output_text,
     )
     gr.Examples(
         examples=[
-            ["./audio_samples/en1.wav", "Transcribe"],
-            ["./audio_samples/en3.wav", "Transcribe EN to MN"],
-            ["./audio_samples/mn2.wav", "Transcribe"],
-            ["./audio_samples/mn2.wav", "Transcribe MN to EN"],
         ],
-        inputs=[audio_input, prompt_dropdown],
         outputs=output_text,
         fn=process_audio,
         cache_examples=True,

+from threading import Thread
 import gradio as gr
 import spaces
 import torch
 from transformers import AutoModelForImageTextToText, AutoProcessor
+from transformers.generation.streamers import TextIteratorStreamer
 BASE_GEMMA_MODEL_ID = "google/gemma-3n-E2B-it"
 GEMMA_MODEL_ID = "bilguun/gemma-3n-E2B-it-audio-en-mn"
 print("Loading processor and model...")
+processor = AutoProcessor.from_pretrained(BASE_GEMMA_MODEL_ID)
+model = AutoModelForImageTextToText.from_pretrained(
+    GEMMA_MODEL_ID, torch_dtype=torch.bfloat16, device_map="auto"
+)
 print("Model loaded successfully!")
+@spaces.GPU(duration=60)
+@torch.inference_mode()
+def process_audio(audio_file, prompt_type, max_tokens):
     if audio_file is None:
         return "Please upload an audio file."
     selected_prompt = prompts[prompt_type]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "audio", "audio": audio_file},
+                {"type": "text", "text": selected_prompt},
+            ],
+        }
+    ]
+    input_ids = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+    )
+    input_ids = input_ids.to(model.device, dtype=model.dtype)
+    streamer = TextIteratorStreamer(
+        processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True
+    )
+    generate_kwargs = dict(
+        input_ids,
+        streamer=streamer,
+        max_new_tokens=max_tokens,
+        disable_compile=True,
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    output = ""
+    for delta in streamer:
+        output += delta
+        yield output
 with gr.Blocks(title="Gemma 3n Audio Transcription & Translation") as demo:
                 max_lines=20,
                 placeholder="Transcribed text will appear here...",
                 show_copy_button=True,
+                interactive=False,
+            )
+    with gr.Row():
+        with gr.Accordion("Additional Settings", open=False):
+            max_tokens_slider = gr.Slider(
+                minimum=16,
+                maximum=512,
+                value=128,
+                step=16,
+                label="Max New Tokens",
+                info="Maximum number of tokens to generate",
             )
     process_btn.click(
         fn=process_audio,
+        inputs=[audio_input, prompt_dropdown, max_tokens_slider],
         outputs=output_text,
     )
     gr.Examples(
         examples=[
+            [
+                "https://github.com/bilguun0203/gemma3n-audio-mn/raw/refs/heads/main/audio_samples/en1.wav",
+                "Transcribe",
+                128,
+            ],
+            [
+                "https://github.com/bilguun0203/gemma3n-audio-mn/raw/refs/heads/main/audio_samples/en3.wav",
+                "Transcribe EN to MN",
+                128,
+            ],
+            [
+                "https://github.com/bilguun0203/gemma3n-audio-mn/raw/refs/heads/main/audio_samples/mn2.wav",
+                "Transcribe",
+                128,
+            ],
+            [
+                "https://github.com/bilguun0203/gemma3n-audio-mn/raw/refs/heads/main/audio_samples/mn2.wav",
+                "Transcribe MN to EN",
+                128,
+            ],
+        ],
+        inputs=[
+            audio_input,
+            prompt_dropdown,
+            max_tokens_slider,
         ],
         outputs=output_text,
         fn=process_audio,
         cache_examples=True,

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+datasets[audio]
+peft
+tensorboard
+timm
+torch<2.7
+transformers<4.55
+trl