Spaces:

oddadmix
/

egyptian-code-swtiching

Running on Zero

App Files Files Community

oddadmix commited on Nov 5, 2025

Commit

672dce6

verified ·

1 Parent(s): 58404fd

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -27

app.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import spaces
 import gradio as gr
-from unsloth import FastModel, FastLanguageModel
 import torch
-from transformers import Gemma3nProcessor
 import os
 # Global variables for model and processor
@@ -14,19 +13,18 @@ def load_model():
     global model, processor
     print("Loading model...")
-    model, _ = FastModel.from_pretrained(
-        model_name = "oddadmix/gemma-4b-egyptian-code-switching-b4-g2",
-        dtype = None,
-        max_seq_length = 2048,
-        load_in_4bit = True,  # Enable 4bit for GPU memory efficiency
-        full_finetuning = False,
-    )
-    processor = Gemma3nProcessor.from_pretrained("google/gemma-3n-E4B-it")
-    # Set model to inference mode
-    FastLanguageModel.for_inference(model)
     print("Model loaded successfully!")
 @spaces.GPU
 def transcribe_audio(audio_path, max_tokens=128):
     """Transcribe audio file using the loaded model"""
@@ -62,18 +60,20 @@ def transcribe_audio(audio_path, max_tokens=128):
             tokenize=True,
             return_dict=True,
             return_tensors="pt",
-        ).to("cuda")
         # Generate transcription
-        output = model.generate(
-            **inputs,
-            max_new_tokens=max_tokens,
-            do_sample=False
-        )
-        # Get only the newly generated tokens
-        generated_tokens = output[0][inputs["input_ids"].shape[-1]:]
-        response = processor.decode(generated_tokens, skip_special_tokens=True)
         return response
@@ -84,13 +84,13 @@ def transcribe_audio(audio_path, max_tokens=128):
 load_model()
 # Create Gradio interface
-with gr.Blocks(title="Egyptian Arabic ASR") as demo:
     gr.Markdown(
         """
-        # 🎙️ Egyptian Arabic Speech Recognition
         Upload an audio file or record your voice to get an automatic transcription.
-        This model is optimized for Egyptian Arabic code-switching.
         """
     )
@@ -121,7 +121,7 @@ with gr.Blocks(title="Egyptian Arabic ASR") as demo:
         """
         ### Tips:
         - For best results, use clear audio with minimal background noise
-        - The model handles Egyptian Arabic and code-switching with English
         - Recording length should be reasonable (under 30 seconds recommended)
         """
     )
@@ -142,5 +142,4 @@ with gr.Blocks(title="Egyptian Arabic ASR") as demo:
 # Launch the app
 if __name__ == "__main__":
-    demo.launch()

 import spaces
 import gradio as gr
+from transformers import AutoProcessor, Gemma3nForConditionalGeneration
 import torch
 import os
 # Global variables for model and processor
     global model, processor
     print("Loading model...")
+    model_id = "google/gemma-3n-e4b-it"
+    model = Gemma3nForConditionalGeneration.from_pretrained(
+        model_id,
+        device_map="auto",
+        torch_dtype=torch.bfloat16,
+    ).eval()
+    processor = AutoProcessor.from_pretrained(model_id)
     print("Model loaded successfully!")
 @spaces.GPU
 def transcribe_audio(audio_path, max_tokens=128):
     """Transcribe audio file using the loaded model"""
             tokenize=True,
             return_dict=True,
             return_tensors="pt",
+        ).to(model.device)
+        input_len = inputs["input_ids"].shape[-1]
         # Generate transcription
+        with torch.inference_mode():
+            generation = model.generate(
+                **inputs,
+                max_new_tokens=max_tokens,
+                do_sample=False
+            )
+            generation = generation[0][input_len:]
+        response = processor.decode(generation, skip_special_tokens=True)
         return response
 load_model()
 # Create Gradio interface
+with gr.Blocks(title="Gemma 3n Audio Transcription") as demo:
     gr.Markdown(
         """
+        # 🎙️ Gemma 3n Audio Transcription
         Upload an audio file or record your voice to get an automatic transcription.
+        Powered by Google's Gemma 3n-E4B-IT multimodal model.
         """
     )
         """
         ### Tips:
         - For best results, use clear audio with minimal background noise
+        - The model can handle various languages and accents
         - Recording length should be reasonable (under 30 seconds recommended)
         """
     )
 # Launch the app
 if __name__ == "__main__":
+    demo.launch()