oddadmix commited on
Commit
672dce6
·
verified ·
1 Parent(s): 58404fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -27
app.py CHANGED
@@ -1,8 +1,7 @@
1
  import spaces
2
  import gradio as gr
3
- from unsloth import FastModel, FastLanguageModel
4
  import torch
5
- from transformers import Gemma3nProcessor
6
  import os
7
 
8
  # Global variables for model and processor
@@ -14,19 +13,18 @@ def load_model():
14
  global model, processor
15
 
16
  print("Loading model...")
17
- model, _ = FastModel.from_pretrained(
18
- model_name = "oddadmix/gemma-4b-egyptian-code-switching-b4-g2",
19
- dtype = None,
20
- max_seq_length = 2048,
21
- load_in_4bit = True, # Enable 4bit for GPU memory efficiency
22
- full_finetuning = False,
23
- )
24
 
25
- processor = Gemma3nProcessor.from_pretrained("google/gemma-3n-E4B-it")
26
 
27
- # Set model to inference mode
28
- FastLanguageModel.for_inference(model)
29
  print("Model loaded successfully!")
 
30
  @spaces.GPU
31
  def transcribe_audio(audio_path, max_tokens=128):
32
  """Transcribe audio file using the loaded model"""
@@ -62,18 +60,20 @@ def transcribe_audio(audio_path, max_tokens=128):
62
  tokenize=True,
63
  return_dict=True,
64
  return_tensors="pt",
65
- ).to("cuda")
 
 
66
 
67
  # Generate transcription
68
- output = model.generate(
69
- **inputs,
70
- max_new_tokens=max_tokens,
71
- do_sample=False
72
- )
 
 
73
 
74
- # Get only the newly generated tokens
75
- generated_tokens = output[0][inputs["input_ids"].shape[-1]:]
76
- response = processor.decode(generated_tokens, skip_special_tokens=True)
77
 
78
  return response
79
 
@@ -84,13 +84,13 @@ def transcribe_audio(audio_path, max_tokens=128):
84
  load_model()
85
 
86
  # Create Gradio interface
87
- with gr.Blocks(title="Egyptian Arabic ASR") as demo:
88
  gr.Markdown(
89
  """
90
- # 🎙️ Egyptian Arabic Speech Recognition
91
 
92
  Upload an audio file or record your voice to get an automatic transcription.
93
- This model is optimized for Egyptian Arabic code-switching.
94
  """
95
  )
96
 
@@ -121,7 +121,7 @@ with gr.Blocks(title="Egyptian Arabic ASR") as demo:
121
  """
122
  ### Tips:
123
  - For best results, use clear audio with minimal background noise
124
- - The model handles Egyptian Arabic and code-switching with English
125
  - Recording length should be reasonable (under 30 seconds recommended)
126
  """
127
  )
@@ -142,5 +142,4 @@ with gr.Blocks(title="Egyptian Arabic ASR") as demo:
142
 
143
  # Launch the app
144
  if __name__ == "__main__":
145
- demo.launch()
146
-
 
1
  import spaces
2
  import gradio as gr
3
+ from transformers import AutoProcessor, Gemma3nForConditionalGeneration
4
  import torch
 
5
  import os
6
 
7
  # Global variables for model and processor
 
13
  global model, processor
14
 
15
  print("Loading model...")
16
+ model_id = "google/gemma-3n-e4b-it"
17
+
18
+ model = Gemma3nForConditionalGeneration.from_pretrained(
19
+ model_id,
20
+ device_map="auto",
21
+ torch_dtype=torch.bfloat16,
22
+ ).eval()
23
 
24
+ processor = AutoProcessor.from_pretrained(model_id)
25
 
 
 
26
  print("Model loaded successfully!")
27
+
28
  @spaces.GPU
29
  def transcribe_audio(audio_path, max_tokens=128):
30
  """Transcribe audio file using the loaded model"""
 
60
  tokenize=True,
61
  return_dict=True,
62
  return_tensors="pt",
63
+ ).to(model.device)
64
+
65
+ input_len = inputs["input_ids"].shape[-1]
66
 
67
  # Generate transcription
68
+ with torch.inference_mode():
69
+ generation = model.generate(
70
+ **inputs,
71
+ max_new_tokens=max_tokens,
72
+ do_sample=False
73
+ )
74
+ generation = generation[0][input_len:]
75
 
76
+ response = processor.decode(generation, skip_special_tokens=True)
 
 
77
 
78
  return response
79
 
 
84
  load_model()
85
 
86
  # Create Gradio interface
87
+ with gr.Blocks(title="Gemma 3n Audio Transcription") as demo:
88
  gr.Markdown(
89
  """
90
+ # 🎙️ Gemma 3n Audio Transcription
91
 
92
  Upload an audio file or record your voice to get an automatic transcription.
93
+ Powered by Google's Gemma 3n-E4B-IT multimodal model.
94
  """
95
  )
96
 
 
121
  """
122
  ### Tips:
123
  - For best results, use clear audio with minimal background noise
124
+ - The model can handle various languages and accents
125
  - Recording length should be reasonable (under 30 seconds recommended)
126
  """
127
  )
 
142
 
143
  # Launch the app
144
  if __name__ == "__main__":
145
+ demo.launch()