oddadmix commited on
Commit
7d2baf1
·
verified ·
1 Parent(s): 6a53468

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -0
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from unsloth import FastModel, FastLanguageModel
3
+ import torch
4
+ from transformers import Gemma3nProcessor
5
+ import os
6
+
7
+ # Global variables for model and processor
8
+ model = None
9
+ processor = None
10
+
11
+ def load_model():
12
+ """Load the model and processor once at startup"""
13
+ global model, processor
14
+
15
+ print("Loading model...")
16
+ model, _ = FastModel.from_pretrained(
17
+ model_name = "oddadmix/gemma-4b-egyptian-code-switching-b4-g2",
18
+ dtype = None,
19
+ max_seq_length = 2048,
20
+ load_in_4bit = True, # Enable 4bit for GPU memory efficiency
21
+ full_finetuning = False,
22
+ )
23
+
24
+ processor = Gemma3nProcessor.from_pretrained("google/gemma-3n-E4B-it")
25
+
26
+ # Set model to inference mode
27
+ FastLanguageModel.for_inference(model)
28
+ print("Model loaded successfully!")
29
+
30
+ def transcribe_audio(audio_path, max_tokens=128):
31
+ """Transcribe audio file using the loaded model"""
32
+ if model is None or processor is None:
33
+ return "Error: Model not loaded"
34
+
35
+ if audio_path is None:
36
+ return "Please upload or record an audio file"
37
+
38
+ try:
39
+ messages = [
40
+ {
41
+ "role": "system",
42
+ "content": [
43
+ {
44
+ "type": "text",
45
+ "text": "You are an assistant that transcribes speech accurately.",
46
+ }
47
+ ],
48
+ },
49
+ {
50
+ "role": "user",
51
+ "content": [
52
+ {"type": "audio", "url": audio_path},
53
+ {"type": "text", "text": "Please transcribe this audio."}
54
+ ]
55
+ }
56
+ ]
57
+
58
+ inputs = processor.apply_chat_template(
59
+ messages,
60
+ add_generation_prompt=True,
61
+ tokenize=True,
62
+ return_dict=True,
63
+ return_tensors="pt",
64
+ ).to("cuda")
65
+
66
+ # Generate transcription
67
+ output = model.generate(
68
+ **inputs,
69
+ max_new_tokens=max_tokens,
70
+ do_sample=False
71
+ )
72
+
73
+ # Get only the newly generated tokens
74
+ generated_tokens = output[0][inputs["input_ids"].shape[-1]:]
75
+ response = processor.decode(generated_tokens, skip_special_tokens=True)
76
+
77
+ return response
78
+
79
+ except Exception as e:
80
+ return f"Error during transcription: {str(e)}"
81
+
82
+ # Load model at startup
83
+ load_model()
84
+
85
+ # Create Gradio interface
86
+ with gr.Blocks(title="Egyptian Arabic ASR") as demo:
87
+ gr.Markdown(
88
+ """
89
+ # 🎙️ Egyptian Arabic Speech Recognition
90
+
91
+ Upload an audio file or record your voice to get an automatic transcription.
92
+ This model is optimized for Egyptian Arabic code-switching.
93
+ """
94
+ )
95
+
96
+ with gr.Row():
97
+ with gr.Column():
98
+ audio_input = gr.Audio(
99
+ sources=["upload", "microphone"],
100
+ type="filepath",
101
+ label="Audio Input"
102
+ )
103
+ max_tokens_slider = gr.Slider(
104
+ minimum=32,
105
+ maximum=512,
106
+ value=128,
107
+ step=32,
108
+ label="Max Output Tokens"
109
+ )
110
+ transcribe_btn = gr.Button("Transcribe", variant="primary")
111
+
112
+ with gr.Column():
113
+ output_text = gr.Textbox(
114
+ label="Transcription",
115
+ placeholder="Your transcription will appear here...",
116
+ lines=10
117
+ )
118
+
119
+ gr.Markdown(
120
+ """
121
+ ### Tips:
122
+ - For best results, use clear audio with minimal background noise
123
+ - The model handles Egyptian Arabic and code-switching with English
124
+ - Recording length should be reasonable (under 30 seconds recommended)
125
+ """
126
+ )
127
+
128
+ # Set up the transcription action
129
+ transcribe_btn.click(
130
+ fn=transcribe_audio,
131
+ inputs=[audio_input, max_tokens_slider],
132
+ outputs=output_text
133
+ )
134
+
135
+ # Also allow transcription on audio upload/record
136
+ audio_input.change(
137
+ fn=transcribe_audio,
138
+ inputs=[audio_input, max_tokens_slider],
139
+ outputs=output_text
140
+ )
141
+
142
+ # Launch the app
143
+ if __name__ == "__main__":
144
+ demo.launch()