legolasyiu commited on
Commit
9be43ff
·
verified ·
1 Parent(s): 92e1e76

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -44
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import os
2
- # disable TorchDynamo since UnsloTh models can have issues with TorchDynamo
3
  os.environ["TORCHDYNAMO_DISABLE"] = "1"
4
 
5
  import gradio as gr
@@ -13,68 +12,67 @@ from transformers import AutoProcessor, TextIteratorStreamer
13
  TARGET_SAMPLING_RATE = 16000
14
  device = "cuda" if torch.cuda.is_available() else "cpu"
15
 
16
- print("Loading model + processor...")
17
 
18
- # load the processor & model from the right repo
19
- processor = AutoProcessor.from_pretrained("EpistemeAI/Audiogemma-3N-finetune")
20
- model, _ = FastModel.from_pretrained(
21
- model_name="EpistemeAI/Audiogemma-3N-finetune",
22
- dtype=None,
23
  max_seq_length=2048,
 
24
  load_in_4bit=True,
25
  full_finetuning=False,
26
- device_map="auto"
 
 
 
 
 
27
  )
28
 
29
  model.eval()
30
- print("Loaded Gemma-3N on", device)
 
 
31
 
32
  def transcribe_and_translate(audio_input):
33
  if audio_input is None:
34
- yield "Upload or record audio first."
35
  return
36
 
37
- sample_rate, audio_array = audio_input
38
 
39
- # mono
40
- if audio_array.ndim > 1:
41
- audio_array = audio_array.mean(axis=1)
42
 
43
- audio_array = audio_array.astype(np.float32)
44
 
45
  # resample to 16k
46
  if sample_rate != TARGET_SAMPLING_RATE:
47
- audio_array = librosa.resample(
48
- audio_array, orig_sr=sample_rate, target_sr=TARGET_SAMPLING_RATE
49
- )
50
 
51
- # prepare prompt
52
  messages = [
53
  {
54
  "role": "system",
55
  "content": [
56
- {
57
- "type": "text",
58
- "text": "You are a model that accurately transcribes spoken audio and translates it to German."
59
- }
60
  ],
61
  },
62
  {
63
  "role": "user",
64
  "content": [
65
- {"type": "audio", "audio": audio_array},
66
- {"type": "text", "text": "Transcribe the spoken audio and translate to German."}
67
  ],
68
  },
69
  ]
70
 
71
- # tokenize & prep inputs
72
  inputs = processor.apply_chat_template(
73
  messages,
74
- add_generation_prompt=True,
75
  tokenize=True,
 
 
76
  return_dict=True,
77
- return_tensors="pt"
78
  ).to(device)
79
 
80
  streamer = TextIteratorStreamer(processor, skip_prompt=True)
@@ -82,40 +80,39 @@ def transcribe_and_translate(audio_input):
82
  generation_kwargs = dict(
83
  **inputs,
84
  max_new_tokens=1024,
85
- temperature=1.0,
86
  top_p=0.95,
87
  top_k=50,
88
- streamer=streamer
89
  )
90
 
91
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
92
  thread.start()
93
 
94
- output_text = ""
95
- for chunk in streamer:
96
- output_text += chunk
97
- yield output_text
 
 
 
98
 
99
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
100
  gr.Markdown("# Gemma-3N Audio Transcription + German Translation")
101
 
102
  with gr.Row():
103
  audio_input = gr.Audio(
104
- sources=["upload","microphone"],
105
  type="numpy",
106
- label="Your Audio"
107
  )
108
  text_output = gr.Textbox(
109
- label="Transcript & Translation",
110
- lines=10
111
  )
112
 
113
- submit_btn = gr.Button("Transcribe + Translate")
114
- submit_btn.click(
115
- fn=transcribe_and_translate,
116
- inputs=audio_input,
117
- outputs=text_output
118
- )
119
 
120
  if __name__ == "__main__":
121
  demo.launch()
 
1
  import os
 
2
  os.environ["TORCHDYNAMO_DISABLE"] = "1"
3
 
4
  import gradio as gr
 
12
  TARGET_SAMPLING_RATE = 16000
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
 
15
+ print("Loading Gemma-3N audio model...")
16
 
17
+ # IMPORTANT: disable alt-up (fixes uint8 clamp crash)
18
+ model, tokenizer = FastModel.from_pretrained(
19
+ model_name="unsloth/gemma-3n-E4B-it-unsloth-bnb-4bit",
 
 
20
  max_seq_length=2048,
21
+ dtype=None,
22
  load_in_4bit=True,
23
  full_finetuning=False,
24
+ disable_altup=True, # ← critical fix
25
+ device_map="auto",
26
+ )
27
+
28
+ processor = AutoProcessor.from_pretrained(
29
+ "unsloth/gemma-3n-E4B-it-unsloth-bnb-4bit"
30
  )
31
 
32
  model.eval()
33
+ print("Model loaded on", device)
34
+
35
+ # ---------------- AUDIO PIPELINE ---------------- #
36
 
37
  def transcribe_and_translate(audio_input):
38
  if audio_input is None:
39
+ yield "Please upload or record audio."
40
  return
41
 
42
+ sample_rate, audio = audio_input
43
 
44
+ # convert to mono
45
+ if audio.ndim > 1:
46
+ audio = audio.mean(axis=1)
47
 
48
+ audio = audio.astype(np.float32)
49
 
50
  # resample to 16k
51
  if sample_rate != TARGET_SAMPLING_RATE:
52
+ audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=TARGET_SAMPLING_RATE)
 
 
53
 
 
54
  messages = [
55
  {
56
  "role": "system",
57
  "content": [
58
+ {"type": "text", "text": "You transcribe spoken audio and translate it into German."}
 
 
 
59
  ],
60
  },
61
  {
62
  "role": "user",
63
  "content": [
64
+ {"type": "audio", "audio": audio},
65
+ {"type": "text", "text": "Please transcribe this audio and translate it to German."}
66
  ],
67
  },
68
  ]
69
 
 
70
  inputs = processor.apply_chat_template(
71
  messages,
 
72
  tokenize=True,
73
+ add_generation_prompt=True,
74
+ return_tensors="pt",
75
  return_dict=True,
 
76
  ).to(device)
77
 
78
  streamer = TextIteratorStreamer(processor, skip_prompt=True)
 
80
  generation_kwargs = dict(
81
  **inputs,
82
  max_new_tokens=1024,
83
+ temperature=0.7,
84
  top_p=0.95,
85
  top_k=50,
86
+ streamer=streamer,
87
  )
88
 
89
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
90
  thread.start()
91
 
92
+ output = ""
93
+ for token in streamer:
94
+ output += token
95
+ yield output
96
+
97
+
98
+ # ---------------- GRADIO UI ---------------- #
99
 
100
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
101
  gr.Markdown("# Gemma-3N Audio Transcription + German Translation")
102
 
103
  with gr.Row():
104
  audio_input = gr.Audio(
105
+ sources=["upload", "microphone"],
106
  type="numpy",
107
+ label="Audio Input"
108
  )
109
  text_output = gr.Textbox(
110
+ label="Transcription + Translation",
111
+ lines=12
112
  )
113
 
114
+ btn = gr.Button("Transcribe and Translate", variant="primary")
115
+ btn.click(transcribe_and_translate, audio_input, text_output)
 
 
 
 
116
 
117
  if __name__ == "__main__":
118
  demo.launch()