legolasyiu commited on
Commit
92e1e76
·
verified ·
1 Parent(s): 7725773

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -37
app.py CHANGED
@@ -1,73 +1,80 @@
1
  import os
 
2
  os.environ["TORCHDYNAMO_DISABLE"] = "1"
3
 
4
  import gradio as gr
5
  import torch
6
  import librosa
7
  import numpy as np
8
- import glob
9
  from unsloth import FastModel
10
  from transformers import AutoProcessor, TextIteratorStreamer
11
- from threading import Thread
12
 
13
  TARGET_SAMPLING_RATE = 16000
14
  device = "cuda" if torch.cuda.is_available() else "cpu"
15
 
16
- print("Loading processor and model...")
17
 
 
18
  processor = AutoProcessor.from_pretrained("EpistemeAI/Audiogemma-3N-finetune")
19
-
20
  model, _ = FastModel.from_pretrained(
21
  model_name="EpistemeAI/Audiogemma-3N-finetune",
22
  dtype=None,
23
- max_seq_length=1024,
24
  load_in_4bit=True,
25
  full_finetuning=False,
 
26
  )
27
 
28
  model.eval()
29
-
30
- print("Model loaded on", device)
31
 
32
  def transcribe_and_translate(audio_input):
33
  if audio_input is None:
34
- yield "Please upload or record an audio file."
35
  return
36
 
37
  sample_rate, audio_array = audio_input
38
 
 
39
  if audio_array.ndim > 1:
40
  audio_array = audio_array.mean(axis=1)
41
 
42
  audio_array = audio_array.astype(np.float32)
43
 
 
44
  if sample_rate != TARGET_SAMPLING_RATE:
45
  audio_array = librosa.resample(
46
  audio_array, orig_sr=sample_rate, target_sr=TARGET_SAMPLING_RATE
47
  )
48
 
 
49
  messages = [
50
  {
51
  "role": "system",
52
  "content": [
53
- {"type": "text", "text": "You are an assistant that transcribes and translates speech accurately."}
 
 
 
54
  ],
55
  },
56
  {
57
  "role": "user",
58
  "content": [
59
  {"type": "audio", "audio": audio_array},
60
- {"type": "text", "text": "Please transcribe this audio and translate it to German."}
61
  ],
62
  },
63
  ]
64
 
 
65
  inputs = processor.apply_chat_template(
66
  messages,
67
  add_generation_prompt=True,
68
  tokenize=True,
69
  return_dict=True,
70
- return_tensors="pt",
71
  ).to(device)
72
 
73
  streamer = TextIteratorStreamer(processor, skip_prompt=True)
@@ -75,46 +82,39 @@ def transcribe_and_translate(audio_input):
75
  generation_kwargs = dict(
76
  **inputs,
77
  max_new_tokens=1024,
78
- temperature=0.8,
79
  top_p=0.95,
80
- top_k=64,
81
- streamer=streamer,
82
  )
83
 
84
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
85
  thread.start()
86
 
87
  output_text = ""
88
- for new_text in streamer:
89
- output_text += new_text
90
  yield output_text
91
 
92
-
93
- example_audios = glob.glob("test_wav_files/*.wav")
94
- example_list = [[audio] for audio in example_audios]
95
-
96
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
97
- gr.Markdown(
98
- """
99
- # Audio Transcription & Translation (Gemma-3N)
100
- Upload or record audio and receive transcription and German translation.
101
- Powered by Audiogemma-3N + Unsloth.
102
- """
103
- )
104
 
105
  with gr.Row():
106
- audio_input = gr.Audio(sources=["upload", "microphone"], type="numpy", label="Audio Input")
107
- text_output = gr.Textbox(label="Transcription & Translation", lines=12)
108
-
109
- submit_btn = gr.Button("Transcribe and Translate", variant="primary")
110
- submit_btn.click(fn=transcribe_and_translate, inputs=audio_input, outputs=text_output)
 
 
 
 
111
 
112
- gr.Examples(
113
- examples=example_list,
114
- inputs=audio_input,
115
- outputs=text_output,
116
  fn=transcribe_and_translate,
117
- cache_examples=False,
 
118
  )
119
 
120
  if __name__ == "__main__":
 
1
  import os
2
+ # disable TorchDynamo since UnsloTh models can have issues with TorchDynamo
3
  os.environ["TORCHDYNAMO_DISABLE"] = "1"
4
 
5
  import gradio as gr
6
  import torch
7
  import librosa
8
  import numpy as np
9
+ from threading import Thread
10
  from unsloth import FastModel
11
  from transformers import AutoProcessor, TextIteratorStreamer
 
12
 
13
  TARGET_SAMPLING_RATE = 16000
14
  device = "cuda" if torch.cuda.is_available() else "cpu"
15
 
16
+ print("Loading model + processor...")
17
 
18
+ # load the processor & model from the right repo
19
  processor = AutoProcessor.from_pretrained("EpistemeAI/Audiogemma-3N-finetune")
 
20
  model, _ = FastModel.from_pretrained(
21
  model_name="EpistemeAI/Audiogemma-3N-finetune",
22
  dtype=None,
23
+ max_seq_length=2048,
24
  load_in_4bit=True,
25
  full_finetuning=False,
26
+ device_map="auto"
27
  )
28
 
29
  model.eval()
30
+ print("Loaded Gemma-3N on", device)
 
31
 
32
  def transcribe_and_translate(audio_input):
33
  if audio_input is None:
34
+ yield "Upload or record audio first."
35
  return
36
 
37
  sample_rate, audio_array = audio_input
38
 
39
+ # mono
40
  if audio_array.ndim > 1:
41
  audio_array = audio_array.mean(axis=1)
42
 
43
  audio_array = audio_array.astype(np.float32)
44
 
45
+ # resample to 16k
46
  if sample_rate != TARGET_SAMPLING_RATE:
47
  audio_array = librosa.resample(
48
  audio_array, orig_sr=sample_rate, target_sr=TARGET_SAMPLING_RATE
49
  )
50
 
51
+ # prepare prompt
52
  messages = [
53
  {
54
  "role": "system",
55
  "content": [
56
+ {
57
+ "type": "text",
58
+ "text": "You are a model that accurately transcribes spoken audio and translates it to German."
59
+ }
60
  ],
61
  },
62
  {
63
  "role": "user",
64
  "content": [
65
  {"type": "audio", "audio": audio_array},
66
+ {"type": "text", "text": "Transcribe the spoken audio and translate to German."}
67
  ],
68
  },
69
  ]
70
 
71
+ # tokenize & prep inputs
72
  inputs = processor.apply_chat_template(
73
  messages,
74
  add_generation_prompt=True,
75
  tokenize=True,
76
  return_dict=True,
77
+ return_tensors="pt"
78
  ).to(device)
79
 
80
  streamer = TextIteratorStreamer(processor, skip_prompt=True)
 
82
  generation_kwargs = dict(
83
  **inputs,
84
  max_new_tokens=1024,
85
+ temperature=1.0,
86
  top_p=0.95,
87
+ top_k=50,
88
+ streamer=streamer
89
  )
90
 
91
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
92
  thread.start()
93
 
94
  output_text = ""
95
+ for chunk in streamer:
96
+ output_text += chunk
97
  yield output_text
98
 
 
 
 
 
99
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
100
+ gr.Markdown("# Gemma-3N Audio Transcription + German Translation")
 
 
 
 
 
 
101
 
102
  with gr.Row():
103
+ audio_input = gr.Audio(
104
+ sources=["upload","microphone"],
105
+ type="numpy",
106
+ label="Your Audio"
107
+ )
108
+ text_output = gr.Textbox(
109
+ label="Transcript & Translation",
110
+ lines=10
111
+ )
112
 
113
+ submit_btn = gr.Button("Transcribe + Translate")
114
+ submit_btn.click(
 
 
115
  fn=transcribe_and_translate,
116
+ inputs=audio_input,
117
+ outputs=text_output
118
  )
119
 
120
  if __name__ == "__main__":