legolasyiu commited on
Commit
94faa68
·
verified ·
1 Parent(s): c3206c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -38
app.py CHANGED
@@ -1,27 +1,16 @@
1
- import os
2
  import torch
3
- import torchaudio
4
- import gradio as gr
5
  from transformers import AutoProcessor, AutoModelForImageTextToText
6
 
7
- # Load model
8
  processor = AutoProcessor.from_pretrained("EpistemeAI/Audiogemma-3N-finetune")
9
  model = AutoModelForImageTextToText.from_pretrained(
10
  "EpistemeAI/Audiogemma-3N-finetune",
11
- torch_dtype=torch.bfloat16,
12
- ).to("cuda")
13
-
14
-
15
-
16
 
17
  def convert_audio_to_text(audio_file):
18
- # Load audio
19
- waveform, sample_rate = torchaudio.load(audio_file)
20
-
21
- # Convert to mono if stereo
22
- if waveform.shape[0] > 1:
23
- waveform = waveform.mean(dim=0, keepdim=True)
24
-
25
  messages = [
26
  {
27
  "role": "user",
@@ -32,36 +21,34 @@ def convert_audio_to_text(audio_file):
32
  }
33
  ]
34
 
35
- input_ids = processor.apply_chat_template(
 
36
  messages,
37
  add_generation_prompt=True,
38
- tokenize=True, return_dict=True,
 
39
  return_tensors="pt",
40
  ).to("cuda")
41
 
42
- outputs = model.generate(**input_ids, max_new_tokens=512)
 
 
 
 
 
 
43
 
 
44
  result = processor.batch_decode(
45
  outputs,
46
- skip_special_tokens=False,
47
- clean_up_tokenization_spaces=False
48
  )
49
- return result
50
-
51
-
52
- with gr.Blocks() as demo:
53
- gr.Markdown("## 🎙️ Audio Transcription with Audiogemma")
54
 
55
- with gr.Tab("Microphone"):
56
- mic = gr.Audio(sources="microphone", type="filepath")
57
- mic_out = gr.Textbox()
58
- mic_btn = gr.Button("Transcribe")
59
- mic_btn.click(convert_audio_to_text, mic, mic_out)
60
-
61
- with gr.Tab("Audio File"):
62
- file = gr.Audio(sources="upload", type="filepath")
63
- file_out = gr.Textbox()
64
- file_btn = gr.Button("Transcribe")
65
- file_btn.click(convert_audio_to_text, file, file_out)
66
 
67
- demo.launch()
 
 
 
 
 
 
1
  import torch
 
 
2
  from transformers import AutoProcessor, AutoModelForImageTextToText
3
 
4
+ # Load processor and model
5
  processor = AutoProcessor.from_pretrained("EpistemeAI/Audiogemma-3N-finetune")
6
  model = AutoModelForImageTextToText.from_pretrained(
7
  "EpistemeAI/Audiogemma-3N-finetune",
8
+ torch_dtype=torch.bfloat16, # Match model weights to bfloat16
9
+ device_map="auto" # Sends model to GPU if available
10
+ )
 
 
11
 
12
  def convert_audio_to_text(audio_file):
13
+ # Build the multimodal chat message
 
 
 
 
 
 
14
  messages = [
15
  {
16
  "role": "user",
 
21
  }
22
  ]
23
 
24
+ # Prepare inputs
25
+ inputs = processor.apply_chat_template(
26
  messages,
27
  add_generation_prompt=True,
28
+ tokenize=True,
29
+ return_dict=True,
30
  return_tensors="pt",
31
  ).to("cuda")
32
 
33
+ # Cast all floating tensors to match model dtype
34
+ for k, v in inputs.items():
35
+ if torch.is_floating_point(v):
36
+ inputs[k] = v.to(dtype=torch.bfloat16)
37
+
38
+ # Generate output tokens
39
+ outputs = model.generate(**inputs, max_new_tokens=512)
40
 
41
+ # Decode output
42
  result = processor.batch_decode(
43
  outputs,
44
+ skip_special_tokens=True,
45
+ clean_up_tokenization_spaces=True
46
  )
 
 
 
 
 
47
 
48
+ return result
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # Example usage
51
+ if __name__ == "__main__":
52
+ audio_path = "sample_audio.wav"
53
+ transcription_translation = convert_audio_to_text(audio_path)
54
+ print(transcription_translation)