legolasyiu commited on
Commit
2d0ebc3
·
verified ·
1 Parent(s): 7bdc961

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -76
app.py CHANGED
@@ -1,105 +1,90 @@
1
- import os
2
- os.environ["TORCHDYNAMO_DISABLE"] = "1"
3
-
4
  import gradio as gr
5
  import torch
6
- import librosa
7
- import numpy as np
8
- from threading import Thread
9
- from unsloth import FastModel
10
- from transformers import AutoProcessor, TextIteratorStreamer
11
-
12
- TARGET_SAMPLING_RATE = 16000
13
- device = "cuda" if torch.cuda.is_available() else "cpu"
14
-
15
- print("Loading Gemma-3N audio model...")
16
-
17
- # IMPORTANT: disable alt-up (fixes uint8 clamp crash)
18
- model, tokenizer = FastModel.from_pretrained(
19
- model_name="unsloth/gemma-3n-E4B-it-unsloth-bnb-4bit",
20
- max_seq_length=2048,
21
- dtype=None,
22
- load_in_4bit=True,
23
- full_finetuning=False,
24
- device_map="auto",
25
- )
26
-
27
- processor = AutoProcessor.from_pretrained(
28
- "EpistemeAI/Audiogemma-3N-finetune"
29
  )
30
 
31
- model.eval()
32
- print("Model loaded on", device)
33
 
34
- # ---------------- AUDIO PIPELINE ---------------- #
35
-
36
- def transcribe_and_translate(audio_input):
37
- if audio_input is None:
38
- yield "Please upload or record audio."
39
- return
40
 
41
  messages = [
42
- {
43
- "role": "system",
44
- "content": [
45
- {"type": "text", "text": "You transcribe spoken audio and translate it into German."}
46
- ],
47
- },
48
  {
49
  "role": "user",
50
  "content": [
51
  {"type": "audio", "audio": audio_input},
52
- {"type": "text", "text": "Please transcribe this audio and translate it to German."}
53
- ],
54
- },
 
 
 
55
  ]
56
 
57
- inputs = processor.apply_chat_template(
58
  messages,
59
- tokenize=True,
60
  add_generation_prompt=True,
61
- return_tensors="pt",
62
  return_dict=True,
63
- ).to(device)
 
64
 
65
- streamer = TextIteratorStreamer(processor, skip_prompt=True)
66
 
67
- generation_kwargs = dict(
68
- **inputs,
69
- max_new_tokens=1024,
70
- temperature=0.7,
71
- top_p=0.95,
72
- top_k=50,
73
- streamer=streamer,
74
- )
75
 
76
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
77
- thread.start()
 
 
 
78
 
79
- output = ""
80
- for token in streamer:
81
- output += token
82
- yield output
83
 
84
 
85
  # ---------------- GRADIO UI ---------------- #
86
 
87
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
88
- gr.Markdown("# Gemma-3N Audio Transcription + German Translation")
 
 
 
 
 
 
 
89
 
90
  with gr.Row():
91
- audio_input = gr.Audio(
92
- sources=["upload", "microphone"],
93
- type="numpy",
94
- label="Audio Input"
95
- )
96
- text_output = gr.Textbox(
97
- label="Transcription + Translation",
98
- lines=12
99
  )
100
 
101
- btn = gr.Button("Transcribe and Translate", variant="primary")
102
- btn.click(transcribe_and_translate, audio_input, text_output)
 
 
 
 
 
 
 
 
 
 
 
103
 
104
- if __name__ == "__main__":
105
- demo.launch()
 
 
 
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import AutoProcessor, AutoModelForImageTextToText
4
+ import nest_asyncio
5
+ nest_asyncio.apply()
6
+
7
+ # ---------------- MODEL SETUP ---------------- #
8
+ MODEL_ID = "EpistemeAI/Audiogemma-3N-finetune"
9
+
10
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
11
+ model = AutoModelForImageTextToText.from_pretrained(
12
+ MODEL_ID,
13
+ torch_dtype=torch.bfloat16,
14
+ device_map="auto"
 
 
 
 
 
 
 
 
 
 
 
15
  )
16
 
17
+ # ---------------- TRANSLATION FUNCTION ---------------- #
 
18
 
19
+ def transcribe_and_translate(audio_input, target_language):
 
 
 
 
 
20
 
21
  messages = [
 
 
 
 
 
 
22
  {
23
  "role": "user",
24
  "content": [
25
  {"type": "audio", "audio": audio_input},
26
+ {
27
+ "type": "text",
28
+ "text": f"Transcribe this audio into English, and then translate it into {target_language}."
29
+ },
30
+ ]
31
+ }
32
  ]
33
 
34
+ input_ids = processor.apply_chat_template(
35
  messages,
 
36
  add_generation_prompt=True,
37
+ tokenize=True,
38
  return_dict=True,
39
+ return_tensors="pt",
40
+ )
41
 
42
+ input_ids = input_ids.to(model.device, dtype=model.dtype)
43
 
44
+ with torch.no_grad():
45
+ outputs = model.generate(**input_ids, max_new_tokens=256)
 
 
 
 
 
 
46
 
47
+ text = processor.batch_decode(
48
+ outputs,
49
+ skip_special_tokens=True,
50
+ clean_up_tokenization_spaces=True
51
+ )
52
 
53
+ return text[0]
 
 
 
54
 
55
 
56
  # ---------------- GRADIO UI ---------------- #
57
 
58
+ LANGUAGES = [
59
+ "French", "Spanish", "German", "Italian", "Portuguese",
60
+ "Chinese", "Japanese", "Korean", "Arabic", "Hindi",
61
+ "Russian", "Ukrainian", "Hebrew", "Thai", "Vietnamese"
62
+ ]
63
+
64
+ with gr.Blocks() as demo:
65
+ gr.Markdown("## 🎙️ Multilingual Audio Translator")
66
+ gr.Markdown("Speak English. The model will transcribe and translate into your chosen language.")
67
 
68
  with gr.Row():
69
+ audio_input = gr.Audio(type="filepath", label="Upload or Record Audio")
70
+ language_dropdown = gr.Dropdown(
71
+ choices=LANGUAGES,
72
+ value="French",
73
+ label="Target Language"
 
 
 
74
  )
75
 
76
+ translate_btn = gr.Button("Translate")
77
+
78
+ output_text = gr.Textbox(
79
+ label="Translation Output",
80
+ lines=10,
81
+ interactive=False
82
+ )
83
+
84
+ translate_btn.click(
85
+ fn=transcribe_and_translate,
86
+ inputs=[audio_input, language_dropdown],
87
+ outputs=output_text
88
+ )
89
 
90
+ demo.launch(debug=True)