legolasyiu commited on
Commit
d0faf3c
·
verified ·
1 Parent(s): 9d1e3b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -73
app.py CHANGED
@@ -1,98 +1,127 @@
1
- import torch
2
  import gradio as gr
 
 
 
3
  import tempfile
4
- from transformers import AutoProcessor, AutoModelForImageTextToText
5
-
6
- # ---------------- CONFIG ---------------- #
7
- MODEL_ID = "EpistemeAI/Audiogemma-3N-finetune"
8
- MAX_TOKENS = 256
9
 
10
- device = "cuda" if torch.cuda.is_available() else "cpu"
11
-
12
- print("Loading model and processor...")
13
- processor = AutoProcessor.from_pretrained(MODEL_ID, device_map="auto")
14
- model = AutoModelForImageTextToText.from_pretrained(
15
- MODEL_ID,
16
- torch_dtype="auto",
17
- device_map="auto"
18
  )
19
 
20
- print("Model loaded.")
21
-
22
-
23
- # ---------------- INFERENCE FUNCTION ---------------- #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- def transcribe_and_translate(audio_file, target_language):
 
 
 
 
 
 
 
 
 
 
26
  if audio_file is None:
27
- return "Please upload an audio file."
28
-
29
- # Save temp file path
30
- audio_path = audio_file
31
-
32
- prompt = f"Transcribe this audio into English, and then translate it into {target_language}."
33
-
34
- messages = [
35
- {
36
- "role": "user",
37
- "content": [
38
- {"type": "audio", "audio": audio_path},
39
- {"type": "text", "text": prompt},
40
- ]
41
- }
42
- ]
43
-
44
- inputs = processor.apply_chat_template(
45
- messages,
46
- add_generation_prompt=True,
47
- tokenize=True,
48
- return_dict=True,
49
- return_tensors="pt"
50
- )
51
 
52
- inputs = {k: v.to(model.device) for k, v in inputs.items()}
 
 
 
 
 
 
 
 
 
53
 
54
  with torch.no_grad():
55
- outputs = model.generate(
56
- **inputs,
57
- max_new_tokens=MAX_TOKENS,
58
- do_sample=False,
59
- temperature=0.2,
60
  )
61
 
62
- decoded = processor.batch_decode(
63
- outputs,
64
  skip_special_tokens=True,
65
- clean_up_tokenization_spaces=True
66
  )
67
 
68
- return decoded[0]
 
 
 
 
69
 
 
 
70
 
71
- # ---------------- GRADIO UI ---------------- #
72
 
73
- with gr.Blocks(title="Audiogemma Multilingual Transcriber") as demo:
74
- gr.Markdown("# 🎧 Audiogemma Multilingual Transcriber")
75
- gr.Markdown("Upload an audio file and get transcription + translation powered by **Gemma 3N**.")
76
 
77
- with gr.Row():
78
- audio_input = gr.Audio(type="filepath", label="Upload Audio or Use Microphone")
79
- language_input = gr.Dropdown(
80
- choices=[
81
- "French", "Spanish", "German", "Chinese", "Japanese",
82
- "Korean", "Italian", "Portuguese", "Arabic", "Hindi"
83
- ],
84
- value="French",
85
- label="Translate To"
86
- )
87
 
88
- transcribe_btn = gr.Button("Transcribe & Translate")
 
 
 
 
 
 
 
 
 
89
 
90
- output_text = gr.Textbox(label="Result", lines=12)
 
 
 
 
 
 
 
 
 
 
91
 
92
- transcribe_btn.click(
93
- fn=transcribe_and_translate,
94
- inputs=[audio_input, language_input],
95
- outputs=output_text
96
  )
97
 
98
  demo.launch()
 
 
1
  import gradio as gr
2
+ import torch
3
+ import librosa
4
+ import soundfile as sf
5
  import tempfile
6
+ import os
 
 
 
 
7
 
8
+ from transformers import (
9
+ AutoProcessor,
10
+ AutoModelForImageTextToText,
11
+ AutoTokenizer,
12
+ AutoModelForTextToSpeech,
 
 
 
13
  )
14
 
15
+ # -----------------------------
16
+ # CONFIG
17
+ # -----------------------------
18
+ STT_MODEL_ID = "EpistemeAI/Audiogemma-3N-finetune"
19
+ TTS_MODEL_ID = "EpistemeAI/LexiVox"
20
+
21
+ TARGET_SR = 16000
22
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
23
+ DTYPE = torch.bfloat16 if DEVICE == "cuda" else torch.float32
24
+
25
+ # -----------------------------
26
+ # LOAD MODELS (ONCE)
27
+ # -----------------------------
28
+ print("Loading STT model...")
29
+ stt_processor = AutoProcessor.from_pretrained(STT_MODEL_ID)
30
+ stt_model = AutoModelForImageTextToText.from_pretrained(
31
+ STT_MODEL_ID,
32
+ torch_dtype=DTYPE,
33
+ device_map="auto",
34
+ )
35
 
36
+ print("Loading TTS model...")
37
+ tts_tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_ID)
38
+ tts_model = AutoModelForTextToSpeech.from_pretrained(
39
+ TTS_MODEL_ID,
40
+ torch_dtype=DTYPE,
41
+ ).to(DEVICE)
42
+
43
+ # -----------------------------
44
+ # PIPELINE FUNCTION
45
+ # -----------------------------
46
+ def speech_to_speech(audio_file):
47
  if audio_file is None:
48
+ return "", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # Load + resample
51
+ audio, sr = librosa.load(audio_file, sr=TARGET_SR)
52
+
53
+ # ---------- STT ----------
54
+ stt_inputs = stt_processor(
55
+ audio=audio,
56
+ sampling_rate=TARGET_SR,
57
+ text="Transcribe the audio accurately.",
58
+ return_tensors="pt",
59
+ ).to(DEVICE)
60
 
61
  with torch.no_grad():
62
+ output_ids = stt_model.generate(
63
+ **stt_inputs,
64
+ max_new_tokens=512,
 
 
65
  )
66
 
67
+ transcription = stt_processor.decode(
68
+ output_ids[0],
69
  skip_special_tokens=True,
 
70
  )
71
 
72
+ # ---------- TTS ----------
73
+ tts_inputs = tts_tokenizer(
74
+ transcription,
75
+ return_tensors="pt",
76
+ ).to(DEVICE)
77
 
78
+ with torch.no_grad():
79
+ speech = tts_model.generate(**tts_inputs)
80
 
81
+ audio_out = speech.cpu().numpy().squeeze()
82
 
83
+ # Save temp wav
84
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
85
+ sf.write(tmp.name, audio_out, TARGET_SR)
86
 
87
+ return transcription, tmp.name
88
+
89
+ # -----------------------------
90
+ # GRADIO UI
91
+ # -----------------------------
92
+ with gr.Blocks(title="Audiogemma LexiVox Speech Loop") as demo:
93
+ gr.Markdown(
94
+ """
95
+ # 🎙️ Speech → Text → Speech
96
+ **Audiogemma-3N + LexiVox**
97
 
98
+ Upload audio or use the microphone.
99
+ The system transcribes speech, then speaks it back using an LLM-based TTS.
100
+ """
101
+ )
102
+
103
+ audio_input = gr.Audio(
104
+ sources=["microphone", "upload"],
105
+ type="filepath",
106
+ label="Input Audio",
107
+ )
108
 
109
+ run_btn = gr.Button("Run Speech Loop")
110
+
111
+ text_output = gr.Textbox(
112
+ label="Transcription",
113
+ lines=4,
114
+ )
115
+
116
+ audio_output = gr.Audio(
117
+ label="Synthesized Speech",
118
+ type="filepath",
119
+ )
120
 
121
+ run_btn.click(
122
+ fn=speech_to_speech,
123
+ inputs=audio_input,
124
+ outputs=[text_output, audio_output],
125
  )
126
 
127
  demo.launch()