AudioGemma / app.py
legolasyiu's picture
Update app.py
180f6b8 verified
import torch
import gradio as gr
import tempfile
from transformers import AutoProcessor, AutoModelForImageTextToText
# ---------------- CONFIG ---------------- #
MODEL_ID = "EpistemeAI/Audiogemma-3N-finetune"
MAX_TOKENS = 256
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Loading model and processor...")
processor = AutoProcessor.from_pretrained(MODEL_ID, device_map="auto")
model = AutoModelForImageTextToText.from_pretrained(
MODEL_ID,
torch_dtype="auto",
device_map="auto"
)
print("Model loaded.")
# ---------------- INFERENCE FUNCTION ---------------- #
def transcribe_and_translate(audio_file, target_language):
if audio_file is None:
return "Please upload an audio file."
# Save temp file path
audio_path = audio_file
prompt = f"Transcribe this audio into English, and then translate it into {target_language}."
messages = [
{
"role": "user",
"content": [
{"type": "audio", "audio": audio_path},
{"type": "text", "text": prompt},
]
}
]
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt"
)
inputs = {k: v.to(model.device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=MAX_TOKENS,
do_sample=False,
temperature=0.2,
)
decoded = processor.batch_decode(
outputs,
skip_special_tokens=True,
clean_up_tokenization_spaces=True
)
return decoded[0]
# ---------------- GRADIO UI ---------------- #
with gr.Blocks(title="Audiogemma Multilingual Transcriber") as demo:
gr.Markdown("# 🎧 Audiogemma Multilingual Transcriber")
gr.Markdown("Upload an audio file and get transcription + translation powered by **Gemma 3N**.")
with gr.Row():
audio_input = gr.Audio(type="filepath", label="Upload Audio or Use Microphone")
language_input = gr.Dropdown(
choices=[
"French", "Spanish", "German", "Chinese", "Japanese",
"Korean", "Italian", "Portuguese", "Arabic", "Hindi"
],
value="French",
label="Translate To"
)
transcribe_btn = gr.Button("Transcribe & Translate")
output_text = gr.Textbox(label="Result", lines=12)
transcribe_btn.click(
fn=transcribe_and_translate,
inputs=[audio_input, language_input],
outputs=output_text
)
demo.launch()