Spaces:
Sleeping
Sleeping
File size: 2,645 Bytes
505b4c9 180f6b8 2d0ebc3 180f6b8 2d0ebc3 180f6b8 2d0ebc3 180f6b8 fa1257a 180f6b8 2d0ebc3 180f6b8 2d0ebc3 94faa68 cf972e4 180f6b8 9be43ff 180f6b8 7725773 d55b3ca 505b4c9 7725773 fa1257a 180f6b8 2d0ebc3 d55b3ca 7725773 fa1257a d55b3ca 9be43ff 2d0ebc3 180f6b8 fa1257a 2d0ebc3 7725773 180f6b8 1691129 2d0ebc3 fa1257a 180f6b8 fa1257a 7725773 180f6b8 2d0ebc3 7725773 180f6b8 9be43ff d4f45f5 180f6b8 7725773 31ec06b 180f6b8 7725773 180f6b8 2d0ebc3 180f6b8 2d0ebc3 180f6b8 2d0ebc3 7725773 180f6b8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | import torch
import gradio as gr
import tempfile
from transformers import AutoProcessor, AutoModelForImageTextToText
# ---------------- CONFIG ---------------- #
MODEL_ID = "EpistemeAI/Audiogemma-3N-finetune"
MAX_TOKENS = 256
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Loading model and processor...")
processor = AutoProcessor.from_pretrained(MODEL_ID, device_map="auto")
model = AutoModelForImageTextToText.from_pretrained(
MODEL_ID,
torch_dtype="auto",
device_map="auto"
)
print("Model loaded.")
# ---------------- INFERENCE FUNCTION ---------------- #
def transcribe_and_translate(audio_file, target_language):
if audio_file is None:
return "Please upload an audio file."
# Save temp file path
audio_path = audio_file
prompt = f"Transcribe this audio into English, and then translate it into {target_language}."
messages = [
{
"role": "user",
"content": [
{"type": "audio", "audio": audio_path},
{"type": "text", "text": prompt},
]
}
]
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt"
)
inputs = {k: v.to(model.device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=MAX_TOKENS,
do_sample=False,
temperature=0.2,
)
decoded = processor.batch_decode(
outputs,
skip_special_tokens=True,
clean_up_tokenization_spaces=True
)
return decoded[0]
# ---------------- GRADIO UI ---------------- #
with gr.Blocks(title="Audiogemma Multilingual Transcriber") as demo:
gr.Markdown("# 🎧 Audiogemma Multilingual Transcriber")
gr.Markdown("Upload an audio file and get transcription + translation powered by **Gemma 3N**.")
with gr.Row():
audio_input = gr.Audio(type="filepath", label="Upload Audio or Use Microphone")
language_input = gr.Dropdown(
choices=[
"French", "Spanish", "German", "Chinese", "Japanese",
"Korean", "Italian", "Portuguese", "Arabic", "Hindi"
],
value="French",
label="Translate To"
)
transcribe_btn = gr.Button("Transcribe & Translate")
output_text = gr.Textbox(label="Result", lines=12)
transcribe_btn.click(
fn=transcribe_and_translate,
inputs=[audio_input, language_input],
outputs=output_text
)
demo.launch()
|