import torch import gradio as gr import tempfile from transformers import AutoProcessor, AutoModelForImageTextToText # ---------------- CONFIG ---------------- # MODEL_ID = "EpistemeAI/Audiogemma-3N-finetune" MAX_TOKENS = 256 device = "cuda" if torch.cuda.is_available() else "cpu" print("Loading model and processor...") processor = AutoProcessor.from_pretrained(MODEL_ID, device_map="auto") model = AutoModelForImageTextToText.from_pretrained( MODEL_ID, torch_dtype="auto", device_map="auto" ) print("Model loaded.") # ---------------- INFERENCE FUNCTION ---------------- # def transcribe_and_translate(audio_file, target_language): if audio_file is None: return "Please upload an audio file." # Save temp file path audio_path = audio_file prompt = f"Transcribe this audio into English, and then translate it into {target_language}." messages = [ { "role": "user", "content": [ {"type": "audio", "audio": audio_path}, {"type": "text", "text": prompt}, ] } ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt" ) inputs = {k: v.to(model.device) for k, v in inputs.items()} with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=MAX_TOKENS, do_sample=False, temperature=0.2, ) decoded = processor.batch_decode( outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True ) return decoded[0] # ---------------- GRADIO UI ---------------- # with gr.Blocks(title="Audiogemma Multilingual Transcriber") as demo: gr.Markdown("# 🎧 Audiogemma Multilingual Transcriber") gr.Markdown("Upload an audio file and get transcription + translation powered by **Gemma 3N**.") with gr.Row(): audio_input = gr.Audio(type="filepath", label="Upload Audio or Use Microphone") language_input = gr.Dropdown( choices=[ "French", "Spanish", "German", "Chinese", "Japanese", "Korean", "Italian", "Portuguese", "Arabic", "Hindi" ], value="French", label="Translate To" ) transcribe_btn = gr.Button("Transcribe & Translate") output_text = gr.Textbox(label="Result", lines=12) transcribe_btn.click( fn=transcribe_and_translate, inputs=[audio_input, language_input], outputs=output_text ) demo.launch()