Spaces:
Sleeping
Sleeping
| import torch | |
| import gradio as gr | |
| import tempfile | |
| from transformers import AutoProcessor, AutoModelForImageTextToText | |
| # ---------------- CONFIG ---------------- # | |
| MODEL_ID = "EpistemeAI/Audiogemma-3N-finetune" | |
| MAX_TOKENS = 256 | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print("Loading model and processor...") | |
| processor = AutoProcessor.from_pretrained(MODEL_ID, device_map="auto") | |
| model = AutoModelForImageTextToText.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype="auto", | |
| device_map="auto" | |
| ) | |
| print("Model loaded.") | |
| # ---------------- INFERENCE FUNCTION ---------------- # | |
| def transcribe_and_translate(audio_file, target_language): | |
| if audio_file is None: | |
| return "Please upload an audio file." | |
| # Save temp file path | |
| audio_path = audio_file | |
| prompt = f"Transcribe this audio into English, and then translate it into {target_language}." | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "audio", "audio": audio_path}, | |
| {"type": "text", "text": prompt}, | |
| ] | |
| } | |
| ] | |
| inputs = processor.apply_chat_template( | |
| messages, | |
| add_generation_prompt=True, | |
| tokenize=True, | |
| return_dict=True, | |
| return_tensors="pt" | |
| ) | |
| inputs = {k: v.to(model.device) for k, v in inputs.items()} | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=MAX_TOKENS, | |
| do_sample=False, | |
| temperature=0.2, | |
| ) | |
| decoded = processor.batch_decode( | |
| outputs, | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=True | |
| ) | |
| return decoded[0] | |
| # ---------------- GRADIO UI ---------------- # | |
| with gr.Blocks(title="Audiogemma Multilingual Transcriber") as demo: | |
| gr.Markdown("# 🎧 Audiogemma Multilingual Transcriber") | |
| gr.Markdown("Upload an audio file and get transcription + translation powered by **Gemma 3N**.") | |
| with gr.Row(): | |
| audio_input = gr.Audio(type="filepath", label="Upload Audio or Use Microphone") | |
| language_input = gr.Dropdown( | |
| choices=[ | |
| "French", "Spanish", "German", "Chinese", "Japanese", | |
| "Korean", "Italian", "Portuguese", "Arabic", "Hindi" | |
| ], | |
| value="French", | |
| label="Translate To" | |
| ) | |
| transcribe_btn = gr.Button("Transcribe & Translate") | |
| output_text = gr.Textbox(label="Result", lines=12) | |
| transcribe_btn.click( | |
| fn=transcribe_and_translate, | |
| inputs=[audio_input, language_input], | |
| outputs=output_text | |
| ) | |
| demo.launch() | |