File size: 2,645 Bytes
505b4c9
180f6b8
 
2d0ebc3
 
180f6b8
2d0ebc3
180f6b8
2d0ebc3
180f6b8
fa1257a
180f6b8
 
2d0ebc3
 
180f6b8
2d0ebc3
94faa68
cf972e4
180f6b8
 
 
 
 
 
 
 
 
 
 
9be43ff
180f6b8
7725773
d55b3ca
505b4c9
7725773
 
fa1257a
180f6b8
2d0ebc3
 
d55b3ca
7725773
fa1257a
d55b3ca
9be43ff
2d0ebc3
180f6b8
fa1257a
2d0ebc3
7725773
180f6b8
1691129
2d0ebc3
fa1257a
 
180f6b8
 
 
fa1257a
7725773
180f6b8
2d0ebc3
 
 
 
7725773
180f6b8
9be43ff
 
 
d4f45f5
180f6b8
 
 
7725773
31ec06b
180f6b8
 
 
 
 
 
 
 
 
 
 
7725773
180f6b8
2d0ebc3
180f6b8
2d0ebc3
180f6b8
2d0ebc3
 
7725773
180f6b8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import torch
import gradio as gr
import tempfile
from transformers import AutoProcessor, AutoModelForImageTextToText

# ---------------- CONFIG ---------------- #
MODEL_ID = "EpistemeAI/Audiogemma-3N-finetune"
MAX_TOKENS = 256

device = "cuda" if torch.cuda.is_available() else "cpu"

print("Loading model and processor...")
processor = AutoProcessor.from_pretrained(MODEL_ID, device_map="auto")
model = AutoModelForImageTextToText.from_pretrained(
    MODEL_ID,
    torch_dtype="auto",
    device_map="auto"
)

print("Model loaded.")


# ---------------- INFERENCE FUNCTION ---------------- #

def transcribe_and_translate(audio_file, target_language):
    if audio_file is None:
        return "Please upload an audio file."

    # Save temp file path
    audio_path = audio_file

    prompt = f"Transcribe this audio into English, and then translate it into {target_language}."

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "audio", "audio": audio_path},
                {"type": "text", "text": prompt},
            ]
        }
    ]

    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt"
    )

    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=MAX_TOKENS,
            do_sample=False,
            temperature=0.2,
        )

    decoded = processor.batch_decode(
        outputs,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )

    return decoded[0]


# ---------------- GRADIO UI ---------------- #

with gr.Blocks(title="Audiogemma Multilingual Transcriber") as demo:
    gr.Markdown("# 🎧 Audiogemma Multilingual Transcriber")
    gr.Markdown("Upload an audio file and get transcription + translation powered by **Gemma 3N**.")

    with gr.Row():
        audio_input = gr.Audio(type="filepath", label="Upload Audio or Use Microphone")
        language_input = gr.Dropdown(
            choices=[
                "French", "Spanish", "German", "Chinese", "Japanese",
                "Korean", "Italian", "Portuguese", "Arabic", "Hindi"
            ],
            value="French",
            label="Translate To"
        )

    transcribe_btn = gr.Button("Transcribe & Translate")

    output_text = gr.Textbox(label="Result", lines=12)

    transcribe_btn.click(
        fn=transcribe_and_translate,
        inputs=[audio_input, language_input],
        outputs=output_text
    )

demo.launch()