Spaces:

ariG23498
/

gemma3n-image-audio

Runtime error

File size: 1,665 Bytes

59ec2ed
2395e7b
59ec2ed
 
2395e7b
59ec2ed
e7427b0
59ec2ed
e7427b0
 
 
a3e6d78
2395e7b
59ec2ed
 
20169cb
 
 
 
 
 
 
 
59ec2ed
cc04b09
20169cb
 
 
 
 
 
 
 
cc04b09
e7427b0
cc04b09
20169cb
 
 
e7427b0
cc04b09
20169cb
59ec2ed
 
20169cb
 
59ec2ed

import spaces
import gradio as gr
import torch
from transformers import AutoModelForImageTextToText, AutoProcessor

# Load model and processor
MODEL_PATH = "google/gemma-3n-E4B-it"
processor = AutoProcessor.from_pretrained(MODEL_PATH)
model = AutoModelForImageTextToText.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.bfloat16,
).eval().to("cuda")

@spaces.GPU
def process_inputs(image, audio):
    messages = [
        {
        "role": "user",
        "content": [
            {"type": "image", "image": image,},
            {"type": "audio", "audio": audio,},
        ]
    },]

    input_ids = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    )
    input_len = input_ids["input_ids"].shape[-1]

    input_ids = input_ids.to(model.device, dtype=model.dtype)
    with torch.inference_mode:
        outputs = model.generate(
            **input_ids,
            max_new_tokens=max_tokens,
            disable_compile=True
        )
    text = processor.batch_decode(
        outputs[:, input_len:],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )
    return text[0]

# Gradio interface
iface = gr.Interface(
    fn=process_inputs,
    inputs=[
        gr.Image(label="Upload Image"),
        gr.Audio(label="Ask Question about the Image")
    ],
    outputs=gr.Textbox(label="Answer"),
    title="Image and Audio Question Answering",
    description="Upload an image as context and ask a quesiton about the image. The model will generate a text response."
)

if __name__ == "__main__":
    iface.launch()