Spaces:
Runtime error
Runtime error
File size: 1,665 Bytes
59ec2ed 2395e7b 59ec2ed 2395e7b 59ec2ed e7427b0 59ec2ed e7427b0 a3e6d78 2395e7b 59ec2ed 20169cb 59ec2ed cc04b09 20169cb cc04b09 e7427b0 cc04b09 20169cb e7427b0 cc04b09 20169cb 59ec2ed 20169cb 59ec2ed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import spaces
import gradio as gr
import torch
from transformers import AutoModelForImageTextToText, AutoProcessor
# Load model and processor
MODEL_PATH = "google/gemma-3n-E4B-it"
processor = AutoProcessor.from_pretrained(MODEL_PATH)
model = AutoModelForImageTextToText.from_pretrained(
MODEL_PATH,
torch_dtype=torch.bfloat16,
).eval().to("cuda")
@spaces.GPU
def process_inputs(image, audio):
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image,},
{"type": "audio", "audio": audio,},
]
},]
input_ids = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)
input_len = input_ids["input_ids"].shape[-1]
input_ids = input_ids.to(model.device, dtype=model.dtype)
with torch.inference_mode:
outputs = model.generate(
**input_ids,
max_new_tokens=max_tokens,
disable_compile=True
)
text = processor.batch_decode(
outputs[:, input_len:],
skip_special_tokens=True,
clean_up_tokenization_spaces=True
)
return text[0]
# Gradio interface
iface = gr.Interface(
fn=process_inputs,
inputs=[
gr.Image(label="Upload Image"),
gr.Audio(label="Ask Question about the Image")
],
outputs=gr.Textbox(label="Answer"),
title="Image and Audio Question Answering",
description="Upload an image as context and ask a quesiton about the image. The model will generate a text response."
)
if __name__ == "__main__":
iface.launch() |