File size: 1,665 Bytes
59ec2ed
2395e7b
59ec2ed
 
2395e7b
59ec2ed
e7427b0
59ec2ed
e7427b0
 
 
a3e6d78
2395e7b
59ec2ed
 
20169cb
 
 
 
 
 
 
 
59ec2ed
cc04b09
20169cb
 
 
 
 
 
 
 
cc04b09
e7427b0
cc04b09
20169cb
 
 
e7427b0
cc04b09
20169cb
59ec2ed
 
20169cb
 
59ec2ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import spaces
import gradio as gr
import torch
from transformers import AutoModelForImageTextToText, AutoProcessor

# Load model and processor
MODEL_PATH = "google/gemma-3n-E4B-it"
processor = AutoProcessor.from_pretrained(MODEL_PATH)
model = AutoModelForImageTextToText.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.bfloat16,
).eval().to("cuda")

@spaces.GPU
def process_inputs(image, audio):
    messages = [
        {
        "role": "user",
        "content": [
            {"type": "image", "image": image,},
            {"type": "audio", "audio": audio,},
        ]
    },]

    input_ids = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    )
    input_len = input_ids["input_ids"].shape[-1]

    input_ids = input_ids.to(model.device, dtype=model.dtype)
    with torch.inference_mode:
        outputs = model.generate(
            **input_ids,
            max_new_tokens=max_tokens,
            disable_compile=True
        )
    text = processor.batch_decode(
        outputs[:, input_len:],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )
    return text[0]

# Gradio interface
iface = gr.Interface(
    fn=process_inputs,
    inputs=[
        gr.Image(label="Upload Image"),
        gr.Audio(label="Ask Question about the Image")
    ],
    outputs=gr.Textbox(label="Answer"),
    title="Image and Audio Question Answering",
    description="Upload an image as context and ask a quesiton about the image. The model will generate a text response."
)

if __name__ == "__main__":
    iface.launch()