import spaces import torch import gradio as gr from transformers import AutoProcessor, AutoModelForImageTextToText MODEL_ID = "google/medgemma-1.5-4b-it" processor = AutoProcessor.from_pretrained(MODEL_ID) model = AutoModelForImageTextToText.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to("cuda") UNUSED95_ID = processor.tokenizer.convert_tokens_to_ids('') EOT_ID = processor.tokenizer.convert_tokens_to_ids('') def extract_response(output_ids, input_length): ids = output_ids.tolist() if UNUSED95_ID in ids: idx = ids.index(UNUSED95_ID) response_ids = ids[idx + 1:] else: response_ids = ids[input_length:] return processor.decode(response_ids, skip_special_tokens=True).strip() @spaces.GPU(duration=90) def analyze(image1, image2, image3, image4, text_prompt): images = [img for img in [image1, image2, image3, image4] if img is not None] messages = [{"role": "user", "content": []}] for img in images: messages[0]["content"].append({"type": "image", "image": img}) messages[0]["content"].append({"type": "text", "text": text_prompt}) inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt" ).to(model.device, dtype=torch.bfloat16) with torch.inference_mode(): output = model.generate(**inputs, max_new_tokens=2048, eos_token_id=EOT_ID) return extract_response(output[0], inputs['input_ids'].shape[1]) demo = gr.Interface( fn=analyze, inputs=[ gr.Image(type="pil", label="Image 1 (optional)"), gr.Image(type="pil", label="Image 2 (optional)"), gr.Image(type="pil", label="Image 3 (optional)"), gr.Image(type="pil", label="Image 4 (optional)"), gr.Textbox(label="Prompt"), ], outputs=gr.Textbox(label="Response"), title="MedGemma 1.5 4B" ) demo.launch()