import torch import gradio as gr from transformers import AutoProcessor, AutoModelForCausalLM model_id = "proteus-computer-use/omniparser-finetuned" processor = AutoProcessor.from_pretrained( "microsoft/Florence-2-base", trust_remote_code=True ) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float16, trust_remote_code=True ).to("cuda" if torch.cuda.is_available() else "cpu") def caption(image): inputs = processor(images=image, text="", return_tensors="pt").to(model.device) outputs = model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=20 ) result = processor.batch_decode(outputs, skip_special_tokens=True)[0] return result demo = gr.Interface( fn=caption, inputs=gr.Image(type="pil"), outputs="text", title="OmniParser Icon Caption Model" ) demo.launch()