import gradio as gr from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer from threading import Thread import re import time import torch import spaces import math import os # from qwen_vl_utils import process_vision_info, fetch_image # run locally: CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_PORT=7860 MODEL=./model_dir python app.py # and open http://localhost:7860 # pretrained_model_name_or_path=os.environ.get("MODEL", "amrn/gmdsv5mx3") # pretrained_model_name_or_path=os.environ.get("MODEL", "amrn/gr1") pretrained_model_name_or_path=os.environ.get("MODEL", "amrn/mrcxr1") auth_token = os.environ.get("HF_TOKEN") or True DEFAULT_PROMPT = "Find abnormalities and support devices." model = AutoModelForImageTextToText.from_pretrained( pretrained_model_name_or_path=pretrained_model_name_or_path, dtype=torch.bfloat16, token=auth_token ).eval().to("cuda") processor = AutoProcessor.from_pretrained(pretrained_model_name_or_path, use_fast=True, ) @spaces.GPU def model_inference( text, history, image ): print(f"text: {text}") print(f"history: {history}") if len(text) == 0: raise gr.Error("Please input a query.", duration=3, print_exception=False) if image is None: raise gr.Error("Please provide an image.", duration=3, print_exception=False) print(f"image0: {image} size: {image.size}") # image = fetch_image({"image": image, "min_pixels": 28*28*2, "max_pixels": 476*476}) # image.thumbnail((512, 512)) #resize image to 512x512 preserve aspect ratio # print(f"image1: {image} size: {image.size}") messages=[] if len(history) > 0: valid_index = None for i in range(len(history)): h = history[i] if len(h.get("content").strip()) > 0: if valid_index is None and h['role'] == 'assistant': valid_index = i-1 messages.append({"role": h['role'], "content": [{"type": "text", "text": h['content']}] }) if valid_index is None: messages = [] if len(messages) > 0 and valid_index > 0: messages = messages[valid_index:] #remove previous messages (without image) # current prompt messages.append({"role": "user","content": [{"type": "text", "text": text}]}) messages[0]['content'].insert(0, {"type": "image"}) print(f"messages: {messages}") prompt = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(text=prompt, images=[image], return_tensors="pt") inputs = inputs.to('cuda') # Generate streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True) generation_args = dict(inputs, streamer=streamer, max_new_tokens=4096) with torch.inference_mode(): thread = Thread(target=model.generate, kwargs=generation_args) thread.start() yield "..." buffer = "" for new_text in streamer: buffer += new_text yield buffer with gr.Blocks() as demo: # gr.Markdown('