import time # os.environ["GRADIO_TEMP_DIR"] = ( # "/home/agent_vision@BEIJAFLORE.COM/fmorel/CoVT-main/CoVT-main/gradio/temp" # ) import gradio as gr import torch from PIL import Image from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration # ================= Configuration Area ================= # You can change these defaults as you like DEFAULT_MODEL_NAME = "Wakals/CoVT-7B-seg_depth_dino" DEFAULT_CKPT_PATH = None # Or set to your local checkpoint path # ====================================================== # Global cache for model and processor to avoid re-loading every call _cached_model = None _cached_processor = None def load_model_and_processor( model_name: str, ckpt: str = None, ): """ Load a single CoVT-7B model and its corresponding processor. """ if ckpt is not None: print(f"Loading model from ckpt: {ckpt}") model = Qwen2_5_VLForConditionalGeneration.from_pretrained( ckpt, torch_dtype=torch.bfloat16, device_map="auto" ).eval() processor = AutoProcessor.from_pretrained( ckpt, min_pixels=256 * 28 * 28, max_pixels=1280 * 28 * 28 ) else: print(f"Loading model from hub: {model_name}") model = Qwen2_5_VLForConditionalGeneration.from_pretrained( model_name, torch_dtype=torch.bfloat16, device_map="auto" ).eval() processor = AutoProcessor.from_pretrained( model_name, min_pixels=256 * 28 * 28, max_pixels=1280 * 28 * 28 ) return model, processor def get_cached_model_and_processor( model_name: str = DEFAULT_MODEL_NAME, ckpt: str = DEFAULT_CKPT_PATH, ): """ Lazy-load and cache the model and processor so they are not reloaded every request. """ global _cached_model, _cached_processor # If already loaded, just return them if _cached_model is not None and _cached_processor is not None: return _cached_model, _cached_processor # Otherwise load and cache _cached_model, _cached_processor = load_model_and_processor( model_name=model_name, ckpt=ckpt, ) return _cached_model, _cached_processor def run_single_inference( model, processor, image, # can be either a PIL.Image or a path string question: str, max_new_tokens: int = 512, temperature: float = 0.0, top_p: float = 0.9, do_sample: bool = False, seed: int = 42, ): """ Single inference: given one image and one question, return answer and elapsed time. """ # 1) Prepare conversation # For Gradio we usually get a PIL image, but we also support a path string for compatibility. if isinstance(image, str): pil_image = Image.open(image).convert("RGB") image_ref = image # path for the "image" field elif isinstance(image, Image.Image): pil_image = image.convert("RGB") # When using PIL image in chat template, you can pass a placeholder # and rely on 'images' argument in processor; here we still need a "dummy" reference. image_ref = ( "gradio_image" # this is not used as a real path, just a placeholder ) else: raise ValueError("image must be a PIL.Image or a path string.") messages = [ { "role": "user", "content": [ {"type": "image", "image": image_ref}, {"type": "text", "text": question}, ], } ] # 2) Apply chat template prompt = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # 3) Encode image and text inputs = processor(text=[prompt], images=[pil_image], return_tensors="pt") # Move inputs to the same device as the model device = model.device inputs = { k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in inputs.items() } print(">>>>>>>>>>>>< DEVICE ", device.type) # 4) Timing + generation if device.type == "cuda": torch.cuda.empty_cache() torch.cuda.synchronize() start = time.time() with torch.no_grad(): generated_ids = model.generate( **inputs, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=do_sample, pad_token_id=processor.tokenizer.eos_token_id, eos_token_id=processor.tokenizer.eos_token_id, ) if device.type == "cuda": torch.cuda.synchronize() end = time.time() elapsed = end - start # 5) Decode only newly generated tokens input_len = inputs["input_ids"].shape[1] new_tokens = generated_ids[0, input_len:] answer = processor.decode(new_tokens, skip_special_tokens=True) return answer, elapsed def gradio_inference( image, question, max_new_tokens, temperature, top_p, seed, ): """ Wrapper function for Gradio that calls the inference logic and returns answer + time cost. """ if image is None: return "Please upload an image.", 0.0 # Get (or load) model and processor model, processor = get_cached_model_and_processor() # Run inference answer, elapsed = run_single_inference( model=model, processor=processor, image=image, # filepath string from Gradio question=question, max_new_tokens=int(max_new_tokens), temperature=float(temperature), top_p=float(top_p), do_sample=(temperature > 0.0), seed=int(seed), ) return answer, elapsed # ===================== Gradio UI ===================== def build_demo(): with gr.Blocks() as demo: gr.Markdown( "# CoVT-7B Gradio Demo\n" "Upload an image and input a question to run visual question answering." ) with gr.Row(): with gr.Column(): image_input = gr.Image(label="Input Image", type="pil") question_input = gr.Textbox(label="Question", value="", lines=2) max_new_tokens = gr.Slider( label="max_new_tokens", minimum=1, maximum=1024, value=512, step=1 ) temperature = gr.Slider( label="temperature", minimum=0.0, maximum=1.0, value=0.0, step=0.01 ) top_p = gr.Slider( label="top_p", minimum=0.1, maximum=1.0, value=0.9, step=0.01 ) seed = gr.Slider( label="random_seed", minimum=0, maximum=1000, value=42, step=1 ) run_button = gr.Button("Run Inference") with gr.Column(): answer_output = gr.Textbox(label="Answer", lines=10) elapsed_output = gr.Number(label="Elapsed time (seconds)") run_button.click( fn=gradio_inference, inputs=[ image_input, question_input, max_new_tokens, temperature, top_p, seed, ], outputs=[answer_output, elapsed_output], ) return demo if __name__ == "__main__": demo = build_demo() # You can set share=True if you want a public link demo.launch()