Spaces:

derektan95
/

LISA-AVS-demo

Running on Zero

File size: 15,197 Bytes

import argparse
import os
import re
import sys

import bleach
import cv2
import gradio as gr
from matplotlib import pyplot as plt
import numpy as np
import torch
import torch.nn.functional as F
from PIL import Image
from transformers import AutoTokenizer, BitsAndBytesConfig, CLIPImageProcessor

from model.LISA import LISAForCausalLM
from model.llava import conversation as conversation_lib
from model.llava.mm_utils import tokenizer_image_token
from model.segment_anything.utils.transforms import ResizeLongestSide
from utils.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
                         DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
import spaces

def parse_args(args):
    parser = argparse.ArgumentParser(description="LISA chat")
    parser.add_argument("--version", default="derektan95/LISA-AVS")
    parser.add_argument("--vis_save_path", default="./vis_output", type=str)
    parser.add_argument(
        "--precision",
        default="bf16",
        type=str,
        choices=["fp32", "bf16", "fp16"],
        help="precision for inference",
    )
    parser.add_argument("--image_size", default=1024, type=int, help="image size")
    parser.add_argument("--model_max_length", default=512, type=int)
    parser.add_argument("--lora_r", default=8, type=int)
    parser.add_argument(
        "--vision-tower", default="openai/clip-vit-large-patch14", type=str
    )
    parser.add_argument("--local-rank", default=0, type=int, help="node rank")
    parser.add_argument("--load_in_8bit", action="store_true", default=False)
    parser.add_argument("--load_in_4bit", action="store_true", default=False)
    parser.add_argument("--use_mm_start_end", action="store_true", default=True)
    parser.add_argument(
        "--conv_type",
        default="llava_v1",
        type=str,
        choices=["llava_v1", "llava_llama_2"],
    )
    return parser.parse_args(args)


def preprocess(
    x,
    pixel_mean=torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1),
    pixel_std=torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1),
    img_size=1024,
) -> torch.Tensor:
    """Normalize pixel values and pad to a square input."""
    # Normalize colors
    x = (x - pixel_mean) / pixel_std
    # Pad
    h, w = x.shape[-2:]
    padh = img_size - h
    padw = img_size - w
    x = F.pad(x, (0, padw, 0, padh))
    return x


args = parse_args(sys.argv[1:])
os.makedirs(args.vis_save_path, exist_ok=True)

# Create model
tokenizer = AutoTokenizer.from_pretrained(
    args.version,
    cache_dir=None,
    model_max_length=args.model_max_length,
    padding_side="right",
    use_fast=False,
)
tokenizer.pad_token = tokenizer.unk_token
args.seg_token_idx = tokenizer("[SEG]", add_special_tokens=False).input_ids[0]

torch_dtype = torch.float32
if args.precision == "bf16":
    torch_dtype = torch.bfloat16
elif args.precision == "fp16":
    torch_dtype = torch.half

kwargs = {"torch_dtype": torch_dtype}
if args.load_in_4bit:
    kwargs.update(
        {
            "torch_dtype": torch.half,
            "load_in_4bit": True,
            "quantization_config": BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
                llm_int8_skip_modules=["visual_model"],
            ),
        }
    )
elif args.load_in_8bit:
    kwargs.update(
        {
            "torch_dtype": torch.half,
            "quantization_config": BitsAndBytesConfig(
                llm_int8_skip_modules=["visual_model"],
                load_in_8bit=True,
            ),
        }
    )

model = LISAForCausalLM.from_pretrained(
    args.version, low_cpu_mem_usage=True, vision_tower=args.vision_tower, seg_token_idx=args.seg_token_idx, **kwargs
)

model.config.eos_token_id = tokenizer.eos_token_id
model.config.bos_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

model.get_model().initialize_vision_modules(model.get_model().config)
vision_tower = model.get_model().get_vision_tower()
vision_tower.to(dtype=torch_dtype)

if args.precision == "bf16":
    model = model.bfloat16().cuda()
elif (
    args.precision == "fp16" and (not args.load_in_4bit) and (not args.load_in_8bit)
):
    vision_tower = model.get_model().get_vision_tower()
    model.model.vision_tower = None
    import deepspeed

    model_engine = deepspeed.init_inference(
        model=model,
        dtype=torch.half,
        replace_with_kernel_inject=True,
        replace_method="auto",
    )
    model = model_engine.module
    model.model.vision_tower = vision_tower.half().cuda()
elif args.precision == "fp32":
    model = model.float().cuda()

vision_tower = model.get_model().get_vision_tower()
vision_tower.to(device=args.local_rank)

clip_image_processor = CLIPImageProcessor.from_pretrained(model.config.vision_tower)
transform = ResizeLongestSide(args.image_size)

model.eval()


# Gradio
examples_in_domain = [
    [
        "./imgs/examples/Animalia_Chordata_Aves_Charadriiformes_Laridae_Larus_marinus/80645_39.76079_-74.10316.jpg",
        "Where can I find the shore birds (Larus marinus) in this image? Please output segmentation mask.",
    ],    
    [
        "./imgs/examples/Animalia_Chordata_Mammalia_Rodentia_Caviidae_Hydrochoerus_hydrochaeris/28871_-12.80255_-69.29999.jpg",
        "Where can I find the capybaras (Hydrochoerus hydrochaeris) in this image? Please output segmentation mask and explain why.",
    ],  
    [
        "./imgs/examples/Animalia_Arthropoda_Malacostraca_Decapoda_Ocypodidae_Ocypode_quadrata/277303_38.72364_-75.07749.jpg",
        "Where can I find the crabs (Ocypode quadrata) in this image? Please output segmentation mask.",
    ],
    [
        "./imgs/examples/Animalia_Chordata_Mammalia_Rodentia_Sciuridae_Marmota_marmota/388246_45.49036_7.14796.jpg",
        "Where can I find the marmots (Marmota marmota) in this image? Please output segmentation mask and explain why.",
    ],
    [
        "./imgs/examples/Animalia_Chordata_Reptilia_Squamata_Varanidae_Varanus_salvator/410613_5.35573_100.28948.jpg",
        "Where can I find monitor lizard (Varanus salvator) in this image? Please output segmentation mask.",
    ],

]

examples_out_domain = [
    [
        "./imgs/examples/Animalia_Chordata_Mammalia_Carnivora_Phocidae_Mirounga_angustirostris/27423_35.64005_-121.17595.jpg",
        "Where can I find the seals (Animalia Chordata Mammalia Carnivora Phocidae Mirounga angustirostris) in this image? Please output segmentation mask and explain why.",
    ],
    [
        "./imgs/examples/Animalia_Chordata_Mammalia_Carnivora_Canidae_Urocyon_littoralis/304160_34.0144_-119.54417.jpg",
        "Where can I find the raccoons (Animalia Chordata Mammalia Carnivora Canidae Urocyon littoralis) in this image? Please output segmentation mask.",
    ],
    [
        "./imgs/examples/Animalia_Chordata_Mammalia_Carnivora_Canidae_Canis_aureus/1528408_13.00422_80.23033.jpg",
        "Where can I find the wolves (Animalia Chordata Mammalia Carnivora Canidae Canis aureus) in this image? Please output segmentation mask and explain why.",
    ],
]
output_labels = ["Segmentation Output"]

title = "LISA-AVS: LISA 7B Model Finetuned on AVS-Bench Dataset"

description = """
<font size=4>
This is an adapted version of the online demo for <a href='https://github.com/dvlab-research/LISA' target='_blank'>LISA</a>, where we finetune from scratch the LISA model (7B) with data from <a href='https://search-tta.github.io/' target='_blank'>AVS-Bench (Search-TTA)</a>. \n
**Note**: Different prompts can lead to significantly varied results. Please **standardize** your input text prompts to **avoid ambiguity**, and pay attention to whether the **punctuations** of the input are correct. \n
**Usage**: <br>
&ensp;(1) To let LISA-AVS **segment something**, input prompt like: "Where can I find the <em>Common Name</em> (<em>Taxonomy Name</em>) in this image? Please output segmentation mask."; <br>
&ensp;(2) To let LISA-AVS **output an explanation**, input prompt like: "Where can I find the <em>Common Name</em> (<em>Taxonomy Name</em>) in this image? Please output segmentation mask and explain why."; <br>
&ensp;(3) To obtain **solely language output**, you can input like what you should do in current multi-modal LLM (e.g., LLaVA), like: "Where can I find the <em>Common Name</em> (<em>Taxonomy Name</em>) in this image?" <br>

</font>
"""

article = """
<p style='text-align: center'>
<a href='https://search-tta.github.io/' target='_blank'>
Search-TTA
</a>
\n
<p style='text-align: center'>
<a href='https://huggingface.co/datasets/derektan95/avs-bench' target='_blank'>
AVS-Bench
</a>
\n
<p style='text-align: center'>
<a href='https://github.com/dvlab-research/LISA' target='_blank'> LISA Project </a></p>
"""


## to be implemented
@spaces.GPU
def inference(input_image, input_str):
    ## filter out special chars
    input_str = bleach.clean(input_str)

    print("input_str: ", input_str, "input_image: ", input_image)

    ## basic validity check: non-empty and reasonable length only
    if len(input_str.strip()) == 0 or len(input_str) > 1024:
        output_str = f"[Error] Invalid input length: {len(input_str)}"
        # Create a red placeholder image to indicate an error
        output_image = np.zeros((128, 128, 3), dtype=np.uint8)
        output_image[:] = (0, 0, 0)  # Red color in RGB
        return output_image, output_str

    # Model Inference
    conv = conversation_lib.conv_templates[args.conv_type].copy()
    conv.messages = []

    prompt = input_str
    prompt = DEFAULT_IMAGE_TOKEN + "\n" + prompt
    if args.use_mm_start_end:
        replace_token = (
            DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
        )
        prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)

    conv.append_message(conv.roles[0], prompt)
    conv.append_message(conv.roles[1], "")
    prompt = conv.get_prompt()

    image_np = cv2.imread(input_image)
    image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
    original_size_list = [image_np.shape[:2]]

    image_clip = (
        clip_image_processor.preprocess(image_np, return_tensors="pt")[
            "pixel_values"
        ][0]
        .unsqueeze(0)
        .cuda()
    )
    if args.precision == "bf16":
        image_clip = image_clip.bfloat16()
    elif args.precision == "fp16":
        image_clip = image_clip.half()
    else:
        image_clip = image_clip.float()

    image = transform.apply_image(image_np)
    resize_list = [image.shape[:2]]

    image = (
        preprocess(torch.from_numpy(image).permute(2, 0, 1).contiguous())
        .unsqueeze(0)
        .cuda()
    )
    if args.precision == "bf16":
        image = image.bfloat16()
    elif args.precision == "fp16":
        image = image.half()
    else:
        image = image.float()


    input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
    input_ids = input_ids.unsqueeze(0).cuda()

    output_ids, pred_masks = model.evaluate(
        image_clip,
        image,
        input_ids,
        resize_list,
        original_size_list,
        max_new_tokens=512,
        tokenizer=tokenizer,
    )
    output_ids = output_ids[0][output_ids[0] != IMAGE_TOKEN_INDEX]

    text_output = tokenizer.decode(output_ids, skip_special_tokens=False)
    text_output = text_output.replace("\n", "").replace("  ", " ")
    text_output = text_output.split("ASSISTANT: ")[-1]

    print("text_output: ", text_output)
    save_img = None
    for i, pred_mask in enumerate(pred_masks):
        if pred_mask.shape[0] == 0:
            continue

        pred_mask_np = pred_mask.detach().cpu().numpy()[0]
        
        # Normalize the continuous score mask to 0-255 range for visualization
        min_val = float(pred_mask_np.min())
        max_val = float(pred_mask_np.max())
        # Avoid division by zero if min_val == max_val
        denom = (max_val - min_val) if (max_val - min_val) != 0 else 1e-8
        
        # Normalize to [0, 255] for image display
        normalized_mask = ((pred_mask_np - min_val) / denom * 255).astype(np.uint8)
        
        # Apply colormap (jet) to create a colored visualization
        save_img = cv2.applyColorMap(normalized_mask, cv2.COLORMAP_VIRIDIS)
        save_img = cv2.cvtColor(save_img, cv2.COLOR_BGR2RGB)

        # -------------------------------------------------------------
        # Add a vertical legend (color bar) to the right of save_img
        # -------------------------------------------------------------
        legend_width = 30
        legend_height = save_img.shape[0]
        # Create vertical gradient from 255 (top) to 0 (bottom)
        gradient = np.linspace(255, 0, legend_height, dtype=np.uint8).reshape(-1, 1)
        gradient = np.repeat(gradient, legend_width, axis=1)
        legend_color = cv2.applyColorMap(gradient, cv2.COLORMAP_VIRIDIS)
        legend_color = cv2.cvtColor(legend_color, cv2.COLOR_BGR2RGB)

        # Put min / max text on legend
        font = cv2.FONT_HERSHEY_SIMPLEX
        font_scale = 0.4
        thickness = 1
        cv2.putText(legend_color, f"{max_val:.2f}", (2, 12), font, font_scale, (255, 255, 255), thickness, cv2.LINE_AA)
        cv2.putText(legend_color, f"{min_val:.2f}", (2, legend_height - 4), font, font_scale, (255, 255, 255), thickness, cv2.LINE_AA)

        # Concatenate original visualization with legend
        save_img = np.concatenate([save_img, legend_color], axis=1)

    output_str = "ASSISTANT: " + text_output  # input_str
    if save_img is not None:
        output_image = save_img  # input_image
    else:
        ## no segmentation output, return a black placeholder image
        output_image = np.zeros((128, 128, 3), dtype=np.uint8)
    return output_image, output_str


with gr.Blocks() as demo:
    # Title, description and article
    gr.Markdown(f"# {title}")
    gr.Markdown(description)
    # gr.Markdown(articSle)

    # Input and output columns side-by-side
    with gr.Row():
        with gr.Column():  # Left column – inputs
            inp_image = gr.Image(type="filepath", label="Input Image",sources=["upload"])
            inp_text = gr.Textbox(lines=1, placeholder=None, label="Text Instruction")
            run_btn = gr.Button("Run", variant="primary")
        with gr.Column():  # Right column – outputs
            out_seg = gr.Image(type="pil", label="Segmentation Output")
            out_text = gr.Textbox(lines=1, label="Text Output")

    # Bind the button to inference
    run_btn.click(fn=inference, inputs=[inp_image, inp_text], outputs=[out_seg, out_text])

    # ---------------- Example Galleries ----------------
    gr.Markdown("### In-Domain Taxonomy")
    gr.Examples(
        examples=examples_in_domain,
        inputs=[inp_image, inp_text],
        outputs=[out_seg, out_text],
        fn=inference,
        cache_examples=False,
    )

    gr.Markdown("### Out-Domain Taxonomy")
    gr.Examples(
        examples=examples_out_domain,
        inputs=[inp_image, inp_text],
        outputs=[out_seg, out_text],
        fn=inference,
        cache_examples=False,
    )

demo.queue()
demo.launch()