wangzeze commited on Mar 30

Commit

0453c63

verified ·

1 Parent(s): 296ec95

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +7 -0
.ipynb_checkpoints/batch_generate-checkpoint.py +401 -0
.ipynb_checkpoints/batch_generate-checkpoint.sh +14 -0
.ipynb_checkpoints/batch_generate_prefill_accelerate-checkpoint.py +418 -0
.ipynb_checkpoints/chat-checkpoint.py +255 -0
.ipynb_checkpoints/chat_prefill-checkpoint.py +282 -0
.ipynb_checkpoints/train_aff-checkpoint.py +620 -0
README.md +79 -3
app.py +329 -0
batch_generate.sh +14 -0
batch_generate_prefill_accelerate.py +418 -0
chat.py +255 -0
chat_prefill.py +282 -0
ckpts/AffordanceVLM-7B/.gitattributes +35 -0
ckpts/AffordanceVLM-7B/README.md +3 -0
ckpts/AffordanceVLM-7B/added_tokens.json +7 -0
ckpts/AffordanceVLM-7B/config.json +42 -0
ckpts/AffordanceVLM-7B/eval_result.txt +1 -0
ckpts/AffordanceVLM-7B/generation_config.json +7 -0
ckpts/AffordanceVLM-7B/pytorch_model-00001-of-00002.bin +3 -0
ckpts/AffordanceVLM-7B/pytorch_model-00002-of-00002.bin +3 -0
ckpts/AffordanceVLM-7B/pytorch_model.bin.index.json +930 -0
ckpts/AffordanceVLM-7B/special_tokens_map.json +24 -0
ckpts/AffordanceVLM-7B/tokenizer.model +3 -0
ckpts/AffordanceVLM-7B/tokenizer_config.json +35 -0
ckpts/sam_vit_h_4b8939.pth +3 -0
client.py +67 -0
data_curation/.ipynb_checkpoints/check_dataset-checkpoint.py +100 -0
data_curation/build_vlpart.py +105 -0
data_curation/check_dataset.py +100 -0
data_curation/prompt_generation_handal_easy_reasoning.py +126 -0
data_curation/prompt_generation_handal_hard_reasoning.py +136 -0
data_curation/vlpart_sam2_tracking.py +187 -0
docs/dataset.md +93 -0
docs/installation.md +10 -0
docs/training_and_evaluation.md +56 -0
imgs/.ipynb_checkpoints/AffordanceNet-checkpoint.jpg +3 -0
imgs/AffordanceNet.jpg +3 -0
imgs/AffordanceNet.png +3 -0
merge_lora_weights_and_save_hf_model.py +162 -0
model/AffordanceVLM.py +428 -0
model/__pycache__/AffordanceVLM.cpython-39.pyc +0 -0
model/llava/__init__.py +1 -0
model/llava/__pycache__/__init__.cpython-39.pyc +0 -0
model/llava/__pycache__/constants.cpython-39.pyc +0 -0
model/llava/__pycache__/conversation.cpython-39.pyc +0 -0
model/llava/__pycache__/mm_utils.cpython-39.pyc +0 -0
model/llava/constants.py +12 -0
model/llava/conversation.py +399 -0
model/llava/mm_utils.py +88 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+imgs/.ipynb_checkpoints/AffordanceNet-checkpoint.jpg filter=lfs diff=lfs merge=lfs -text
+imgs/AffordanceNet.jpg filter=lfs diff=lfs merge=lfs -text
+imgs/AffordanceNet.png filter=lfs diff=lfs merge=lfs -text
+vis_output/.ipynb_checkpoints/my_workspace-checkpoint.JPG filter=lfs diff=lfs merge=lfs -text
+vis_output/.ipynb_checkpoints/my_workspace_masked_img_0-checkpoint.jpg filter=lfs diff=lfs merge=lfs -text
+vis_output/my_workspace.JPG filter=lfs diff=lfs merge=lfs -text
+vis_output/my_workspace_masked_img_0.jpg filter=lfs diff=lfs merge=lfs -text

.ipynb_checkpoints/batch_generate-checkpoint.py ADDED Viewed

	@@ -0,0 +1,401 @@

+"""
+Batch affordance mask generation for per-step datasets.
+Reads a per-step dataset (converted by convert_lerobot_to_perstep.py) and
+generates affordance masks for every image_primary.jpg and image_wrist.jpg
+using AffordanceVLM.
+Input structure:
+    {data_dir}/
+    ├── meta_info.h5
+    └── episodes/
+        └── {episode_id:06d}/
+            └── steps/
+                └── {step_id:04d}/
+                    ├── other.h5           # language_instruction
+                    ├── image_primary.jpg
+                    └── image_wrist.jpg
+Output structure:
+    {save_dir}/
+    └── episode_{episode_id}/
+        └── steps/
+            └── step_{step_id}/
+                ├── image_primary_mask.png   # binary 0/255
+                └── image_wrist_mask.png
+Usage:
+    python batch_generate.py \
+        --data_dir /path/to/perstep_dataset \
+        --save_dir /path/to/mask_output \
+        --start_episode 0 --end_episode 10
+"""
+import argparse
+import os
+import sys
+from pathlib import Path
+import cv2
+import h5py
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, BitsAndBytesConfig, CLIPImageProcessor
+from model.AffordanceVLM import AffordanceVLMForCausalLM
+from model.llava import conversation as conversation_lib
+from model.llava.mm_utils import tokenizer_image_token
+from model.segment_anything.utils.transforms import ResizeLongestSide
+from utils.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
+                         DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
+def parse_args(args):
+    parser = argparse.ArgumentParser(
+        description="Batch affordance mask generation for per-step datasets"
+    )
+    # Model arguments (same as chat.py)
+    parser.add_argument("--version", default="/gemini/code/AffordanceNet/ckpts/AffordanceVLM-7B")
+    parser.add_argument(
+        "--precision", default="bf16", type=str,
+        choices=["fp32", "bf16", "fp16"],
+    )
+    parser.add_argument("--image_size", default=1024, type=int)
+    parser.add_argument("--model_max_length", default=512, type=int)
+    parser.add_argument("--lora_r", default=8, type=int)
+    parser.add_argument("--vision-tower", default="openai/clip-vit-large-patch14", type=str)
+    parser.add_argument("--local-rank", default=0, type=int)
+    parser.add_argument("--load_in_8bit", action="store_true", default=False)
+    parser.add_argument("--load_in_4bit", action="store_true", default=False)
+    parser.add_argument("--use_mm_start_end", action="store_true", default=True)
+    parser.add_argument(
+        "--conv_type", default="llava_v1", type=str,
+        choices=["llava_v1", "llava_llama_2"],
+    )
+    # Batch processing arguments
+    parser.add_argument("--data_dir", type=str, required=True,
+                        help="Root of per-step dataset (contains episodes/)")
+    parser.add_argument("--save_dir", type=str, required=True,
+                        help="Output directory for masks")
+    parser.add_argument("--prompt_template", type=str,
+                        default="{}",
+                        help="Template wrapping language_instruction. Use {} as placeholder.")
+    parser.add_argument("--start_episode", type=int, default=None,
+                        help="First episode index to process (inclusive)")
+    parser.add_argument("--end_episode", type=int, default=None,
+                        help="Last episode index to process (exclusive)")
+    return parser.parse_args(args)
+def preprocess(
+    x,
+    pixel_mean=torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1),
+    pixel_std=torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1),
+    img_size=1024,
+) -> torch.Tensor:
+    """Normalize pixel values and pad to a square input."""
+    x = (x - pixel_mean) / pixel_std
+    h, w = x.shape[-2:]
+    padh = img_size - h
+    padw = img_size - w
+    x = F.pad(x, (0, padw, 0, padh))
+    return x
+def load_model(args):
+    """Load tokenizer and model, identical to chat.py."""
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.version,
+        cache_dir=None,
+        model_max_length=args.model_max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+    tokenizer.pad_token = tokenizer.unk_token
+    tokenizer.add_tokens("[SEG]")
+    args.seg_token_idx = tokenizer("[SEG]", add_special_tokens=False).input_ids[0]
+    tokenizer.add_tokens("[AFF]")
+    args.aff_token_idx = tokenizer("[AFF]", add_special_tokens=False).input_ids[0]
+    torch_dtype = torch.float32
+    if args.precision == "bf16":
+        torch_dtype = torch.bfloat16
+    elif args.precision == "fp16":
+        torch_dtype = torch.half
+    kwargs = {"torch_dtype": torch_dtype}
+    if args.load_in_4bit:
+        kwargs.update({
+            "torch_dtype": torch.half,
+            "load_in_4bit": True,
+            "quantization_config": BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+                llm_int8_skip_modules=["visual_model"],
+            ),
+        })
+    elif args.load_in_8bit:
+        kwargs.update({
+            "torch_dtype": torch.half,
+            "quantization_config": BitsAndBytesConfig(
+                llm_int8_skip_modules=["visual_model"],
+                load_in_8bit=True,
+            ),
+        })
+    model = AffordanceVLMForCausalLM.from_pretrained(
+        args.version,
+        low_cpu_mem_usage=True,
+        vision_tower=args.vision_tower,
+        seg_token_idx=args.seg_token_idx,
+        aff_token_idx=args.aff_token_idx,
+        **kwargs,
+    )
+    model.config.eos_token_id = tokenizer.eos_token_id
+    model.config.bos_token_id = tokenizer.bos_token_id
+    model.config.pad_token_id = tokenizer.pad_token_id
+    model.get_model().initialize_vision_modules(model.get_model().config)
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(dtype=torch_dtype)
+    if args.precision == "bf16":
+        model = model.bfloat16().cuda()
+    elif args.precision == "fp16" and (not args.load_in_4bit) and (not args.load_in_8bit):
+        vision_tower = model.get_model().get_vision_tower()
+        model.model.vision_tower = None
+        import deepspeed
+        model_engine = deepspeed.init_inference(
+            model=model,
+            dtype=torch.half,
+            replace_with_kernel_inject=True,
+            replace_method="auto",
+        )
+        model = model_engine.module
+        model.model.vision_tower = vision_tower.half().cuda()
+    elif args.precision == "fp32":
+        model = model.float().cuda()
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(device=args.local_rank)
+    clip_image_processor = CLIPImageProcessor.from_pretrained(model.config.vision_tower)
+    transform = ResizeLongestSide(args.image_size)
+    model.eval()
+    return model, tokenizer, clip_image_processor, transform
+def build_prompt(text: str, args) -> str:
+    """Build the full conversation prompt from a text query."""
+    conv = conversation_lib.conv_templates[args.conv_type].copy()
+    conv.messages = []
+    prompt = DEFAULT_IMAGE_TOKEN + "\n" + "You are an embodied robot. " + text
+    if args.use_mm_start_end:
+        replace_token = (
+            DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
+        )
+        prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+    conv.append_message(conv.roles[0], prompt)
+    conv.append_message(conv.roles[1], "")
+    return conv.get_prompt()
+def infer_single_image(
+    image_path: str,
+    prompt_str: str,
+    model,
+    tokenizer,
+    clip_image_processor,
+    transform,
+    args,
+) -> "np.ndarray | None":
+    """Run inference on a single image. Returns binary mask (H, W) uint8 0/255 or None."""
+    image_np = cv2.imread(image_path)
+    if image_np is None:
+        print(f"  [WARNING] Cannot read image: {image_path}")
+        return None
+    image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
+    original_size_list = [image_np.shape[:2]]
+    # CLIP preprocessing
+    image_clip = (
+        clip_image_processor.preprocess(image_np, return_tensors="pt")["pixel_values"][0]
+        .unsqueeze(0)
+        .cuda()
+    )
+    if args.precision == "bf16":
+        image_clip = image_clip.bfloat16()
+    elif args.precision == "fp16":
+        image_clip = image_clip.half()
+    else:
+        image_clip = image_clip.float()
+    # SAM preprocessing
+    image = transform.apply_image(image_np)
+    resize_list = [image.shape[:2]]
+    image = (
+        preprocess(torch.from_numpy(image).permute(2, 0, 1).contiguous())
+        .unsqueeze(0)
+        .cuda()
+    )
+    if args.precision == "bf16":
+        image = image.bfloat16()
+    elif args.precision == "fp16":
+        image = image.half()
+    else:
+        image = image.float()
+    # Tokenize
+    input_ids = tokenizer_image_token(prompt_str, tokenizer, return_tensors="pt")
+    input_ids = input_ids.unsqueeze(0).cuda()
+    # Inference
+    with torch.no_grad():
+        output_ids, pred_masks = model.evaluate(
+            image_clip,
+            image,
+            input_ids,
+            resize_list,
+            original_size_list,
+            max_new_tokens=512,
+            tokenizer=tokenizer,
+        )
+    # Merge all predicted masks via union (logical OR)
+    h, w = original_size_list[0]
+    merged = np.zeros((h, w), dtype=bool)
+    has_mask = False
+    for pred_mask in pred_masks:
+        if pred_mask.shape[0] == 0:
+            continue
+        mask_np = pred_mask.detach().cpu().numpy()[0]  # (H, W)
+        merged |= (mask_np > 0)
+        has_mask = True
+    if not has_mask:
+        return None
+    return (merged.astype(np.uint8) * 255)
+def read_language_instruction(h5_path: str) -> str:
+    """Read language_instruction from other.h5."""
+    with h5py.File(h5_path, "r") as f:
+        instr = f["language_instruction"][()]
+        if isinstance(instr, bytes):
+            instr = instr.decode("utf-8")
+        return str(instr)
+def main(args):
+    args = parse_args(args)
+    data_dir = Path(args.data_dir)
+    save_dir = Path(args.save_dir)
+    episodes_dir = data_dir / "episodes"
+    if not episodes_dir.is_dir():
+        print(f"Error: episodes directory not found at {episodes_dir}")
+        sys.exit(1)
+    # Collect and sort episode directories
+    episode_dirs = sorted(
+        [d for d in episodes_dir.iterdir() if d.is_dir()],
+        key=lambda p: p.name,
+    )
+    # Filter by episode range
+    if args.start_episode is not None or args.end_episode is not None:
+        start = args.start_episode if args.start_episode is not None else 0
+        end = args.end_episode if args.end_episode is not None else len(episode_dirs)
+        episode_dirs = [
+            d for d in episode_dirs
+            if start <= int(d.name) < end
+        ]
+    print(f"Data dir : {data_dir}")
+    print(f"Save dir : {save_dir}")
+    print(f"Episodes : {len(episode_dirs)}")
+    print(f"Prompt   : {args.prompt_template}")
+    print()
+    # Load model
+    print("Loading model...")
+    model, tokenizer, clip_image_processor, transform = load_model(args)
+    print("Model loaded.\n")
+    total_steps = 0
+    empty_mask_count = 0
+    for ep_dir in episode_dirs:
+        episode_id = ep_dir.name  # e.g. "000000"
+        steps_dir = ep_dir / "steps"
+        if not steps_dir.is_dir():
+            print(f"  [WARNING] No steps/ in {ep_dir}, skipping.")
+            continue
+        step_dirs = sorted(
+            [d for d in steps_dir.iterdir() if d.is_dir()],
+            key=lambda p: p.name,
+        )
+        for step_dir in step_dirs:
+            step_id = step_dir.name  # e.g. "0000"
+            # Read language instruction
+            other_h5 = step_dir / "other.h5"
+            if not other_h5.exists():
+                print(f"  [WARNING] Missing other.h5 in {step_dir}, skipping.")
+                continue
+            language_instruction = read_language_instruction(str(other_h5))
+            # debug
+            # print(language_instruction)
+            # Build prompt
+            query_text = args.prompt_template.format(language_instruction)
+            prompt_str = build_prompt(query_text, args)
+            # Output directory (same structure as input: episodes/{episode_id}/steps/{step_id}/)
+            out_dir = save_dir / "episodes" / episode_id / "steps" / step_id
+            out_dir.mkdir(parents=True, exist_ok=True)
+            # Process both cameras
+            for cam_name in ("image_primary", "image_wrist"):
+                img_path = step_dir / f"{cam_name}.jpg"
+                mask_path = out_dir / f"{cam_name}_mask.png"
+                if not img_path.exists():
+                    print(f"  [WARNING] Missing {img_path}, skipping.")
+                    continue
+                mask = infer_single_image(
+                    str(img_path), prompt_str,
+                    model, tokenizer, clip_image_processor, transform, args,
+                )
+                if mask is None:
+                    # Save blank mask and warn
+                    h, w = cv2.imread(str(img_path)).shape[:2]
+                    mask = np.zeros((h, w), dtype=np.uint8)
+                    empty_mask_count += 1
+                cv2.imwrite(str(mask_path), mask)
+            total_steps += 1
+            if total_steps % 50 == 0:
+                print(f"  Processed {total_steps} steps (episode {episode_id}, step {step_id})")
+        print(f"Episode {episode_id} done ({len(step_dirs)} steps)")
+    print(f"\nFinished. {total_steps} steps processed, {empty_mask_count} empty masks.")
+if __name__ == "__main__":
+    main(sys.argv[1:])

.ipynb_checkpoints/batch_generate-checkpoint.sh ADDED Viewed

	@@ -0,0 +1,14 @@

+#!/bin/bash
+# Batch generate affordance masks for all four LIBERO subsets sequentially.
+SRC_ROOT="/gemini/space/wrz/libero_per_frame"
+TGT_ROOT="/gemini/space/wrz/ragnet_results"
+for ds in libero_object libero_goal libero_spatial libero_10; do
+    echo "========== Processing ${ds} =========="
+    CUDA_VISIBLE_DEVICES=0 python batch_generate.py \
+        --data_dir "${SRC_ROOT}/${ds}_converted" \
+        --save_dir "${TGT_ROOT}/${ds}"
+    echo "========== ${ds} done =========="
+    echo
+done

.ipynb_checkpoints/batch_generate_prefill_accelerate-checkpoint.py ADDED Viewed

	@@ -0,0 +1,418 @@

+"""
+Batch affordance mask generation for per-step datasets.
+Reads a per-step dataset (converted by convert_lerobot_to_perstep.py) and
+generates affordance masks for every image_primary.jpg and image_wrist.jpg
+using AffordanceVLM.
+Input structure:
+    {data_dir}/
+    ├── meta_info.h5
+    └── episodes/
+        └── {episode_id:06d}/
+            └── steps/
+                └── {step_id:04d}/
+                    ├── other.h5           # language_instruction
+                    ├── image_primary.jpg
+                    └── image_wrist.jpg
+Output structure:
+    {save_dir}/
+    └── episodes/
+        └── {episode_id:06d}/
+            └── steps/
+                └── {step_id:04d}/
+                    ├── image_primary_mask.png   # binary 0/255
+                    └── image_wrist_mask.png
+Usage:
+        CUDA_VISIBLE_DEVICES=1 python batch_generate_prefill_accelerate.py \
+        --data_dir /gemini/space/wrz/libero_per_frame/libero_spatial_converted \
+        --save_dir /gemini/space/wrz/ragnet_results/libero_spatial
+"""
+import argparse
+import os
+import sys
+from pathlib import Path
+import cv2
+import h5py
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, BitsAndBytesConfig, CLIPImageProcessor
+from model.AffordanceVLM import AffordanceVLMForCausalLM
+from model.llava import conversation as conversation_lib
+from model.llava.mm_utils import tokenizer_image_token
+from model.segment_anything.utils.transforms import ResizeLongestSide
+from utils.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
+                         DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
+def parse_args(args):
+    parser = argparse.ArgumentParser(
+        description="Batch affordance mask generation for per-step datasets"
+    )
+    # Model arguments (same as chat.py)
+    parser.add_argument("--version", default="/gemini/code/AffordanceNet/ckpts/AffordanceVLM-7B")
+    parser.add_argument(
+        "--precision", default="bf16", type=str,
+        choices=["fp32", "bf16", "fp16"],
+    )
+    parser.add_argument("--image_size", default=1024, type=int)
+    parser.add_argument("--model_max_length", default=512, type=int)
+    parser.add_argument("--lora_r", default=8, type=int)
+    parser.add_argument("--vision-tower", default="openai/clip-vit-large-patch14", type=str)
+    parser.add_argument("--local-rank", default=0, type=int)
+    parser.add_argument("--load_in_8bit", action="store_true", default=False)
+    parser.add_argument("--load_in_4bit", action="store_true", default=False)
+    parser.add_argument("--use_mm_start_end", action="store_true", default=True)
+    parser.add_argument(
+        "--conv_type", default="llava_v1", type=str,
+        choices=["llava_v1", "llava_llama_2"],
+    )
+    # Batch processing arguments
+    parser.add_argument("--data_dir", type=str, required=True,
+                        help="Root of per-step dataset (contains episodes/)")
+    parser.add_argument("--save_dir", type=str, required=True,
+                        help="Output directory for masks")
+    parser.add_argument("--prompt_template", type=str,
+                        default="{}",
+                        help="Template wrapping language_instruction. Use {} as placeholder.")
+    # "{}"
+    # Segment the most suitable manipulation region on the single target object for the task '{}'.
+    # Segment the affordance map for the task '{}' in this image.
+    # Segment the affordance map of the single target object for the task '{}' in this image.
+    # Given the task instruction '{}', what is the affordance map of the target object in this image? Please output segmentation mask.
+    # Given the task instruction '{}', what is the affordance map of the single target object in this image? There is only one target object. Please output segmentation mask.
+    parser.add_argument("--start_episode", type=int, default=None,
+                        help="First episode index to process (inclusive)")
+    parser.add_argument("--end_episode", type=int, default=None,
+                        help="Last episode index to process (exclusive)")
+    return parser.parse_args(args)
+def preprocess(
+    x,
+    pixel_mean=torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1),
+    pixel_std=torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1),
+    img_size=1024,
+) -> torch.Tensor:
+    """Normalize pixel values and pad to a square input."""
+    x = (x - pixel_mean) / pixel_std
+    h, w = x.shape[-2:]
+    padh = img_size - h
+    padw = img_size - w
+    x = F.pad(x, (0, padw, 0, padh))
+    return x
+def load_model(args):
+    """Load tokenizer and model, identical to chat.py."""
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.version,
+        cache_dir=None,
+        model_max_length=args.model_max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+    tokenizer.pad_token = tokenizer.unk_token
+    tokenizer.add_tokens("[SEG]")
+    args.seg_token_idx = tokenizer("[SEG]", add_special_tokens=False).input_ids[0]
+    tokenizer.add_tokens("[AFF]")
+    args.aff_token_idx = tokenizer("[AFF]", add_special_tokens=False).input_ids[0]
+    torch_dtype = torch.float32
+    if args.precision == "bf16":
+        torch_dtype = torch.bfloat16
+    elif args.precision == "fp16":
+        torch_dtype = torch.half
+    kwargs = {"torch_dtype": torch_dtype}
+    if args.load_in_4bit:
+        kwargs.update({
+            "torch_dtype": torch.half,
+            "load_in_4bit": True,
+            "quantization_config": BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+                llm_int8_skip_modules=["visual_model"],
+            ),
+        })
+    elif args.load_in_8bit:
+        kwargs.update({
+            "torch_dtype": torch.half,
+            "quantization_config": BitsAndBytesConfig(
+                llm_int8_skip_modules=["visual_model"],
+                load_in_8bit=True,
+            ),
+        })
+    model = AffordanceVLMForCausalLM.from_pretrained(
+        args.version,
+        low_cpu_mem_usage=True,
+        vision_tower=args.vision_tower,
+        seg_token_idx=args.seg_token_idx,
+        aff_token_idx=args.aff_token_idx,
+        **kwargs,
+    )
+    model.config.eos_token_id = tokenizer.eos_token_id
+    model.config.bos_token_id = tokenizer.bos_token_id
+    model.config.pad_token_id = tokenizer.pad_token_id
+    model.get_model().initialize_vision_modules(model.get_model().config)
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(dtype=torch_dtype)
+    if args.precision == "bf16":
+        model = model.bfloat16().cuda()
+    elif args.precision == "fp16" and (not args.load_in_4bit) and (not args.load_in_8bit):
+        vision_tower = model.get_model().get_vision_tower()
+        model.model.vision_tower = None
+        import deepspeed
+        model_engine = deepspeed.init_inference(
+            model=model,
+            dtype=torch.half,
+            replace_with_kernel_inject=True,
+            replace_method="auto",
+        )
+        model = model_engine.module
+        model.model.vision_tower = vision_tower.half().cuda()
+    elif args.precision == "fp32":
+        model = model.float().cuda()
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(device=args.local_rank)
+    clip_image_processor = CLIPImageProcessor.from_pretrained(model.config.vision_tower)
+    transform = ResizeLongestSide(args.image_size)
+    model.eval()
+    return model, tokenizer, clip_image_processor, transform
+def build_prompt(text: str, args) -> str:
+    """Build the full conversation prompt from a text query."""
+    conv = conversation_lib.conv_templates[args.conv_type].copy()
+    conv.messages = []
+    prompt = DEFAULT_IMAGE_TOKEN + "\n" + "You are an embodied robot. " + text
+    if args.use_mm_start_end:
+        replace_token = (
+            DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
+        )
+        prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+    conv.append_message(conv.roles[0], prompt)
+    conv.append_message(conv.roles[1], "[AFF].")
+    return conv.get_prompt()
+def infer_single_image(
+    image_path: str,
+    prompt_str: str,
+    model,
+    tokenizer,
+    clip_image_processor,
+    transform,
+    args,
+) -> "np.ndarray | None":
+    """Run inference on a single image. Returns binary mask (H, W) uint8 0/255 or None."""
+    image_np = cv2.imread(image_path)
+    if image_np is None:
+        print(f"  [WARNING] Cannot read image: {image_path}")
+        return None
+    image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
+    original_size_list = [image_np.shape[:2]]
+    # CLIP preprocessing
+    image_clip = (
+        clip_image_processor.preprocess(image_np, return_tensors="pt")["pixel_values"][0]
+        .unsqueeze(0)
+        .cuda()
+    )
+    if args.precision == "bf16":
+        image_clip = image_clip.bfloat16()
+    elif args.precision == "fp16":
+        image_clip = image_clip.half()
+    else:
+        image_clip = image_clip.float()
+    # SAM preprocessing
+    image = transform.apply_image(image_np)
+    resize_list = [image.shape[:2]]
+    image = (
+        preprocess(torch.from_numpy(image).permute(2, 0, 1).contiguous())
+        .unsqueeze(0)
+        .cuda()
+    )
+    if args.precision == "bf16":
+        image = image.bfloat16()
+    elif args.precision == "fp16":
+        image = image.half()
+    else:
+        image = image.float()
+    # Tokenize
+    input_ids = tokenizer_image_token(prompt_str, tokenizer, return_tensors="pt")
+    input_ids = input_ids.unsqueeze(0).cuda()
+    attention_masks = input_ids.ne(tokenizer.pad_token_id)
+    # Prefill inference (single forward pass instead of autoregressive generation)
+    h, w = original_size_list[0]
+    labels = input_ids.clone()
+    offset = torch.LongTensor([0, 1]).cuda()
+    masks_list = [torch.zeros(1, h, w).float().cuda()]
+    label_list = [torch.zeros(h, w).long().cuda()]
+    with torch.no_grad():
+        output_dict = model(
+            images=image,
+            images_clip=image_clip,
+            input_ids=input_ids,
+            labels=labels,
+            attention_masks=attention_masks,
+            offset=offset,
+            masks_list=masks_list,
+            label_list=label_list,
+            resize_list=resize_list,
+            inference=True,
+        )
+    pred_masks = output_dict["pred_masks"]
+    # Merge all predicted masks via union (logical OR)
+    merged = np.zeros((h, w), dtype=bool)
+    has_mask = False
+    for pred_mask in pred_masks:
+        if pred_mask.shape[0] == 0:
+            continue
+        mask_np = pred_mask.detach().cpu().numpy()[0]  # (H, W)
+        merged |= (mask_np > 0)
+        has_mask = True
+    if not has_mask:
+        return None
+    return (merged.astype(np.uint8) * 255)
+def read_language_instruction(h5_path: str) -> str:
+    """Read language_instruction from other.h5."""
+    with h5py.File(h5_path, "r") as f:
+        instr = f["language_instruction"][()]
+        if isinstance(instr, bytes):
+            instr = instr.decode("utf-8")
+        return str(instr)
+def main(args):
+    args = parse_args(args)
+    data_dir = Path(args.data_dir)
+    save_dir = Path(args.save_dir)
+    episodes_dir = data_dir / "episodes"
+    if not episodes_dir.is_dir():
+        print(f"Error: episodes directory not found at {episodes_dir}")
+        sys.exit(1)
+    # Collect and sort episode directories
+    episode_dirs = sorted(
+        [d for d in episodes_dir.iterdir() if d.is_dir()],
+        key=lambda p: p.name,
+    )
+    # Filter by episode range
+    if args.start_episode is not None or args.end_episode is not None:
+        start = args.start_episode if args.start_episode is not None else 0
+        end = args.end_episode if args.end_episode is not None else len(episode_dirs)
+        episode_dirs = [
+            d for d in episode_dirs
+            if start <= int(d.name) < end
+        ]
+    print(f"Data dir : {data_dir}")
+    print(f"Save dir : {save_dir}")
+    print(f"Episodes : {len(episode_dirs)}")
+    print(f"Prompt   : {args.prompt_template}")
+    print()
+    # Load model
+    print("Loading model...")
+    model, tokenizer, clip_image_processor, transform = load_model(args)
+    print("Model loaded.\n")
+    total_steps = 0
+    empty_mask_count = 0
+    for ep_dir in episode_dirs:
+        episode_id = ep_dir.name  # e.g. "000000"
+        steps_dir = ep_dir / "steps"
+        if not steps_dir.is_dir():
+            print(f"  [WARNING] No steps/ in {ep_dir}, skipping.")
+            continue
+        step_dirs = sorted(
+            [d for d in steps_dir.iterdir() if d.is_dir()],
+            key=lambda p: p.name,
+        )
+        for step_dir in step_dirs:
+            step_id = step_dir.name  # e.g. "0000"
+            # Read language instruction
+            other_h5 = step_dir / "other.h5"
+            if not other_h5.exists():
+                print(f"  [WARNING] Missing other.h5 in {step_dir}, skipping.")
+                continue
+            language_instruction = read_language_instruction(str(other_h5))
+            # debug
+            # print(language_instruction)
+            # Build prompt
+            query_text = args.prompt_template.format(language_instruction)
+            prompt_str = build_prompt(query_text, args)
+            # Output directory (same structure as input: episodes/{episode_id}/steps/{step_id}/)
+            out_dir = save_dir / "episodes" / episode_id / "steps" / step_id
+            out_dir.mkdir(parents=True, exist_ok=True)
+            # Process both cameras
+            for cam_name in ("image_primary", "image_wrist"):
+                img_path = step_dir / f"{cam_name}.jpg"
+                mask_path = out_dir / f"{cam_name}_mask.png"
+                if not img_path.exists():
+                    print(f"  [WARNING] Missing {img_path}, skipping.")
+                    continue
+                mask = infer_single_image(
+                    str(img_path), prompt_str,
+                    model, tokenizer, clip_image_processor, transform, args,
+                )
+                if mask is None:
+                    # Save blank mask and warn
+                    h, w = cv2.imread(str(img_path)).shape[:2]
+                    mask = np.zeros((h, w), dtype=np.uint8)
+                    empty_mask_count += 1
+                cv2.imwrite(str(mask_path), mask)
+            total_steps += 1
+            if total_steps % 50 == 0:
+                print(f"  Processed {total_steps} steps (episode {episode_id}, step {step_id})")
+        print(f"Episode {episode_id} done ({len(step_dirs)} steps)")
+    print(f"\nFinished. {total_steps} steps processed, {empty_mask_count} empty masks.")
+if __name__ == "__main__":
+    main(sys.argv[1:])

.ipynb_checkpoints/chat-checkpoint.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import argparse
+import os
+import sys
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, BitsAndBytesConfig, CLIPImageProcessor
+from model.AffordanceVLM import AffordanceVLMForCausalLM
+from model.llava import conversation as conversation_lib
+from model.llava.mm_utils import tokenizer_image_token
+from model.segment_anything.utils.transforms import ResizeLongestSide
+from utils.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
+                         DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
+def parse_args(args):
+    parser = argparse.ArgumentParser(description="LISA chat")
+    parser.add_argument("--version", default="/gemini/code/AffordanceNet/ckpts/AffordanceVLM-7B")
+    parser.add_argument("--vis_save_path", default="./vis_output", type=str)
+    parser.add_argument(
+        "--precision",
+        default="bf16",
+        type=str,
+        choices=["fp32", "bf16", "fp16"],
+        help="precision for inference",
+    )
+    parser.add_argument("--image_size", default=1024, type=int, help="image size")
+    parser.add_argument("--model_max_length", default=512, type=int)
+    parser.add_argument("--lora_r", default=8, type=int)
+    parser.add_argument(
+        "--vision-tower", default="openai/clip-vit-large-patch14", type=str
+    )
+    parser.add_argument("--local-rank", default=0, type=int, help="node rank")
+    parser.add_argument("--load_in_8bit", action="store_true", default=False)
+    parser.add_argument("--load_in_4bit", action="store_true", default=False)
+    parser.add_argument("--use_mm_start_end", action="store_true", default=True)
+    parser.add_argument(
+        "--conv_type",
+        default="llava_v1",
+        type=str,
+        choices=["llava_v1", "llava_llama_2"],
+    )
+    return parser.parse_args(args)
+def preprocess(
+    x,
+    pixel_mean=torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1),
+    pixel_std=torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1),
+    img_size=1024,
+) -> torch.Tensor:
+    """Normalize pixel values and pad to a square input."""
+    # Normalize colors
+    x = (x - pixel_mean) / pixel_std
+    # Pad
+    h, w = x.shape[-2:]
+    padh = img_size - h
+    padw = img_size - w
+    x = F.pad(x, (0, padw, 0, padh))
+    return x
+def main(args):
+    args = parse_args(args)
+    os.makedirs(args.vis_save_path, exist_ok=True)
+    # Create model
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.version,
+        cache_dir=None,
+        model_max_length=args.model_max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+    tokenizer.pad_token = tokenizer.unk_token
+    num_added_tokens = tokenizer.add_tokens("[SEG]")
+    args.seg_token_idx = tokenizer("[SEG]", add_special_tokens=False).input_ids[0]
+    num_added_tokens = tokenizer.add_tokens("[AFF]")
+    args.aff_token_idx = tokenizer("[AFF]", add_special_tokens=False).input_ids[0]
+    torch_dtype = torch.float32
+    if args.precision == "bf16":
+        torch_dtype = torch.bfloat16
+    elif args.precision == "fp16":
+        torch_dtype = torch.half
+    kwargs = {"torch_dtype": torch_dtype}
+    if args.load_in_4bit:
+        kwargs.update(
+            {
+                "torch_dtype": torch.half,
+                "load_in_4bit": True,
+                "quantization_config": BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_compute_dtype=torch.float16,
+                    bnb_4bit_use_double_quant=True,
+                    bnb_4bit_quant_type="nf4",
+                    llm_int8_skip_modules=["visual_model"],
+                ),
+            }
+        )
+    elif args.load_in_8bit:
+        kwargs.update(
+            {
+                "torch_dtype": torch.half,
+                "quantization_config": BitsAndBytesConfig(
+                    llm_int8_skip_modules=["visual_model"],
+                    load_in_8bit=True,
+                ),
+            }
+        )
+    model = AffordanceVLMForCausalLM.from_pretrained(
+        args.version, low_cpu_mem_usage=True, vision_tower=args.vision_tower, seg_token_idx=args.seg_token_idx, aff_token_idx=args.aff_token_idx, **kwargs
+    )
+    model.config.eos_token_id = tokenizer.eos_token_id
+    model.config.bos_token_id = tokenizer.bos_token_id
+    model.config.pad_token_id = tokenizer.pad_token_id
+    model.get_model().initialize_vision_modules(model.get_model().config)
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(dtype=torch_dtype)
+    if args.precision == "bf16":
+        model = model.bfloat16().cuda()
+    elif (
+        args.precision == "fp16" and (not args.load_in_4bit) and (not args.load_in_8bit)
+    ):
+        vision_tower = model.get_model().get_vision_tower()
+        model.model.vision_tower = None
+        import deepspeed
+        model_engine = deepspeed.init_inference(
+            model=model,
+            dtype=torch.half,
+            replace_with_kernel_inject=True,
+            replace_method="auto",
+        )
+        model = model_engine.module
+        model.model.vision_tower = vision_tower.half().cuda()
+    elif args.precision == "fp32":
+        model = model.float().cuda()
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(device=args.local_rank)
+    clip_image_processor = CLIPImageProcessor.from_pretrained(model.config.vision_tower)
+    transform = ResizeLongestSide(args.image_size)
+    model.eval()
+    while True:
+        conv = conversation_lib.conv_templates[args.conv_type].copy()
+        conv.messages = []
+        prompt = input("Please input your prompt: ")
+        prompt = DEFAULT_IMAGE_TOKEN + "\n" + "You are an embodied robot. " + prompt
+        if args.use_mm_start_end:
+            replace_token = (
+                DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
+            )
+            prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+        conv.append_message(conv.roles[0], prompt)
+        conv.append_message(conv.roles[1], "")
+        prompt = conv.get_prompt()
+        image_path = input("Please input the image path: ")
+        if not os.path.exists(image_path):
+            print("File not found in {}".format(image_path))
+            continue
+        image_np = cv2.imread(image_path)
+        image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
+        original_size_list = [image_np.shape[:2]]
+        image_clip = (
+            clip_image_processor.preprocess(image_np, return_tensors="pt")[
+                "pixel_values"
+            ][0]
+            .unsqueeze(0)
+            .cuda()
+        )
+        if args.precision == "bf16":
+            image_clip = image_clip.bfloat16()
+        elif args.precision == "fp16":
+            image_clip = image_clip.half()
+        else:
+            image_clip = image_clip.float()
+        image = transform.apply_image(image_np)
+        resize_list = [image.shape[:2]]
+        image = (
+            preprocess(torch.from_numpy(image).permute(2, 0, 1).contiguous())
+            .unsqueeze(0)
+            .cuda()
+        )
+        if args.precision == "bf16":
+            image = image.bfloat16()
+        elif args.precision == "fp16":
+            image = image.half()
+        else:
+            image = image.float()
+        input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+        input_ids = input_ids.unsqueeze(0).cuda()
+        output_ids, pred_masks = model.evaluate(
+            image_clip,
+            image,
+            input_ids,
+            resize_list,
+            original_size_list,
+            max_new_tokens=512,
+            tokenizer=tokenizer,
+        )
+        output_ids = output_ids[0][output_ids[0] != IMAGE_TOKEN_INDEX]
+        text_output = tokenizer.decode(output_ids, skip_special_tokens=False)
+        text_output = text_output.replace("\n", "").replace("  ", " ")
+        print("text_output: ", text_output)
+        for i, pred_mask in enumerate(pred_masks):
+            if pred_mask.shape[0] == 0:
+                continue
+            pred_mask = pred_mask.detach().cpu().numpy()[0]
+            pred_mask = pred_mask > 0
+            save_path = "{}/{}_mask_{}.jpg".format(
+                args.vis_save_path, image_path.split("/")[-1].split(".")[0], i
+            )
+            cv2.imwrite(save_path, pred_mask * 100)
+            print("{} has been saved.".format(save_path))
+            save_path = "{}/{}_masked_img_{}.jpg".format(
+                args.vis_save_path, image_path.split("/")[-1].split(".")[0], i
+            )
+            save_img = image_np.copy()
+            save_img[pred_mask] = (
+                image_np * 0.5
+                + pred_mask[:, :, None].astype(np.uint8) * np.array([255, 0, 0]) * 0.5
+            )[pred_mask]
+            save_img = cv2.cvtColor(save_img, cv2.COLOR_RGB2BGR)
+            cv2.imwrite(save_path, save_img)
+            print("{} has been saved.".format(save_path))
+if __name__ == "__main__":
+    main(sys.argv[1:])

.ipynb_checkpoints/chat_prefill-checkpoint.py ADDED Viewed

	@@ -0,0 +1,282 @@

+"""
+Interactive affordance mask generation using prefill mode (single forward pass).
+Same interactive workflow as chat.py, but uses prefill inference instead of
+autoregressive generation. The assistant response "[AFF]." is pre-filled in the
+prompt, so the model only does one forward pass to extract mask embeddings.
+"""
+import argparse
+import os
+import sys
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, BitsAndBytesConfig, CLIPImageProcessor
+from model.AffordanceVLM import AffordanceVLMForCausalLM
+from model.llava import conversation as conversation_lib
+from model.llava.mm_utils import tokenizer_image_token
+from model.segment_anything.utils.transforms import ResizeLongestSide
+from utils.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
+                         DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
+def parse_args(args):
+    parser = argparse.ArgumentParser(description="AffordanceVLM chat (prefill mode)")
+    parser.add_argument("--version", default="/gemini/code/AffordanceNet/ckpts/AffordanceVLM-7B")
+    parser.add_argument("--vis_save_path", default="./vis_output_prefill", type=str)
+    parser.add_argument(
+        "--precision", default="bf16", type=str,
+        choices=["fp32", "bf16", "fp16"],
+    )
+    parser.add_argument("--image_size", default=1024, type=int)
+    parser.add_argument("--model_max_length", default=512, type=int)
+    parser.add_argument("--lora_r", default=8, type=int)
+    parser.add_argument("--vision-tower", default="openai/clip-vit-large-patch14", type=str)
+    parser.add_argument("--local-rank", default=0, type=int)
+    parser.add_argument("--load_in_8bit", action="store_true", default=False)
+    parser.add_argument("--load_in_4bit", action="store_true", default=False)
+    parser.add_argument("--use_mm_start_end", action="store_true", default=True)
+    parser.add_argument(
+        "--conv_type", default="llava_v1", type=str,
+        choices=["llava_v1", "llava_llama_2"],
+    )
+    parser.add_argument("--prompt_template", type=str,
+                        default="Segment the most suitable manipulation region on the single target object for the task '{}'.",
+                        help="Template wrapping language_instruction. Use {} as placeholder.")
+    # Segment the most suitable manipulation region on the single target object for the task '{}'.
+    # Segment the affordance map for the task '{}' in this image.
+    # Segment the affordance map of the single target object for the task '{}' in this image.
+    # Given the task instruction '{}', what is the affordance map of the target object in this image? Please output segmentation mask.
+    # Given the task instruction '{}', what is the affordance map of the single target object in this image? There is only one target object. Please output segmentation mask.
+    return parser.parse_args(args)
+def preprocess(
+    x,
+    pixel_mean=torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1),
+    pixel_std=torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1),
+    img_size=1024,
+) -> torch.Tensor:
+    """Normalize pixel values and pad to a square input."""
+    x = (x - pixel_mean) / pixel_std
+    h, w = x.shape[-2:]
+    padh = img_size - h
+    padw = img_size - w
+    x = F.pad(x, (0, padw, 0, padh))
+    return x
+def main(args):
+    args = parse_args(args)
+    os.makedirs(args.vis_save_path, exist_ok=True)
+    # Create model
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.version,
+        cache_dir=None,
+        model_max_length=args.model_max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+    tokenizer.pad_token = tokenizer.unk_token
+    tokenizer.add_tokens("[SEG]")
+    args.seg_token_idx = tokenizer("[SEG]", add_special_tokens=False).input_ids[0]
+    tokenizer.add_tokens("[AFF]")
+    args.aff_token_idx = tokenizer("[AFF]", add_special_tokens=False).input_ids[0]
+    torch_dtype = torch.float32
+    if args.precision == "bf16":
+        torch_dtype = torch.bfloat16
+    elif args.precision == "fp16":
+        torch_dtype = torch.half
+    kwargs = {"torch_dtype": torch_dtype}
+    if args.load_in_4bit:
+        kwargs.update({
+            "torch_dtype": torch.half,
+            "load_in_4bit": True,
+            "quantization_config": BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+                llm_int8_skip_modules=["visual_model"],
+            ),
+        })
+    elif args.load_in_8bit:
+        kwargs.update({
+            "torch_dtype": torch.half,
+            "quantization_config": BitsAndBytesConfig(
+                llm_int8_skip_modules=["visual_model"],
+                load_in_8bit=True,
+            ),
+        })
+    model = AffordanceVLMForCausalLM.from_pretrained(
+        args.version,
+        low_cpu_mem_usage=True,
+        vision_tower=args.vision_tower,
+        seg_token_idx=args.seg_token_idx,
+        aff_token_idx=args.aff_token_idx,
+        **kwargs,
+    )
+    model.config.eos_token_id = tokenizer.eos_token_id
+    model.config.bos_token_id = tokenizer.bos_token_id
+    model.config.pad_token_id = tokenizer.pad_token_id
+    model.get_model().initialize_vision_modules(model.get_model().config)
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(dtype=torch_dtype)
+    if args.precision == "bf16":
+        model = model.bfloat16().cuda()
+    elif args.precision == "fp16" and (not args.load_in_4bit) and (not args.load_in_8bit):
+        vision_tower = model.get_model().get_vision_tower()
+        model.model.vision_tower = None
+        import deepspeed
+        model_engine = deepspeed.init_inference(
+            model=model,
+            dtype=torch.half,
+            replace_with_kernel_inject=True,
+            replace_method="auto",
+        )
+        model = model_engine.module
+        model.model.vision_tower = vision_tower.half().cuda()
+    elif args.precision == "fp32":
+        model = model.float().cuda()
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(device=args.local_rank)
+    clip_image_processor = CLIPImageProcessor.from_pretrained(model.config.vision_tower)
+    transform = ResizeLongestSide(args.image_size)
+    model.eval()
+    # debug
+    template = "Given the task instruction '{}', what is the affordance map of the target object in this image? Please output segmentation mask."
+    while True:
+        conv = conversation_lib.conv_templates[args.conv_type].copy()
+        conv.messages = []
+        prompt = input("Please input your prompt: ")
+        # 加入模版
+        prompt = args.prompt_template.format(prompt)
+        prompt = DEFAULT_IMAGE_TOKEN + "\n" + "You are an embodied robot. " + prompt
+        if args.use_mm_start_end:
+            replace_token = (
+                DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
+            )
+            prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+        conv.append_message(conv.roles[0], prompt)
+        conv.append_message(conv.roles[1], "[AFF].")
+        prompt = conv.get_prompt()
+        image_path = input("Please input the image path: ")
+        if not os.path.exists(image_path):
+            print("File not found in {}".format(image_path))
+            continue
+        image_np = cv2.imread(image_path)
+        image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
+        original_size_list = [image_np.shape[:2]]
+        h, w = original_size_list[0]
+        image_clip = (
+            clip_image_processor.preprocess(image_np, return_tensors="pt")[
+                "pixel_values"
+            ][0]
+            .unsqueeze(0)
+            .cuda()
+        )
+        if args.precision == "bf16":
+            image_clip = image_clip.bfloat16()
+        elif args.precision == "fp16":
+            image_clip = image_clip.half()
+        else:
+            image_clip = image_clip.float()
+        image = transform.apply_image(image_np)
+        resize_list = [image.shape[:2]]
+        image = (
+            preprocess(torch.from_numpy(image).permute(2, 0, 1).contiguous())
+            .unsqueeze(0)
+            .cuda()
+        )
+        if args.precision == "bf16":
+            image = image.bfloat16()
+        elif args.precision == "fp16":
+            image = image.half()
+        else:
+            image = image.float()
+        input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+        input_ids = input_ids.unsqueeze(0).cuda()
+        attention_masks = input_ids.ne(tokenizer.pad_token_id)
+        # Print the full prompt text (prefill mode has no generated text)
+        # debug
+        text_ids = input_ids[0][input_ids[0] != IMAGE_TOKEN_INDEX]
+        text_output = tokenizer.decode(text_ids, skip_special_tokens=False)
+        text_output = text_output.replace("\n", "").replace("  ", " ")
+        print("text_output: ", text_output)
+        # Prefill inference
+        labels = input_ids.clone()
+        offset = torch.LongTensor([0, 1]).cuda()
+        masks_list = [torch.zeros(1, h, w).float().cuda()]
+        label_list = [torch.zeros(h, w).long().cuda()]
+        with torch.no_grad():
+            output_dict = model(
+                images=image,
+                images_clip=image_clip,
+                input_ids=input_ids,
+                labels=labels,
+                attention_masks=attention_masks,
+                offset=offset,
+                masks_list=masks_list,
+                label_list=label_list,
+                resize_list=resize_list,
+                inference=True,
+            )
+        pred_masks = output_dict["pred_masks"]
+        for i, pred_mask in enumerate(pred_masks):
+            if pred_mask.shape[0] == 0:
+                continue
+            pred_mask = pred_mask.detach().cpu().numpy()[0]
+            pred_mask = pred_mask > 0
+            save_path = "{}/{}_mask_{}.jpg".format(
+                args.vis_save_path, image_path.split("/")[-1].split(".")[0], i
+            )
+            cv2.imwrite(save_path, pred_mask * 100)
+            print("{} has been saved.".format(save_path))
+            save_path = "{}/{}_masked_img_{}.jpg".format(
+                args.vis_save_path, image_path.split("/")[-1].split(".")[0], i
+            )
+            save_img = image_np.copy()
+            save_img[pred_mask] = (
+                image_np * 0.5
+                + pred_mask[:, :, None].astype(np.uint8) * np.array([255, 0, 0]) * 0.5
+            )[pred_mask]
+            save_img = cv2.cvtColor(save_img, cv2.COLOR_RGB2BGR)
+            cv2.imwrite(save_path, save_img)
+            print("{} has been saved.".format(save_path))
+if __name__ == "__main__":
+    main(sys.argv[1:])

.ipynb_checkpoints/train_aff-checkpoint.py ADDED Viewed

	@@ -0,0 +1,620 @@

+import argparse
+import os
+import shutil
+import sys
+import time
+from functools import partial
+import deepspeed
+import numpy as np
+import torch
+import tqdm
+import transformers
+from peft import LoraConfig, get_peft_model
+from torch.utils.tensorboard import SummaryWriter
+from model.AffordanceVLM import AffordanceVLMForCausalLM
+from model.llava import conversation as conversation_lib
+from utils.dataset import HybridDataset, ValDataset, collate_fn
+from utils.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
+                         AverageMeter, ProgressMeter, Summary, dict_to_cuda,
+                         intersectionAndUnionGPU)
+from utils.aff_seg_dataset import AffValDataset
+from utils.reason_aff_dataset import ReasonAffValDataset
+def parse_args(args):
+    parser = argparse.ArgumentParser(description="LISA Model Training")
+    parser.add_argument("--local_rank", default=0, type=int, help="node rank")
+    parser.add_argument(
+        "--version", default="liuhaotian/llava-llama-2-13b-chat-lightning-preview"
+    )
+    parser.add_argument("--vis_save_path", default="./vis_output", type=str)
+    parser.add_argument(
+        "--precision",
+        default="bf16",
+        type=str,
+        choices=["fp32", "bf16", "fp16"],
+        help="precision for inference",
+    )
+    parser.add_argument("--image_size", default=1024, type=int, help="image size")
+    parser.add_argument("--model_max_length", default=512, type=int)
+    parser.add_argument("--lora_r", default=8, type=int)
+    parser.add_argument(
+        "--vision-tower", default="openai/clip-vit-large-patch14", type=str
+    )
+    parser.add_argument("--load_in_8bit", action="store_true", default=False)
+    parser.add_argument("--load_in_4bit", action="store_true", default=False)
+    parser.add_argument(
+        "--dataset", default="sem_seg||refer_seg||vqa||reason_seg", type=str
+    )
+    parser.add_argument("--sample_rates", default="9,3,3,1", type=str)
+    parser.add_argument(
+        "--sem_seg_data",
+        default="ade20k||cocostuff||pascal_part||paco_lvis||mapillary",
+        type=str,
+    )
+    parser.add_argument(
+        "--refer_seg_data", default="refclef||refcoco||refcoco+||refcocog", type=str
+    )
+    parser.add_argument("--vqa_data", default="llava_instruct_150k", type=str)
+    parser.add_argument("--reason_seg_data", default="ReasonSeg|train", type=str)
+    parser.add_argument("--aff_seg_data", default="handal", type=str)
+    parser.add_argument("--aff_sample_rates", default="1", type=str)
+    parser.add_argument("--reason_aff_data", default="handal_hard_reasoning", type=str)
+    parser.add_argument("--reason_aff_sample_rates", default="1", type=str)
+    parser.add_argument("--val_dataset", default="ReasonSeg|val", type=str)
+    parser.add_argument("--dataset_dir", default="./dataset", type=str)
+    parser.add_argument("--log_base_dir", default="./runs", type=str)
+    parser.add_argument("--exp_name", default="lisa", type=str)
+    parser.add_argument("--epochs", default=10, type=int)
+    parser.add_argument("--steps_per_epoch", default=500, type=int)
+    parser.add_argument(
+        "--batch_size", default=2, type=int, help="batch size per device per step"
+    )
+    parser.add_argument(
+        "--grad_accumulation_steps",
+        default=10,
+        type=int,
+    )
+    parser.add_argument("--val_batch_size", default=1, type=int)
+    parser.add_argument("--workers", default=4, type=int)
+    parser.add_argument("--lr", default=0.0003, type=float)
+    parser.add_argument("--ce_loss_weight", default=1.0, type=float)
+    parser.add_argument("--dice_loss_weight", default=0.5, type=float)
+    parser.add_argument("--bce_loss_weight", default=2.0, type=float)
+    parser.add_argument("--lora_alpha", default=16, type=int)
+    parser.add_argument("--lora_dropout", default=0.05, type=float)
+    parser.add_argument("--lora_target_modules", default="q_proj,v_proj", type=str)
+    parser.add_argument("--explanatory", default=0.1, type=float)
+    parser.add_argument("--beta1", default=0.9, type=float)
+    parser.add_argument("--beta2", default=0.95, type=float)
+    parser.add_argument("--num_classes_per_sample", default=3, type=int)
+    parser.add_argument("--exclude_val", action="store_true", default=False)
+    parser.add_argument("--no_eval", action="store_true", default=False)
+    parser.add_argument("--eval_only", action="store_true", default=False)
+    parser.add_argument("--eval_affordance", action="store_true", default=False)
+    parser.add_argument("--eval_reason_aff", action="store_true", default=False)
+    parser.add_argument("--vision_pretrained", default="PATH_TO_SAM_ViT-H", type=str)
+    parser.add_argument("--out_dim", default=256, type=int)
+    parser.add_argument("--resume", default="", type=str)
+    parser.add_argument("--print_freq", default=1, type=int)
+    parser.add_argument("--start_epoch", default=0, type=int)
+    parser.add_argument("--gradient_checkpointing", action="store_true", default=True)
+    parser.add_argument("--train_mask_decoder", action="store_true", default=True)
+    parser.add_argument("--use_mm_start_end", action="store_true", default=True)
+    parser.add_argument("--auto_resume", action="store_true", default=True)
+    parser.add_argument(
+        "--conv_type",
+        default="llava_v1",
+        type=str,
+        choices=["llava_v1", "llava_llama_2"],
+    )
+    return parser.parse_args(args)
+def main(args):
+    args = parse_args(args)
+    args.log_dir = os.path.join(args.log_base_dir, args.exp_name)
+    if args.local_rank == 0:
+        os.makedirs(args.log_dir, exist_ok=True)
+        writer = SummaryWriter(args.log_dir)
+    else:
+        writer = None
+    # Create model
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        args.version,
+        cache_dir=None,
+        model_max_length=args.model_max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+    tokenizer.pad_token = tokenizer.unk_token
+    num_added_tokens = tokenizer.add_tokens("[SEG]")
+    args.seg_token_idx = tokenizer("[SEG]", add_special_tokens=False).input_ids[0]
+    num_added_tokens = tokenizer.add_tokens("[AFF]")
+    args.aff_token_idx = tokenizer("[AFF]", add_special_tokens=False).input_ids[0]
+    if args.use_mm_start_end:
+        tokenizer.add_tokens(
+            [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
+        )
+    model_args = {
+        "train_mask_decoder": args.train_mask_decoder,
+        "out_dim": args.out_dim,
+        "ce_loss_weight": args.ce_loss_weight,
+        "dice_loss_weight": args.dice_loss_weight,
+        "bce_loss_weight": args.bce_loss_weight,
+        "seg_token_idx": args.seg_token_idx,
+        "aff_token_idx": args.aff_token_idx,
+        "vision_pretrained": args.vision_pretrained,
+        "vision_tower": args.vision_tower,
+        "use_mm_start_end": args.use_mm_start_end,
+    }
+    torch_dtype = torch.float32
+    if args.precision == "bf16":
+        torch_dtype = torch.bfloat16
+    elif args.precision == "fp16":
+        torch_dtype = torch.half
+    model = AffordanceVLMForCausalLM.from_pretrained(
+        args.version, torch_dtype=torch_dtype, low_cpu_mem_usage=True, **model_args
+    )
+    model.config.eos_token_id = tokenizer.eos_token_id
+    model.config.bos_token_id = tokenizer.bos_token_id
+    model.config.pad_token_id = tokenizer.pad_token_id
+    model.enable_input_require_grads()
+    model.gradient_checkpointing_enable()
+    model.get_model().initialize_vision_modules(model.get_model().config)
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(dtype=torch_dtype, device=args.local_rank)
+    if not args.eval_only:
+        model.get_model().initialize_lisa_modules(model.get_model().config)
+    for p in vision_tower.parameters():
+        p.requires_grad = False
+    for p in model.get_model().mm_projector.parameters():
+        p.requires_grad = False
+    conversation_lib.default_conversation = conversation_lib.conv_templates[
+        args.conv_type
+    ]
+    lora_r = args.lora_r
+    if lora_r > 0:
+        def find_linear_layers(model, lora_target_modules):
+            cls = torch.nn.Linear
+            lora_module_names = set()
+            for name, module in model.named_modules():
+                if (
+                    isinstance(module, cls)
+                    and all(
+                        [
+                            x not in name
+                            for x in [
+                                "visual_model",
+                                "vision_tower",
+                                "mm_projector",
+                                "text_hidden_fcs",
+                            ]
+                        ]
+                    )
+                    and any([x in name for x in lora_target_modules])
+                ):
+                    lora_module_names.add(name)
+            return sorted(list(lora_module_names))
+        lora_alpha = args.lora_alpha
+        lora_dropout = args.lora_dropout
+        lora_target_modules = find_linear_layers(
+            model, args.lora_target_modules.split(",")
+        )
+        lora_config = LoraConfig(
+            r=lora_r,
+            lora_alpha=lora_alpha,
+            target_modules=lora_target_modules,
+            lora_dropout=lora_dropout,
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+        model = get_peft_model(model, lora_config)
+        model.print_trainable_parameters()
+    model.resize_token_embeddings(len(tokenizer))
+    # make text_hidden_fcs, mask_decoder, lm_head, embed_tokens trainable
+    for n, p in model.named_parameters():
+        if any(
+            [
+                x in n
+                for x in ["lm_head", "embed_tokens", "mask_decoder", "text_hidden_fcs"]
+            ]
+        ):
+            print("n: ", n, "p.shape: ", p.shape)
+            p.requires_grad = True
+    world_size = torch.cuda.device_count()
+    args.distributed = world_size > 1
+    train_dataset = HybridDataset(
+        args.dataset_dir,
+        tokenizer,
+        args.vision_tower,
+        samples_per_epoch=args.batch_size
+        * args.grad_accumulation_steps
+        * args.steps_per_epoch
+        * world_size,
+        precision=args.precision,
+        image_size=args.image_size,
+        num_classes_per_sample=args.num_classes_per_sample,
+        exclude_val=args.exclude_val,
+        dataset=args.dataset,
+        sample_rate=[float(x) for x in args.sample_rates.split(",")],
+        sem_seg_data=args.sem_seg_data,
+        refer_seg_data=args.refer_seg_data,
+        vqa_data=args.vqa_data,
+        reason_seg_data=args.reason_seg_data,
+        aff_seg_data=args.aff_seg_data,
+        aff_sample_rate=[float(x) for x in args.aff_sample_rates.split(",")],
+        reason_aff_data=args.reason_aff_data,
+        reason_aff_sample_rate=[float(x) for x in args.reason_aff_sample_rates.split(",")],
+        explanatory=args.explanatory,
+    )
+    if args.no_eval == False:
+        if args.eval_affordance:
+            val_dataset = AffValDataset(
+                args.dataset_dir,
+                tokenizer,
+                args.vision_tower,
+                args.val_dataset,
+                args.image_size,
+            )
+        elif args.eval_reason_aff:
+            val_dataset = ReasonAffValDataset(
+                args.dataset_dir,
+                tokenizer,
+                args.vision_tower,
+                args.val_dataset,
+                args.image_size,
+            )
+        else:
+            val_dataset = ValDataset(
+                args.dataset_dir,
+                tokenizer,
+                args.vision_tower,
+                args.val_dataset,
+                args.image_size,
+            )
+        print(
+            f"Training with {len(train_dataset)} examples and validating with {len(val_dataset)} examples."
+        )
+    else:
+        val_dataset = None
+        print(f"Training with {len(train_dataset)} examples.")
+    ds_config = {
+        "train_micro_batch_size_per_gpu": args.batch_size,
+        "gradient_accumulation_steps": args.grad_accumulation_steps,
+        "optimizer": {
+            "type": "AdamW",
+            "params": {
+                "lr": args.lr,
+                "weight_decay": 0.0,
+                "betas": (args.beta1, args.beta2),
+            },
+        },
+        "scheduler": {
+            "type": "WarmupDecayLR",
+            "params": {
+                "total_num_steps": args.epochs * args.steps_per_epoch,
+                "warmup_min_lr": 0,
+                "warmup_max_lr": args.lr,
+                "warmup_num_steps": 100,
+                "warmup_type": "linear",
+            },
+        },
+        "fp16": {
+            "enabled": args.precision == "fp16",
+        },
+        "bf16": {
+            "enabled": args.precision == "bf16",
+        },
+        "gradient_clipping": 1.0,
+        "zero_optimization": {
+            "stage": 2,
+            "contiguous_gradients": True,
+            "overlap_comm": True,
+            "reduce_scatter": True,
+            "reduce_bucket_size": 5e8,
+            "allgather_bucket_size": 5e8,
+        },
+    }
+    model_engine, optimizer, train_loader, scheduler = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        training_data=train_dataset,
+        collate_fn=partial(
+            collate_fn,
+            tokenizer=tokenizer,
+            conv_type=args.conv_type,
+            use_mm_start_end=args.use_mm_start_end,
+            local_rank=args.local_rank,
+        ),
+        config=ds_config,
+    )
+    # resume deepspeed checkpoint
+    if args.auto_resume and len(args.resume) == 0:
+        resume = os.path.join(args.log_dir, "ckpt_model")
+        if os.path.exists(resume):
+            args.resume = resume
+    if args.resume:
+        load_path, client_state = model_engine.load_checkpoint(args.resume)
+        with open(os.path.join(args.resume, "latest"), "r") as f:
+            ckpt_dir = f.readlines()[0].strip()
+        args.start_epoch = (
+            int(ckpt_dir.replace("global_step", "")) // args.steps_per_epoch
+        )
+        print(
+            "resume training from {}, start from epoch {}".format(
+                args.resume, args.start_epoch
+            )
+        )
+    # validation dataset
+    if val_dataset is not None:
+        assert args.val_batch_size == 1
+        val_sampler = torch.utils.data.distributed.DistributedSampler(
+            val_dataset, shuffle=False, drop_last=False
+        )
+        val_loader = torch.utils.data.DataLoader(
+            val_dataset,
+            batch_size=args.val_batch_size,
+            shuffle=False,
+            num_workers=args.workers,
+            pin_memory=False,
+            sampler=val_sampler,
+            collate_fn=partial(
+                collate_fn,
+                tokenizer=tokenizer,
+                conv_type=args.conv_type,
+                use_mm_start_end=args.use_mm_start_end,
+                local_rank=args.local_rank,
+            ),
+        )
+    train_iter = iter(train_loader)
+    best_score, cur_ciou = 0.0, 0.0
+    if args.eval_only:
+        giou, ciou = validate(val_loader, model_engine, 0, writer, args)
+        if args.local_rank == 0:
+            with open(os.path.join(args.version, "eval_result.txt"), "a") as f:
+                f.write(f"dataset: {args.val_dataset}, giou: {giou}, ciou: {ciou} \n")
+        exit()
+    for epoch in range(args.start_epoch, args.epochs):
+        # train for one epoch
+        train_iter = train(
+            train_loader,
+            model_engine,
+            epoch,
+            scheduler,
+            writer,
+            train_iter,
+            args,
+        )
+        if args.no_eval == False:
+            giou, ciou = validate(val_loader, model_engine, epoch, writer, args)
+            is_best = giou > best_score
+            best_score = max(giou, best_score)
+            cur_ciou = ciou if is_best else cur_ciou
+        if args.no_eval or is_best:
+            save_dir = os.path.join(args.log_dir, "ckpt_model")
+            if args.local_rank == 0:
+                torch.save(
+                    {"epoch": epoch},
+                    os.path.join(
+                        args.log_dir,
+                        "meta_log_giou{:.3f}_ciou{:.3f}.pth".format(
+                            best_score, cur_ciou
+                        ),
+                    ),
+                )
+                if os.path.exists(save_dir):
+                    shutil.rmtree(save_dir)
+            torch.distributed.barrier()
+            model_engine.save_checkpoint(save_dir)
+def train(
+    train_loader,
+    model,
+    epoch,
+    scheduler,
+    writer,
+    train_iter,
+    args,
+):
+    """Main training loop."""
+    batch_time = AverageMeter("Time", ":6.3f")
+    data_time = AverageMeter("Data", ":6.3f")
+    losses = AverageMeter("Loss", ":.4f")
+    ce_losses = AverageMeter("CeLoss", ":.4f")
+    mask_bce_losses = AverageMeter("MaskBCELoss", ":.4f")
+    mask_dice_losses = AverageMeter("MaskDICELoss", ":.4f")
+    mask_losses = AverageMeter("MaskLoss", ":.4f")
+    progress = ProgressMeter(
+        args.steps_per_epoch,
+        [
+            batch_time,
+            losses,
+            ce_losses,
+            mask_losses,
+            mask_bce_losses,
+            mask_dice_losses,
+        ],
+        prefix="Epoch: [{}]".format(epoch),
+    )
+    # switch to train mode
+    model.train()
+    end = time.time()
+    for global_step in range(args.steps_per_epoch):
+        for i in range(args.grad_accumulation_steps):
+            try:
+                input_dict = next(train_iter)
+            except:
+                train_iter = iter(train_loader)
+                input_dict = next(train_iter)
+            data_time.update(time.time() - end)
+            input_dict = dict_to_cuda(input_dict)
+            if args.precision == "fp16":
+                input_dict["images"] = input_dict["images"].half()
+                input_dict["images_clip"] = input_dict["images_clip"].half()
+            elif args.precision == "bf16":
+                input_dict["images"] = input_dict["images"].bfloat16()
+                input_dict["images_clip"] = input_dict["images_clip"].bfloat16()
+            else:
+                input_dict["images"] = input_dict["images"].float()
+                input_dict["images_clip"] = input_dict["images_clip"].float()
+            output_dict = model(**input_dict)
+            loss = output_dict["loss"]
+            ce_loss = output_dict["ce_loss"]
+            mask_bce_loss = output_dict["mask_bce_loss"]
+            mask_dice_loss = output_dict["mask_dice_loss"]
+            mask_loss = output_dict["mask_loss"]
+            losses.update(loss.item(), input_dict["images"].size(0))
+            ce_losses.update(ce_loss.item(), input_dict["images"].size(0))
+            mask_bce_losses.update(mask_bce_loss.item(), input_dict["images"].size(0))
+            mask_dice_losses.update(mask_dice_loss.item(), input_dict["images"].size(0))
+            mask_losses.update(mask_loss.item(), input_dict["images"].size(0))
+            model.backward(loss)
+            model.step()
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+        if global_step % args.print_freq == 0:
+            if args.distributed:
+                batch_time.all_reduce()
+                data_time.all_reduce()
+                losses.all_reduce()
+                ce_losses.all_reduce()
+                mask_bce_losses.all_reduce()
+                mask_dice_losses.all_reduce()
+                mask_losses.all_reduce()
+            if args.local_rank == 0:
+                progress.display(global_step + 1)
+                writer.add_scalar("train/loss", losses.avg, global_step)
+                writer.add_scalar("train/ce_loss", ce_losses.avg, global_step)
+                writer.add_scalar(
+                    "train/mask_bce_loss", mask_bce_losses.avg, global_step
+                )
+                writer.add_scalar(
+                    "train/mask_dice_loss", mask_dice_losses.avg, global_step
+                )
+                writer.add_scalar("train/mask_loss", mask_losses.avg, global_step)
+                writer.add_scalar(
+                    "metrics/total_secs_per_batch", batch_time.avg, global_step
+                )
+                writer.add_scalar(
+                    "metrics/data_secs_per_batch", data_time.avg, global_step
+                )
+            batch_time.reset()
+            data_time.reset()
+            losses.reset()
+            ce_losses.reset()
+            mask_bce_losses.reset()
+            mask_dice_losses.reset()
+            mask_losses.reset()
+        if global_step != 0:
+            curr_lr = scheduler.get_last_lr()
+            if args.local_rank == 0:
+                writer.add_scalar("train/lr", curr_lr[0], global_step)
+    return train_iter
+def validate(val_loader, model_engine, epoch, writer, args):
+    intersection_meter = AverageMeter("Intersec", ":6.3f", Summary.SUM)
+    union_meter = AverageMeter("Union", ":6.3f", Summary.SUM)
+    acc_iou_meter = AverageMeter("gIoU", ":6.3f", Summary.SUM)
+    model_engine.eval()
+    for input_dict in tqdm.tqdm(val_loader):
+        torch.cuda.empty_cache()
+        input_dict = dict_to_cuda(input_dict)
+        if args.precision == "fp16":
+            input_dict["images"] = input_dict["images"].half()
+            input_dict["images_clip"] = input_dict["images_clip"].half()
+        elif args.precision == "bf16":
+            input_dict["images"] = input_dict["images"].bfloat16()
+            input_dict["images_clip"] = input_dict["images_clip"].bfloat16()
+        else:
+            input_dict["images"] = input_dict["images"].float()
+            input_dict["images_clip"] = input_dict["images_clip"].float()
+        with torch.no_grad():
+            output_dict = model_engine(**input_dict)
+        pred_masks = output_dict["pred_masks"]
+        masks_list = output_dict["gt_masks"][0].int()
+        output_list = (pred_masks[0] > 0).int()
+        assert len(pred_masks) == 1
+        intersection, union, acc_iou = 0.0, 0.0, 0.0
+        for mask_i, output_i in zip(masks_list, output_list):
+            intersection_i, union_i, _ = intersectionAndUnionGPU(
+                output_i.contiguous().clone(), mask_i.contiguous(), 2, ignore_index=255
+            )
+            intersection += intersection_i
+            union += union_i
+            acc_iou += intersection_i / (union_i + 1e-5)
+            acc_iou[union_i == 0] += 1.0  # no-object target
+        intersection, union = intersection.cpu().numpy(), union.cpu().numpy()
+        acc_iou = acc_iou.cpu().numpy() / masks_list.shape[0]
+        intersection_meter.update(intersection), union_meter.update(
+            union
+        ), acc_iou_meter.update(acc_iou, n=masks_list.shape[0])
+    intersection_meter.all_reduce()
+    union_meter.all_reduce()
+    acc_iou_meter.all_reduce()
+    iou_class = intersection_meter.sum / (union_meter.sum + 1e-10)
+    ciou = iou_class[1]
+    giou = acc_iou_meter.avg[1]
+    if args.local_rank == 0:
+        writer.add_scalar("val/giou", giou, epoch)
+        writer.add_scalar("val/ciou", ciou, epoch)
+        print("giou: {:.4f}, ciou: {:.4f}".format(giou, ciou))
+    return giou, ciou
+if __name__ == "__main__":
+    main(sys.argv[1:])

README.md CHANGED Viewed

@@ -1,3 +1,79 @@
----
-license: apache-2.0
----

+<div align="center">
+<h1>
+<b>
+RAGNet: Large-scale Reasoning-based Affordance Segmentation Benchmark towards General Grasping
+</b>
+</h1>
+</div>
+<div align="center">
+| [**📑 Paper**](https://arxiv.org/abs/2507.23734)  |  [**🤗 Model**](https://huggingface.co/Dongming97/AffordanceVLM) |   [**🤗 Dataset**](https://huggingface.co/datasets/Dongming97/RAGNet) |  [**🖥️ Website**](https://wudongming97.github.io/RAGNet/) |
+</div>
+<p align="center"><img src="./imgs/AffordanceNet.jpg" width="800"/></p>
+> **[RAGNet: Large-scale Reasoning-based Affordance Segmentation Benchmark towards General Grasping](https://arxiv.org/abs/2507.23734)**
+>
+> Dongming Wu, Yanping Fu, Saike Huang, Yingfei Liu, Fan Jia, Nian Liu, Feng Dai, Tiancai Wang, Rao Muhammad Anwer, Fahad Shahbaz Khan, Jianbing Shen
+## 📝 TL;DR
+- To push forward general robotic grasping, we introduce a large-scale reasoning-based affordance segmentation benchmark, **RAGNet**.  It contains 273k images, 180 categories, and 26k reasoning instructions.
+- Furthermore, we propose a comprehensive affordance-based grasping framework, named AffordanceNet, which consists of a VLM (named AffordanceVLM) pre-trained on our massive affordance data and a grasping network that conditions an affordance map to grasp the target.
+---
+## 📰 News
+- [2025.08] Paper is released at [arXiv](https://arxiv.org/abs/2507.23734).
+- [2025.07] Inference code and the [AffordanceVLM](https://huggingface.co/Dongming97/AffordanceVLM) model are released. Welcome to try it!
+- [2025.06] Paper is accepted by ICCV 2025!
+---
+## 🚀 Getting Started
+* [Installation](docs/installation.md)
+* [Download dataset](docs/dataset.md)
+* [Training and evaluation](docs/training_and_evaluation.md)
+* To deploy using Gradio, run the following command:
+    ```bash
+    python app.py --version='./exps/AffordanceVLM-7B'
+    ```
+## 📊 Main Results
+### 🔹 Affordance Segmentation
+| Method                               | HANDAL gIoU | HANDAL cIoU | HANDAL† gIoU | HANDAL† cIoU | GraspNet seen gIoU | GraspNet seen cIoU | GraspNet novel gIoU | GraspNet novel cIoU | 3DOI gIoU | 3DOI cIoU |
+|--------------------------------------|-------------|-------------|---------------|---------------|----------------------|----------------------|------------------------|------------------------|------------|------------|
+| AffordanceNet | 60.3| 60.8 |60.5|60.3|63.3 |64.0| 45.6 |33.2  | 37.4| 37.4 |
+### 🔸 Reasoning-Based Affordance Segmentation
+| Method  | HANDAL (easy) gIoU | HANDAL (easy) cIoU | HANDAL (hard) gIoU | HANDAL (hard) cIoU | 3DOI gIoU | 3DOI cIoU |
+|---------|---------------------|---------------------|---------------------|---------------------|-----------|-----------|
+| AffordanceNet| 58.3| 58.1 | 58.2| 57.8 | 38.1 | 39.4|
+## 📚 Citation
+If you find our work useful, please consider citing:
+```bibtex
+@inproceedings{wu2025ragnet,
+  title={RAGNet: Large-scale Reasoning-based Affordance Segmentation Benchmark towards General Grasping},
+  author={Wu, Dongming and Fu, Yanping and Huang, Saike and Liu, Yingfei and Jia, Fan and Liu, Nian and Dai, Feng and Wang, Tiancai and Anwer, Rao Muhammad and Khan, Fahad Shahbaz and others},
+  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+  pages={11980--11990},
+  year={2025}
+}
+```
+## 🙏 Acknowledgements
+We thank the authors that open the following projects.
+- [LISA](https://github.com/dvlab-research/LISA)
+- [LLaVA](https://github.com/haotian-liu/LLaVA)
+- [SAM](https://github.com/facebookresearch/segment-anything)

app.py ADDED Viewed

	@@ -0,0 +1,329 @@

+import argparse
+import os
+import re
+import sys
+import bleach
+import cv2
+import gradio as gr
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from transformers import AutoTokenizer, BitsAndBytesConfig, CLIPImageProcessor
+from model.AffordanceVLM import AffordanceVLMForCausalLM
+from model.llava import conversation as conversation_lib
+from model.llava.mm_utils import tokenizer_image_token
+from model.segment_anything.utils.transforms import ResizeLongestSide
+from utils.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
+                         DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
+from datetime import datetime
+def parse_args(args):
+    parser = argparse.ArgumentParser(description="AffordanceVLM chat")
+    parser.add_argument("--version", default="./exps/AffordanceVLM-7B")
+    parser.add_argument("--vis_save_path", default="./vis_output", type=str)
+    parser.add_argument(
+        "--precision",
+        default="bf16",
+        type=str,
+        choices=["fp32", "bf16", "fp16"],
+        help="precision for inference",
+    )
+    parser.add_argument("--image_size", default=1024, type=int, help="image size")
+    parser.add_argument("--model_max_length", default=512, type=int)
+    parser.add_argument("--lora_r", default=8, type=int)
+    parser.add_argument(
+        "--vision-tower", default="openai/clip-vit-large-patch14", type=str
+    )
+    parser.add_argument("--local-rank", default=0, type=int, help="node rank")
+    parser.add_argument("--load_in_8bit", action="store_true", default=False)
+    parser.add_argument("--load_in_4bit", action="store_true", default=False)
+    parser.add_argument("--use_mm_start_end", action="store_true", default=True)
+    parser.add_argument(
+        "--conv_type",
+        default="llava_v1",
+        type=str,
+        choices=["llava_v1", "llava_llama_2"],
+    )
+    return parser.parse_args(args)
+def preprocess(
+    x,
+    pixel_mean=torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1),
+    pixel_std=torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1),
+    img_size=1024,
+) -> torch.Tensor:
+    """Normalize pixel values and pad to a square input."""
+    # Normalize colors
+    x = (x - pixel_mean) / pixel_std
+    # Pad
+    h, w = x.shape[-2:]
+    padh = img_size - h
+    padw = img_size - w
+    x = F.pad(x, (0, padw, 0, padh))
+    return x
+args = parse_args(sys.argv[1:])
+os.makedirs(args.vis_save_path, exist_ok=True)
+# Create model
+tokenizer = AutoTokenizer.from_pretrained(
+    args.version,
+    cache_dir=None,
+    model_max_length=args.model_max_length,
+    padding_side="right",
+    use_fast=False,
+)
+tokenizer.pad_token = tokenizer.unk_token
+args.seg_token_idx = tokenizer("[SEG]", add_special_tokens=False).input_ids[0]
+args.aff_token_idx = tokenizer("[AFF]", add_special_tokens=False).input_ids[0]
+torch_dtype = torch.float32
+if args.precision == "bf16":
+    torch_dtype = torch.bfloat16
+elif args.precision == "fp16":
+    torch_dtype = torch.half
+kwargs = {"torch_dtype": torch_dtype}
+if args.load_in_4bit:
+    kwargs.update(
+        {
+            "torch_dtype": torch.half,
+            "load_in_4bit": True,
+            "quantization_config": BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+                llm_int8_skip_modules=["visual_model"],
+            ),
+        }
+    )
+elif args.load_in_8bit:
+    kwargs.update(
+        {
+            "torch_dtype": torch.half,
+            "quantization_config": BitsAndBytesConfig(
+                llm_int8_skip_modules=["visual_model"],
+                load_in_8bit=True,
+            ),
+        }
+    )
+model = AffordanceVLMForCausalLM.from_pretrained(
+    args.version, low_cpu_mem_usage=True, vision_tower=args.vision_tower, seg_token_idx=args.seg_token_idx, aff_token_idx=args.aff_token_idx, **kwargs
+)
+model.config.eos_token_id = tokenizer.eos_token_id
+model.config.bos_token_id = tokenizer.bos_token_id
+model.config.pad_token_id = tokenizer.pad_token_id
+model.get_model().initialize_vision_modules(model.get_model().config)
+vision_tower = model.get_model().get_vision_tower()
+vision_tower.to(dtype=torch_dtype)
+if args.precision == "bf16":
+    model = model.bfloat16().cuda()
+elif (
+    args.precision == "fp16" and (not args.load_in_4bit) and (not args.load_in_8bit)
+):
+    vision_tower = model.get_model().get_vision_tower()
+    model.model.vision_tower = None
+    import deepspeed
+    model_engine = deepspeed.init_inference(
+        model=model,
+        dtype=torch.half,
+        replace_with_kernel_inject=True,
+        replace_method="auto",
+    )
+    model = model_engine.module
+    model.model.vision_tower = vision_tower.half().cuda()
+elif args.precision == "fp32":
+    model = model.float().cuda()
+vision_tower = model.get_model().get_vision_tower()
+vision_tower.to(device=args.local_rank)
+clip_image_processor = CLIPImageProcessor.from_pretrained(model.config.vision_tower)
+transform = ResizeLongestSide(args.image_size)
+model.eval()
+# Gradio
+examples = [
+    [
+        "Please segment the affordance map of mug in this image.",
+        "/data/AffordanceNet/vis_output/my_workspace.JPG",
+    ],
+]
+output_labels = ["Segmentation Output"]
+title = "RAGNet: Large-scale Reasoning-based Affordance Segmentation Benchmark towards General Grasping"
+description = """
+<font size=4>
+This is the online demo of AffordanceVLM. \n
+**Note**: **Different prompts can lead to significantly varied results**. \n
+**Note**: Please try to **standardize** your input text prompts to **avoid ambiguity**, and also pay attention to whether the **punctuations** of the input are correct. \n
+**Note**: Current model is **AffordanceVLM-7B**. \n
+**Usage**: <br>
+To let AffordanceVLM **segment something**, input prompt like: "Can you segment the affordance map of xxx in this image?", "What is the affordance map of xxx in this image?"; <br>
+</font>
+"""
+article = """
+<p style='text-align: center'>
+<a href='https://arxiv.org/abs/2507.23734' target='_blank'>
+Preprint Paper
+</a>
+\n
+<p style='text-align: center'>
+<a href='https://github.com/wudongming97/AffordanceNet' target='_blank'>   Github Repo </a></p>
+"""
+## to be implemented
+def inference(input_str, input_image):
+    ## filter out special chars
+    input_str = bleach.clean(input_str)
+    print("input_str: ", input_str, "input_image: ", input_image)
+    ## input valid check
+    if not re.match(r"^[A-Za-z ,.!?\'\"]+$", input_str) or len(input_str) < 1:
+        output_str = "[Error] Invalid input: ", input_str
+        # output_image = np.zeros((128, 128, 3))
+        ## error happened
+        output_image = cv2.imread("./resources/error_happened.png")[:, :, ::-1]
+        return output_image, output_str
+    # Model Inference
+    conv = conversation_lib.conv_templates[args.conv_type].copy()
+    conv.messages = []
+    prompt = input_str
+    prompt = DEFAULT_IMAGE_TOKEN + "\n" + "You are an embodied robot. " + prompt
+    if args.use_mm_start_end:
+        replace_token = (
+            DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
+        )
+        prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+    conv.append_message(conv.roles[0], prompt)
+    conv.append_message(conv.roles[1], "")
+    prompt = conv.get_prompt()
+    image_np = cv2.imread(input_image)
+    # save the input image
+    SAVE_DIR = "./gradio_images/"
+    os.makedirs(SAVE_DIR, exist_ok=True)
+    # generate a timestamped filename
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"{timestamp}.png"
+    save_path = os.path.join(SAVE_DIR, filename)
+    # save the image
+    cv2.imwrite(save_path, image_np)
+    image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
+    original_size_list = [image_np.shape[:2]]
+    image_clip = (
+        clip_image_processor.preprocess(image_np, return_tensors="pt")[
+            "pixel_values"
+        ][0]
+        .unsqueeze(0)
+        .cuda()
+    )
+    if args.precision == "bf16":
+        image_clip = image_clip.bfloat16()
+    elif args.precision == "fp16":
+        image_clip = image_clip.half()
+    else:
+        image_clip = image_clip.float()
+    image = transform.apply_image(image_np)
+    resize_list = [image.shape[:2]]
+    image = (
+        preprocess(torch.from_numpy(image).permute(2, 0, 1).contiguous())
+        .unsqueeze(0)
+        .cuda()
+    )
+    if args.precision == "bf16":
+        image = image.bfloat16()
+    elif args.precision == "fp16":
+        image = image.half()
+    else:
+        image = image.float()
+    input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+    input_ids = input_ids.unsqueeze(0).cuda()
+    output_ids, pred_masks = model.evaluate(
+        image_clip,
+        image,
+        input_ids,
+        resize_list,
+        original_size_list,
+        max_new_tokens=512,
+        tokenizer=tokenizer,
+    )
+    output_ids = output_ids[0][output_ids[0] != IMAGE_TOKEN_INDEX]
+    text_output = tokenizer.decode(output_ids, skip_special_tokens=False)
+    text_output = text_output.replace("\n", "").replace("  ", " ")
+    text_output = text_output.split("ASSISTANT: ")[-1].replace('</s>', '')
+    print("text_output: ", text_output)
+    save_img = None
+    for i, pred_mask in enumerate(pred_masks):
+        if pred_mask.shape[0] == 0:
+            continue
+        pred_mask = pred_mask.detach().cpu().numpy()[0]
+        pred_mask = pred_mask > 0
+        save_img = image_np.copy()
+        save_img[pred_mask] = (
+            image_np * 0.5
+            + pred_mask[:, :, None].astype(np.uint8) * np.array([255, 0, 0]) * 0.5
+        )[pred_mask]
+    output_str = "ASSITANT: " + text_output  # input_str
+    if save_img is not None:
+        output_image = save_img  # input_image
+    else:
+        ## no seg output
+        output_image = cv2.imread("./resources/no_seg_out.png")[:, :, ::-1]
+    return output_image, output_str
+demo = gr.Interface(
+    inference,
+    inputs=[
+        gr.Textbox(lines=1, placeholder=None, label="Text Instruction"),
+        gr.Image(type="filepath", label="Input Image"),
+    ],
+    outputs=[
+        gr.Image(type="pil", label="Affordance Output"),
+        gr.Textbox(lines=1, placeholder=None, label="Text Output"),
+    ],
+    title=title,
+    description=description,
+    article=article,
+    examples=examples,
+    allow_flagging="auto",
+)
+demo.queue()
+# demo.launch()
+demo.launch(server_name="0.0.0.0", server_port=3200)

batch_generate.sh ADDED Viewed

	@@ -0,0 +1,14 @@

+#!/bin/bash
+# Batch generate affordance masks for all four LIBERO subsets sequentially.
+SRC_ROOT="/gemini/space/wrz/libero_per_frame"
+TGT_ROOT="/gemini/space/wrz/ragnet_results"
+for ds in libero_object libero_goal libero_spatial libero_10; do
+    echo "========== Processing ${ds} =========="
+    CUDA_VISIBLE_DEVICES=0 python batch_generate.py \
+        --data_dir "${SRC_ROOT}/${ds}_converted" \
+        --save_dir "${TGT_ROOT}/${ds}"
+    echo "========== ${ds} done =========="
+    echo
+done

batch_generate_prefill_accelerate.py ADDED Viewed

	@@ -0,0 +1,418 @@

+"""
+Batch affordance mask generation for per-step datasets.
+Reads a per-step dataset (converted by convert_lerobot_to_perstep.py) and
+generates affordance masks for every image_primary.jpg and image_wrist.jpg
+using AffordanceVLM.
+Input structure:
+    {data_dir}/
+    ├── meta_info.h5
+    └── episodes/
+        └── {episode_id:06d}/
+            └── steps/
+                └── {step_id:04d}/
+                    ├── other.h5           # language_instruction
+                    ├── image_primary.jpg
+                    └── image_wrist.jpg
+Output structure:
+    {save_dir}/
+    └── episodes/
+        └── {episode_id:06d}/
+            └── steps/
+                └── {step_id:04d}/
+                    ├── image_primary_mask.png   # binary 0/255
+                    └── image_wrist_mask.png
+Usage:
+        CUDA_VISIBLE_DEVICES=1 python batch_generate_prefill_accelerate.py \
+        --data_dir /gemini/space/wrz/libero_per_frame/libero_spatial_converted \
+        --save_dir /gemini/space/wrz/ragnet_results/libero_spatial
+"""
+import argparse
+import os
+import sys
+from pathlib import Path
+import cv2
+import h5py
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, BitsAndBytesConfig, CLIPImageProcessor
+from model.AffordanceVLM import AffordanceVLMForCausalLM
+from model.llava import conversation as conversation_lib
+from model.llava.mm_utils import tokenizer_image_token
+from model.segment_anything.utils.transforms import ResizeLongestSide
+from utils.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
+                         DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
+def parse_args(args):
+    parser = argparse.ArgumentParser(
+        description="Batch affordance mask generation for per-step datasets"
+    )
+    # Model arguments (same as chat.py)
+    parser.add_argument("--version", default="/gemini/code/AffordanceNet/ckpts/AffordanceVLM-7B")
+    parser.add_argument(
+        "--precision", default="bf16", type=str,
+        choices=["fp32", "bf16", "fp16"],
+    )
+    parser.add_argument("--image_size", default=1024, type=int)
+    parser.add_argument("--model_max_length", default=512, type=int)
+    parser.add_argument("--lora_r", default=8, type=int)
+    parser.add_argument("--vision-tower", default="openai/clip-vit-large-patch14", type=str)
+    parser.add_argument("--local-rank", default=0, type=int)
+    parser.add_argument("--load_in_8bit", action="store_true", default=False)
+    parser.add_argument("--load_in_4bit", action="store_true", default=False)
+    parser.add_argument("--use_mm_start_end", action="store_true", default=True)
+    parser.add_argument(
+        "--conv_type", default="llava_v1", type=str,
+        choices=["llava_v1", "llava_llama_2"],
+    )
+    # Batch processing arguments
+    parser.add_argument("--data_dir", type=str, required=True,
+                        help="Root of per-step dataset (contains episodes/)")
+    parser.add_argument("--save_dir", type=str, required=True,
+                        help="Output directory for masks")
+    parser.add_argument("--prompt_template", type=str,
+                        default="{}",
+                        help="Template wrapping language_instruction. Use {} as placeholder.")
+    # "{}"
+    # Segment the most suitable manipulation region on the single target object for the task '{}'.
+    # Segment the affordance map for the task '{}' in this image.
+    # Segment the affordance map of the single target object for the task '{}' in this image.
+    # Given the task instruction '{}', what is the affordance map of the target object in this image? Please output segmentation mask.
+    # Given the task instruction '{}', what is the affordance map of the single target object in this image? There is only one target object. Please output segmentation mask.
+    parser.add_argument("--start_episode", type=int, default=None,
+                        help="First episode index to process (inclusive)")
+    parser.add_argument("--end_episode", type=int, default=None,
+                        help="Last episode index to process (exclusive)")
+    return parser.parse_args(args)
+def preprocess(
+    x,
+    pixel_mean=torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1),
+    pixel_std=torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1),
+    img_size=1024,
+) -> torch.Tensor:
+    """Normalize pixel values and pad to a square input."""
+    x = (x - pixel_mean) / pixel_std
+    h, w = x.shape[-2:]
+    padh = img_size - h
+    padw = img_size - w
+    x = F.pad(x, (0, padw, 0, padh))
+    return x
+def load_model(args):
+    """Load tokenizer and model, identical to chat.py."""
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.version,
+        cache_dir=None,
+        model_max_length=args.model_max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+    tokenizer.pad_token = tokenizer.unk_token
+    tokenizer.add_tokens("[SEG]")
+    args.seg_token_idx = tokenizer("[SEG]", add_special_tokens=False).input_ids[0]
+    tokenizer.add_tokens("[AFF]")
+    args.aff_token_idx = tokenizer("[AFF]", add_special_tokens=False).input_ids[0]
+    torch_dtype = torch.float32
+    if args.precision == "bf16":
+        torch_dtype = torch.bfloat16
+    elif args.precision == "fp16":
+        torch_dtype = torch.half
+    kwargs = {"torch_dtype": torch_dtype}
+    if args.load_in_4bit:
+        kwargs.update({
+            "torch_dtype": torch.half,
+            "load_in_4bit": True,
+            "quantization_config": BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+                llm_int8_skip_modules=["visual_model"],
+            ),
+        })
+    elif args.load_in_8bit:
+        kwargs.update({
+            "torch_dtype": torch.half,
+            "quantization_config": BitsAndBytesConfig(
+                llm_int8_skip_modules=["visual_model"],
+                load_in_8bit=True,
+            ),
+        })
+    model = AffordanceVLMForCausalLM.from_pretrained(
+        args.version,
+        low_cpu_mem_usage=True,
+        vision_tower=args.vision_tower,
+        seg_token_idx=args.seg_token_idx,
+        aff_token_idx=args.aff_token_idx,
+        **kwargs,
+    )
+    model.config.eos_token_id = tokenizer.eos_token_id
+    model.config.bos_token_id = tokenizer.bos_token_id
+    model.config.pad_token_id = tokenizer.pad_token_id
+    model.get_model().initialize_vision_modules(model.get_model().config)
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(dtype=torch_dtype)
+    if args.precision == "bf16":
+        model = model.bfloat16().cuda()
+    elif args.precision == "fp16" and (not args.load_in_4bit) and (not args.load_in_8bit):
+        vision_tower = model.get_model().get_vision_tower()
+        model.model.vision_tower = None
+        import deepspeed
+        model_engine = deepspeed.init_inference(
+            model=model,
+            dtype=torch.half,
+            replace_with_kernel_inject=True,
+            replace_method="auto",
+        )
+        model = model_engine.module
+        model.model.vision_tower = vision_tower.half().cuda()
+    elif args.precision == "fp32":
+        model = model.float().cuda()
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(device=args.local_rank)
+    clip_image_processor = CLIPImageProcessor.from_pretrained(model.config.vision_tower)
+    transform = ResizeLongestSide(args.image_size)
+    model.eval()
+    return model, tokenizer, clip_image_processor, transform
+def build_prompt(text: str, args) -> str:
+    """Build the full conversation prompt from a text query."""
+    conv = conversation_lib.conv_templates[args.conv_type].copy()
+    conv.messages = []
+    prompt = DEFAULT_IMAGE_TOKEN + "\n" + "You are an embodied robot. " + text
+    if args.use_mm_start_end:
+        replace_token = (
+            DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
+        )
+        prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+    conv.append_message(conv.roles[0], prompt)
+    conv.append_message(conv.roles[1], "[AFF].")
+    return conv.get_prompt()
+def infer_single_image(
+    image_path: str,
+    prompt_str: str,
+    model,
+    tokenizer,
+    clip_image_processor,
+    transform,
+    args,
+) -> "np.ndarray | None":
+    """Run inference on a single image. Returns binary mask (H, W) uint8 0/255 or None."""
+    image_np = cv2.imread(image_path)
+    if image_np is None:
+        print(f"  [WARNING] Cannot read image: {image_path}")
+        return None
+    image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
+    original_size_list = [image_np.shape[:2]]
+    # CLIP preprocessing
+    image_clip = (
+        clip_image_processor.preprocess(image_np, return_tensors="pt")["pixel_values"][0]
+        .unsqueeze(0)
+        .cuda()
+    )
+    if args.precision == "bf16":
+        image_clip = image_clip.bfloat16()
+    elif args.precision == "fp16":
+        image_clip = image_clip.half()
+    else:
+        image_clip = image_clip.float()
+    # SAM preprocessing
+    image = transform.apply_image(image_np)
+    resize_list = [image.shape[:2]]
+    image = (
+        preprocess(torch.from_numpy(image).permute(2, 0, 1).contiguous())
+        .unsqueeze(0)
+        .cuda()
+    )
+    if args.precision == "bf16":
+        image = image.bfloat16()
+    elif args.precision == "fp16":
+        image = image.half()
+    else:
+        image = image.float()
+    # Tokenize
+    input_ids = tokenizer_image_token(prompt_str, tokenizer, return_tensors="pt")
+    input_ids = input_ids.unsqueeze(0).cuda()
+    attention_masks = input_ids.ne(tokenizer.pad_token_id)
+    # Prefill inference (single forward pass instead of autoregressive generation)
+    h, w = original_size_list[0]
+    labels = input_ids.clone()
+    offset = torch.LongTensor([0, 1]).cuda()
+    masks_list = [torch.zeros(1, h, w).float().cuda()]
+    label_list = [torch.zeros(h, w).long().cuda()]
+    with torch.no_grad():
+        output_dict = model(
+            images=image,
+            images_clip=image_clip,
+            input_ids=input_ids,
+            labels=labels,
+            attention_masks=attention_masks,
+            offset=offset,
+            masks_list=masks_list,
+            label_list=label_list,
+            resize_list=resize_list,
+            inference=True,
+        )
+    pred_masks = output_dict["pred_masks"]
+    # Merge all predicted masks via union (logical OR)
+    merged = np.zeros((h, w), dtype=bool)
+    has_mask = False
+    for pred_mask in pred_masks:
+        if pred_mask.shape[0] == 0:
+            continue
+        mask_np = pred_mask.detach().cpu().numpy()[0]  # (H, W)
+        merged |= (mask_np > 0)
+        has_mask = True
+    if not has_mask:
+        return None
+    return (merged.astype(np.uint8) * 255)
+def read_language_instruction(h5_path: str) -> str:
+    """Read language_instruction from other.h5."""
+    with h5py.File(h5_path, "r") as f:
+        instr = f["language_instruction"][()]
+        if isinstance(instr, bytes):
+            instr = instr.decode("utf-8")
+        return str(instr)
+def main(args):
+    args = parse_args(args)
+    data_dir = Path(args.data_dir)
+    save_dir = Path(args.save_dir)
+    episodes_dir = data_dir / "episodes"
+    if not episodes_dir.is_dir():
+        print(f"Error: episodes directory not found at {episodes_dir}")
+        sys.exit(1)
+    # Collect and sort episode directories
+    episode_dirs = sorted(
+        [d for d in episodes_dir.iterdir() if d.is_dir()],
+        key=lambda p: p.name,
+    )
+    # Filter by episode range
+    if args.start_episode is not None or args.end_episode is not None:
+        start = args.start_episode if args.start_episode is not None else 0
+        end = args.end_episode if args.end_episode is not None else len(episode_dirs)
+        episode_dirs = [
+            d for d in episode_dirs
+            if start <= int(d.name) < end
+        ]
+    print(f"Data dir : {data_dir}")
+    print(f"Save dir : {save_dir}")
+    print(f"Episodes : {len(episode_dirs)}")
+    print(f"Prompt   : {args.prompt_template}")
+    print()
+    # Load model
+    print("Loading model...")
+    model, tokenizer, clip_image_processor, transform = load_model(args)
+    print("Model loaded.\n")
+    total_steps = 0
+    empty_mask_count = 0
+    for ep_dir in episode_dirs:
+        episode_id = ep_dir.name  # e.g. "000000"
+        steps_dir = ep_dir / "steps"
+        if not steps_dir.is_dir():
+            print(f"  [WARNING] No steps/ in {ep_dir}, skipping.")
+            continue
+        step_dirs = sorted(
+            [d for d in steps_dir.iterdir() if d.is_dir()],
+            key=lambda p: p.name,
+        )
+        for step_dir in step_dirs:
+            step_id = step_dir.name  # e.g. "0000"
+            # Read language instruction
+            other_h5 = step_dir / "other.h5"
+            if not other_h5.exists():
+                print(f"  [WARNING] Missing other.h5 in {step_dir}, skipping.")
+                continue
+            language_instruction = read_language_instruction(str(other_h5))
+            # debug
+            # print(language_instruction)
+            # Build prompt
+            query_text = args.prompt_template.format(language_instruction)
+            prompt_str = build_prompt(query_text, args)
+            # Output directory (same structure as input: episodes/{episode_id}/steps/{step_id}/)
+            out_dir = save_dir / "episodes" / episode_id / "steps" / step_id
+            out_dir.mkdir(parents=True, exist_ok=True)
+            # Process both cameras
+            for cam_name in ("image_primary", "image_wrist"):
+                img_path = step_dir / f"{cam_name}.jpg"
+                mask_path = out_dir / f"{cam_name}_mask.png"
+                if not img_path.exists():
+                    print(f"  [WARNING] Missing {img_path}, skipping.")
+                    continue
+                mask = infer_single_image(
+                    str(img_path), prompt_str,
+                    model, tokenizer, clip_image_processor, transform, args,
+                )
+                if mask is None:
+                    # Save blank mask and warn
+                    h, w = cv2.imread(str(img_path)).shape[:2]
+                    mask = np.zeros((h, w), dtype=np.uint8)
+                    empty_mask_count += 1
+                cv2.imwrite(str(mask_path), mask)
+            total_steps += 1
+            if total_steps % 50 == 0:
+                print(f"  Processed {total_steps} steps (episode {episode_id}, step {step_id})")
+        print(f"Episode {episode_id} done ({len(step_dirs)} steps)")
+    print(f"\nFinished. {total_steps} steps processed, {empty_mask_count} empty masks.")
+if __name__ == "__main__":
+    main(sys.argv[1:])

chat.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import argparse
+import os
+import sys
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, BitsAndBytesConfig, CLIPImageProcessor
+from model.AffordanceVLM import AffordanceVLMForCausalLM
+from model.llava import conversation as conversation_lib
+from model.llava.mm_utils import tokenizer_image_token
+from model.segment_anything.utils.transforms import ResizeLongestSide
+from utils.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
+                         DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
+def parse_args(args):
+    parser = argparse.ArgumentParser(description="LISA chat")
+    parser.add_argument("--version", default="/gemini/code/AffordanceNet/ckpts/AffordanceVLM-7B")
+    parser.add_argument("--vis_save_path", default="./vis_output", type=str)
+    parser.add_argument(
+        "--precision",
+        default="bf16",
+        type=str,
+        choices=["fp32", "bf16", "fp16"],
+        help="precision for inference",
+    )
+    parser.add_argument("--image_size", default=1024, type=int, help="image size")
+    parser.add_argument("--model_max_length", default=512, type=int)
+    parser.add_argument("--lora_r", default=8, type=int)
+    parser.add_argument(
+        "--vision-tower", default="openai/clip-vit-large-patch14", type=str
+    )
+    parser.add_argument("--local-rank", default=0, type=int, help="node rank")
+    parser.add_argument("--load_in_8bit", action="store_true", default=False)
+    parser.add_argument("--load_in_4bit", action="store_true", default=False)
+    parser.add_argument("--use_mm_start_end", action="store_true", default=True)
+    parser.add_argument(
+        "--conv_type",
+        default="llava_v1",
+        type=str,
+        choices=["llava_v1", "llava_llama_2"],
+    )
+    return parser.parse_args(args)
+def preprocess(
+    x,
+    pixel_mean=torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1),
+    pixel_std=torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1),
+    img_size=1024,
+) -> torch.Tensor:
+    """Normalize pixel values and pad to a square input."""
+    # Normalize colors
+    x = (x - pixel_mean) / pixel_std
+    # Pad
+    h, w = x.shape[-2:]
+    padh = img_size - h
+    padw = img_size - w
+    x = F.pad(x, (0, padw, 0, padh))
+    return x
+def main(args):
+    args = parse_args(args)
+    os.makedirs(args.vis_save_path, exist_ok=True)
+    # Create model
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.version,
+        cache_dir=None,
+        model_max_length=args.model_max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+    tokenizer.pad_token = tokenizer.unk_token
+    num_added_tokens = tokenizer.add_tokens("[SEG]")
+    args.seg_token_idx = tokenizer("[SEG]", add_special_tokens=False).input_ids[0]
+    num_added_tokens = tokenizer.add_tokens("[AFF]")
+    args.aff_token_idx = tokenizer("[AFF]", add_special_tokens=False).input_ids[0]
+    torch_dtype = torch.float32
+    if args.precision == "bf16":
+        torch_dtype = torch.bfloat16
+    elif args.precision == "fp16":
+        torch_dtype = torch.half
+    kwargs = {"torch_dtype": torch_dtype}
+    if args.load_in_4bit:
+        kwargs.update(
+            {
+                "torch_dtype": torch.half,
+                "load_in_4bit": True,
+                "quantization_config": BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_compute_dtype=torch.float16,
+                    bnb_4bit_use_double_quant=True,
+                    bnb_4bit_quant_type="nf4",
+                    llm_int8_skip_modules=["visual_model"],
+                ),
+            }
+        )
+    elif args.load_in_8bit:
+        kwargs.update(
+            {
+                "torch_dtype": torch.half,
+                "quantization_config": BitsAndBytesConfig(
+                    llm_int8_skip_modules=["visual_model"],
+                    load_in_8bit=True,
+                ),
+            }
+        )
+    model = AffordanceVLMForCausalLM.from_pretrained(
+        args.version, low_cpu_mem_usage=True, vision_tower=args.vision_tower, seg_token_idx=args.seg_token_idx, aff_token_idx=args.aff_token_idx, **kwargs
+    )
+    model.config.eos_token_id = tokenizer.eos_token_id
+    model.config.bos_token_id = tokenizer.bos_token_id
+    model.config.pad_token_id = tokenizer.pad_token_id
+    model.get_model().initialize_vision_modules(model.get_model().config)
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(dtype=torch_dtype)
+    if args.precision == "bf16":
+        model = model.bfloat16().cuda()
+    elif (
+        args.precision == "fp16" and (not args.load_in_4bit) and (not args.load_in_8bit)
+    ):
+        vision_tower = model.get_model().get_vision_tower()
+        model.model.vision_tower = None
+        import deepspeed
+        model_engine = deepspeed.init_inference(
+            model=model,
+            dtype=torch.half,
+            replace_with_kernel_inject=True,
+            replace_method="auto",
+        )
+        model = model_engine.module
+        model.model.vision_tower = vision_tower.half().cuda()
+    elif args.precision == "fp32":
+        model = model.float().cuda()
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(device=args.local_rank)
+    clip_image_processor = CLIPImageProcessor.from_pretrained(model.config.vision_tower)
+    transform = ResizeLongestSide(args.image_size)
+    model.eval()
+    while True:
+        conv = conversation_lib.conv_templates[args.conv_type].copy()
+        conv.messages = []
+        prompt = input("Please input your prompt: ")
+        prompt = DEFAULT_IMAGE_TOKEN + "\n" + "You are an embodied robot. " + prompt
+        if args.use_mm_start_end:
+            replace_token = (
+                DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
+            )
+            prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+        conv.append_message(conv.roles[0], prompt)
+        conv.append_message(conv.roles[1], "")
+        prompt = conv.get_prompt()
+        image_path = input("Please input the image path: ")
+        if not os.path.exists(image_path):
+            print("File not found in {}".format(image_path))
+            continue
+        image_np = cv2.imread(image_path)
+        image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
+        original_size_list = [image_np.shape[:2]]
+        image_clip = (
+            clip_image_processor.preprocess(image_np, return_tensors="pt")[
+                "pixel_values"
+            ][0]
+            .unsqueeze(0)
+            .cuda()
+        )
+        if args.precision == "bf16":
+            image_clip = image_clip.bfloat16()
+        elif args.precision == "fp16":
+            image_clip = image_clip.half()
+        else:
+            image_clip = image_clip.float()
+        image = transform.apply_image(image_np)
+        resize_list = [image.shape[:2]]
+        image = (
+            preprocess(torch.from_numpy(image).permute(2, 0, 1).contiguous())
+            .unsqueeze(0)
+            .cuda()
+        )
+        if args.precision == "bf16":
+            image = image.bfloat16()
+        elif args.precision == "fp16":
+            image = image.half()
+        else:
+            image = image.float()
+        input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+        input_ids = input_ids.unsqueeze(0).cuda()
+        output_ids, pred_masks = model.evaluate(
+            image_clip,
+            image,
+            input_ids,
+            resize_list,
+            original_size_list,
+            max_new_tokens=512,
+            tokenizer=tokenizer,
+        )
+        output_ids = output_ids[0][output_ids[0] != IMAGE_TOKEN_INDEX]
+        text_output = tokenizer.decode(output_ids, skip_special_tokens=False)
+        text_output = text_output.replace("\n", "").replace("  ", " ")
+        print("text_output: ", text_output)
+        for i, pred_mask in enumerate(pred_masks):
+            if pred_mask.shape[0] == 0:
+                continue
+            pred_mask = pred_mask.detach().cpu().numpy()[0]
+            pred_mask = pred_mask > 0
+            save_path = "{}/{}_mask_{}.jpg".format(
+                args.vis_save_path, image_path.split("/")[-1].split(".")[0], i
+            )
+            cv2.imwrite(save_path, pred_mask * 100)
+            print("{} has been saved.".format(save_path))
+            save_path = "{}/{}_masked_img_{}.jpg".format(
+                args.vis_save_path, image_path.split("/")[-1].split(".")[0], i
+            )
+            save_img = image_np.copy()
+            save_img[pred_mask] = (
+                image_np * 0.5
+                + pred_mask[:, :, None].astype(np.uint8) * np.array([255, 0, 0]) * 0.5
+            )[pred_mask]
+            save_img = cv2.cvtColor(save_img, cv2.COLOR_RGB2BGR)
+            cv2.imwrite(save_path, save_img)
+            print("{} has been saved.".format(save_path))
+if __name__ == "__main__":
+    main(sys.argv[1:])

chat_prefill.py ADDED Viewed

	@@ -0,0 +1,282 @@

+"""
+Interactive affordance mask generation using prefill mode (single forward pass).
+Same interactive workflow as chat.py, but uses prefill inference instead of
+autoregressive generation. The assistant response "[AFF]." is pre-filled in the
+prompt, so the model only does one forward pass to extract mask embeddings.
+"""
+import argparse
+import os
+import sys
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, BitsAndBytesConfig, CLIPImageProcessor
+from model.AffordanceVLM import AffordanceVLMForCausalLM
+from model.llava import conversation as conversation_lib
+from model.llava.mm_utils import tokenizer_image_token
+from model.segment_anything.utils.transforms import ResizeLongestSide
+from utils.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
+                         DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
+def parse_args(args):
+    parser = argparse.ArgumentParser(description="AffordanceVLM chat (prefill mode)")
+    parser.add_argument("--version", default="/gemini/code/AffordanceNet/ckpts/AffordanceVLM-7B")
+    parser.add_argument("--vis_save_path", default="./vis_output_prefill", type=str)
+    parser.add_argument(
+        "--precision", default="bf16", type=str,
+        choices=["fp32", "bf16", "fp16"],
+    )
+    parser.add_argument("--image_size", default=1024, type=int)
+    parser.add_argument("--model_max_length", default=512, type=int)
+    parser.add_argument("--lora_r", default=8, type=int)
+    parser.add_argument("--vision-tower", default="openai/clip-vit-large-patch14", type=str)
+    parser.add_argument("--local-rank", default=0, type=int)
+    parser.add_argument("--load_in_8bit", action="store_true", default=False)
+    parser.add_argument("--load_in_4bit", action="store_true", default=False)
+    parser.add_argument("--use_mm_start_end", action="store_true", default=True)
+    parser.add_argument(
+        "--conv_type", default="llava_v1", type=str,
+        choices=["llava_v1", "llava_llama_2"],
+    )
+    parser.add_argument("--prompt_template", type=str,
+                        default="Segment the most suitable manipulation region on the single target object for the task '{}'.",
+                        help="Template wrapping language_instruction. Use {} as placeholder.")
+    # Segment the most suitable manipulation region on the single target object for the task '{}'.
+    # Segment the affordance map for the task '{}' in this image.
+    # Segment the affordance map of the single target object for the task '{}' in this image.
+    # Given the task instruction '{}', what is the affordance map of the target object in this image? Please output segmentation mask.
+    # Given the task instruction '{}', what is the affordance map of the single target object in this image? There is only one target object. Please output segmentation mask.
+    return parser.parse_args(args)
+def preprocess(
+    x,
+    pixel_mean=torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1),
+    pixel_std=torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1),
+    img_size=1024,
+) -> torch.Tensor:
+    """Normalize pixel values and pad to a square input."""
+    x = (x - pixel_mean) / pixel_std
+    h, w = x.shape[-2:]
+    padh = img_size - h
+    padw = img_size - w
+    x = F.pad(x, (0, padw, 0, padh))
+    return x
+def main(args):
+    args = parse_args(args)
+    os.makedirs(args.vis_save_path, exist_ok=True)
+    # Create model
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.version,
+        cache_dir=None,
+        model_max_length=args.model_max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+    tokenizer.pad_token = tokenizer.unk_token
+    tokenizer.add_tokens("[SEG]")
+    args.seg_token_idx = tokenizer("[SEG]", add_special_tokens=False).input_ids[0]
+    tokenizer.add_tokens("[AFF]")
+    args.aff_token_idx = tokenizer("[AFF]", add_special_tokens=False).input_ids[0]
+    torch_dtype = torch.float32
+    if args.precision == "bf16":
+        torch_dtype = torch.bfloat16
+    elif args.precision == "fp16":
+        torch_dtype = torch.half
+    kwargs = {"torch_dtype": torch_dtype}
+    if args.load_in_4bit:
+        kwargs.update({
+            "torch_dtype": torch.half,
+            "load_in_4bit": True,
+            "quantization_config": BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+                llm_int8_skip_modules=["visual_model"],
+            ),
+        })
+    elif args.load_in_8bit:
+        kwargs.update({
+            "torch_dtype": torch.half,
+            "quantization_config": BitsAndBytesConfig(
+                llm_int8_skip_modules=["visual_model"],
+                load_in_8bit=True,
+            ),
+        })
+    model = AffordanceVLMForCausalLM.from_pretrained(
+        args.version,
+        low_cpu_mem_usage=True,
+        vision_tower=args.vision_tower,
+        seg_token_idx=args.seg_token_idx,
+        aff_token_idx=args.aff_token_idx,
+        **kwargs,
+    )
+    model.config.eos_token_id = tokenizer.eos_token_id
+    model.config.bos_token_id = tokenizer.bos_token_id
+    model.config.pad_token_id = tokenizer.pad_token_id
+    model.get_model().initialize_vision_modules(model.get_model().config)
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(dtype=torch_dtype)
+    if args.precision == "bf16":
+        model = model.bfloat16().cuda()
+    elif args.precision == "fp16" and (not args.load_in_4bit) and (not args.load_in_8bit):
+        vision_tower = model.get_model().get_vision_tower()
+        model.model.vision_tower = None
+        import deepspeed
+        model_engine = deepspeed.init_inference(
+            model=model,
+            dtype=torch.half,
+            replace_with_kernel_inject=True,
+            replace_method="auto",
+        )
+        model = model_engine.module
+        model.model.vision_tower = vision_tower.half().cuda()
+    elif args.precision == "fp32":
+        model = model.float().cuda()
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(device=args.local_rank)
+    clip_image_processor = CLIPImageProcessor.from_pretrained(model.config.vision_tower)
+    transform = ResizeLongestSide(args.image_size)
+    model.eval()
+    # debug
+    template = "Given the task instruction '{}', what is the affordance map of the target object in this image? Please output segmentation mask."
+    while True:
+        conv = conversation_lib.conv_templates[args.conv_type].copy()
+        conv.messages = []
+        prompt = input("Please input your prompt: ")
+        # 加入模版
+        prompt = args.prompt_template.format(prompt)
+        prompt = DEFAULT_IMAGE_TOKEN + "\n" + "You are an embodied robot. " + prompt
+        if args.use_mm_start_end:
+            replace_token = (
+                DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
+            )
+            prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+        conv.append_message(conv.roles[0], prompt)
+        conv.append_message(conv.roles[1], "[AFF].")
+        prompt = conv.get_prompt()
+        image_path = input("Please input the image path: ")
+        if not os.path.exists(image_path):
+            print("File not found in {}".format(image_path))
+            continue
+        image_np = cv2.imread(image_path)
+        image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
+        original_size_list = [image_np.shape[:2]]
+        h, w = original_size_list[0]
+        image_clip = (
+            clip_image_processor.preprocess(image_np, return_tensors="pt")[
+                "pixel_values"
+            ][0]
+            .unsqueeze(0)
+            .cuda()
+        )
+        if args.precision == "bf16":
+            image_clip = image_clip.bfloat16()
+        elif args.precision == "fp16":
+            image_clip = image_clip.half()
+        else:
+            image_clip = image_clip.float()
+        image = transform.apply_image(image_np)
+        resize_list = [image.shape[:2]]
+        image = (
+            preprocess(torch.from_numpy(image).permute(2, 0, 1).contiguous())
+            .unsqueeze(0)
+            .cuda()
+        )
+        if args.precision == "bf16":
+            image = image.bfloat16()
+        elif args.precision == "fp16":
+            image = image.half()
+        else:
+            image = image.float()
+        input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+        input_ids = input_ids.unsqueeze(0).cuda()
+        attention_masks = input_ids.ne(tokenizer.pad_token_id)
+        # Print the full prompt text (prefill mode has no generated text)
+        # debug
+        text_ids = input_ids[0][input_ids[0] != IMAGE_TOKEN_INDEX]
+        text_output = tokenizer.decode(text_ids, skip_special_tokens=False)
+        text_output = text_output.replace("\n", "").replace("  ", " ")
+        print("text_output: ", text_output)
+        # Prefill inference
+        labels = input_ids.clone()
+        offset = torch.LongTensor([0, 1]).cuda()
+        masks_list = [torch.zeros(1, h, w).float().cuda()]
+        label_list = [torch.zeros(h, w).long().cuda()]
+        with torch.no_grad():
+            output_dict = model(
+                images=image,
+                images_clip=image_clip,
+                input_ids=input_ids,
+                labels=labels,
+                attention_masks=attention_masks,
+                offset=offset,
+                masks_list=masks_list,
+                label_list=label_list,
+                resize_list=resize_list,
+                inference=True,
+            )
+        pred_masks = output_dict["pred_masks"]
+        for i, pred_mask in enumerate(pred_masks):
+            if pred_mask.shape[0] == 0:
+                continue
+            pred_mask = pred_mask.detach().cpu().numpy()[0]
+            pred_mask = pred_mask > 0
+            save_path = "{}/{}_mask_{}.jpg".format(
+                args.vis_save_path, image_path.split("/")[-1].split(".")[0], i
+            )
+            cv2.imwrite(save_path, pred_mask * 100)
+            print("{} has been saved.".format(save_path))
+            save_path = "{}/{}_masked_img_{}.jpg".format(
+                args.vis_save_path, image_path.split("/")[-1].split(".")[0], i
+            )
+            save_img = image_np.copy()
+            save_img[pred_mask] = (
+                image_np * 0.5
+                + pred_mask[:, :, None].astype(np.uint8) * np.array([255, 0, 0]) * 0.5
+            )[pred_mask]
+            save_img = cv2.cvtColor(save_img, cv2.COLOR_RGB2BGR)
+            cv2.imwrite(save_path, save_img)
+            print("{} has been saved.".format(save_path))
+if __name__ == "__main__":
+    main(sys.argv[1:])

ckpts/AffordanceVLM-7B/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

ckpts/AffordanceVLM-7B/README.md ADDED Viewed

	@@ -0,0 +1,3 @@

+---
+license: apache-2.0
+---

ckpts/AffordanceVLM-7B/added_tokens.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "<im_end>": 32002,
+  "<im_patch>": 32000,
+  "<im_start>": 32001,
+  "[AFF]": 32004,
+  "[SEG]": 32003
+}

ckpts/AffordanceVLM-7B/config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "_name_or_path": "./LLaVA/LLaVA-Lightning-7B-v1-1",
+  "architectures": [
+    "AffordanceVLMForCausalLM"
+  ],
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "freeze_mm_mlp_adapter": true,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "square",
+  "image_grid_pinpoints": null,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 2048,
+  "max_sequence_length": 2048,
+  "mm_hidden_size": 1024,
+  "mm_use_im_patch_token": false,
+  "mm_use_im_start_end": true,
+  "mm_vision_select_feature": "patch",
+  "mm_vision_select_layer": -2,
+  "mm_vision_tower": "openai/clip-vit-large-patch14",
+  "model_type": "llava",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "out_dim": 256,
+  "pad_token_id": 0,
+  "pretrain_mm_mlp_adapter": null,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "train_mask_decoder": true,
+  "transformers_version": "4.31.0",
+  "tune_mm_mlp_adapter": false,
+  "use_cache": false,
+  "use_mm_proj": true,
+  "vision_tower": "openai/clip-vit-large-patch14",
+  "vocab_size": 32005
+}

ckpts/AffordanceVLM-7B/eval_result.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ dataset: handal_all, giou: 0.60872483253479, ciou: 0.6054294109344482

ckpts/AffordanceVLM-7B/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.31.0"
+}

ckpts/AffordanceVLM-7B/pytorch_model-00001-of-00002.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:efdb3ff9accdd733412d083c770ba34ae1c6745b28e2bae07d3546dc9356bfec
+size 9976675518

ckpts/AffordanceVLM-7B/pytorch_model-00002-of-00002.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7259eabdd3c03be21d45a328177ac3e46e1385cbc5ff2d757cd8bb70dec81ae9
+size 6144654233

ckpts/AffordanceVLM-7B/pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,930 @@

+{
+  "metadata": {
+    "total_size": 16121002176
+  },
+  "weight_map": {
+    "lm_head.weight": "pytorch_model-00002-of-00002.bin",
+    "model.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.20.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.20.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.20.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.20.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.21.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.21.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.21.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.21.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.22.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.22.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.22.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.22.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.23.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.23.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.23.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.23.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
+    "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
+    "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
+    "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
+    "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.30.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.30.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.30.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.30.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
+    "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
+    "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.mm_projector.bias": "pytorch_model-00002-of-00002.bin",
+    "model.mm_projector.weight": "pytorch_model-00002-of-00002.bin",
+    "model.norm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.text_hidden_fcs.0.0.bias": "pytorch_model-00002-of-00002.bin",
+    "model.text_hidden_fcs.0.0.weight": "pytorch_model-00002-of-00002.bin",
+    "model.text_hidden_fcs.0.2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.text_hidden_fcs.0.2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.0.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.0.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.0.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.0.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.0.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.0.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.0.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.0.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.0.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.0.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.0.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.0.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.0.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.0.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.1.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.1.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.1.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.1.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.1.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.1.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.1.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.1.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.1.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.1.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.1.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.1.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.1.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.1.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.10.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.10.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.10.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.10.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.10.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.10.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.10.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.10.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.10.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.10.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.10.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.10.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.10.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.10.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.11.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.11.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.11.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.11.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.11.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.11.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.11.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.11.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.11.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.11.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.11.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.11.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.11.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.11.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.12.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.12.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.12.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.12.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.12.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.12.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.12.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.12.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.12.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.12.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.12.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.12.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.12.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.12.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.13.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.13.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.13.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.13.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.13.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.13.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.13.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.13.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.13.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.13.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.13.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.13.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.13.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.13.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.14.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.14.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.14.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.14.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.14.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.14.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.14.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.14.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.14.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.14.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.14.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.14.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.14.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.14.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.15.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.15.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.15.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.15.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.15.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.15.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.15.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.15.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.15.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.15.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.15.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.15.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.15.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.15.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.16.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.16.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.16.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.16.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.16.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.16.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.16.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.16.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.16.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.16.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.16.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.16.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.16.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.16.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.17.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.17.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.17.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.17.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.17.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.17.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.17.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.17.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.17.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.17.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.17.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.17.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.17.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.17.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.18.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.18.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.18.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.18.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.18.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.18.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.18.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.18.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.18.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.18.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.18.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.18.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.18.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.18.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.19.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.19.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.19.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.19.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.19.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.19.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.19.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.19.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.19.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.19.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.19.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.19.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.19.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.19.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.2.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.2.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.2.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.2.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.2.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.2.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.2.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.2.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.2.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.2.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.2.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.2.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.2.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.2.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.20.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.20.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.20.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.20.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.20.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.20.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.20.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.20.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.20.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.20.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.20.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.20.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.20.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.20.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.21.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.21.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.21.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.21.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.21.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.21.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.21.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.21.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.21.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.21.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.21.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.21.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.21.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.21.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.22.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.22.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.22.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.22.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.22.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.22.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.22.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.22.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.22.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.22.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.22.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.22.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.22.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.22.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.23.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.23.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.23.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.23.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.23.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.23.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.23.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.23.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.23.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.23.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.23.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.23.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.23.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.23.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.24.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.24.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.24.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.24.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.24.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.24.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.24.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.24.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.24.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.24.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.24.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.24.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.24.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.24.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.25.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.25.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.25.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.25.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.25.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.25.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.25.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.25.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.25.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.25.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.25.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.25.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.25.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.25.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.26.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.26.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.26.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.26.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.26.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.26.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.26.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.26.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.26.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.26.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.26.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.26.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.26.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.26.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.27.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.27.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.27.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.27.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.27.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.27.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.27.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.27.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.27.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.27.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.27.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.27.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.27.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.27.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.28.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.28.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.28.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.28.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.28.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.28.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.28.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.28.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.28.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.28.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.28.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.28.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.28.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.28.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.29.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.29.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.29.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.29.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.29.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.29.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.29.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.29.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.29.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.29.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.29.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.29.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.29.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.29.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.3.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.3.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.3.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.3.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.3.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.3.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.3.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.3.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.3.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.3.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.3.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.3.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.3.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.3.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.30.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.30.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.30.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.30.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.30.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.30.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.30.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.30.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.30.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.30.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.30.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.30.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.30.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.30.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.31.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.31.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.31.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.31.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.31.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.31.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.31.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.31.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.31.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.31.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.31.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.31.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.31.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.31.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.4.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.4.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.4.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.4.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.4.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.4.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.4.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.4.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.4.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.4.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.4.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.4.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.4.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.4.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.5.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.5.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.5.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.5.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.5.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.5.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.5.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.5.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.5.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.5.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.5.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.5.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.5.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.5.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.6.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.6.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.6.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.6.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.6.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.6.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.6.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.6.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.6.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.6.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.6.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.6.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.6.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.6.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.7.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.7.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.7.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.7.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.7.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.7.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.7.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.7.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.7.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.7.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.7.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.7.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.7.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.7.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.8.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.8.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.8.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.8.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.8.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.8.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.8.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.8.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.8.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.8.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.8.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.8.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.8.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.8.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.9.attn.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.9.attn.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.9.attn.qkv.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.9.attn.qkv.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.9.attn.rel_pos_h": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.9.attn.rel_pos_w": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.9.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.9.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.9.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.9.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.9.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.9.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.9.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.blocks.9.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.neck.0.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.neck.1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.neck.1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.neck.2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.neck.3.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.neck.3.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.patch_embed.proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.patch_embed.proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.image_encoder.pos_embed": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.iou_prediction_head.layers.0.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.iou_prediction_head.layers.0.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.iou_prediction_head.layers.1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.iou_prediction_head.layers.1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.iou_prediction_head.layers.2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.iou_prediction_head.layers.2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.iou_token.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.mask_tokens.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.0.layers.0.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.0.layers.0.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.0.layers.1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.0.layers.1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.0.layers.2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.0.layers.2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.1.layers.0.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.1.layers.0.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.1.layers.1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.1.layers.1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.1.layers.2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.1.layers.2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.2.layers.0.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.2.layers.0.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.2.layers.1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.2.layers.1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.2.layers.2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.2.layers.2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.3.layers.0.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.3.layers.0.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.3.layers.1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.3.layers.1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.3.layers.2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_hypernetworks_mlps.3.layers.2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_upscaling.0.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_upscaling.0.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_upscaling.1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_upscaling.1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_upscaling.3.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.output_upscaling.3.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.final_attn_token_to_image.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.final_attn_token_to_image.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.final_attn_token_to_image.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.final_attn_token_to_image.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.final_attn_token_to_image.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.final_attn_token_to_image.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.final_attn_token_to_image.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.final_attn_token_to_image.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.cross_attn_image_to_token.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.cross_attn_image_to_token.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.cross_attn_image_to_token.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.cross_attn_image_to_token.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.cross_attn_image_to_token.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.cross_attn_image_to_token.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.cross_attn_image_to_token.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.cross_attn_image_to_token.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.cross_attn_token_to_image.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.cross_attn_token_to_image.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.cross_attn_token_to_image.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.cross_attn_token_to_image.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.cross_attn_token_to_image.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.cross_attn_token_to_image.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.cross_attn_token_to_image.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.cross_attn_token_to_image.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.norm3.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.norm3.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.norm4.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.norm4.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.self_attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.self_attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.0.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.cross_attn_image_to_token.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.cross_attn_image_to_token.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.cross_attn_image_to_token.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.cross_attn_image_to_token.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.cross_attn_image_to_token.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.cross_attn_image_to_token.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.cross_attn_image_to_token.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.cross_attn_image_to_token.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.cross_attn_token_to_image.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.cross_attn_token_to_image.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.cross_attn_token_to_image.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.cross_attn_token_to_image.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.cross_attn_token_to_image.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.cross_attn_token_to_image.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.cross_attn_token_to_image.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.cross_attn_token_to_image.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.mlp.lin1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.mlp.lin1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.mlp.lin2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.mlp.lin2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.norm3.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.norm3.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.norm4.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.norm4.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.self_attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.self_attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.layers.1.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.norm_final_attn.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.mask_decoder.transformer.norm_final_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.prompt_encoder.mask_downscaling.0.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.prompt_encoder.mask_downscaling.0.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.prompt_encoder.mask_downscaling.1.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.prompt_encoder.mask_downscaling.1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.prompt_encoder.mask_downscaling.3.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.prompt_encoder.mask_downscaling.3.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.prompt_encoder.mask_downscaling.4.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.prompt_encoder.mask_downscaling.4.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.prompt_encoder.mask_downscaling.6.bias": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.prompt_encoder.mask_downscaling.6.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.prompt_encoder.no_mask_embed.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.prompt_encoder.not_a_point_embed.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.prompt_encoder.pe_layer.positional_encoding_gaussian_matrix": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.prompt_encoder.point_embeddings.0.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.prompt_encoder.point_embeddings.1.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.prompt_encoder.point_embeddings.2.weight": "pytorch_model-00002-of-00002.bin",
+    "model.visual_model.prompt_encoder.point_embeddings.3.weight": "pytorch_model-00002-of-00002.bin"
+  }
+}

ckpts/AffordanceVLM-7B/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

ckpts/AffordanceVLM-7B/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

ckpts/AffordanceVLM-7B/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "clean_up_tokenization_spaces": false,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "legacy": true,
+  "model_max_length": 512,
+  "pad_token": null,
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

ckpts/sam_vit_h_4b8939.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7bf3b02f3ebf1267aba913ff637d9a2d5c33d3173bb679e46d9f338c26f262e
+size 2564550879

client.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Client script to send an image and prompt to a Flask-based vision-language segmentation server.
+from __future__ import absolute_import, print_function, division
+import requests
+import cv2
+import base64
+import numpy as np
+# ---------------------------
+# Encode image to base64 string
+# ---------------------------
+def img2b64(img):
+    retval, buffer = cv2.imencode('.bmp', img)  # Encode as BMP
+    pic_str = base64.b64encode(buffer).decode()  # Convert to base64 string
+    return pic_str
+# ---------------------------
+# Decode base64 string back to image
+# ---------------------------
+def b642img(pic_str):
+    img_data = base64.b64decode(pic_str)
+    nparr = np.frombuffer(img_data, np.uint8)
+    img_np = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+    return img_np
+# ---------------------------
+# Send image and prompt to server, receive result and save
+# ---------------------------
+def post_files():
+    path = 'vis_output/my_workspace.JPG'  # Input image path
+    img = cv2.imread(path)
+    if img is None:
+        print(f"Failed to read image at {path}")
+        return
+    pic_str = img2b64(img)
+    data = {
+        'img': pic_str,
+        'prompt': 'Please segment the affordance map of mug in this image.'
+    }
+    # Send POST request to Flask server
+    r = requests.post('http://localhost:3200/img_mask', json=data)
+    if r.status_code == 200:
+        print('Success. Received response from server.')
+        result = r.json()
+        result_b64 = result.get('img', None)
+        if result_b64:
+            result_img = b642img(result_b64)
+            save_path = 'affordance_mask_result.jpg'
+            cv2.imwrite(save_path, result_img)
+            print(f"Result saved to {save_path}")
+        else:
+            print("No image returned in the response.")
+    else:
+        print(f"Request failed with status code {r.status_code}")
+# ---------------------------
+# Main entry
+# ---------------------------
+if __name__ == '__main__':
+    post_files()

data_curation/.ipynb_checkpoints/check_dataset-checkpoint.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import os
+import pickle as pkl
+DATA_DIR = '/gemini/space/wrz/AffordanceNet/data'
+# 新增一个路径修复函数
+def resolve_path(path):
+    """
+    如果路径是相对路径 (比如 ./data/...)，将其转换为绝对路径
+    """
+    if path.startswith('./data/'):
+        # 截掉前缀的 './data/' (长度为 7)，拼接到真实的 DATA_DIR 后面
+        return os.path.join(DATA_DIR, path[7:])
+    elif path.startswith('./'):
+        # 兼容其他情况
+        return os.path.join(os.path.dirname(DATA_DIR), path[2:])
+    return path
+def get_data_paths():
+    """Retrieve train/val/reasoning/non-reasoning pkl file paths."""
+    all_files = os.listdir(DATA_DIR)
+    train_paths = [os.path.join(DATA_DIR, f) for f in all_files if f.endswith('train.pkl')]
+    val_paths = [os.path.join(DATA_DIR, f) for f in all_files if f.endswith('val.pkl')]
+    reasoning_paths = [os.path.join(DATA_DIR, f) for f in all_files if f.endswith('reasoning_val.pkl')]
+    non_reasoning_paths = [vp for vp in val_paths if vp not in reasoning_paths]
+    return train_paths, reasoning_paths, non_reasoning_paths
+def check_file_exists(file_path, description=""):
+    """Assert that the file exists, otherwise raise an error."""
+    assert os.path.exists(file_path), f"{description} does not exist: {file_path}"
+def check_train_data(train_path):
+    """Check frame and mask paths for each sample in training data."""
+    print(f"[Train] Checking: {train_path}")
+    with open(train_path, "rb") as f:
+        data = pkl.load(f)
+    for item in data:
+        # 修改这里：在检查之前先转换路径
+        real_frame_path = resolve_path(item["frame_path"])
+        real_mask_path = resolve_path(item["mask_path"])
+        check_file_exists(real_frame_path, "Frame path")
+        check_file_exists(real_mask_path, "Mask path")
+    print(f"[Train] ✅ Checked {train_path}. Samples: {len(data)}")
+def check_val_data(val_path, reasoning=False):
+    """Check validation data paths depending on reasoning mode."""
+    tag = "Reasoning Val" if reasoning else "Non-Reasoning Val"
+    print(f"[{tag}] Checking: {val_path}")
+    with open(val_path, "rb") as f:
+        data = pkl.load(f)
+    if reasoning:
+        for item in data:
+            # 修改这里
+            real_frame_path = resolve_path(item["frame_path"])
+            real_mask_path = resolve_path(item["mask_path"])
+            check_file_exists(real_frame_path, "Frame path")
+            check_file_exists(real_mask_path, "Mask path")
+        print(f"[{tag}] ✅ Checked {val_path}. Samples: {len(data)}")
+    else:
+        total_images = 0
+        for class_name, image_list in data.get('images', {}).items():
+            for image_path in image_list:
+                # 修改这里
+                check_file_exists(resolve_path(image_path), "Image path")
+            total_images += len(image_list)
+        for class_name, label_list in data.get('labels', {}).items():
+            for label_path in label_list:
+                # 修改这里
+                check_file_exists(resolve_path(label_path), "Label path")
+        print(f"[{tag}] ✅ Checked {val_path}. Samples: {total_images}")
+def main():
+    train_paths, reasoning_paths, non_reasoning_paths = get_data_paths()
+    for train_path in train_paths:
+        check_train_data(train_path)
+    for val_path in non_reasoning_paths:
+        check_val_data(val_path, reasoning=False)
+    for val_path in reasoning_paths:
+        check_val_data(val_path, reasoning=True)
+if __name__ == "__main__":
+    main()

data_curation/build_vlpart.py ADDED Viewed

	@@ -0,0 +1,105 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import argparse
+import glob
+import multiprocessing as mp
+import numpy as np
+import os
+import tempfile
+import time
+import warnings
+import cv2
+import tqdm
+from detectron2.config import get_cfg
+from detectron2.data.detection_utils import read_image
+from detectron2.utils.logger import setup_logger
+import sys
+sys.path.append('.')
+from VLPart.vlpart.config import add_vlpart_config
+from VLPart.demo.predictor import VisualizationDemo
+# constants
+WINDOW_NAME = "image demo"
+def setup_cfg(args):
+    # load config from file and command-line arguments
+    cfg = get_cfg()
+    add_vlpart_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    # Set score_threshold for builtin models
+    cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold
+    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold
+    cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold
+    cfg.freeze()
+    return cfg
+def get_parser():
+    parser = argparse.ArgumentParser(description="Detectron2 demo for builtin configs")
+    parser.add_argument(
+        "--config-file",
+        default="VLPart/configs/joint/swinbase_cascade_lvis_paco_pascalpart_partimagenet.yaml",
+        metavar="FILE",
+        help="path to config file",
+    )
+    parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.")
+    parser.add_argument("--video-input", help="Path to video file.")
+    parser.add_argument(
+        "--input",
+        nargs="+",
+        default='',
+        help="A list of space separated input images; "
+        "or a single glob pattern such as 'directory/*.jpg'",
+    )
+    parser.add_argument(
+        "--output",
+        default='',
+        help="A file or directory to save output visualizations. "
+        "If not given, will show output in an OpenCV window.",
+    )
+    parser.add_argument(
+        "--vocabulary",
+        default="custom",
+        choices=['pascal_part', 'partimagenet', 'paco',
+                 'voc', 'coco', 'lvis',
+                 'pascal_part_voc', 'lvis_paco', 'custom'],
+        help="",
+    )
+    parser.add_argument(
+        "--custom_vocabulary",
+        default="",
+        help="",
+    )
+    parser.add_argument(
+        "--confidence-threshold",
+        type=float,
+        default=0.7,
+        help="Minimum score for instance predictions to be shown",
+    )
+    parser.add_argument(
+        "--opts",
+        help="Modify config options using the command-line 'KEY VALUE' pairs",
+        default=['MODEL.WEIGHTS', "/data/VLPart/ckpts/swinbase_cascade_lvis_paco_pascalpart_partimagenet.pth", "VIS.BOX", False],
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+def build_vlpart_model(custom_vocabulary):
+    mp.set_start_method("spawn", force=True)
+    args = get_parser().parse_args()
+    args.custom_vocabulary = custom_vocabulary
+    setup_logger(name="fvcore")
+    logger = setup_logger()
+    logger.info("Arguments: " + str(args))
+    cfg = setup_cfg(args)
+    model = VisualizationDemo(cfg, args)
+    return model

data_curation/check_dataset.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import os
+import pickle as pkl
+DATA_DIR = '/gemini/space/wrz/AffordanceNet/data'
+# 新增一个路径修复函数
+def resolve_path(path):
+    """
+    如果路径是相对路径 (比如 ./data/...)，将其转换为绝对路径
+    """
+    if path.startswith('./data/'):
+        # 截掉前缀的 './data/' (长度为 7)，拼接到真实的 DATA_DIR 后面
+        return os.path.join(DATA_DIR, path[7:])
+    elif path.startswith('./'):
+        # 兼容其他情况
+        return os.path.join(os.path.dirname(DATA_DIR), path[2:])
+    return path
+def get_data_paths():
+    """Retrieve train/val/reasoning/non-reasoning pkl file paths."""
+    all_files = os.listdir(DATA_DIR)
+    train_paths = [os.path.join(DATA_DIR, f) for f in all_files if f.endswith('train.pkl')]
+    val_paths = [os.path.join(DATA_DIR, f) for f in all_files if f.endswith('val.pkl')]
+    reasoning_paths = [os.path.join(DATA_DIR, f) for f in all_files if f.endswith('reasoning_val.pkl')]
+    non_reasoning_paths = [vp for vp in val_paths if vp not in reasoning_paths]
+    return train_paths, reasoning_paths, non_reasoning_paths
+def check_file_exists(file_path, description=""):
+    """Assert that the file exists, otherwise raise an error."""
+    assert os.path.exists(file_path), f"{description} does not exist: {file_path}"
+def check_train_data(train_path):
+    """Check frame and mask paths for each sample in training data."""
+    print(f"[Train] Checking: {train_path}")
+    with open(train_path, "rb") as f:
+        data = pkl.load(f)
+    for item in data:
+        # 修改这里：在检查之前先转换路径
+        real_frame_path = resolve_path(item["frame_path"])
+        real_mask_path = resolve_path(item["mask_path"])
+        check_file_exists(real_frame_path, "Frame path")
+        check_file_exists(real_mask_path, "Mask path")
+    print(f"[Train] ✅ Checked {train_path}. Samples: {len(data)}")
+def check_val_data(val_path, reasoning=False):
+    """Check validation data paths depending on reasoning mode."""
+    tag = "Reasoning Val" if reasoning else "Non-Reasoning Val"
+    print(f"[{tag}] Checking: {val_path}")
+    with open(val_path, "rb") as f:
+        data = pkl.load(f)
+    if reasoning:
+        for item in data:
+            # 修改这里
+            real_frame_path = resolve_path(item["frame_path"])
+            real_mask_path = resolve_path(item["mask_path"])
+            check_file_exists(real_frame_path, "Frame path")
+            check_file_exists(real_mask_path, "Mask path")
+        print(f"[{tag}] ✅ Checked {val_path}. Samples: {len(data)}")
+    else:
+        total_images = 0
+        for class_name, image_list in data.get('images', {}).items():
+            for image_path in image_list:
+                # 修改这里
+                check_file_exists(resolve_path(image_path), "Image path")
+            total_images += len(image_list)
+        for class_name, label_list in data.get('labels', {}).items():
+            for label_path in label_list:
+                # 修改这里
+                check_file_exists(resolve_path(label_path), "Label path")
+        print(f"[{tag}] ✅ Checked {val_path}. Samples: {total_images}")
+def main():
+    train_paths, reasoning_paths, non_reasoning_paths = get_data_paths()
+    for train_path in train_paths:
+        check_train_data(train_path)
+    for val_path in non_reasoning_paths:
+        check_val_data(val_path, reasoning=False)
+    for val_path in reasoning_paths:
+        check_val_data(val_path, reasoning=True)
+if __name__ == "__main__":
+    main()

data_curation/prompt_generation_handal_easy_reasoning.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import os
+import json
+import pickle
+import requests
+from concurrent.futures import ThreadPoolExecutor
+# Dataset name
+DATASET = 'handal'
+# Handle-equipped objects to filter
+OBJECTS_WITH_HANDLE = [
+    'strainers', 'fixed joint pliers', 'hammers', 'ladles', 'whisks', 'measuring cups',
+    'locking pliers', 'power drills', 'adjustable wrenches', 'mugs', 'ratchets', 'utensils',
+    'combinational wrenches', 'pots pans', 'spatulas', 'screwdrivers', 'slip joint pliers'
+]
+# OpenAI API settings (update key!)
+API_URL = 'https://api.openai.com/v1/chat/completions'
+HEADERS = {
+    'Content-Type': 'application/json',
+    'Authorization': 'Bearer YOUR-API-KEY'  # Replace with your real key
+}
+def read_pkl_file(pkl_path):
+    """Reads pkl file and filters entries for objects with handles."""
+    with open(pkl_path, 'rb') as f:
+        val_data = pickle.load(f)
+    filtered_data = []
+    for class_name, image_list in val_data['images'].items():
+        if class_name in OBJECTS_WITH_HANDLE:
+            for idx, img in enumerate(image_list):
+                class_label = val_data['class_names'][class_name][idx]
+                save_path = os.path.join(
+                    f'./reason_affordance/{DATASET}_easy_reasoning',
+                    class_label,
+                    os.path.splitext(os.path.basename(img))[0] + ".json"
+                )
+                if not os.path.exists(save_path):
+                    filtered_data.append({'img_name': img, 'class_name': class_label})
+    return filtered_data
+def process_sentence(class_name):
+    """Send prompt to OpenAI and return generated sentence."""
+    prompt = [
+        {'role': 'system', 'content': 'You are a helpful assistant.'},
+        {'role': 'system',
+         'content': (
+             'Based on several words where the first is category name, '
+             'please design an instruction <1> and instruction <2> in embodied scenes. '
+             'The instruction <1> must include object category name itself. '
+             'The instruction <2> must include the object category name itself. '
+             'The instruction <2> must belong to embodied manipulation and give action if instruction <1> provides. '
+             'The instruction <2> does not exceed 50 words.'
+         )},
+        {'role': 'user', 'content': 'mug'},
+        {'role': 'assistant',
+         'content': '<1> I need a drink. Please find a mug to fill water. <2> The mug has a handle as affordance map. So the robot can hold its handle.'},
+        {'role': 'user', 'content': 'knife'},
+        {'role': 'assistant',
+         'content': '<1> Please give me a knife to cut apple. <2> The knife has a handle, and you can use its handle to cut apple.'},
+        {'role': 'user', 'content': 'hammers'},
+        {'role': 'assistant',
+         'content': '<1> What is the proper way to hold the hammers? <2> The correct method is to hold the hammer by its handle.'},
+        {'role': 'user', 'content': 'fork'},
+        {'role': 'assistant',
+         'content': '<1> Kindly pick up the fork. <2> You will be holding the fork handle.'},
+        {'role': 'user', 'content': 'screwdrivers'},
+        {'role': 'assistant',
+         'content': '<1> I need a tool to tighten or loosen screws. <2> The screwdriver is here, hold its handle to turn and control screws.'},
+        {'role': 'user', 'content': class_name}
+    ]
+    response = requests.post(API_URL, headers=HEADERS, json={'model': 'gpt-4', 'messages': prompt})
+    if response.status_code == 200:
+        return response.json()['choices'][0]['message']['content']
+    else:
+        print(f"API Error for {class_name}:", response.text)
+        return None
+def process_json(data):
+    """Process a single data entry and save result to JSON file."""
+    class_name = data["class_name"]
+    # Retry up to 5 times
+    for _ in range(5):
+        result = process_sentence(class_name)
+        if not result or '<1>' not in result or '<2>' not in result:
+            continue
+        break
+    else:
+        print(f"Failed to process: {class_name}")
+        return
+    print("Processed:", result)
+    try:
+        question = result.split('<2>')[0].split('<1>')[-1].strip()
+        answer = result.split('<2>')[-1].strip()
+        save_dir = os.path.join(f'./reason_affordance/{DATASET}_easy_reasoning', class_name)
+        os.makedirs(save_dir, exist_ok=True)
+        save_path = os.path.join(save_dir, os.path.splitext(os.path.basename(data["img_name"]))[0] + ".json")
+        output = {'img_name': data["img_name"], 'class_name': class_name, 'question': question, 'answer': answer}
+        with open(save_path, 'w') as f:
+            json.dump(output, f, indent=4)
+    except Exception as e:
+        print(f"Error saving file for {class_name}:", e)
+def main():
+    pkl_file = f'./data/{DATASET}_val.pkl'
+    data_list = read_pkl_file(pkl_file)
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        executor.map(process_json, data_list)
+if __name__ == "__main__":
+    main()

data_curation/prompt_generation_handal_hard_reasoning.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import os
+import json
+import pickle
+import requests
+from concurrent.futures import ThreadPoolExecutor
+# Dataset configuration
+DATASET = 'handal'
+# Object categories with handle
+OBJECTS_WITH_HANDLE = [
+    'strainers', 'fixed joint pliers', 'hammers', 'ladles', 'whisks', 'measuring cups',
+    'locking pliers', 'power drills', 'adjustable wrenches', 'mugs', 'ratchets', 'utensils',
+    'combinational wrenches', 'pots pans', 'spatulas', 'screwdrivers', 'slip joint pliers'
+]
+# OpenAI API settings (update key!)
+API_URL = 'https://api.openai.com/v1/chat/completions'
+HEADERS = {
+    'Content-Type': 'application/json',
+    'Authorization': 'Bearer YOUR-API-KEY'  # Replace with your real key
+}
+def read_pkl_file(pkl_path):
+    """
+    Load a pickle file and extract data entries containing objects with handles,
+    skipping already processed samples.
+    """
+    with open(pkl_path, 'rb') as f:
+        val_data = pickle.load(f)
+    filtered_data = []
+    for class_name, img_list in val_data['images'].items():
+        if class_name not in OBJECTS_WITH_HANDLE:
+            continue
+        for i, img_path in enumerate(img_list):
+            class_label = val_data['class_names'][class_name][i]
+            save_path = os.path.join(
+                f'./reason_affordance/{DATASET}_hard_reasoning',
+                class_label,
+                os.path.splitext(os.path.basename(img_path))[0] + ".json"
+            )
+            if not os.path.exists(save_path):
+                filtered_data.append({'img_name': img_path, 'class_name': class_label})
+    return filtered_data
+def process_sentence(category):
+    """
+    Generate reasoning instructions (<1>, <2>) from category name using GPT.
+    """
+    payload = {
+        'model': 'gpt-4',
+        'messages': [
+            {'role': 'system', 'content': 'You are a helpful assistant.'},
+            {'role': 'system',
+             'content': (
+                 'Based on several words where the first is category name, please design an instruction <1> and instruction <2> in embodied scenes. '
+                 'The instruction <1> must not include object category name itself. '
+                 'The instruction <2> must include the object category name itself. '
+                 'The instruction <2> must belong to embodied manipulation and give action if instruction <1> provides. '
+                 'The instruction <2> does not exceed 50 words.'
+             )},
+            {'role': 'user', 'content': 'microwave, open'},
+            {'role': 'assistant', 'content': '<1> Heat up food quickly. <2> The microwave is closed, so it can be open to access the food inside.'},
+            {'role': 'user', 'content': 'knife'},
+            {'role': 'assistant', 'content': '<1> I want to cut a bread. <2> The knife has a handle, you can use its handle to cut bread.'},
+            {'role': 'user', 'content': 'computer mouse'},
+            {'role': 'assistant', 'content': '<1> Give me a tool to control the cursor on the screen. <2> The computer mouse is here. It has no handle, so you can grasp its whole body.'},
+            {'role': 'user', 'content': 'fork'},
+            {'role': 'assistant', 'content': '<1> Use to pierce and lift food. <2> The fork is here, and its handle can be grasped.'},
+            {'role': 'user', 'content': 'screwdrivers'},
+            {'role': 'assistant', 'content': '<1> I need a tool to tighten or loosen screws. <2> The screwdriver is here, hold its handle to turn and control screws.'},
+            {'role': 'user', 'content': category}
+        ]
+    }
+    response = requests.post(API_URL, headers=HEADERS, json=payload)
+    if response.status_code == 200:
+        return response.json()['choices'][0]['message']['content']
+    else:
+        print(f"[API Error] {category}: {response.status_code} - {response.text}")
+        return None
+def process_json(entry):
+    """
+    Process a single image/class entry by generating reasoning and saving result to file.
+    """
+    class_name = entry['class_name']
+    for _ in range(5):
+        result = process_sentence(class_name)
+        if result and '<1>' in result and '<2>' in result:
+            break
+    else:
+        print(f"[Retry Failed] {class_name}")
+        return
+    try:
+        question = result.split('<2>')[0].split('<1>')[-1].strip()
+        answer = result.split('<2>')[-1].strip()
+        save_dir = os.path.join(f'./reason_affordance/{DATASET}_hard_reasoning', class_name)
+        os.makedirs(save_dir, exist_ok=True)
+        save_path = os.path.join(save_dir, os.path.splitext(os.path.basename(entry['img_name']))[0] + ".json")
+        output = {
+            'img_name': entry['img_name'],
+            'class_name': class_name,
+            'question': question,
+            'answer': answer
+        }
+        with open(save_path, 'w') as f:
+            json.dump(output, f, indent=4)
+        print(f"[Saved] {save_path}")
+    except Exception as e:
+        print(f"[Error] Failed to save {class_name}: {e}")
+def main():
+    """
+    Main execution: loads data, then processes in parallel.
+    """
+    pkl_path = f'./data/{DATASET}_val.pkl'
+    entries = read_pkl_file(pkl_path)
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        executor.map(process_json, entries)
+if __name__ == "__main__":
+    main()

data_curation/vlpart_sam2_tracking.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import os
+import cv2
+import torch
+import pickle
+import argparse
+import numpy as np
+import warnings
+from tqdm import tqdm
+from pathlib import Path
+from PIL import Image
+from detectron2.data.detection_utils import read_image
+from supervision import Detections, BoxAnnotator, MaskAnnotator, LabelAnnotator, mask_to_xyxy
+from sam2.build_sam import build_sam2_video_predictor
+from VLPart.build_vlpart import build_vlpart_model
+warnings.filterwarnings('ignore')
+# Constants
+SAM2_CONFIG = "sam2_hiera_l.yaml"
+SAM2_CHECKPOINT = "./checkpoints/sam2_hiera_large.pt"
+OUTPUT_ROOT = "/data/robot-merlin/mask_vlpart+sam2_tracking"
+OUTPUT_ROOT_IMG = "/data/robot-merlin/mask_vlpart+sam2_tracking_with_image"
+# Set up torch environment
+torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
+if torch.cuda.get_device_properties(0).major >= 8:
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+def load_affordance_data(pkl_path):
+    """
+    Load affordance data from a pickle file and organize it by video directory.
+    Args:
+        pkl_path (str): Path to the pickle file containing affordance data.
+    Returns:
+        dict: A dictionary where keys are video directory paths and values are lists of data entries.
+    """
+    with open(pkl_path, 'rb') as f:
+        datas = pickle.load(f)
+    data_dict = {}
+    for data in datas:
+        vid_path = os.path.dirname(data['frame_path'])
+        data_dict.setdefault(vid_path, []).append(data)
+    return data_dict
+def init_vlpart_once(text, prev_text, vlpart_model):
+    """
+    Initialize VLPart model if the text has changed.
+    """
+    if text != prev_text:
+        if vlpart_model is not None:
+            del vlpart_model
+        vlpart_model = build_vlpart_model(text)
+    return vlpart_model, text
+def run_vlpart_on_first_frame(vlpart_model, image_path):
+    """
+    Run VLPart model on the first frame to get bounding boxes.
+    """
+    img = read_image(image_path, format="BGR")
+    predictions, _ = vlpart_model.run_on_image(img)
+    if len(predictions["instances"]) != 1:
+        return None
+    return predictions["instances"].pred_boxes.tensor.cpu().numpy()
+def run_sam2_tracking(video_dir, frame_names, sam2_predictor, boxes):
+    """
+    Run SAM2 tracking on the video frames using the provided bounding boxes.
+    """
+    inference_state = sam2_predictor.init_state(video_path=video_dir)
+    sam2_predictor.reset_state(inference_state)
+    _, obj_ids, mask_logits = sam2_predictor.add_new_points_or_box(
+        inference_state=inference_state,
+        frame_idx=0,
+        obj_id=1,
+        box=boxes,
+    )
+    results = {}
+    for frame_idx, out_ids, out_logits in sam2_predictor.propagate_in_video(inference_state):
+        results[frame_idx] = {
+            oid: (out_logits[i] > 0).cpu().numpy()
+            for i, oid in enumerate(out_ids)
+        }
+    return results
+def save_tracking_results(video_dir, frame_names, video_segments, object_name, output_base, vid):
+    """
+    Save the tracking results to the specified output directory.
+    """
+    objects = [object_name]
+    id_to_objects = {i: obj for i, obj in enumerate(objects, start=1)}
+    output_dir = Path(f"{output_base}/{vid:06d}")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_dir_img = Path(f"{OUTPUT_ROOT_IMG}/{vid:06d}")
+    output_dir_img.mkdir(parents=True, exist_ok=True)
+    box_annotator = BoxAnnotator()
+    label_annotator = LabelAnnotator()
+    mask_annotator = MaskAnnotator()
+    for idx, masks in video_segments.items():
+        frame_path = os.path.join(video_dir, frame_names[idx])
+        frame = cv2.imread(frame_path)
+        obj_ids = list(masks.keys())
+        mask_arr = np.concatenate(list(masks.values()), axis=0)
+        detections = Detections(
+            xyxy=mask_to_xyxy(mask_arr),
+            mask=mask_arr,
+            class_id=np.array(obj_ids, dtype=np.int32),
+        )
+        annotated = box_annotator.annotate(frame.copy(), detections)
+        annotated = label_annotator.annotate(annotated, detections, [id_to_objects[i] for i in obj_ids])
+        annotated = mask_annotator.annotate(annotated, detections)
+        cv2.imwrite(str(output_dir_img / frame_names[idx]), annotated)
+        cv2.imwrite(str(output_dir / frame_names[idx]), mask_arr[0] * 255)
+def get_sorted_frame_names(video_dir):
+    return sorted([
+        f for f in os.listdir(video_dir)
+        if f.lower().endswith(('.jpg', '.jpeg'))
+    ], key=lambda name: int(os.path.splitext(name)[0]))
+def main(openx_data, text_override=None):
+    # You can reorganize the data loading logic as needed
+    data_dict = load_affordance_data(f'./data/{openx_data}_for_affordance.pkl')
+    # Initialize SAM2 predictor
+    sam2_predictor = build_sam2_video_predictor(SAM2_CONFIG, SAM2_CHECKPOINT, device=device)
+    prev_text = ''
+    vlpart_model = None
+    for video_dir, data_list in tqdm(data_dict.items()):
+        first_sample = data_list[0]
+        frame_path = first_sample['frame_path']
+        task_class = first_sample['task_object_class']
+        # Only process specific classes
+        if not any(k in task_class for k in ['door', 'drawer', 'knife']):
+            continue
+        # Initialize VLPart model with the task class
+        input_text = f"{task_class} handle" if not text_override else text_override
+        vlpart_model, prev_text = init_vlpart_once(input_text, prev_text, vlpart_model)
+        # Process the first frame to get bounding boxes
+        boxes = run_vlpart_on_first_frame(vlpart_model, frame_path)
+        if boxes is None:
+            continue
+        # Run SAM2 tracking on the video frames
+        frame_names = get_sorted_frame_names(video_dir)
+        segments = run_sam2_tracking(video_dir, frame_names, sam2_predictor, boxes)
+        save_tracking_results(video_dir, frame_names, segments, input_text,
+                              f"{OUTPUT_ROOT}/", first_sample['vid'])
+        print(f"[Done] {frame_path} | {task_class}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("VLPart + SAM2 Tracking Demo")
+    parser.add_argument("--pipeline", type=str, default="referring_expression_segmentation", help="Pipeline task")
+    parser.add_argument("--text_input", type=str, default=None, help="Optional override for input text")
+    parser.add_argument("--dataset", type=str, default="bridge", help="Dataset name (e.g., bridge)")
+    args = parser.parse_args()
+    main(args.dataset, args.pipeline, args.text_input)

docs/dataset.md ADDED Viewed

	@@ -0,0 +1,93 @@

+## Dataset
+To train our affordance segmentation model, we use two types of data:
+* **General Segmentation Data**: This follows [LISA](https://github.com/dvlab-research/LISA).
+* **Affordance Segmentation Data**: This is a large-scale dataset that we collect.
+### General Segmentation Data
+These data is organized as follows:
+```
+./data/
+├── lisa_data
+│   ├── ade20k
+│   ├── coco
+│   ├── cocostuff
+│   ├── llava_dataset
+│   ├── mapillary
+│   ├── reason_seg
+│   ├── refer_seg
+│   ├── vlpart
+```
+### Affordance Segmentation Data
+We employ images from HANDAL, Open-X, GraspNet, EgoObjects, and RLBench in our affordance segmentation task.
+The HANDAL data is downloaded and organized according to its official [repo](https://github.com/NVlabs/HANDAL).
+Other data can be downloaded from the [Hugging Face](https://huggingface.co/datasets/Dongming97/RAGNet).
+The training data is organized as follows:
+```
+./data/
+├── openx_train.pkl
+├── graspnet_train.pkl
+├── egoobjects_train.pkl
+├── rlbench_train.pkl
+├── handal_hard_reasoning_train.pkl
+├── egoobjects_easy_reasoning_train.pkl
+├── egoobjects_hard_reasoning_train.pkl
+├── HANDAL
+│   ├── without_depth
+│       ├── handal_dataset_adjustable_wrenches
+│       ├── handal_dataset_combinational_wrenches
+│       ├── handal_dataset_fixed_joint_pliers
+│       ├── ...
+├── openx
+│   ├── images
+│       ├── fractal20220817_data
+│       ├── bridge
+│   ├── masks
+│       ├── fractal20220817_data
+│       ├── bridge
+├── graspnet
+│   ├── images
+│   ├── masks
+│   ├── test_seen
+│   ├── test_novel
+├── egoobjects
+│   ├── images
+│   ├── masks
+├── rlbench
+│   ├── images
+│   ├── masks
+├── 3doi
+│   ├── images
+│   ├── masks
+```
+The evaluation data is also in the same dictory, but with the `*_eval.pkl` files instead of `*_train.pkl`.
+```
+./data/
+├── handal_mini_val.pkl
+├── graspnet_test_seen_val.pkl
+├── graspnet_test_novel_val.pkl
+├── 3doi_val.pkl
+├── handal_easy_reasoning_val.pkl
+├── handal_hard_reasoning_val.pkl
+├── 3doi_easy_reasoning_val.pkl
+```
+You can use the following script to confirm if data is organized correctly:
+```bash
+python data_curation/check_dataset.py
+```
+### About data curation
+1. **SAM2**: We use SAM2 to generate affordance mask if the dataset provides box annotation.
+2. **Florence-2 + SAM2**: We use Florence-2 to generate the initial segmentation masks of some complete objects, and then refine them with SAM2. Please see [Florence-2+SAM2](https://github.com/IDEA-Research/Grounded-SAM-2).
+3. **VLPart + SAM2**: We use VLPart to generate box of object part, and then refine them with SAM2. We refer to [VLPart](https://github.com/facebookresearch/VLPart).
+We provide our inference demo scripts in `data_curation/build_vlpart.py` and `data_curation/vlpart_sam2_tracking.py`.
+4. **Reasoning Instruction**: We provide two example scripts to generate reasoning instructions for the affordance segmentation task:
+   - `data_curation/prompt_generation_handal_easy_reasoning.py`: This script generates easy reasoning instructions for the HANDAL dataset.
+   - `data_curation/prompt_generation_handal_hard_reasoning.py`: This script generates hard reasoning instructions for the HANDAL dataset.

docs/installation.md ADDED Viewed

	@@ -0,0 +1,10 @@

+## Installation
+The environment installation mainly follows [LISA](https://github.com/dvlab-research/LISA).
+```
+https://github.com/wudongming97/AffordanceNet.git
+cd AffordanceNet
+conda create -n affordancenet python=3.9
+conda activate affordancenet
+pip install -r requirements.txt
+pip install flash-attn --no-build-isolation
+```

docs/training_and_evaluation.md ADDED Viewed

	@@ -0,0 +1,56 @@

+## Training and Evaluation
+### Pre-trained Weights
+#### LLaVA
+For convenience of using pre-trained LLaVA weights, we provide a link from [Hugging Face](https://huggingface.co/Dongming97/LLaVA-Lightning-7B-v1-1).
+#### SAM
+Download SAM ViT-H pre-trained weights from the [link](https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth).
+### Training
+To train AffordanceVLM, you can use the following command.
+```
+bash ./scripts/train.sh
+```
+When training is finished, to get the full model weight:
+```
+cd ./runs/AffordanceVLM-7B/ckpt_model && python zero_to_fp32.py . ../pytorch_model.bin
+```
+### Merge LoRA Weight
+Merge the LoRA weights of `pytorch_model.bin`, save the resulting model into your desired path in the Hugging Face format:
+```
+CUDA_VISIBLE_DEVICES="" python merge_lora_weights_and_save_hf_model.py \
+  --version="PATH_TO_LLaVA" \
+  --weight="PATH_TO_pytorch_model.bin" \
+  --save_path="PATH_TO_SAVED_MODEL"
+```
+For example:
+```
+CUDA_VISIBLE_DEVICES="" python3 merge_lora_weights_and_save_hf_model.py  \
+    --version="./LLaVA/LLaVA-Lightning-7B-v1-1" \
+    --weight="./runs/AffordanceVLM-7B/pytorch_model.bin"  \
+    --save_path="./exps/AffordanceVLM-7B"
+```
+### Evaluation
+To evaluate AffordanceVLM on the entire [HANDAL](https://github.com/NVlabs/HANDAL) dataset, please adjust the `--dataset_dir` parameter in `evaluate.sh`.
+```
+bash ./scripts/evaluate.sh
+```
+To chat with [AffordanceVLM-7B](https://huggingface.co/Dongming97/AffordanceVLM):
+```
+CUDA_VISIBLE_DEVICES=0 python chat.py --version=./exps/AffordanceVLM-7B
+```
+### Main Results
+HANDAL:
+|      Method      | gIoU | cIoU |
+|:----------------:|:----:|-----:|
+| AffordanceVLM-7B | 60.3 | 60.8 |

imgs/.ipynb_checkpoints/AffordanceNet-checkpoint.jpg ADDED Viewed

Git LFS Details

SHA256: 3abd71b7ead1d3353faf60d65da4ceeafed34314a4c123059b5d92f53685c797
Pointer size: 132 Bytes
Size of remote file: 1.17 MB

imgs/AffordanceNet.jpg ADDED Viewed

Git LFS Details

SHA256: 3abd71b7ead1d3353faf60d65da4ceeafed34314a4c123059b5d92f53685c797
Pointer size: 132 Bytes
Size of remote file: 1.17 MB

imgs/AffordanceNet.png ADDED Viewed

Git LFS Details

SHA256: 6c1537d2a0442b1685bdfdefbb8f028acf2cc9d90782a8f37c77037126aab550
Pointer size: 132 Bytes
Size of remote file: 1.88 MB

merge_lora_weights_and_save_hf_model.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import argparse
+import glob
+import os
+import sys
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+import transformers
+from peft import LoraConfig, get_peft_model
+from transformers import AutoTokenizer
+from model.AffordanceVLM import AffordanceVLMForCausalLM
+from utils.utils import DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN
+def parse_args(args):
+    parser = argparse.ArgumentParser(
+        description="merge lora weights and save model with hf format"
+    )
+    parser.add_argument(
+        "--version", default="liuhaotian/llava-llama-2-13b-chat-lightning-preview"
+    )
+    parser.add_argument("--vis_save_path", default="./vis_output", type=str)
+    parser.add_argument(
+        "--precision",
+        default="bf16",
+        type=str,
+        choices=["fp32", "bf16", "fp16"],
+        help="precision for inference",
+    )
+    parser.add_argument("--vision_pretrained", default="PATH_TO_SAM_ViT-H", type=str)
+    parser.add_argument("--out_dim", default=256, type=int)
+    parser.add_argument("--image_size", default=1024, type=int, help="image size")
+    parser.add_argument("--model_max_length", default=512, type=int)
+    parser.add_argument(
+        "--vision-tower", default="openai/clip-vit-large-patch14", type=str
+    )
+    parser.add_argument("--lora_r", default=8, type=int)
+    parser.add_argument("--lora_alpha", default=16, type=int)
+    parser.add_argument("--lora_dropout", default=0.05, type=float)
+    parser.add_argument("--lora_target_modules", default="q_proj,v_proj", type=str)
+    parser.add_argument("--local-rank", default=0, type=int, help="node rank")
+    parser.add_argument("--train_mask_decoder", action="store_true", default=True)
+    parser.add_argument("--use_mm_start_end", action="store_true", default=True)
+    parser.add_argument(
+        "--conv_type",
+        default="llava_v1",
+        type=str,
+        choices=["llava_v1", "llava_llama_2"],
+    )
+    parser.add_argument("--weight", default="", type=str, required=True)
+    parser.add_argument("--save_path", default="./lisa_model", type=str, required=True)
+    return parser.parse_args(args)
+def main(args):
+    args = parse_args(args)
+    os.makedirs(args.vis_save_path, exist_ok=True)
+    # Create model
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        args.version,
+        cache_dir=None,
+        model_max_length=args.model_max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+    tokenizer.pad_token = tokenizer.unk_token
+    num_added_tokens = tokenizer.add_tokens("[SEG]")
+    args.seg_token_idx = tokenizer("[SEG]", add_special_tokens=False).input_ids[0]
+    num_added_tokens = tokenizer.add_tokens("[AFF]")
+    args.aff_token_idx = tokenizer("[AFF]", add_special_tokens=False).input_ids[0]
+    if args.use_mm_start_end:
+        tokenizer.add_tokens(
+            [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
+        )
+    model_args = {
+        "train_mask_decoder": args.train_mask_decoder,
+        "out_dim": args.out_dim,
+        "seg_token_idx": args.seg_token_idx,
+        "aff_token_idx": args.aff_token_idx,
+        "vision_tower": args.vision_tower,
+    }
+    torch_dtype = torch.float32
+    if args.precision == "bf16":
+        torch_dtype = torch.bfloat16
+    elif args.precision == "fp16":
+        torch_dtype = torch.half
+    model = AffordanceVLMForCausalLM.from_pretrained(
+        args.version, torch_dtype=torch_dtype, low_cpu_mem_usage=True, **model_args
+    )
+    model.config.eos_token_id = tokenizer.eos_token_id
+    model.config.bos_token_id = tokenizer.bos_token_id
+    model.config.pad_token_id = tokenizer.pad_token_id
+    model.get_model().initialize_vision_modules(model.get_model().config)
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(dtype=torch_dtype)
+    model.get_model().initialize_lisa_modules(model.get_model().config)
+    lora_r = args.lora_r
+    if lora_r > 0:
+        def find_linear_layers(model, lora_target_modules):
+            cls = torch.nn.Linear
+            lora_module_names = set()
+            for name, module in model.named_modules():
+                if (
+                    isinstance(module, cls)
+                    and all(
+                        [
+                            x not in name
+                            for x in [
+                                "visual_model",
+                                "vision_tower",
+                                "mm_projector",
+                                "text_hidden_fcs",
+                            ]
+                        ]
+                    )
+                    and any([x in name for x in lora_target_modules])
+                ):
+                    lora_module_names.add(name)
+            return sorted(list(lora_module_names))
+        lora_alpha = args.lora_alpha
+        lora_dropout = args.lora_dropout
+        lora_target_modules = find_linear_layers(
+            model, args.lora_target_modules.split(",")
+        )
+        lora_config = LoraConfig(
+            r=lora_r,
+            lora_alpha=lora_alpha,
+            target_modules=lora_target_modules,
+            lora_dropout=lora_dropout,
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+        model = get_peft_model(model, lora_config)
+        model.print_trainable_parameters()
+    model.resize_token_embeddings(len(tokenizer))
+    state_dict = torch.load(args.weight, map_location="cpu")
+    model.load_state_dict(state_dict, strict=True)
+    model = model.merge_and_unload()
+    state_dict = {}
+    for k, v in model.state_dict().items():
+        if "vision_tower" not in k:
+            state_dict[k] = v
+    model.save_pretrained(args.save_path, state_dict=state_dict)
+    tokenizer.save_pretrained(args.save_path)
+if __name__ == "__main__":
+    main(sys.argv[1:])

model/AffordanceVLM.py ADDED Viewed

	@@ -0,0 +1,428 @@

+from typing import List
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import BitsAndBytesConfig, CLIPVisionModel
+from utils.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
+                         DEFAULT_IMAGE_PATCH_TOKEN)
+from .llava.model.language_model.llava_llama import (LlavaLlamaForCausalLM,
+                                                     LlavaLlamaModel)
+from .segment_anything import build_sam_vit_h
+def dice_loss(
+        inputs: torch.Tensor,
+        targets: torch.Tensor,
+        num_masks: float,
+        scale=1000,  # 100000.0,
+        eps=1e-6,
+):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1, 2)
+    targets = targets.flatten(1, 2)
+    numerator = 2 * (inputs / scale * targets).sum(-1)
+    denominator = (inputs / scale).sum(-1) + (targets / scale).sum(-1)
+    loss = 1 - (numerator + eps) / (denominator + eps)
+    loss = loss.sum() / (num_masks + 1e-8)
+    return loss
+def sigmoid_ce_loss(
+        inputs: torch.Tensor,
+        targets: torch.Tensor,
+        num_masks: float,
+):
+    """
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    Returns:
+        Loss tensor
+    """
+    loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    loss = loss.flatten(1, 2).mean(1).sum() / (num_masks + 1e-8)
+    return loss
+class LisaMetaModel:
+    def __init__(
+            self,
+            config,
+            **kwargs,
+    ):
+        super(LisaMetaModel, self).__init__(config)
+        self.config = config
+        if not hasattr(self.config, "train_mask_decoder"):
+            self.config.train_mask_decoder = kwargs["train_mask_decoder"]
+            self.config.out_dim = kwargs["out_dim"]
+            self.vision_pretrained = kwargs.get("vision_pretrained", None)
+        else:
+            self.vision_pretrained = kwargs.get("vision_pretrained", None)
+            self.initialize_lisa_modules(self.config)
+    def initialize_lisa_modules(self, config):
+        # SAM
+        self.visual_model = build_sam_vit_h(self.vision_pretrained)
+        for param in self.visual_model.parameters():
+            param.requires_grad = False
+        if config.train_mask_decoder:
+            self.visual_model.mask_decoder.train()
+            for param in self.visual_model.mask_decoder.parameters():
+                param.requires_grad = True
+        # Projection layer
+        in_dim = config.hidden_size
+        out_dim = config.out_dim
+        text_fc = [
+            nn.Linear(in_dim, in_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(in_dim, out_dim),
+            nn.Dropout(0.0),
+        ]
+        self.text_hidden_fcs = nn.ModuleList([nn.Sequential(*text_fc)])
+        self.text_hidden_fcs.train()
+        for param in self.text_hidden_fcs.parameters():
+            param.requires_grad = True
+class LisaModel(LisaMetaModel, LlavaLlamaModel):
+    def __init__(
+            self,
+            config,
+            **kwargs,
+    ):
+        super(LisaModel, self).__init__(config, **kwargs)
+        self.config.use_cache = False
+        self.config.vision_tower = self.config.mm_vision_tower
+        self.config.mm_vision_select_feature = "patch"
+        self.config.image_aspect_ratio = "square"
+        self.config.image_grid_pinpoints = None
+        self.config.tune_mm_mlp_adapter = False
+        self.config.freeze_mm_mlp_adapter = True
+        self.config.pretrain_mm_mlp_adapter = None
+        self.config.mm_use_im_patch_token = False
+class AffordanceVLMForCausalLM(LlavaLlamaForCausalLM):
+    def __init__(
+            self,
+            config,
+            **kwargs,
+    ):
+        if not hasattr(config, "train_mask_decoder"):
+            config.mm_use_im_start_end = kwargs.pop("use_mm_start_end", True)
+            config.mm_vision_tower = kwargs.get(
+                "vision_tower", "openai/clip-vit-large-patch14"
+            )
+            self.ce_loss_weight = kwargs.pop("ce_loss_weight", None)
+            self.dice_loss_weight = kwargs.pop("dice_loss_weight", None)
+            self.bce_loss_weight = kwargs.pop("bce_loss_weight", None)
+        else:
+            config.mm_vision_tower = config.vision_tower
+        self.seg_token_idx = kwargs.pop("seg_token_idx")
+        self.aff_token_idx = kwargs.pop("aff_token_idx")
+        super().__init__(config)
+        self.model = LisaModel(config, **kwargs)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_visual_embs(self, pixel_values: torch.FloatTensor):
+        with torch.no_grad():
+            image_embeddings_list = []
+            for i in range(pixel_values.shape[0]):
+                torch.cuda.empty_cache()
+                image_embeddings = self.model.visual_model.image_encoder(
+                    pixel_values[i].unsqueeze(0)
+                )
+                image_embeddings_list.append(image_embeddings)
+            torch.cuda.empty_cache()
+            image_embeddings = torch.cat(image_embeddings_list, 0)
+        return image_embeddings
+    def forward(self, **kwargs):
+        if "past_key_values" in kwargs:
+            return super().forward(**kwargs)
+        return self.model_forward(**kwargs)
+    def model_forward(
+            self,
+            images: torch.FloatTensor,
+            images_clip: torch.FloatTensor,
+            input_ids: torch.LongTensor,
+            labels: torch.LongTensor,
+            attention_masks: torch.LongTensor,
+            offset: torch.LongTensor,
+            masks_list: List[torch.FloatTensor],
+            label_list: List[torch.Tensor],
+            resize_list: List[tuple],
+            inference: bool = False,
+            **kwargs,
+    ):
+        image_embeddings = self.get_visual_embs(images)
+        batch_size = image_embeddings.shape[0]
+        assert batch_size == len(offset) - 1
+        seg_token_mask = (input_ids[:, 1:] == self.seg_token_idx) + (input_ids[:, 1:] == self.aff_token_idx)
+        seg_token_mask = torch.cat(
+            [
+                seg_token_mask,
+                torch.zeros((seg_token_mask.shape[0], 1)).bool().cuda(),
+            ],
+            dim=1,
+        )
+        # hack for IMAGE_TOKEN_INDEX (we suppose that there is only one image, and it is in the front)
+        seg_token_mask = torch.cat(
+            [torch.zeros((seg_token_mask.shape[0], 255)).bool().cuda(), seg_token_mask],
+            dim=1,
+        )
+        if inference:
+            n_batch = 1
+            length = input_ids.shape[0]
+            assert images_clip.shape[0] == 1
+            images_clip_extend = images_clip.expand(length, -1, -1, -1).contiguous()
+            output_hidden_states = []
+            for i in range(n_batch):
+                start_i, end_i = i * length, min((i + 1) * length, input_ids.shape[0])
+                output_i = super().forward(
+                    images=images_clip_extend[: end_i - start_i],
+                    attention_mask=attention_masks[start_i:end_i],
+                    input_ids=input_ids[start_i:end_i],
+                    output_hidden_states=True,
+                )
+                output_hidden_states.append(output_i.hidden_states)
+                torch.cuda.empty_cache()
+            output_hidden_states_list = []
+            output_hidden_states_level = torch.cat(output_hidden_states, dim=0)
+            output_hidden_states_list.append(output_hidden_states_level)
+            output_hidden_states = output_hidden_states_list
+            output = None
+        else:
+            images_clip_list = []
+            for i in range(len(offset) - 1):
+                start_i, end_i = offset[i], offset[i + 1]
+                images_clip_i = (
+                    images_clip[i]
+                    .unsqueeze(0)
+                    .expand(end_i - start_i, -1, -1, -1)
+                    .contiguous()
+                )
+                images_clip_list.append(images_clip_i)
+            images_clip = torch.cat(images_clip_list, dim=0)
+            output = super().forward(
+                images=images_clip,
+                attention_mask=attention_masks,
+                input_ids=input_ids,
+                labels=labels,
+                output_hidden_states=True,
+            )
+            output_hidden_states = output.hidden_states
+        hidden_states = []
+        assert len(self.model.text_hidden_fcs) == 1
+        hidden_states.append(self.model.text_hidden_fcs[0](output_hidden_states[-1]))
+        last_hidden_state = torch.stack(hidden_states, dim=-1).sum(dim=-1)
+        pred_embeddings = last_hidden_state[seg_token_mask]
+        seg_token_counts = seg_token_mask.int().sum(-1)  # [bs, ]
+        seg_token_offset = seg_token_counts.cumsum(-1)
+        seg_token_offset = torch.cat(
+            [torch.zeros(1).long().cuda(), seg_token_offset], dim=0
+        )
+        seg_token_offset = seg_token_offset[offset]
+        pred_embeddings_ = []
+        for i in range(len(seg_token_offset) - 1):
+            start_i, end_i = seg_token_offset[i], seg_token_offset[i + 1]
+            pred_embeddings_.append(pred_embeddings[start_i:end_i])
+        pred_embeddings = pred_embeddings_
+        multimask_output = False
+        pred_masks = []
+        for i in range(len(pred_embeddings)):
+            (
+                sparse_embeddings,
+                dense_embeddings,
+            ) = self.model.visual_model.prompt_encoder(
+                points=None,
+                boxes=None,
+                masks=None,
+                text_embeds=pred_embeddings[i].unsqueeze(1),
+            )
+            sparse_embeddings = sparse_embeddings.to(pred_embeddings[i].dtype)
+            low_res_masks, iou_predictions = self.model.visual_model.mask_decoder(
+                image_embeddings=image_embeddings[i].unsqueeze(0),
+                image_pe=self.model.visual_model.prompt_encoder.get_dense_pe(),
+                sparse_prompt_embeddings=sparse_embeddings,
+                dense_prompt_embeddings=dense_embeddings,
+                multimask_output=multimask_output,
+            )
+            pred_mask = self.model.visual_model.postprocess_masks(
+                low_res_masks,
+                input_size=resize_list[i],
+                original_size=label_list[i].shape,
+            )
+            pred_masks.append(pred_mask[:, 0])
+        model_output = output
+        gt_masks = masks_list
+        if inference:
+            return {
+                "pred_masks": pred_masks,
+                "gt_masks": gt_masks,
+            }
+        output = model_output.logits
+        ce_loss = model_output.loss
+        ce_loss = ce_loss * self.ce_loss_weight
+        mask_bce_loss = 0
+        mask_dice_loss = 0
+        num_masks = 0
+        for batch_idx in range(len(pred_masks)):
+            gt_mask = gt_masks[batch_idx]
+            pred_mask = pred_masks[batch_idx]
+            assert (
+                    gt_mask.shape[0] == pred_mask.shape[0]
+            ), "gt_mask.shape: {}, pred_mask.shape: {}".format(
+                gt_mask.shape, pred_mask.shape
+            )
+            mask_bce_loss += (
+                    sigmoid_ce_loss(pred_mask, gt_mask, num_masks=gt_mask.shape[0])
+                    * gt_mask.shape[0]
+            )
+            mask_dice_loss += (
+                    dice_loss(pred_mask, gt_mask, num_masks=gt_mask.shape[0])
+                    * gt_mask.shape[0]
+            )
+            num_masks += gt_mask.shape[0]
+        mask_bce_loss = self.bce_loss_weight * mask_bce_loss / (num_masks + 1e-8)
+        mask_dice_loss = self.dice_loss_weight * mask_dice_loss / (num_masks + 1e-8)
+        mask_loss = mask_bce_loss + mask_dice_loss
+        loss = ce_loss + mask_loss
+        return {
+            "loss": loss,
+            "ce_loss": ce_loss,
+            "mask_bce_loss": mask_bce_loss,
+            "mask_dice_loss": mask_dice_loss,
+            "mask_loss": mask_loss,
+        }
+    def evaluate(
+            self,
+            images_clip,
+            images,
+            input_ids,
+            resize_list,
+            original_size_list,
+            max_new_tokens=32,
+            tokenizer=None,
+    ):
+        with torch.no_grad():
+            outputs = self.generate(
+                images=images_clip,
+                input_ids=input_ids,
+                max_new_tokens=max_new_tokens,
+                num_beams=1,
+                output_hidden_states=True,
+                return_dict_in_generate=True,
+            )
+            output_hidden_states = outputs.hidden_states[-1]
+            output_ids = outputs.sequences
+            seg_token_mask = (output_ids[:, 1:] == self.seg_token_idx) + (output_ids[:, 1:] == self.aff_token_idx)
+            # hack for IMAGE_TOKEN_INDEX (we suppose that there is only one image, and it is in the front)
+            seg_token_mask = torch.cat(
+                [
+                    torch.zeros((seg_token_mask.shape[0], 255)).bool().cuda(),
+                    seg_token_mask,
+                ],
+                dim=1,
+            )
+            hidden_states = []
+            assert len(self.model.text_hidden_fcs) == 1
+            hidden_states.append(self.model.text_hidden_fcs[0](output_hidden_states))
+            last_hidden_state = torch.stack(hidden_states, dim=-1).sum(dim=-1)
+            pred_embeddings = last_hidden_state[seg_token_mask]
+            seg_token_counts = seg_token_mask.int().sum(-1)  # [bs, ]
+            seg_token_offset = seg_token_counts.cumsum(-1)
+            seg_token_offset = torch.cat(
+                [torch.zeros(1).long().cuda(), seg_token_offset], dim=0
+            )
+            pred_embeddings_ = []
+            for i in range(len(seg_token_offset) - 1):
+                start_i, end_i = seg_token_offset[i], seg_token_offset[i + 1]
+                pred_embeddings_.append(pred_embeddings[start_i:end_i])
+            pred_embeddings = pred_embeddings_
+            image_embeddings = self.get_visual_embs(images)
+            multimask_output = False
+            pred_masks = []
+            for i in range(len(pred_embeddings)):
+                (
+                    sparse_embeddings,
+                    dense_embeddings,
+                ) = self.model.visual_model.prompt_encoder(
+                    points=None,
+                    boxes=None,
+                    masks=None,
+                    text_embeds=pred_embeddings[i].unsqueeze(1),
+                )
+                sparse_embeddings = sparse_embeddings.to(pred_embeddings[i].dtype)
+                low_res_masks, iou_predictions = self.model.visual_model.mask_decoder(
+                    image_embeddings=image_embeddings[i].unsqueeze(0),
+                    image_pe=self.model.visual_model.prompt_encoder.get_dense_pe(),
+                    sparse_prompt_embeddings=sparse_embeddings,
+                    dense_prompt_embeddings=dense_embeddings,
+                    multimask_output=multimask_output,
+                )
+                pred_mask = self.model.visual_model.postprocess_masks(
+                    low_res_masks,
+                    input_size=resize_list[i],
+                    original_size=original_size_list[i],
+                )
+                pred_masks.append(pred_mask[:, 0])
+        return output_ids, pred_masks

model/__pycache__/AffordanceVLM.cpython-39.pyc ADDED Viewed

Binary file (9.71 kB). View file

model/llava/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .model import LlavaLlamaForCausalLM

model/llava/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (192 Bytes). View file

model/llava/__pycache__/constants.cpython-39.pyc ADDED Viewed

Binary file (454 Bytes). View file

model/llava/__pycache__/conversation.cpython-39.pyc ADDED Viewed

Binary file (10.4 kB). View file

model/llava/__pycache__/mm_utils.cpython-39.pyc ADDED Viewed

Binary file (3.4 kB). View file

model/llava/constants.py ADDED Viewed

	@@ -0,0 +1,12 @@

+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"

model/llava/conversation.py ADDED Viewed

	@@ -0,0 +1,399 @@

+import dataclasses
+from enum import Enum, auto
+from typing import List, Tuple
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if "mmtag" in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>\n" + init_msg)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0:
+                        message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+                    msg, image, image_process_mode = msg
+                    if image_process_mode == "Pad":
+                        def expand2square(pil_img, background_color=(122, 116, 104)):
+                            width, height = pil_img.size
+                            if width == height:
+                                return pil_img
+                            elif width > height:
+                                result = Image.new(
+                                    pil_img.mode, (width, width), background_color
+                                )
+                                result.paste(pil_img, (0, (width - height) // 2))
+                                return result
+                            else:
+                                result = Image.new(
+                                    pil_img.mode, (height, height), background_color
+                                )
+                                result.paste(pil_img, ((height - width) // 2, 0))
+                                return result
+                        image = expand2square(image)
+                    elif image_process_mode == "Crop":
+                        pass
+                    elif image_process_mode == "Resize":
+                        image = image.resize((336, 336))
+                    else:
+                        raise ValueError(
+                            f"Invalid image_process_mode: {image_process_mode}"
+                        )
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    if return_pil:
+                        images.append(image)
+                    else:
+                        buffered = BytesIO()
+                        image.save(buffered, format="PNG")
+                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                        images.append(img_b64_str)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image, image_process_mode = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    buffered = BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    ret.append([img_str, None])
+                    msg = msg.replace("<image>", "").strip()
+                    if len(msg) > 0:
+                        ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version,
+        )
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [
+                    [x, y[0] if type(y) is tuple else y] for x, y in self.messages
+                ],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        (
+            "Human",
+            "What are the key differences between renewable and non-renewable energy sources?",
+        ),
+        (
+            "Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n",
+        ),
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+    "You are able to understand the visual content that the user provides, "
+    "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(("Human", "Hi!"), ("Assistant", "Hi there! How can I help you today?")),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+    "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("Human", "Assistant"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+    "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+default_conversation = conv_vicuna_v0
+conv_templates = {
+    "default": conv_vicuna_v0,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "llama_2": conv_llama_2,
+    "plain": conv_llava_plain,
+    "v0_plain": conv_llava_plain,
+    "llava_v0": conv_llava_v0,
+    "v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "v1_mmtag": conv_llava_v1_mmtag,
+    "llava_llama_2": conv_llava_llama_2,
+    "mpt": conv_mpt,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

model/llava/mm_utils.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import base64
+from io import BytesIO
+import torch
+from PIL import Image
+from transformers import StoppingCriteria
+from .constants import IMAGE_TOKEN_INDEX
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+def process_images(images, image_processor, model_cfg):
+    return image_processor(images, return_tensors="pt")["pixel_values"]
+def tokenizer_image_token(
+    prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None
+):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if (
+        len(prompt_chunks) > 0
+        and len(prompt_chunks[0]) > 0
+        and prompt_chunks[0][0] == tokenizer.bos_token_id
+    ):
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == "pt":
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
+    return input_ids
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith("checkpoint-"):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if (
+                len(cur_keyword_ids) > 1
+                and cur_keyword_ids[0] == tokenizer.bos_token_id
+            ):
+                cur_keyword_ids = cur_keyword_ids[1:]
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    def __call__(
+        self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+    ) -> bool:
+        assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
+        offset = min(output_ids.shape[1] - self.start_len, 3)
+        self.keyword_ids = [
+            keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids
+        ]
+        for keyword_id in self.keyword_ids:
+            if output_ids[0, -keyword_id.shape[0] :] == keyword_id:
+                return True
+        outputs = self.tokenizer.batch_decode(
+            output_ids[:, -offset:], skip_special_tokens=True
+        )[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False