UWGZQ commited on Feb 17

Commit

5dbdc31

verified ·

1 Parent(s): 2a493ae

Upload folder using huggingface_hub

Browse files

Files changed (32) hide show

.gitattributes +1 -0
added_tokens.json +26 -0
chat_template.jinja +7 -0
config.json +150 -0
example/2401075277.mp4 +3 -0
example/2401075277_rle.json +0 -0
generation_config.json +17 -0
inference.py +213 -0
merges.txt +0 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +896 -0
modeling_traser.py +179 -0
qwen_vl_vsg_utils/src/qwen_vl_utils/__init__.py +7 -0
qwen_vl_vsg_utils/src/qwen_vl_utils/__pycache__/__init__.cpython-310.pyc +0 -0
qwen_vl_vsg_utils/src/qwen_vl_utils/__pycache__/vision_process.cpython-310.pyc +0 -0
qwen_vl_vsg_utils/src/qwen_vl_utils/vision_process.py +432 -0
resampler_utils/__pycache__/token_arrangement.cpython-310.pyc +0 -0
resampler_utils/__pycache__/token_insert_1017_multi_resampler.cpython-310.pyc +0 -0
resampler_utils/__pycache__/token_insert_1020_multi_two_resampler.cpython-310.pyc +0 -0
resampler_utils/__pycache__/token_insert_new.cpython-310.pyc +0 -0
resampler_utils/__pycache__/token_insert_no_resampler.cpython-310.pyc +0 -0
resampler_utils/__pycache__/token_insert_single_resampler.cpython-310.pyc +0 -0
resampler_utils/__pycache__/token_insert_temporal.cpython-310.pyc +0 -0
resampler_utils/__pycache__/token_selection.cpython-310.pyc +0 -0
resampler_utils/__pycache__/token_selection_bbox.cpython-310.pyc +0 -0
resampler_utils/__pycache__/token_selection_temporal.cpython-310.pyc +0 -0
resampler_utils/token_arrangement.py +640 -0
resampler_utils/token_selection.py +101 -0
special_tokens_map.json +45 -0
tokenizer_config.json +226 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+example/2401075277.mp4 filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "</tool_call>": 151658,
+  "<obj_traj_end>": 151666,
+  "<obj_traj_start>": 151665,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

config.json ADDED Viewed

	@@ -0,0 +1,150 @@

+{
+  "architectures": [
+    "Qwen2_5_VLForConditionalGeneration_Insert"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "image_token_id": 151655,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 128000,
+  "max_window_layers": 70,
+  "model_type": "qwen2_5_vl",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 2,
+  "obj_traj_end_id": 151666,
+  "obj_traj_start_id": 151665,
+  "resampler_depth": 3,
+  "temporal_resampler_n_latents": 32,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "mrope_section": [
+      16,
+      24,
+      24
+    ],
+    "rope_type": "default",
+    "type": "default"
+  },
+  "rope_theta": 1000000.0,
+  "object_resampler_n_latents": 32,
+  "sliding_window": 32768,
+  "text_config": {
+    "architectures": [
+      "Qwen2_5_VLForConditionalGeneration"
+    ],
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "eos_token_id": 151645,
+    "hidden_act": "silu",
+    "hidden_size": 2048,
+    "image_token_id": null,
+    "initializer_range": 0.02,
+    "intermediate_size": 11008,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 128000,
+    "max_window_layers": 70,
+    "model_type": "qwen2_5_vl_text",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 2,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+      "mrope_section": [
+        16,
+        24,
+        24
+      ],
+      "rope_type": "default",
+      "type": "default"
+    },
+    "rope_theta": 1000000.0,
+    "sliding_window": null,
+    "tie_word_embeddings": true,
+    "torch_dtype": "bfloat16",
+    "use_cache": true,
+    "use_sliding_window": false,
+    "video_token_id": null,
+    "vision_end_token_id": 151653,
+    "vision_start_token_id": 151652,
+    "vision_token_id": 151654,
+    "vocab_size": 151667
+  },
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.54.0",
+  "object_resampler": true,
+  "use_cache": false,
+  "use_resampler": true,
+  "use_sliding_window": false,
+  "video_token_id": 151656,
+  "vision_config": {
+    "depth": 32,
+    "fullatt_block_indexes": [
+      7,
+      15,
+      23,
+      31
+    ],
+    "hidden_act": "silu",
+    "hidden_size": 1280,
+    "in_channels": 3,
+    "in_chans": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 3420,
+    "model_type": "qwen2_5_vl",
+    "num_heads": 16,
+    "out_hidden_size": 2048,
+    "patch_size": 14,
+    "spatial_merge_size": 2,
+    "spatial_patch_size": 14,
+    "temporal_patch_size": 2,
+    "tokens_per_second": 2,
+    "torch_dtype": "bfloat16",
+    "window_size": 112
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652,
+  "vision_token_id": 151654,
+  "vocab_size": 151667
+}

example/2401075277.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bea771d46e14045b24a554333dbc07d27292f5927b15a2b3f2dc4ab4572329aa
+size 3966614

example/2401075277_rle.json ADDED Viewed

The diff for this file is too large to render. See raw diff

generation_config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.05,
+  "resampler_depth": 3,
+  "temporal_resampler_n_latents": 32,
+  "object_resampler_n_latents": 32,
+  "temperature": 1e-06,
+  "transformers_version": "4.54.0",
+  "object_resampler": true,
+  "use_resampler": true
+}

inference.py ADDED Viewed

	@@ -0,0 +1,213 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Inference example for Qwen2.5-VL TRASER model.
+Usage:
+    python inference.py \
+        --model_path . \
+        --video_path /path/to/video.mp4 \
+        --mask_path /path/to/mask.json \
+        --structured_json_dir /path/to/struct_dir \
+        --out_dir ./output
+"""
+import os
+import json
+import argparse
+import random
+import torch
+import numpy as np
+from transformers import AutoProcessor, AutoTokenizer
+# Import Custom Model
+from modeling_traser import TRASER
+# Import Utils
+from qwen_vl_vsg_utils.src.qwen_vl_utils import process_vision_info
+from resampler_utils.token_selection import select_tokens
+from resampler_utils.token_arrangement import rearrange_token
+from pycocotools import mask as maskUtils
+import math
+import torch.nn.functional as F
+def set_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+def load_mask_data(mask_json_path):
+    with open(mask_json_path, "r") as f:
+        return json.load(f)
+def has_any_mask(mask_data, obj_id):
+    for frame in mask_data:
+        if not frame or obj_id >= len(frame): continue
+        if frame[obj_id] and frame[obj_id].get("counts"): return True
+    return False
+def build_obj_masks_tensor(mask_data, obj_ids, sampled_idx, H_rz, W_rz, device):
+    O, N = len(obj_ids), len(sampled_idx)
+    obj_masks = torch.zeros((O, N, H_rz, W_rz), dtype=torch.float32, device=device)
+    for o_i, oid in enumerate(obj_ids):
+        for n_idx, fidx in enumerate(sampled_idx):
+            if fidx < len(mask_data):
+                frame_objs = mask_data[fidx]
+                if frame_objs and oid < len(frame_objs):
+                    rle = frame_objs[oid]
+                    if rle:
+                        m = maskUtils.decode({"size": rle["size"], "counts": rle["counts"]})
+                        if m.ndim == 3: m = m[:, :, 0]
+                        m_t = torch.from_numpy(m.astype(np.uint8)).unsqueeze(0).unsqueeze(0).float().to(device)
+                        m_rz = F.interpolate(m_t, size=(H_rz, W_rz), mode="nearest")[0, 0]
+                        obj_masks[o_i, n_idx] = (m_rz > 0.5).float()
+    keep_idx = (obj_masks.view(O, -1).sum(dim=1) > 0).nonzero(as_tuple=False).squeeze(1).tolist()
+    if len(keep_idx) < O: obj_masks = obj_masks[keep_idx]
+    return obj_masks, keep_idx
+def run_single_video(model, processor, video_path, mask_path, out_dir, device, args):
+    mask_data = load_mask_data(mask_path)
+    all_ids = range(min(len(mask_data[0]),args.max_objects))
+    eligible = [oid for oid in all_ids if has_any_mask(mask_data, oid)]
+    if len(eligible) > args.max_objects:
+        random.shuffle(eligible)
+        selected_obj_ids = sorted(eligible[:args.max_objects])
+    else:
+        selected_obj_ids = sorted(eligible)
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": [
+            {"type": "text", "text": "Output the video Scene Graph from the video and object trajectories:\n"},
+            {"type": "video", "video": video_path}
+        ]}
+    ]
+    prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs, fps, selected_frame_idx = process_vision_info(messages, return_video_kwargs=True)
+    proc_inputs = processor(
+        text=[prompt_text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", fps=1
+    ).to(device)
+    video_grid_thw = proc_inputs["video_grid_thw"]
+    if isinstance(video_grid_thw, list): video_grid_thw = torch.stack([x.to(device) for x in video_grid_thw])
+    else: video_grid_thw = video_grid_thw.to(device)
+    T_grid = int(video_grid_thw[0, 0].item())
+    H_patch, W_patch = int(video_grid_thw[0, 1].item()), int(video_grid_thw[0, 2].item())
+    # Calculate mask resize dimensions
+    patch_size = 14
+    H_rz, W_rz = H_patch * patch_size, W_patch * patch_size
+    # Build Masks
+    sampled_idx = selected_frame_idx[0]
+    obj_masks, keep_idx = build_obj_masks_tensor(mask_data, selected_obj_ids, sampled_idx, H_rz, W_rz, device)
+    selected_obj_ids = [selected_obj_ids[i] for i in keep_idx]
+    # Select Tokens
+    per_union_idx, per_obj_idx, _ = select_tokens(
+        obj_masks=obj_masks,
+        grid_thw=(T_grid, H_patch, W_patch),
+        patch_size=patch_size,
+        device=device
+    )
+    # Prepare Input
+    per_obj_idx_batch = [per_obj_idx]
+    # Prepare text labels
+    text_token_ids_per_sample = []
+    label_template = "Object {i}: "
+    additional_texts = [label_template.format(i=(k + 1)) for k in range(len(per_obj_idx))]
+    enc = processor.tokenizer(additional_texts, add_special_tokens=False)["input_ids"]
+    text_token_ids_per_sample.append([torch.tensor(x, dtype=torch.long) for x in enc])
+    # Prepare timestamps
+    sec_per_window = torch.arange(0, T_grid) * 2.0
+    temporal_window_length = 4.0
+    grids_per_window = int(temporal_window_length / 2.0)
+    timestamp_token_ids_per_batch = []
+    grids_per_window_batch = []
+    temporal_text_list = []
+    num_windows = math.ceil(len(sec_per_window) / grids_per_window)
+    for w_id in range(num_windows):
+        s, e = w_id * temporal_window_length, (w_id + 1) * temporal_window_length
+        temporal_text_list.append(f"<{int(s)} - {int(e)} sec>")
+    enc_ts = processor.tokenizer(temporal_text_list, add_special_tokens=False)["input_ids"]
+    timestamp_token_ids_per_batch.append([torch.tensor(x) for x in enc_ts])
+    grids_per_window_batch.append(grids_per_window)
+    # Rearrange and Generate
+    with torch.no_grad():
+        new_emb, new_pid, new_mask, rope_deltas, cache_pos, _, _ = rearrange_token(
+            model=model,
+            input_ids=proc_inputs["input_ids"],
+            attention_mask=proc_inputs["attention_mask"],
+            pixel_values_videos=proc_inputs["pixel_values_videos"],
+            video_grid_thw=video_grid_thw,
+            image_grid_thw=None, pixel_values=None, second_per_grid_ts=None,
+            obj_token_indices_per_sample=per_obj_idx_batch,
+            obj_traj_start_id=args.obj_traj_start_id,
+            obj_traj_end_id=args.obj_traj_end_id,
+            text_token_ids_per_sample=text_token_ids_per_sample,
+            timestamp_token_ids_per_batch=timestamp_token_ids_per_batch,
+            grids_per_temporal_window_per_batch=grids_per_window_batch,
+            use_resampler=True
+        )
+        gen_out = model.generate(
+            inputs_embeds=new_emb,
+            position_ids=new_pid,
+            attention_mask=new_mask.long(),
+            rope_deltas=rope_deltas,
+            max_new_tokens=8192,
+            do_sample=True,
+            top_p=0.9,
+            temperature=1e-6,
+            repetition_penalty=1.05
+        )
+    decoded = processor.tokenizer.decode(gen_out[0], skip_special_tokens=True)
+    print(f"Generated Output:\n{decoded}")
+    if out_dir:
+        with open(os.path.join(out_dir, "output.txt"), "w") as f:
+            f.write(decoded)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, required=True, help="Path to model or HF repo")
+    parser.add_argument("--video_path", type=str, required=True)
+    parser.add_argument("--mask_path", type=str, required=True)
+    parser.add_argument("--out_dir", type=str, default="./output")
+    parser.add_argument("--max_objects", type=int, default=40)
+    parser.add_argument("--obj_traj_start_id", type=int, default=151665)
+    parser.add_argument("--obj_traj_end_id", type=int, default=151666)
+    args = parser.parse_args()
+    set_seed(42)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    if args.out_dir:
+        os.makedirs(args.out_dir, exist_ok=True)
+    # Load Model (Using the separate class)
+    # Note: If trust_remote_code=True works, you can use AutoModel.
+    # For this example, we explicit load TRASER to ensure it works with local weights.
+    model = TRASER.from_pretrained(args.model_path, torch_dtype=torch.bfloat16).to(device)
+    processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+    processor.tokenizer = tokenizer
+    run_single_video(model, processor, args.video_path, args.mask_path, args.out_dir, device, args)
+if __name__ == "__main__":
+    main()

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e1d223559703e608b90365e2f245ee07372d41572d7d8c80cf39186efa2944a
+size 4996648936

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a4b8c3ab4f120fcefc2d012648c40340348938dc7c47e1b0120787dff0eff2e9
+size 3210291272

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,896 @@

+{
+  "metadata": {
+    "total_parameters": 927744,
+    "total_size": 8206839808
+  },
+  "weight_map": {
+    "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.norm.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.latents": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.0.input_context_norm.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.0.input_latents_norm.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.0.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.0.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.0.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.0.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.0.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.1.input_context_norm.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.1.input_latents_norm.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.1.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.1.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.1.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.1.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.1.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.2.input_context_norm.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.2.input_latents_norm.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.2.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.2.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.2.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.2.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.2.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "perceiver_resampler.norm.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.latents": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.0.input_context_norm.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.0.input_latents_norm.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.0.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.0.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.0.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.0.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.0.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.1.input_context_norm.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.1.input_latents_norm.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.1.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.1.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.1.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.1.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.1.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.2.input_context_norm.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.2.input_latents_norm.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.2.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.2.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.2.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.2.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.2.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "second_perceiver_resampler.norm.weight": "model-00002-of-00002.safetensors",
+    "visual.blocks.0.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.24.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.24.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.24.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.24.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.24.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.24.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.24.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.24.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.24.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.25.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.25.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.25.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.25.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.25.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.25.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.25.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.25.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.25.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.26.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.26.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.26.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.26.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.26.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.26.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.26.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.26.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.26.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.27.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.27.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.27.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.27.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.27.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.27.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.27.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.27.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.27.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.28.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.28.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.28.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.28.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.28.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.28.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.28.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.28.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.28.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.28.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.28.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.28.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.29.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.29.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.29.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.29.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.29.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.29.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.29.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.29.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.29.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.29.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.29.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.29.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.30.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.30.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.30.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.30.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.30.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.30.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.30.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.30.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.30.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.30.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.30.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.30.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.31.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.31.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.31.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.31.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.31.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.31.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.31.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.31.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.31.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.31.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.31.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.31.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.merger.ln_q.weight": "model-00001-of-00002.safetensors",
+    "visual.merger.mlp.0.bias": "model-00001-of-00002.safetensors",
+    "visual.merger.mlp.0.weight": "model-00001-of-00002.safetensors",
+    "visual.merger.mlp.2.bias": "model-00001-of-00002.safetensors",
+    "visual.merger.mlp.2.weight": "model-00001-of-00002.safetensors",
+    "visual.patch_embed.proj.weight": "model-00001-of-00002.safetensors"
+  }
+}

modeling_traser.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import torch
+import torch.nn as nn
+from typing import List, Tuple, Optional, Any, Dict
+from dataclasses import dataclass
+from transformers import Qwen2_5_VLForConditionalGeneration
+from transformers.modeling_outputs import ModelOutput
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLConfig
+from transformers.models.idefics2.modeling_idefics2 import Idefics2PerceiverResampler
+from transformers.models.idefics2.configuration_idefics2 import Idefics2PerceiverConfig
+from transformers.utils import ModelOutput
+from transformers.processing_utils import Unpack
+@dataclass
+class TRASEROutput(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    rope_deltas: Optional[torch.LongTensor] = None
+class TRASER(Qwen2_5_VLForConditionalGeneration):
+    def __init__(self, config: Qwen2_5_VLConfig, **kwargs):
+        super().__init__(config)
+        # Update config with kwargs if provided (fallback mechanism)
+        for k, v in kwargs.items():
+            if not hasattr(config, k):
+                setattr(config, k, v)
+        self.config = config
+        self._build_perceiver(dtype=config.torch_dtype, attn_impl=config._attn_implementation)
+        self.post_init()
+    def _build_perceiver(self, dtype: torch.dtype, attn_impl: str) -> None:
+        h = int(getattr(self.config, "hidden_size", 2048))
+        n_latents = int(getattr(self.config, "temporal_resampler_n_latents", 64))
+        depth = int(getattr(self.config, "resampler_depth", 3))
+        perceiver_cfg = Idefics2PerceiverConfig(
+            hidden_size=h,
+            resampler_n_latents=n_latents,
+            resampler_depth=depth,
+            _attn_implementation=attn_impl,
+            torch_dtype=dtype,
+        )
+        self.perceiver_resampler = Idefics2PerceiverResampler(perceiver_cfg)
+        if getattr(self.config, "object_resampler", True):
+            second_n_latents = int(getattr(self.config, "object_resampler_n_latents", 32))
+            second_perceiver_cfg = Idefics2PerceiverConfig(
+                hidden_size=h,
+                resampler_n_latents=second_n_latents,
+                resampler_depth=depth,
+                _attn_implementation=attn_impl,
+                torch_dtype=dtype,
+            )
+            self.second_perceiver_resampler = Idefics2PerceiverResampler(second_perceiver_cfg)
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        pixel_values=None,
+        pixel_values_videos=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        second_per_grid_ts=None,
+        **kwargs,
+    ):
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            second_per_grid_ts=second_per_grid_ts,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        model_inputs["position_ids"] = position_ids
+        if cache_position is not None and cache_position[0] != 0:
+            model_inputs["pixel_values"] = None
+            model_inputs["pixel_values_videos"] = None
+            model_inputs["position_ids"] = None
+        return model_inputs
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        rope_deltas: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[Any],
+    ) -> TRASEROutput:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        if rope_deltas is not None:
+            self.model.rope_deltas = rope_deltas
+        is_prefill = (inputs_embeds is not None) and (
+            past_key_values is None or (hasattr(past_key_values, "get_seq_length") and past_key_values.get_seq_length() == 0)
+        )
+        if is_prefill:
+            outputs = self.model.language_model(
+                input_ids=None,
+                inputs_embeds=inputs_embeds,
+                position_ids=position_ids,
+                attention_mask=attention_mask,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                cache_position=cache_position,
+                return_dict=True,
+            )
+        else:
+            inputs_embeds = self.model.get_input_embeddings()(input_ids)
+            batch_size, seq_length, _ = inputs_embeds.shape
+            delta = (
+                (cache_position[0] + self.model.rope_deltas).to(inputs_embeds.device)
+                if cache_position is not None
+                else 0
+            )
+            pos = torch.arange(seq_length, device=inputs_embeds.device).view(1, -1).expand(batch_size, -1)
+            if cache_position is not None:
+                delta = delta.repeat_interleave(max(1, batch_size // delta.shape[0]), dim=0)
+            pos = pos.add(delta).unsqueeze(0).expand(3, -1, -1)
+            outputs = self.model.language_model(
+                input_ids=None,
+                position_ids=pos,
+                attention_mask=attention_mask,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                cache_position=cache_position,
+                **kwargs,
+            )
+        hidden_states = outputs.last_hidden_state
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size)
+        return TRASEROutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            rope_deltas=self.model.rope_deltas,
+        )

qwen_vl_vsg_utils/src/qwen_vl_utils/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .vision_process import (
+    extract_vision_info,
+    fetch_image,
+    fetch_video,
+    process_vision_info,
+    smart_resize,
+)

qwen_vl_vsg_utils/src/qwen_vl_utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (327 Bytes). View file

qwen_vl_vsg_utils/src/qwen_vl_utils/__pycache__/vision_process.cpython-310.pyc ADDED Viewed

Binary file (12.9 kB). View file

qwen_vl_vsg_utils/src/qwen_vl_utils/vision_process.py ADDED Viewed

	@@ -0,0 +1,432 @@

+from __future__ import annotations
+import base64
+import copy
+import logging
+import math
+import os
+import sys
+import time
+import warnings
+from functools import lru_cache
+from io import BytesIO
+from typing import Optional
+import requests
+import torch
+import torchvision
+from packaging import version
+from PIL import Image
+from torchvision import io, transforms
+from torchvision.transforms import InterpolationMode
+logger = logging.getLogger(__name__)
+IMAGE_FACTOR = 28
+MIN_PIXELS = 4 * 28 * 28
+MAX_PIXELS = 16384 * 28 * 28
+MAX_RATIO = 200
+VIDEO_MAX_PIXELS = 768 * 28 * 28
+FRAME_FACTOR = 2
+FPS_MIN_FRAMES = 4
+FPS_MAX_FRAMES = 768
+VIDEO_MIN_PIXELS = 64 * 28 * 28
+FPS = 1
+VIDEO_TOTAL_PIXELS = int(float(os.environ.get('VIDEO_MAX_PIXELS', 128000 * 28 * 28 * 0.9)))
+logger.info(f"set VIDEO_TOTAL_PIXELS: {VIDEO_TOTAL_PIXELS}")
+def round_by_factor(number: int, factor: int) -> int:
+    """Returns the closest integer to 'number' that is divisible by 'factor'."""
+    return round(number / factor) * factor
+def ceil_by_factor(number: int, factor: int) -> int:
+    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
+    return math.ceil(number / factor) * factor
+def floor_by_factor(number: int, factor: int) -> int:
+    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
+    return math.floor(number / factor) * factor
+def smart_resize(
+    height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
+) -> tuple[int, int]:
+    if max(height, width) / min(height, width) > MAX_RATIO:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = max(factor, floor_by_factor(height / beta, factor))
+        w_bar = max(factor, floor_by_factor(width / beta, factor))
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(height * beta, factor)
+        w_bar = ceil_by_factor(width * beta, factor)
+    return h_bar, w_bar
+def to_rgb(pil_image: Image.Image) -> Image.Image:
+    if pil_image.mode == 'RGBA':
+        white_background = Image.new("RGB", pil_image.size, (255, 255, 255))
+        white_background.paste(pil_image, mask=pil_image.split()[3])
+        return white_background
+    else:
+        return pil_image.convert("RGB")
+def fetch_image(ele: dict[str, str | Image.Image], size_factor: int = IMAGE_FACTOR) -> Image.Image:
+    if "image" in ele:
+        image = ele["image"]
+    else:
+        image = ele["image_url"]
+    image_obj = None
+    if isinstance(image, Image.Image):
+        image_obj = image
+    elif image.startswith("http://") or image.startswith("https://"):
+        with requests.get(image, stream=True) as response:
+            response.raise_for_status()
+            with BytesIO(response.content) as bio:
+                image_obj = copy.deepcopy(Image.open(bio))
+    elif image.startswith("file://"):
+        image_obj = Image.open(image[7:])
+    elif image.startswith("data:image"):
+        if "base64," in image:
+            _, base64_data = image.split("base64,", 1)
+            data = base64.b64decode(base64_data)
+            with BytesIO(data) as bio:
+                image_obj = copy.deepcopy(Image.open(bio))
+    else:
+        image_obj = Image.open(image)
+    if image_obj is None:
+        raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
+    image = to_rgb(image_obj)
+    if "resized_height" in ele and "resized_width" in ele:
+        resized_height, resized_width = smart_resize(
+            ele["resized_height"],
+            ele["resized_width"],
+            factor=size_factor,
+        )
+    else:
+        width, height = image.size
+        min_pixels = ele.get("min_pixels", MIN_PIXELS)
+        max_pixels = ele.get("max_pixels", MAX_PIXELS)
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=size_factor,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+    image = image.resize((resized_width, resized_height))
+    return image
+def smart_nframes(
+    ele: dict,
+    total_frames: int,
+    video_fps: int | float,
+) -> int:
+    assert not ("fps" in ele and "nframes" in ele), "Only accept either `fps` or `nframes`"
+    if "nframes" in ele:
+        nframes = round_by_factor(ele["nframes"], FRAME_FACTOR)
+    else:
+        fps = ele.get("fps", FPS)
+        min_frames = ceil_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR)
+        max_frames = floor_by_factor(ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR)
+        nframes = total_frames / video_fps * fps
+        if nframes > total_frames:
+            logger.warning(f"smart_nframes: nframes[{nframes}] > total_frames[{total_frames}]")
+        nframes = min(min(max(nframes, min_frames), max_frames), total_frames)
+        nframes = floor_by_factor(nframes, FRAME_FACTOR)
+    if not (FRAME_FACTOR <= nframes and nframes <= total_frames):
+        raise ValueError(f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}.")
+    return nframes
+def _read_video_torchvision(
+    ele: dict,
+) -> (torch.Tensor, float):
+    video_path = ele["video"]
+    if version.parse(torchvision.__version__) < version.parse("0.19.0"):
+        if "http://" in video_path or "https://" in video_path:
+            warnings.warn("torchvision < 0.19.0 does not support http/https video path, please upgrade to 0.19.0.")
+        if "file://" in video_path:
+            video_path = video_path[7:]
+    st = time.time()
+    video, audio, info = io.read_video(
+        video_path,
+        start_pts=ele.get("video_start", 0.0),
+        end_pts=ele.get("video_end", None),
+        pts_unit="sec",
+        output_format="TCHW",
+    )
+    total_frames, video_fps = video.size(0), info["video_fps"]
+    logger.info(f"torchvision:  {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
+    nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
+    idx = torch.linspace(0, total_frames - 1, nframes).round().long()
+    sample_fps = nframes / max(total_frames, 1e-6) * video_fps
+    video = video[idx]
+    return video, sample_fps, idx.tolist()
+def is_decord_available() -> bool:
+    import importlib.util
+    return importlib.util.find_spec("decord") is not None
+def calculate_video_frame_range(
+    ele: dict,
+    total_frames: int,
+    video_fps: float,
+) -> tuple[int, int, int]:
+    """
+    Calculate the start and end frame indices based on the given time range.
+    Args:
+        ele (dict): A dictionary containing optional 'video_start' and 'video_end' keys (in seconds).
+        total_frames (int): Total number of frames in the video.
+        video_fps (float): Frames per second of the video.
+    Returns:
+        tuple: A tuple containing (start_frame, end_frame, frame_count).
+    Raises:
+        ValueError: If input parameters are invalid or the time range is inconsistent.
+    """
+    if video_fps <= 0:
+        raise ValueError("video_fps must be a positive number")
+    if total_frames <= 0:
+        raise ValueError("total_frames must be a positive integer")
+    video_start = ele.get("video_start", None)
+    video_end = ele.get("video_end", None)
+    if video_start is None and video_end is None:
+        return 0, total_frames - 1, total_frames
+    max_duration = total_frames / video_fps
+    if video_start is not None:
+        video_start_clamped = max(0.0, min(video_start, max_duration))
+        start_frame = math.ceil(video_start_clamped * video_fps)
+    else:
+        start_frame = 0
+    if video_end is not None:
+        video_end_clamped = max(0.0, min(video_end, max_duration))
+        end_frame = math.floor(video_end_clamped * video_fps)
+        end_frame = min(end_frame, total_frames - 1)
+    else:
+        end_frame = total_frames - 1
+    if start_frame >= end_frame:
+        raise ValueError(
+            f"Invalid time range: Start frame {start_frame} (at {video_start_clamped if video_start is not None else 0}s) "
+            f"exceeds end frame {end_frame} (at {video_end_clamped if video_end is not None else max_duration}s). "
+            f"Video duration: {max_duration:.2f}s ({total_frames} frames @ {video_fps}fps)"
+        )
+    logger.info(f"calculate video frame range: {start_frame=}, {end_frame=}, {total_frames=} from {video_start=}, {video_end=}, {video_fps=:.3f}")
+    return start_frame, end_frame, end_frame - start_frame + 1
+def _read_video_decord(
+    ele: dict,
+) -> (torch.Tensor, float):
+    """read video using decord.VideoReader
+    Args:
+        ele (dict): a dict contains the configuration of video.
+        support keys:
+            - video: the path of video. support "file://", "http://", "https://" and local path.
+            - video_start: the start time of video.
+            - video_end: the end time of video.
+    Returns:
+        torch.Tensor: the video tensor with shape (T, C, H, W).
+    """
+    import decord
+    video_path = ele["video"]
+    st = time.time()
+    vr = decord.VideoReader(video_path)
+    total_frames, video_fps = len(vr), vr.get_avg_fps()
+    start_frame, end_frame, total_frames = calculate_video_frame_range(
+        ele,
+        total_frames,
+        video_fps,
+    )
+    nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
+    idx = torch.linspace(start_frame, end_frame, nframes).round().long().tolist()
+    video = vr.get_batch(idx).asnumpy()
+    video = torch.tensor(video).permute(0, 3, 1, 2)  # Convert to TCHW format
+    logger.info(f"decord:  {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
+    sample_fps = nframes / max(total_frames, 1e-6) * video_fps
+    return video, sample_fps, idx
+def is_torchcodec_available() -> bool:
+    try:
+        import importlib.util
+        if importlib.util.find_spec("torchcodec") is None:
+            return False
+        from torchcodec.decoders import VideoDecoder
+        return True
+    except (ImportError, AttributeError, Exception):
+        return False
+def _read_video_torchcodec(
+    ele: dict,
+) -> (torch.Tensor, float):
+    from torchcodec.decoders import VideoDecoder
+    TORCHCODEC_NUM_THREADS = int(os.environ.get('TORCHCODEC_NUM_THREADS', 8))
+    logger.info(f"set TORCHCODEC_NUM_THREADS: {TORCHCODEC_NUM_THREADS}")
+    video_path = ele["video"]
+    st = time.time()
+    decoder = VideoDecoder(video_path, num_ffmpeg_threads=TORCHCODEC_NUM_THREADS)
+    video_fps = decoder.metadata.average_fps
+    total_frames = decoder.metadata.num_frames
+    start_frame, end_frame, total_frames = calculate_video_frame_range(
+        ele,
+        total_frames,
+        video_fps,
+    )
+    nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
+    idx = torch.linspace(start_frame, end_frame, nframes).round().long().tolist()
+    sample_fps = nframes / max(total_frames, 1e-6) * video_fps
+    video = decoder.get_frames_at(indices=idx).data
+    logger.info(f"torchcodec:  {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
+    return video, sample_fps, idx
+VIDEO_READER_BACKENDS = {
+    "decord": _read_video_decord,
+    "torchvision": _read_video_torchvision,
+    "torchcodec": _read_video_torchcodec,
+}
+FORCE_QWENVL_VIDEO_READER = os.getenv("FORCE_QWENVL_VIDEO_READER", None)
+@lru_cache(maxsize=1)
+def get_video_reader_backend() -> str:
+    if FORCE_QWENVL_VIDEO_READER is not None:
+        video_reader_backend = FORCE_QWENVL_VIDEO_READER
+    elif is_torchcodec_available():
+        video_reader_backend = "torchcodec"
+    elif is_decord_available():
+        video_reader_backend = "decord"
+    else:
+        video_reader_backend = "torchvision"
+    print(f"qwen-vl-utils using {video_reader_backend} to read video.", file=sys.stderr)
+    return video_reader_backend
+def fetch_video(ele: dict, image_factor: int = IMAGE_FACTOR, return_video_sample_fps: bool = False) -> torch.Tensor | list[Image.Image]:
+    if isinstance(ele["video"], str):
+        video_reader_backend = get_video_reader_backend()
+        try:
+            video, sample_fps, sampled_frame_idx_list = VIDEO_READER_BACKENDS[video_reader_backend](ele)
+        except Exception as e:
+            logger.warning(f"video_reader_backend {video_reader_backend} error, use torchvision as default, msg: {e}")
+            video, sample_fps, sampled_frame_idx_list = VIDEO_READER_BACKENDS["torchvision"](ele)
+        nframes, _, height, width = video.shape
+        min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS)
+        total_pixels = ele.get("total_pixels", VIDEO_TOTAL_PIXELS)
+        max_pixels = max(min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR), int(min_pixels * 1.05))
+        max_pixels_supposed = ele.get("max_pixels", max_pixels)
+        if max_pixels_supposed > max_pixels:
+            logger.warning(f"The given max_pixels[{max_pixels_supposed}] exceeds limit[{max_pixels}].")
+        max_pixels = min(max_pixels_supposed, max_pixels)
+        if "resized_height" in ele and "resized_width" in ele:
+            resized_height, resized_width = smart_resize(
+                ele["resized_height"],
+                ele["resized_width"],
+                factor=image_factor,
+            )
+        else:
+            resized_height, resized_width = smart_resize(
+                height,
+                width,
+                factor=image_factor,
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+            )
+        video = transforms.functional.resize(
+            video,
+            [resized_height, resized_width],
+            interpolation=InterpolationMode.BICUBIC,
+            antialias=True,
+        ).float()
+        if return_video_sample_fps:
+            return video, sample_fps, sampled_frame_idx_list
+        return video, sampled_frame_idx_list
+    else:
+        assert isinstance(ele["video"], (list, tuple))
+        process_info = ele.copy()
+        process_info.pop("type", None)
+        process_info.pop("video", None)
+        images = [
+            fetch_image({"image": video_element, **process_info}, size_factor=image_factor)
+            for video_element in ele["video"]
+        ]
+        nframes = ceil_by_factor(len(images), FRAME_FACTOR)
+        if len(images) < nframes:
+            images.extend([images[-1]] * (nframes - len(images)))
+        if return_video_sample_fps:
+            return images, process_info.pop("fps", 2.0)
+        return images
+def extract_vision_info(conversations: list[dict] | list[list[dict]]) -> list[dict]:
+    vision_infos = []
+    if isinstance(conversations[0], dict):
+        conversations = [conversations]
+    for conversation in conversations:
+        for message in conversation:
+            if isinstance(message["content"], list):
+                for ele in message["content"]:
+                    if (
+                        "image" in ele
+                        or "image_url" in ele
+                        or "video" in ele
+                        or ele.get("type","") in ("image", "image_url", "video")
+                    ):
+                        vision_infos.append(ele)
+    return vision_infos
+def process_vision_info(
+    conversations: list[dict] | list[list[dict]],
+    return_video_kwargs: bool = False,
+) -> tuple[list[Image.Image] | None, list[torch.Tensor | list[Image.Image]] | None, Optional[dict]]:
+    vision_infos = extract_vision_info(conversations)
+    image_inputs = []
+    video_inputs = []
+    video_sample_fps_list = []
+    video_sampled_frame_idx_list = []
+    for vision_info in vision_infos:
+        if "image" in vision_info or "image_url" in vision_info:
+            image_inputs.append(fetch_image(vision_info))
+        elif "video" in vision_info:
+            video_input, video_sample_fps, sampled_frame_idx_list = fetch_video(vision_info, return_video_sample_fps=True)
+            video_sample_fps_list.append(video_sample_fps)
+            video_inputs.append(video_input)
+            video_sampled_frame_idx_list.append(sampled_frame_idx_list)
+        else:
+            raise ValueError("image, image_url or video should in content.")
+    if len(image_inputs) == 0:
+        image_inputs = None
+    if len(video_inputs) == 0:
+        video_inputs = None
+    if return_video_kwargs:
+        return image_inputs, video_inputs, {'fps': video_sample_fps_list}, video_sampled_frame_idx_list
+    return image_inputs, video_inputs, video_sampled_frame_idx_list

resampler_utils/__pycache__/token_arrangement.cpython-310.pyc ADDED Viewed

Binary file (14 kB). View file

resampler_utils/__pycache__/token_insert_1017_multi_resampler.cpython-310.pyc ADDED Viewed

Binary file (13.3 kB). View file

resampler_utils/__pycache__/token_insert_1020_multi_two_resampler.cpython-310.pyc ADDED Viewed

Binary file (15 kB). View file

resampler_utils/__pycache__/token_insert_new.cpython-310.pyc ADDED Viewed

Binary file (11.2 kB). View file

resampler_utils/__pycache__/token_insert_no_resampler.cpython-310.pyc ADDED Viewed

Binary file (9.06 kB). View file

resampler_utils/__pycache__/token_insert_single_resampler.cpython-310.pyc ADDED Viewed

Binary file (11.2 kB). View file

resampler_utils/__pycache__/token_insert_temporal.cpython-310.pyc ADDED Viewed

Binary file (12.6 kB). View file

resampler_utils/__pycache__/token_selection.cpython-310.pyc ADDED Viewed

Binary file (2.88 kB). View file

resampler_utils/__pycache__/token_selection_bbox.cpython-310.pyc ADDED Viewed

Binary file (9.38 kB). View file

resampler_utils/__pycache__/token_selection_temporal.cpython-310.pyc ADDED Viewed

Binary file (3.29 kB). View file

resampler_utils/token_arrangement.py ADDED Viewed

	@@ -0,0 +1,640 @@

+import torch
+import torch.nn.functional as F
+from typing import List, Optional, Tuple
+import math
+def rearrange_token(
+    model,
+    input_ids: torch.LongTensor,           # [B, L]
+    attention_mask: torch.LongTensor,      # [B, L]
+    pixel_values: Optional[torch.FloatTensor],            # unused here (image path kept for API compatibility)
+    image_grid_thw: Optional[torch.LongTensor],           # unused here (image path kept for API compatibility)
+    pixel_values_videos: Optional[torch.FloatTensor],     # may be None
+    video_grid_thw: Optional[torch.LongTensor],           # may be None
+    second_per_grid_ts: Optional[torch.Tensor],           # may be None
+    # Per-sample list of objects; each object is a 1D LongTensor of relative video-token indices (in the original video token stream)
+    obj_token_indices_per_sample: List[List[torch.Tensor]],
+    # Only mode3_traj_and_text is kept:
+    obj_traj_start_id: Optional[int] = None,
+    obj_traj_end_id: Optional[int] = None,
+    # Required: List[sample][object] -> 1D LongTensor(ids)
+    text_token_ids_per_sample: Optional[List[List[torch.Tensor]]] = None,
+    timestamp_token_ids_per_batch=None,  # List[sample][1D LongTensor(ids)]
+    grids_per_temporal_window_per_batch=None,  # List[sample] number of grids per temporal window
+    labels: Optional[torch.LongTensor] = None,
+    IGNORE_ID: int = -100,
+    use_resampler: bool = True,             # True → per-object resampling + linear (1D) positions
+    use_second_resampler: bool = True,
+    add_timestamp_token: bool = True,       # whether to add timestamp token for each object window
+):
+    """
+    Fixed simplifications:
+      - insert_where: only "in_order" (no argument kept)
+      - insertion_mode: only "mode3_traj_and_text"
+      - perceiver_injection: only "visuals" (no time tokens injected into resampler)
+    Returns:
+      new_inputs_embeds:  [B, Lmax, D]
+      new_position_ids:   [3, B, Lmax] (int32)
+      new_attention_mask: [B, Lmax] (bool)
+      rope_deltas:        [B, 1] (long)
+      cache_position:     [Lmax] (int32)
+      new_input_ids:      [B, Lmax] (long)
+      new_labels:         [B, Lmax] or None (long)
+    """
+    dev = input_ids.device
+    B, L = input_ids.shape
+    cpu = torch.device("cpu")
+    assert text_token_ids_per_sample is not None and len(text_token_ids_per_sample) == B, \
+        "mode3_traj_and_text requires text_token_ids_per_sample with length B."
+    if add_timestamp_token:
+        assert timestamp_token_ids_per_batch is not None and len(timestamp_token_ids_per_batch) == B, \
+            "add_timestamp_token=True requires timestamp_token_ids_per_batch with length B."
+        assert grids_per_temporal_window_per_batch is not None and len(grids_per_temporal_window_per_batch) == B, \
+            "add_timestamp_token=True requires grids_per_temporal_window_per_batch with length B."
+    else:
+        # still needed for window indexing if use_resampler path uses temporal windows
+        assert grids_per_temporal_window_per_batch is not None and len(grids_per_temporal_window_per_batch) == B, \
+            "grids_per_temporal_window_per_batch is required."
+    tok_embed = model.get_input_embeddings()
+    vt_id = int(model.config.video_token_id)
+    vs_id = getattr(model.config, "vision_start_token_id", None)
+    ve_id = getattr(model.config, "vision_end_token_id", None)
+    pad_id = 151643  # align with original implementation
+    # ---- (0+) temporal window meta ----
+    assert video_grid_thw is not None, "video_grid_thw is required for temporal windowing"
+    assert video_grid_thw.shape[0] == B and video_grid_thw.shape[1] == 3, \
+        f"video_grid_thw should be ({B},3), got {video_grid_thw.shape}"
+    grid_area_batch: List[int] = []  # per-sample spatial token count (H*W/4)
+    temporal_window_size_batch = grids_per_temporal_window_per_batch
+    # ---- (0) Compute visual features (with grad) ----
+    video_embeds = None
+    if pixel_values_videos is not None:
+        _vid = model.model.get_video_features(
+            pixel_values_videos.type(model.model.visual.dtype), video_grid_thw
+        )
+        video_embeds = torch.cat(_vid, dim=0) if isinstance(_vid, (list, tuple)) else _vid  # [N_vid, D]
+        del pixel_values_videos, _vid
+    # ---- (0.1) Resamplers ----
+    resampler = None
+    resampler_num_latents = None
+    second_resampler = None
+    second_resampler_num_latents = None
+    if use_resampler:
+        if not hasattr(model, "perceiver_resampler"):
+            raise RuntimeError("use_resampler=True, but model.perceiver_resampler not found.")
+        resampler = model.perceiver_resampler
+        resampler_num_latents = int(resampler.n_latents)
+        if use_second_resampler:
+            if not hasattr(model, "second_perceiver_resampler"):
+                raise RuntimeError("use_second_resampler=True, but model.second_perceiver_resampler not found.")
+            second_resampler = model.second_perceiver_resampler
+            second_resampler_num_latents = int(second_resampler.n_latents)
+    # ---- (1) Position ids preparation ----
+    need_3d_rope = (not use_resampler)
+    if need_3d_rope:
+        with torch.no_grad():
+            position_ids_full, _ = model.model.get_rope_index(
+                input_ids=input_ids,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+                attention_mask=attention_mask,
+            ).to(cpu)  # (3, B, L)
+    else:
+        position_ids_full = None
+    # ---- (2) Move to CPU for sequence planning ----
+    attn_cpu = attention_mask.to(cpu, dtype=torch.bool)
+    ids_cpu = input_ids.to(cpu)
+    pid_cpu = position_ids_full.to(cpu, dtype=torch.int32) if need_3d_rope else None
+    lbls_cpu = labels.to(cpu) if labels is not None else None
+    eff_lens: List[int] = []
+    vid_idx_list: List[torch.Tensor] = []
+    for b in range(B):
+        video_grid_thw_b = video_grid_thw[b]
+        # H*W/4 as integer
+        grid_area = (int(video_grid_thw_b[1].item()) * int(video_grid_thw_b[2].item())) // 4
+        grid_area_batch.append(int(grid_area))
+        nz = torch.nonzero(attn_cpu[b], as_tuple=False).flatten()
+        L_eff = int(nz[-1].item()) + 1 if nz.numel() > 0 else 0
+        eff_lens.append(L_eff)
+        if L_eff > 0:
+            ids_b_eff = ids_cpu[b, :L_eff]
+            vid_idx = torch.nonzero(ids_b_eff == vt_id, as_tuple=False).flatten()
+            vid_idx_list.append(vid_idx)
+        else:
+            vid_idx_list.append(torch.empty(0, dtype=torch.long))
+    # ---- Global offsets into concatenated video_embeds for each sample ----
+    vid_counts = [int(v.numel()) for v in vid_idx_list]
+    vid_offsets: List[int] = [0] * B
+    running = 0
+    for b in range(B):
+        vid_offsets[b] = running
+        running += vid_counts[b]
+    # ---- (3) Length planning ----
+    def _object_block_len(b: int, obj_i: int, sel_latent_len: int, rel_temporal_window_idx: torch.Tensor) -> int:
+        """
+        mode3_traj_and_text block length:
+          [<traj_start>?] + [text] + [<VS>?] + [<ts>* + <vt_latents>*] + [<VE>?] + [<traj_end>?]
+        where <ts>* and <vt_latents>* repeat per non-empty temporal window (resampler path),
+        or raw selected video tokens (non-resampler path).
+        """
+        add = 0
+        if obj_traj_start_id is not None:
+            add += 1
+        # text
+        tlen = int(text_token_ids_per_sample[b][obj_i].numel())
+        add += tlen
+        # VS
+        if vs_id is not None:
+            add += 1
+        # timestamps per unique window (if enabled)
+        if add_timestamp_token and timestamp_token_ids_per_batch is not None:
+            locs = rel_temporal_window_idx.unique()
+            for loc in locs:
+                loc_i = int(loc.item())
+                if loc_i < len(timestamp_token_ids_per_batch[b]):
+                    add += int(timestamp_token_ids_per_batch[b][loc_i].numel())
+                else:
+                    add += int(timestamp_token_ids_per_batch[b][-1].numel())
+        # visual placeholder length (either resampled latents or raw selected tokens)
+        add += int(sel_latent_len)
+        # VE
+        if ve_id is not None:
+            add += 1
+        if obj_traj_end_id is not None:
+            add += 1
+        return add
+    L_new_each: List[int] = []
+    for b in range(B):
+        L_eff = eff_lens[b]
+        ids_b = ids_cpu[b, :L_eff]
+        vid_idx = vid_idx_list[b]
+        if L_eff == 0:
+            L_new_each.append(0)
+            continue
+        if vid_idx.numel() == 0:
+            L_new_each.append(L_eff)
+            continue
+        v_s = int(vid_idx[0].item())
+        v_e = int(vid_idx[-1].item())
+        has_vs = (vs_id is not None and v_s - 1 >= 0 and ids_b[v_s - 1].item() == vs_id)
+        has_ve = (ve_id is not None and v_e + 1 < L_eff and ids_b[v_e + 1].item() == ve_id)
+        if has_vs:
+            v_s -= 1
+        if has_ve:
+            v_e += 1
+        prefix_len = v_s
+        suffix_len = L_eff - (v_e + 1)
+        sel_lists = obj_token_indices_per_sample[b]
+        Nv = int(vid_idx.numel())
+        cur_total = 0
+        for i, rel in enumerate(sel_lists):
+            rel = rel.to(cpu, dtype=torch.long)
+            sel_len = int(rel.numel())
+            if use_resampler:
+                tokens_per_window = int(grid_area_batch[b] * int(temporal_window_size_batch[b]))
+                rel_temporal_window_idx = rel // tokens_per_window if (tokens_per_window > 0) else torch.zeros_like(rel)
+                nonempty_windows = int(rel_temporal_window_idx.unique().numel())
+                if use_second_resampler and second_resampler_num_latents is not None:
+                    sel_len = int(second_resampler_num_latents) + int(resampler_num_latents) * nonempty_windows
+                else:
+                    sel_len = int(resampler_num_latents) * nonempty_windows
+            else:
+                # Non-resampler: keep raw selected video tokens count
+                tokens_per_window = int(grid_area_batch[b] * int(temporal_window_size_batch[b]))
+                rel_temporal_window_idx = rel // tokens_per_window if (tokens_per_window > 0) else torch.zeros_like(rel)
+            cur_total += _object_block_len(b, i, sel_len, rel_temporal_window_idx)
+        L_new_each.append(prefix_len + cur_total + suffix_len)
+    Lmax = max(L_new_each) if len(L_new_each) > 0 else 0
+    # ---- (4) Allocate new sequence tensors on CPU and fill per-sample ----
+    new_input_ids_cpu = torch.full((B, Lmax), pad_id, dtype=torch.long, device=cpu)
+    new_attention_mask_cpu = torch.zeros((B, Lmax), dtype=torch.bool, device=cpu)
+    new_position_ids_cpu = torch.zeros((3, B, Lmax), dtype=torch.int32, device=cpu)
+    new_labels_cpu = None
+    if labels is not None:
+        new_labels_cpu = torch.full((B, Lmax), IGNORE_ID, dtype=torch.long, device=cpu)
+    rows_for_video: List[torch.Tensor] = [torch.empty(0, dtype=torch.long) for _ in range(B)]
+    batched_obj_rows: List[torch.Tensor] = []  # each: rows into video_embeds (visual-only)
+    batched_obj_pos: List[torch.Tensor] = []   # each: destination positions [R]
+    batched_obj_bids: List[int] = []
+    batched_obj_lens: List[int] = []           # visual token lengths per (object-window)
+    batched_second_rows: List[torch.Tensor] = []
+    batched_second_pos: List[torch.Tensor] = []
+    batched_second_bids: List[int] = []
+    batched_second_oids: List[int] = []
+    def _text_pos_block(start_scalar: int, length: int, dtype=torch.int32) -> torch.Tensor:
+        """Create 1D-linear positions replicated across 3 RoPE dims."""
+        if length <= 0:
+            return torch.empty(3, 0, dtype=dtype, device=cpu)
+        ar = torch.arange(start_scalar, start_scalar + length, device=cpu, dtype=dtype)
+        return torch.stack([ar, ar, ar], dim=0)
+    for b in range(B):
+        L_eff = eff_lens[b]
+        if L_eff == 0:
+            continue
+        ids_b = ids_cpu[b, :L_eff]
+        msk_b = attn_cpu[b, :L_eff]
+        labs_b = lbls_cpu[b, :L_eff] if lbls_cpu is not None else None
+        vid_idx = vid_idx_list[b]
+        dst = 0
+        # No video tokens: copy through
+        if vid_idx.numel() == 0:
+            new_input_ids_cpu[b, :L_eff] = ids_b
+            new_attention_mask_cpu[b, :L_eff] = msk_b
+            if new_labels_cpu is not None and labs_b is not None:
+                new_labels_cpu[b, :L_eff] = labs_b
+            if need_3d_rope:
+                new_position_ids_cpu[:, b, :L_eff] = pid_cpu[:, b, :L_eff]
+            else:
+                new_position_ids_cpu[:, b, :L_eff] = _text_pos_block(0, L_eff, dtype=torch.int32)
+            continue
+        v_s = int(vid_idx[0].item())
+        v_e = int(vid_idx[-1].item())
+        has_vs = (vs_id is not None and v_s - 1 >= 0 and ids_b[v_s - 1].item() == vs_id)
+        has_ve = (ve_id is not None and v_e + 1 < L_eff and ids_b[v_e + 1].item() == ve_id)
+        if has_vs:
+            v_s -= 1
+        if has_ve:
+            v_e += 1
+        prefix_len = v_s
+        suffix_len = L_eff - (v_e + 1)
+        if need_3d_rope:
+            pid_b = pid_cpu[:, b, :L_eff]
+            pos_scalar = pid_b.max(dim=0).values
+            first_video_scalar = int(pos_scalar[v_s + (1 if has_vs else 0)].item())
+            last_video_scalar = int(pos_scalar[v_e - (1 if has_ve else 0)].item())
+            vs_scalar = int(pos_scalar[v_s].item()) if has_vs else None
+            min_video_scalar_base = int(first_video_scalar)
+            max_video_scalar_base = int(last_video_scalar)
+        # prefix
+        if prefix_len > 0:
+            new_input_ids_cpu[b, dst:dst + prefix_len] = ids_b[:prefix_len]
+            new_attention_mask_cpu[b, dst:dst + prefix_len] = msk_b[:prefix_len]
+            if new_labels_cpu is not None and labs_b is not None:
+                new_labels_cpu[b, dst:dst + prefix_len] = labs_b[:prefix_len]
+            if need_3d_rope:
+                new_position_ids_cpu[:, b, dst:dst + prefix_len] = pid_b[:, :prefix_len]
+            else:
+                new_position_ids_cpu[:, b, dst:dst + prefix_len] = _text_pos_block(dst, prefix_len, dtype=torch.int32)
+            dst += prefix_len
+        # in_order only:
+        if need_3d_rope:
+            cursor = int(vs_scalar) if has_vs else int(first_video_scalar)
+        else:
+            cursor = dst
+        Nv = int(vid_idx.numel())
+        pos2rank = torch.full((L_eff,), -1, dtype=torch.long, device=cpu)
+        if Nv > 0:
+            pos2rank[vid_idx] = torch.arange(Nv, dtype=torch.long, device=cpu)
+        vid_offset = int(vid_offsets[b])
+        sel_lists = obj_token_indices_per_sample[b]
+        for i, rel in enumerate(sel_lists):
+            rel = rel.to(cpu, dtype=torch.long)
+            if rel.numel() > 0:
+                rel.clamp_(0, Nv - 1)
+            g = vid_idx.index_select(0, rel) if (Nv > 0 and rel.numel() > 0) else torch.empty(0, dtype=torch.long, device=cpu)
+            # (1) <obj_traj_start> (optional)
+            if obj_traj_start_id is not None:
+                new_input_ids_cpu[b, dst] = int(obj_traj_start_id)
+                new_position_ids_cpu[:, b, dst:dst + 1] = _text_pos_block(cursor if need_3d_rope else dst, 1, dtype=torch.int32)
+                if new_labels_cpu is not None:
+                    new_labels_cpu[b, dst] = IGNORE_ID
+                new_attention_mask_cpu[b, dst] = True
+                dst += 1
+                if need_3d_rope:
+                    cursor += 1
+            # (2) text tokens (required)
+            txt_ids = text_token_ids_per_sample[b][i].to(cpu, dtype=torch.long)
+            k = int(txt_ids.numel())
+            if k > 0:
+                new_input_ids_cpu[b, dst:dst + k] = txt_ids
+                new_position_ids_cpu[:, b, dst:dst + k] = _text_pos_block(cursor if need_3d_rope else dst, k, dtype=torch.int32)
+                if new_labels_cpu is not None:
+                    new_labels_cpu[b, dst:dst + k] = IGNORE_ID
+                new_attention_mask_cpu[b, dst:dst + k] = True
+                dst += k
+                if need_3d_rope:
+                    cursor += k
+            # (3) <VS> (optional)
+            if vs_id is not None:
+                new_input_ids_cpu[b, dst] = int(vs_id)
+                new_position_ids_cpu[:, b, dst:dst + 1] = _text_pos_block(cursor if need_3d_rope else dst, 1, dtype=torch.int32)
+                if new_labels_cpu is not None:
+                    new_labels_cpu[b, dst] = IGNORE_ID
+                new_attention_mask_cpu[b, dst] = True
+                dst += 1
+                if need_3d_rope:
+                    cursor += 1
+            # (4) video tokens
+            if g.numel() > 0:
+                if use_resampler:
+                    tokens_per_window = int(grid_area_batch[b] * int(temporal_window_size_batch[b]))
+                    rel_temporal_window_idx = rel // tokens_per_window if (tokens_per_window > 0) else torch.zeros_like(rel)
+                    # Loop only over windows that actually appear in rel (robust)
+                    W_eff = int(rel_temporal_window_idx.max().item()) + 1 if rel_temporal_window_idx.numel() > 0 else 0
+                    all_rows_list = []
+                    for w in range(W_eff):
+                        m_w = (rel_temporal_window_idx == w)
+                        if not torch.any(m_w):
+                            all_rows_list.append(torch.empty(0, dtype=torch.long, device=cpu))
+                            continue
+                        rel_w = rel[m_w]
+                        rows_w = rel_w + vid_offset
+                        all_rows_list.append(rows_w)
+                    # second resampler: global object summary
+                    if use_second_resampler and second_resampler is not None:
+                        rows_all = torch.cat([x for x in all_rows_list if x.numel() > 0], dim=0) if any(x.numel() > 0 for x in all_rows_list) \
+                            else torch.empty(0, dtype=torch.long, device=cpu)
+                        if rows_all.numel() > 0:
+                            R2 = int(second_resampler_num_latents)
+                            new_input_ids_cpu[b, dst:dst + R2] = int(vt_id)
+                            new_position_ids_cpu[:, b, dst:dst + R2] = _text_pos_block(cursor if need_3d_rope else dst, R2, dtype=torch.int32)
+                            if new_labels_cpu is not None:
+                                new_labels_cpu[b, dst:dst + R2] = IGNORE_ID
+                            new_attention_mask_cpu[b, dst:dst + R2] = True
+                            pos_idx2 = torch.arange(dst, dst + R2, dtype=torch.long, device=cpu)
+                            batched_second_rows.append(rows_all)
+                            batched_second_pos.append(pos_idx2)
+                            batched_second_bids.append(b)
+                            batched_second_oids.append(i)
+                            dst += R2
+                            if need_3d_rope:
+                                cursor += R2
+                    R = int(resampler_num_latents)
+                    for w in range(W_eff):
+                        m_w = (rel_temporal_window_idx == w)
+                        if not torch.any(m_w):
+                            continue
+                        # timestamp tokens (text-only; NOT injected into resampler)
+                        if add_timestamp_token and (timestamp_token_ids_per_batch is not None):
+                            loc = w
+                            if loc < len(timestamp_token_ids_per_batch[b]):
+                                ts_ids = timestamp_token_ids_per_batch[b][loc].to(cpu, dtype=torch.long)
+                            else:
+                                ts_ids = timestamp_token_ids_per_batch[b][-1].to(cpu, dtype=torch.long)
+                            kt = int(ts_ids.numel())
+                            assert kt > 0, "Timestamp token ids should not be empty."
+                            new_input_ids_cpu[b, dst:dst + kt] = ts_ids
+                            new_position_ids_cpu[:, b, dst:dst + kt] = _text_pos_block(cursor if need_3d_rope else dst, kt, dtype=torch.int32)
+                            if new_labels_cpu is not None:
+                                new_labels_cpu[b, dst:dst + kt] = IGNORE_ID
+                            new_attention_mask_cpu[b, dst:dst + kt] = True
+                            dst += kt
+                            if need_3d_rope:
+                                cursor += kt
+                        # reserve R vt slots for resampled latents
+                        new_input_ids_cpu[b, dst:dst + R] = int(vt_id)
+                        new_position_ids_cpu[:, b, dst:dst + R] = _text_pos_block(cursor if need_3d_rope else dst, R, dtype=torch.int32)
+                        if new_labels_cpu is not None:
+                            new_labels_cpu[b, dst:dst + R] = IGNORE_ID
+                        new_attention_mask_cpu[b, dst:dst + R] = True
+                        rel_w = rel[m_w]
+                        rows_w = rel_w + vid_offset
+                        pos_idx = torch.arange(dst, dst + R, dtype=torch.long, device=cpu)
+                        batched_obj_rows.append(rows_w)
+                        batched_obj_pos.append(pos_idx)
+                        batched_obj_bids.append(b)
+                        batched_obj_lens.append(int(rows_w.numel()))  # visuals-only
+                        dst += R
+                        if need_3d_rope:
+                            cursor += R
+                else:
+                    # Non-resampler: 3D RoPE positions for selected raw video tokens
+                    assert need_3d_rope, "Non-resampler path requires 3D RoPE positions."
+                    pid_vid = pid_b.index_select(1, g)  # (3, Lv_sel)
+                    # in_order only: shift selected pid by delta
+                    delta = int(cursor - min_video_scalar_base)
+                    if delta != 0:
+                        pid_vid = pid_vid + delta
+                        cursor = max_video_scalar_base + delta + 1
+                    Lv_sel = int(g.numel())
+                    new_input_ids_cpu[b, dst:dst + Lv_sel] = int(vt_id)
+                    new_position_ids_cpu[:, b, dst:dst + Lv_sel] = pid_vid
+                    if new_labels_cpu is not None:
+                        new_labels_cpu[b, dst:dst + Lv_sel] = IGNORE_ID
+                    new_attention_mask_cpu[b, dst:dst + Lv_sel] = True
+                    ranks = pos2rank.index_select(0, g)
+                    rows = ranks + vid_offset
+                    rows_for_video[b] = torch.cat([rows_for_video[b], rows], dim=0)
+                    dst += Lv_sel
+            # (5) <VE> (optional)
+            if ve_id is not None:
+                new_input_ids_cpu[b, dst] = int(ve_id)
+                new_position_ids_cpu[:, b, dst:dst + 1] = _text_pos_block(cursor if need_3d_rope else dst, 1, dtype=torch.int32)
+                if new_labels_cpu is not None:
+                    new_labels_cpu[b, dst] = IGNORE_ID
+                new_attention_mask_cpu[b, dst] = True
+                dst += 1
+                if need_3d_rope:
+                    cursor += 1
+            # (6) <obj_traj_end> (optional)
+            if obj_traj_end_id is not None:
+                new_input_ids_cpu[b, dst] = int(obj_traj_end_id)
+                new_position_ids_cpu[:, b, dst:dst + 1] = _text_pos_block(cursor if need_3d_rope else dst, 1, dtype=torch.int32)
+                if new_labels_cpu is not None:
+                    new_labels_cpu[b, dst] = IGNORE_ID
+                new_attention_mask_cpu[b, dst] = True
+                dst += 1
+                if need_3d_rope:
+                    cursor += 1
+        # suffix
+        if suffix_len > 0:
+            src_lo = v_e + 1
+            src_hi = L_eff
+            seg = src_hi - src_lo
+            new_input_ids_cpu[b, dst:dst + seg] = ids_b[src_lo:src_hi]
+            new_attention_mask_cpu[b, dst:dst + seg] = msk_b[src_lo:src_hi]
+            if new_labels_cpu is not None and labs_b is not None:
+                new_labels_cpu[b, dst:dst + seg] = labs_b[src_lo:src_hi]
+            new_position_ids_cpu[:, b, dst:dst + seg] = _text_pos_block(dst, seg, dtype=torch.int32) if not need_3d_rope else _text_pos_block(cursor, seg, dtype=torch.int32)
+            dst += seg
+        assert dst == L_new_each[b], f"sample {b}: dst={dst}, L_new={L_new_each[b]}"
+    # ---- (5) Move back to device, build inputs_embeds, and paste visual features ----
+    new_input_ids = new_input_ids_cpu.to(dev, non_blocking=True)
+    new_position_ids = new_position_ids_cpu.to(dev, non_blocking=True)
+    new_attention_mask = new_attention_mask_cpu.to(dev, non_blocking=True)
+    new_labels = None if new_labels_cpu is None else new_labels_cpu.to(dev, non_blocking=True)
+    base = tok_embed(new_input_ids)
+    new_inputs_embeds = base.clone()
+    # Non-resampler: copy raw video features at vt positions
+    if (video_embeds is not None) and (not use_resampler) and any(r.numel() > 0 for r in rows_for_video):
+        vemb = video_embeds.to(dev, dtype=new_inputs_embeds.dtype, non_blocking=True)
+        for b in range(B):
+            rows = rows_for_video[b]
+            if rows.numel() == 0:
+                continue
+            vt_pos = torch.nonzero(new_input_ids[b] == vt_id, as_tuple=False).flatten()
+            assert vt_pos.numel() == rows.numel(), f"video rows mismatch for sample {b}"
+            new_inputs_embeds[b].index_copy_(0, vt_pos.to(dev), vemb.index_select(0, rows.to(dev)))
+    # ---- (5.1) second resampler: object-level global summary ----
+    if use_resampler and use_second_resampler and len(batched_second_rows) > 0:
+        if video_embeds is None:
+            raise RuntimeError("use_second_resampler=True but video_embeds is None.")
+        dev_emb = video_embeds.device
+        dtype_emb = video_embeds.dtype
+        D = video_embeds.shape[-1]
+        N_obj2 = len(batched_second_rows)
+        seqs2 = []
+        lens2 = []
+        for rows_all in batched_second_rows:
+            if rows_all.numel() == 0:
+                seqs2.append(torch.zeros(0, D, device=dev_emb, dtype=dtype_emb))
+                lens2.append(0)
+            else:
+                seqs2.append(video_embeds.index_select(0, rows_all.to(dev_emb)))
+                lens2.append(int(rows_all.numel()))
+        x2 = torch.nn.utils.rnn.pad_sequence(seqs2, batch_first=True) if len(seqs2) > 0 else torch.zeros(0, 0, D, device=dev_emb, dtype=dtype_emb)
+        L2_max = x2.size(1) if x2.numel() > 0 else 0
+        lens2_t = torch.tensor(lens2, device=dev_emb, dtype=torch.long) if len(lens2) > 0 else torch.zeros(0, device=dev_emb, dtype=torch.long)
+        ar2 = torch.arange(L2_max, device=dev_emb).unsqueeze(0) if L2_max > 0 else torch.zeros(1, 0, device=dev_emb, dtype=torch.long)
+        mask2 = (ar2 < lens2_t.unsqueeze(1)) if L2_max > 0 else torch.zeros(0, 0, device=dev_emb, dtype=torch.bool)
+        y2 = second_resampler(x2, attention_mask=mask2)  # [N_obj2, R2, D]
+        y2 = y2.to(new_inputs_embeds.dtype)
+        for j in range(N_obj2):
+            b_cur = batched_second_bids[j]
+            pos2 = batched_second_pos[j].to(dev)
+            new_inputs_embeds[b_cur, pos2] = y2[j]
+    # ---- (5.2) main resampler: visuals-only ----
+    if use_resampler and len(batched_obj_rows) > 0:
+        if video_embeds is None:
+            raise RuntimeError("use_resampler=True but video_embeds is None.")
+        dev_emb = video_embeds.device
+        dtype_emb = video_embeds.dtype
+        D = video_embeds.shape[-1]
+        N_obj = len(batched_obj_rows)
+        lens = torch.tensor(batched_obj_lens, device=dev_emb, dtype=torch.long)  # [N_obj]
+        L_max = int(lens.max().item()) if lens.numel() > 0 else 0
+        seqs = []
+        for rows in batched_obj_rows:
+            if rows.numel() == 0:
+                seqs.append(torch.zeros(0, D, device=dev_emb, dtype=dtype_emb))
+            else:
+                seqs.append(video_embeds.index_select(0, rows.to(dev_emb)))  # [Lv_sel, D]
+        x = torch.nn.utils.rnn.pad_sequence(seqs, batch_first=True) if len(seqs) > 0 else torch.zeros(0, 0, D, device=dev_emb, dtype=dtype_emb)
+        ar = torch.arange(L_max, device=dev_emb).unsqueeze(0) if L_max > 0 else torch.zeros(1, 0, device=dev_emb, dtype=torch.long)
+        mask = (ar < lens.unsqueeze(1)) if L_max > 0 else torch.zeros(0, 0, device=dev_emb, dtype=torch.bool)
+        y = resampler(x, attention_mask=mask)  # [N_obj, R, D]
+        y = y.to(new_inputs_embeds.dtype)
+        per_b_indices: List[List[int]] = [[] for _ in range(B)]
+        for i in range(N_obj):
+            per_b_indices[batched_obj_bids[i]].append(i)
+        for b in range(B):
+            if not per_b_indices[b]:
+                continue
+            pos_list = []
+            emb_list = []
+            for i in per_b_indices[b]:
+                pos_list.append(batched_obj_pos[i].to(dev))
+                emb_list.append(y[i])
+            pos_b = torch.cat(pos_list, dim=0)
+            emb_b = torch.cat(emb_list, dim=0)
+            new_inputs_embeds[b, pos_b] = emb_b
+    # ---- (6) rope_deltas / cache_position ----
+    maxpos = new_position_ids.max(dim=0)[0].max(dim=1, keepdim=True)[0]  # [B,1]
+    rope_deltas = (maxpos + 1 - new_inputs_embeds.shape[1]).to(dtype=torch.long, device=dev)
+    cache_position = torch.arange(new_inputs_embeds.shape[1], device=dev, dtype=torch.int32)
+    return new_inputs_embeds, new_position_ids, new_attention_mask, rope_deltas, cache_position, new_input_ids, new_labels

resampler_utils/token_selection.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import torch
+import torch.nn.functional as F
+from typing import Literal, Optional, Tuple
+@torch.no_grad()
+def select_tokens(
+    obj_masks: torch.Tensor,
+    grid_thw: Tuple[int,int,int],
+    *,
+    patch_size: int = 14,
+    spatial_merge_size: int = 2,
+    temporal_patch_size: int = 2,
+    coverage_thresh: float = 0.7,
+    time_reduce: Literal["mean","max","all"] = "max",
+    device: str | torch.device = "cpu",
+    retry_step: float = 0.1,
+    retry_times: int = 1,
+    ensure_at_least_one: bool = True,
+    dtype: torch.dtype = torch.float32,
+):
+    if obj_masks.dim() == 3:
+        obj_masks = obj_masks.unsqueeze(0)
+    O, N, H_rz, W_rz = obj_masks.shape
+    T, H, W = grid_thw
+    m, g = spatial_merge_size, temporal_patch_size
+    if N != T*g:
+        if N < T * g:
+            pad = T*g - N
+            last = obj_masks[:,-1:,:,:].repeat(1, pad, 1, 1)
+            obj_masks = torch.cat([obj_masks, last], dim=1)
+            N = T * g
+        else:
+            obj_masks = obj_masks[:, :T * g, :, :]
+            N = T * g
+    Hm, Wm = H // m, W // m
+    pix_h, pix_w = m * patch_size, m * patch_size
+    assert H_rz % pix_h == 0 and W_rz % pix_w == 0, "resized // (28×28)"
+    M = obj_masks.to(device=device, dtype=dtype).clamp(0, 1)
+    M_flat = M.view(O*N, 1, H_rz, W_rz)
+    cov_hw = F.avg_pool2d(M_flat, kernel_size=(pix_h, pix_w), stride=(pix_h, pix_w))  # (O*N,1,Hm,Wm)
+    cov_hw = cov_hw.view(O, N, Hm, Wm)
+    cov_hw = cov_hw.view(O, T, g, Hm, Wm)
+    if time_reduce == "mean":
+        cov_thw = cov_hw.mean(dim=2)
+    elif time_reduce == "max":
+        cov_thw = cov_hw.max(dim=2).values
+    elif time_reduce == "all":
+        cov_thw = cov_hw.min(dim=2).values
+    else:
+        raise ValueError("time_reduce ∈ {'mean','max','all'}")
+    per_obj_idx = []
+    per_t = Hm * Wm
+    for o in range(O):
+        nz = torch.empty(0, 3, dtype=torch.long, device=device)
+        tried = 0
+        thr = coverage_thresh
+        while tried <= retry_times:
+            thr_eff = max(0.0, float(thr))
+            sel = (cov_thw[o] >= thr_eff)
+            nz = torch.nonzero(sel, as_tuple=False)
+            if nz.numel() > 0:
+                break
+            tried += 1
+            thr -= retry_step
+        if nz.numel() == 0:
+            if ensure_at_least_one:
+                flat = cov_thw[o].reshape(-1)
+                arg = torch.argmax(flat)
+                t = arg // (Hm * Wm)
+                rem = arg % (Hm * Wm)
+                hp = rem // Wm
+                wp = rem % Wm
+                idx = (t * per_t + hp * Wm + wp).view(1)
+                per_obj_idx.append(idx.to(device=device, dtype=torch.long))
+            else:
+                per_obj_idx.append(torch.empty(0, dtype=torch.long, device=device))
+        else:
+            t = nz[:, 0]
+            hp = nz[:, 1]
+            wp = nz[:, 2]
+            idx = t * per_t + hp * Wm + wp
+            per_obj_idx.append(idx.to(device=device, dtype=torch.long))
+    if len(per_obj_idx) == 0:
+        union_idx = torch.empty(0, dtype=torch.long, device=device)
+    else:
+        union_idx = torch.unique(torch.cat(per_obj_idx, dim=0)) if per_obj_idx[0].numel() else torch.empty(0, dtype=torch.long, device=device)
+    union_idx_cpu = union_idx.cpu()
+    per_obj_idx_cpu = [idx.cpu() for idx in per_obj_idx]
+    cov_thw_cpu = cov_thw.cpu()
+    del M, M_flat, cov_hw, cov_thw, per_obj_idx, union_idx
+    if O > 0:
+        del sel, nz
+    return union_idx_cpu, per_obj_idx_cpu, cov_thw_cpu

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    {
+      "content": "<obj_traj_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<obj_traj_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,226 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<obj_traj_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "<obj_traj_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<obj_traj_start>",
+    "<obj_traj_end>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 128000,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff