Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- added_tokens.json +26 -0
- chat_template.jinja +7 -0
- config.json +150 -0
- example/2401075277.mp4 +3 -0
- example/2401075277_rle.json +0 -0
- generation_config.json +17 -0
- inference.py +213 -0
- merges.txt +0 -0
- model-00001-of-00002.safetensors +3 -0
- model-00002-of-00002.safetensors +3 -0
- model.safetensors.index.json +896 -0
- modeling_traser.py +179 -0
- qwen_vl_vsg_utils/src/qwen_vl_utils/__init__.py +7 -0
- qwen_vl_vsg_utils/src/qwen_vl_utils/__pycache__/__init__.cpython-310.pyc +0 -0
- qwen_vl_vsg_utils/src/qwen_vl_utils/__pycache__/vision_process.cpython-310.pyc +0 -0
- qwen_vl_vsg_utils/src/qwen_vl_utils/vision_process.py +432 -0
- resampler_utils/__pycache__/token_arrangement.cpython-310.pyc +0 -0
- resampler_utils/__pycache__/token_insert_1017_multi_resampler.cpython-310.pyc +0 -0
- resampler_utils/__pycache__/token_insert_1020_multi_two_resampler.cpython-310.pyc +0 -0
- resampler_utils/__pycache__/token_insert_new.cpython-310.pyc +0 -0
- resampler_utils/__pycache__/token_insert_no_resampler.cpython-310.pyc +0 -0
- resampler_utils/__pycache__/token_insert_single_resampler.cpython-310.pyc +0 -0
- resampler_utils/__pycache__/token_insert_temporal.cpython-310.pyc +0 -0
- resampler_utils/__pycache__/token_selection.cpython-310.pyc +0 -0
- resampler_utils/__pycache__/token_selection_bbox.cpython-310.pyc +0 -0
- resampler_utils/__pycache__/token_selection_temporal.cpython-310.pyc +0 -0
- resampler_utils/token_arrangement.py +640 -0
- resampler_utils/token_selection.py +101 -0
- special_tokens_map.json +45 -0
- tokenizer_config.json +226 -0
- vocab.json +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
example/2401075277.mp4 filter=lfs diff=lfs merge=lfs -text
|
added_tokens.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</tool_call>": 151658,
|
| 3 |
+
"<obj_traj_end>": 151666,
|
| 4 |
+
"<obj_traj_start>": 151665,
|
| 5 |
+
"<tool_call>": 151657,
|
| 6 |
+
"<|box_end|>": 151649,
|
| 7 |
+
"<|box_start|>": 151648,
|
| 8 |
+
"<|endoftext|>": 151643,
|
| 9 |
+
"<|file_sep|>": 151664,
|
| 10 |
+
"<|fim_middle|>": 151660,
|
| 11 |
+
"<|fim_pad|>": 151662,
|
| 12 |
+
"<|fim_prefix|>": 151659,
|
| 13 |
+
"<|fim_suffix|>": 151661,
|
| 14 |
+
"<|im_end|>": 151645,
|
| 15 |
+
"<|im_start|>": 151644,
|
| 16 |
+
"<|image_pad|>": 151655,
|
| 17 |
+
"<|object_ref_end|>": 151647,
|
| 18 |
+
"<|object_ref_start|>": 151646,
|
| 19 |
+
"<|quad_end|>": 151651,
|
| 20 |
+
"<|quad_start|>": 151650,
|
| 21 |
+
"<|repo_name|>": 151663,
|
| 22 |
+
"<|video_pad|>": 151656,
|
| 23 |
+
"<|vision_end|>": 151653,
|
| 24 |
+
"<|vision_pad|>": 151654,
|
| 25 |
+
"<|vision_start|>": 151652
|
| 26 |
+
}
|
chat_template.jinja
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
|
| 2 |
+
You are a helpful assistant.<|im_end|>
|
| 3 |
+
{% endif %}<|im_start|>{{ message['role'] }}
|
| 4 |
+
{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
|
| 5 |
+
{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
|
| 6 |
+
{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
|
| 7 |
+
{% endif %}
|
config.json
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Qwen2_5_VLForConditionalGeneration_Insert"
|
| 4 |
+
],
|
| 5 |
+
"attention_dropout": 0.0,
|
| 6 |
+
"bos_token_id": 151643,
|
| 7 |
+
"eos_token_id": 151645,
|
| 8 |
+
"hidden_act": "silu",
|
| 9 |
+
"hidden_size": 2048,
|
| 10 |
+
"image_token_id": 151655,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"intermediate_size": 11008,
|
| 13 |
+
"max_position_embeddings": 128000,
|
| 14 |
+
"max_window_layers": 70,
|
| 15 |
+
"model_type": "qwen2_5_vl",
|
| 16 |
+
"num_attention_heads": 16,
|
| 17 |
+
"num_hidden_layers": 36,
|
| 18 |
+
"num_key_value_heads": 2,
|
| 19 |
+
"obj_traj_end_id": 151666,
|
| 20 |
+
"obj_traj_start_id": 151665,
|
| 21 |
+
"resampler_depth": 3,
|
| 22 |
+
"temporal_resampler_n_latents": 32,
|
| 23 |
+
"rms_norm_eps": 1e-06,
|
| 24 |
+
"rope_scaling": {
|
| 25 |
+
"mrope_section": [
|
| 26 |
+
16,
|
| 27 |
+
24,
|
| 28 |
+
24
|
| 29 |
+
],
|
| 30 |
+
"rope_type": "default",
|
| 31 |
+
"type": "default"
|
| 32 |
+
},
|
| 33 |
+
"rope_theta": 1000000.0,
|
| 34 |
+
"object_resampler_n_latents": 32,
|
| 35 |
+
"sliding_window": 32768,
|
| 36 |
+
"text_config": {
|
| 37 |
+
"architectures": [
|
| 38 |
+
"Qwen2_5_VLForConditionalGeneration"
|
| 39 |
+
],
|
| 40 |
+
"attention_dropout": 0.0,
|
| 41 |
+
"bos_token_id": 151643,
|
| 42 |
+
"eos_token_id": 151645,
|
| 43 |
+
"hidden_act": "silu",
|
| 44 |
+
"hidden_size": 2048,
|
| 45 |
+
"image_token_id": null,
|
| 46 |
+
"initializer_range": 0.02,
|
| 47 |
+
"intermediate_size": 11008,
|
| 48 |
+
"layer_types": [
|
| 49 |
+
"full_attention",
|
| 50 |
+
"full_attention",
|
| 51 |
+
"full_attention",
|
| 52 |
+
"full_attention",
|
| 53 |
+
"full_attention",
|
| 54 |
+
"full_attention",
|
| 55 |
+
"full_attention",
|
| 56 |
+
"full_attention",
|
| 57 |
+
"full_attention",
|
| 58 |
+
"full_attention",
|
| 59 |
+
"full_attention",
|
| 60 |
+
"full_attention",
|
| 61 |
+
"full_attention",
|
| 62 |
+
"full_attention",
|
| 63 |
+
"full_attention",
|
| 64 |
+
"full_attention",
|
| 65 |
+
"full_attention",
|
| 66 |
+
"full_attention",
|
| 67 |
+
"full_attention",
|
| 68 |
+
"full_attention",
|
| 69 |
+
"full_attention",
|
| 70 |
+
"full_attention",
|
| 71 |
+
"full_attention",
|
| 72 |
+
"full_attention",
|
| 73 |
+
"full_attention",
|
| 74 |
+
"full_attention",
|
| 75 |
+
"full_attention",
|
| 76 |
+
"full_attention",
|
| 77 |
+
"full_attention",
|
| 78 |
+
"full_attention",
|
| 79 |
+
"full_attention",
|
| 80 |
+
"full_attention",
|
| 81 |
+
"full_attention",
|
| 82 |
+
"full_attention",
|
| 83 |
+
"full_attention",
|
| 84 |
+
"full_attention"
|
| 85 |
+
],
|
| 86 |
+
"max_position_embeddings": 128000,
|
| 87 |
+
"max_window_layers": 70,
|
| 88 |
+
"model_type": "qwen2_5_vl_text",
|
| 89 |
+
"num_attention_heads": 16,
|
| 90 |
+
"num_hidden_layers": 36,
|
| 91 |
+
"num_key_value_heads": 2,
|
| 92 |
+
"rms_norm_eps": 1e-06,
|
| 93 |
+
"rope_scaling": {
|
| 94 |
+
"mrope_section": [
|
| 95 |
+
16,
|
| 96 |
+
24,
|
| 97 |
+
24
|
| 98 |
+
],
|
| 99 |
+
"rope_type": "default",
|
| 100 |
+
"type": "default"
|
| 101 |
+
},
|
| 102 |
+
"rope_theta": 1000000.0,
|
| 103 |
+
"sliding_window": null,
|
| 104 |
+
"tie_word_embeddings": true,
|
| 105 |
+
"torch_dtype": "bfloat16",
|
| 106 |
+
"use_cache": true,
|
| 107 |
+
"use_sliding_window": false,
|
| 108 |
+
"video_token_id": null,
|
| 109 |
+
"vision_end_token_id": 151653,
|
| 110 |
+
"vision_start_token_id": 151652,
|
| 111 |
+
"vision_token_id": 151654,
|
| 112 |
+
"vocab_size": 151667
|
| 113 |
+
},
|
| 114 |
+
"torch_dtype": "bfloat16",
|
| 115 |
+
"transformers_version": "4.54.0",
|
| 116 |
+
"object_resampler": true,
|
| 117 |
+
"use_cache": false,
|
| 118 |
+
"use_resampler": true,
|
| 119 |
+
"use_sliding_window": false,
|
| 120 |
+
"video_token_id": 151656,
|
| 121 |
+
"vision_config": {
|
| 122 |
+
"depth": 32,
|
| 123 |
+
"fullatt_block_indexes": [
|
| 124 |
+
7,
|
| 125 |
+
15,
|
| 126 |
+
23,
|
| 127 |
+
31
|
| 128 |
+
],
|
| 129 |
+
"hidden_act": "silu",
|
| 130 |
+
"hidden_size": 1280,
|
| 131 |
+
"in_channels": 3,
|
| 132 |
+
"in_chans": 3,
|
| 133 |
+
"initializer_range": 0.02,
|
| 134 |
+
"intermediate_size": 3420,
|
| 135 |
+
"model_type": "qwen2_5_vl",
|
| 136 |
+
"num_heads": 16,
|
| 137 |
+
"out_hidden_size": 2048,
|
| 138 |
+
"patch_size": 14,
|
| 139 |
+
"spatial_merge_size": 2,
|
| 140 |
+
"spatial_patch_size": 14,
|
| 141 |
+
"temporal_patch_size": 2,
|
| 142 |
+
"tokens_per_second": 2,
|
| 143 |
+
"torch_dtype": "bfloat16",
|
| 144 |
+
"window_size": 112
|
| 145 |
+
},
|
| 146 |
+
"vision_end_token_id": 151653,
|
| 147 |
+
"vision_start_token_id": 151652,
|
| 148 |
+
"vision_token_id": 151654,
|
| 149 |
+
"vocab_size": 151667
|
| 150 |
+
}
|
example/2401075277.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bea771d46e14045b24a554333dbc07d27292f5927b15a2b3f2dc4ab4572329aa
|
| 3 |
+
size 3966614
|
example/2401075277_rle.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
generation_config.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 151643,
|
| 3 |
+
"do_sample": true,
|
| 4 |
+
"eos_token_id": [
|
| 5 |
+
151645,
|
| 6 |
+
151643
|
| 7 |
+
],
|
| 8 |
+
"pad_token_id": 151643,
|
| 9 |
+
"repetition_penalty": 1.05,
|
| 10 |
+
"resampler_depth": 3,
|
| 11 |
+
"temporal_resampler_n_latents": 32,
|
| 12 |
+
"object_resampler_n_latents": 32,
|
| 13 |
+
"temperature": 1e-06,
|
| 14 |
+
"transformers_version": "4.54.0",
|
| 15 |
+
"object_resampler": true,
|
| 16 |
+
"use_resampler": true
|
| 17 |
+
}
|
inference.py
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
Inference example for Qwen2.5-VL TRASER model.
|
| 5 |
+
Usage:
|
| 6 |
+
python inference.py \
|
| 7 |
+
--model_path . \
|
| 8 |
+
--video_path /path/to/video.mp4 \
|
| 9 |
+
--mask_path /path/to/mask.json \
|
| 10 |
+
--structured_json_dir /path/to/struct_dir \
|
| 11 |
+
--out_dir ./output
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
import json
|
| 16 |
+
import argparse
|
| 17 |
+
import random
|
| 18 |
+
import torch
|
| 19 |
+
import numpy as np
|
| 20 |
+
from transformers import AutoProcessor, AutoTokenizer
|
| 21 |
+
|
| 22 |
+
# Import Custom Model
|
| 23 |
+
from modeling_traser import TRASER
|
| 24 |
+
|
| 25 |
+
# Import Utils
|
| 26 |
+
from qwen_vl_vsg_utils.src.qwen_vl_utils import process_vision_info
|
| 27 |
+
from resampler_utils.token_selection import select_tokens
|
| 28 |
+
from resampler_utils.token_arrangement import rearrange_token
|
| 29 |
+
from pycocotools import mask as maskUtils
|
| 30 |
+
import math
|
| 31 |
+
import torch.nn.functional as F
|
| 32 |
+
|
| 33 |
+
def set_seed(seed: int):
|
| 34 |
+
random.seed(seed)
|
| 35 |
+
np.random.seed(seed)
|
| 36 |
+
torch.manual_seed(seed)
|
| 37 |
+
torch.cuda.manual_seed_all(seed)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def load_mask_data(mask_json_path):
|
| 41 |
+
with open(mask_json_path, "r") as f:
|
| 42 |
+
return json.load(f)
|
| 43 |
+
|
| 44 |
+
def has_any_mask(mask_data, obj_id):
|
| 45 |
+
for frame in mask_data:
|
| 46 |
+
if not frame or obj_id >= len(frame): continue
|
| 47 |
+
if frame[obj_id] and frame[obj_id].get("counts"): return True
|
| 48 |
+
return False
|
| 49 |
+
|
| 50 |
+
def build_obj_masks_tensor(mask_data, obj_ids, sampled_idx, H_rz, W_rz, device):
|
| 51 |
+
O, N = len(obj_ids), len(sampled_idx)
|
| 52 |
+
obj_masks = torch.zeros((O, N, H_rz, W_rz), dtype=torch.float32, device=device)
|
| 53 |
+
for o_i, oid in enumerate(obj_ids):
|
| 54 |
+
for n_idx, fidx in enumerate(sampled_idx):
|
| 55 |
+
if fidx < len(mask_data):
|
| 56 |
+
frame_objs = mask_data[fidx]
|
| 57 |
+
if frame_objs and oid < len(frame_objs):
|
| 58 |
+
rle = frame_objs[oid]
|
| 59 |
+
if rle:
|
| 60 |
+
m = maskUtils.decode({"size": rle["size"], "counts": rle["counts"]})
|
| 61 |
+
if m.ndim == 3: m = m[:, :, 0]
|
| 62 |
+
m_t = torch.from_numpy(m.astype(np.uint8)).unsqueeze(0).unsqueeze(0).float().to(device)
|
| 63 |
+
m_rz = F.interpolate(m_t, size=(H_rz, W_rz), mode="nearest")[0, 0]
|
| 64 |
+
obj_masks[o_i, n_idx] = (m_rz > 0.5).float()
|
| 65 |
+
|
| 66 |
+
keep_idx = (obj_masks.view(O, -1).sum(dim=1) > 0).nonzero(as_tuple=False).squeeze(1).tolist()
|
| 67 |
+
if len(keep_idx) < O: obj_masks = obj_masks[keep_idx]
|
| 68 |
+
return obj_masks, keep_idx
|
| 69 |
+
|
| 70 |
+
def run_single_video(model, processor, video_path, mask_path, out_dir, device, args):
|
| 71 |
+
mask_data = load_mask_data(mask_path)
|
| 72 |
+
all_ids = range(min(len(mask_data[0]),args.max_objects))
|
| 73 |
+
eligible = [oid for oid in all_ids if has_any_mask(mask_data, oid)]
|
| 74 |
+
|
| 75 |
+
if len(eligible) > args.max_objects:
|
| 76 |
+
random.shuffle(eligible)
|
| 77 |
+
selected_obj_ids = sorted(eligible[:args.max_objects])
|
| 78 |
+
else:
|
| 79 |
+
selected_obj_ids = sorted(eligible)
|
| 80 |
+
|
| 81 |
+
messages = [
|
| 82 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
| 83 |
+
{"role": "user", "content": [
|
| 84 |
+
{"type": "text", "text": "Output the video Scene Graph from the video and object trajectories:\n"},
|
| 85 |
+
{"type": "video", "video": video_path}
|
| 86 |
+
]}
|
| 87 |
+
]
|
| 88 |
+
|
| 89 |
+
prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 90 |
+
image_inputs, video_inputs, fps, selected_frame_idx = process_vision_info(messages, return_video_kwargs=True)
|
| 91 |
+
|
| 92 |
+
proc_inputs = processor(
|
| 93 |
+
text=[prompt_text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", fps=1
|
| 94 |
+
).to(device)
|
| 95 |
+
|
| 96 |
+
video_grid_thw = proc_inputs["video_grid_thw"]
|
| 97 |
+
if isinstance(video_grid_thw, list): video_grid_thw = torch.stack([x.to(device) for x in video_grid_thw])
|
| 98 |
+
else: video_grid_thw = video_grid_thw.to(device)
|
| 99 |
+
|
| 100 |
+
T_grid = int(video_grid_thw[0, 0].item())
|
| 101 |
+
H_patch, W_patch = int(video_grid_thw[0, 1].item()), int(video_grid_thw[0, 2].item())
|
| 102 |
+
|
| 103 |
+
# Calculate mask resize dimensions
|
| 104 |
+
patch_size = 14
|
| 105 |
+
H_rz, W_rz = H_patch * patch_size, W_patch * patch_size
|
| 106 |
+
|
| 107 |
+
# Build Masks
|
| 108 |
+
sampled_idx = selected_frame_idx[0]
|
| 109 |
+
obj_masks, keep_idx = build_obj_masks_tensor(mask_data, selected_obj_ids, sampled_idx, H_rz, W_rz, device)
|
| 110 |
+
selected_obj_ids = [selected_obj_ids[i] for i in keep_idx]
|
| 111 |
+
|
| 112 |
+
# Select Tokens
|
| 113 |
+
per_union_idx, per_obj_idx, _ = select_tokens(
|
| 114 |
+
obj_masks=obj_masks,
|
| 115 |
+
grid_thw=(T_grid, H_patch, W_patch),
|
| 116 |
+
patch_size=patch_size,
|
| 117 |
+
device=device
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
# Prepare Input
|
| 121 |
+
per_obj_idx_batch = [per_obj_idx]
|
| 122 |
+
|
| 123 |
+
# Prepare text labels
|
| 124 |
+
text_token_ids_per_sample = []
|
| 125 |
+
label_template = "Object {i}: "
|
| 126 |
+
additional_texts = [label_template.format(i=(k + 1)) for k in range(len(per_obj_idx))]
|
| 127 |
+
enc = processor.tokenizer(additional_texts, add_special_tokens=False)["input_ids"]
|
| 128 |
+
text_token_ids_per_sample.append([torch.tensor(x, dtype=torch.long) for x in enc])
|
| 129 |
+
|
| 130 |
+
# Prepare timestamps
|
| 131 |
+
sec_per_window = torch.arange(0, T_grid) * 2.0
|
| 132 |
+
temporal_window_length = 4.0
|
| 133 |
+
grids_per_window = int(temporal_window_length / 2.0)
|
| 134 |
+
|
| 135 |
+
timestamp_token_ids_per_batch = []
|
| 136 |
+
grids_per_window_batch = []
|
| 137 |
+
|
| 138 |
+
temporal_text_list = []
|
| 139 |
+
num_windows = math.ceil(len(sec_per_window) / grids_per_window)
|
| 140 |
+
for w_id in range(num_windows):
|
| 141 |
+
s, e = w_id * temporal_window_length, (w_id + 1) * temporal_window_length
|
| 142 |
+
temporal_text_list.append(f"<{int(s)} - {int(e)} sec>")
|
| 143 |
+
|
| 144 |
+
enc_ts = processor.tokenizer(temporal_text_list, add_special_tokens=False)["input_ids"]
|
| 145 |
+
timestamp_token_ids_per_batch.append([torch.tensor(x) for x in enc_ts])
|
| 146 |
+
grids_per_window_batch.append(grids_per_window)
|
| 147 |
+
|
| 148 |
+
# Rearrange and Generate
|
| 149 |
+
with torch.no_grad():
|
| 150 |
+
new_emb, new_pid, new_mask, rope_deltas, cache_pos, _, _ = rearrange_token(
|
| 151 |
+
model=model,
|
| 152 |
+
input_ids=proc_inputs["input_ids"],
|
| 153 |
+
attention_mask=proc_inputs["attention_mask"],
|
| 154 |
+
pixel_values_videos=proc_inputs["pixel_values_videos"],
|
| 155 |
+
video_grid_thw=video_grid_thw,
|
| 156 |
+
image_grid_thw=None, pixel_values=None, second_per_grid_ts=None,
|
| 157 |
+
obj_token_indices_per_sample=per_obj_idx_batch,
|
| 158 |
+
obj_traj_start_id=args.obj_traj_start_id,
|
| 159 |
+
obj_traj_end_id=args.obj_traj_end_id,
|
| 160 |
+
text_token_ids_per_sample=text_token_ids_per_sample,
|
| 161 |
+
timestamp_token_ids_per_batch=timestamp_token_ids_per_batch,
|
| 162 |
+
grids_per_temporal_window_per_batch=grids_per_window_batch,
|
| 163 |
+
use_resampler=True
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
gen_out = model.generate(
|
| 167 |
+
inputs_embeds=new_emb,
|
| 168 |
+
position_ids=new_pid,
|
| 169 |
+
attention_mask=new_mask.long(),
|
| 170 |
+
rope_deltas=rope_deltas,
|
| 171 |
+
max_new_tokens=8192,
|
| 172 |
+
do_sample=True,
|
| 173 |
+
top_p=0.9,
|
| 174 |
+
temperature=1e-6,
|
| 175 |
+
repetition_penalty=1.05
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
decoded = processor.tokenizer.decode(gen_out[0], skip_special_tokens=True)
|
| 179 |
+
print(f"Generated Output:\n{decoded}")
|
| 180 |
+
|
| 181 |
+
if out_dir:
|
| 182 |
+
with open(os.path.join(out_dir, "output.txt"), "w") as f:
|
| 183 |
+
f.write(decoded)
|
| 184 |
+
|
| 185 |
+
def main():
|
| 186 |
+
parser = argparse.ArgumentParser()
|
| 187 |
+
parser.add_argument("--model_path", type=str, required=True, help="Path to model or HF repo")
|
| 188 |
+
parser.add_argument("--video_path", type=str, required=True)
|
| 189 |
+
parser.add_argument("--mask_path", type=str, required=True)
|
| 190 |
+
parser.add_argument("--out_dir", type=str, default="./output")
|
| 191 |
+
parser.add_argument("--max_objects", type=int, default=40)
|
| 192 |
+
parser.add_argument("--obj_traj_start_id", type=int, default=151665)
|
| 193 |
+
parser.add_argument("--obj_traj_end_id", type=int, default=151666)
|
| 194 |
+
args = parser.parse_args()
|
| 195 |
+
|
| 196 |
+
set_seed(42)
|
| 197 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 198 |
+
|
| 199 |
+
if args.out_dir:
|
| 200 |
+
os.makedirs(args.out_dir, exist_ok=True)
|
| 201 |
+
|
| 202 |
+
# Load Model (Using the separate class)
|
| 203 |
+
# Note: If trust_remote_code=True works, you can use AutoModel.
|
| 204 |
+
# For this example, we explicit load TRASER to ensure it works with local weights.
|
| 205 |
+
model = TRASER.from_pretrained(args.model_path, torch_dtype=torch.bfloat16).to(device)
|
| 206 |
+
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
|
| 207 |
+
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
|
| 208 |
+
processor.tokenizer = tokenizer
|
| 209 |
+
|
| 210 |
+
run_single_video(model, processor, args.video_path, args.mask_path, args.out_dir, device, args)
|
| 211 |
+
|
| 212 |
+
if __name__ == "__main__":
|
| 213 |
+
main()
|
merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
model-00001-of-00002.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7e1d223559703e608b90365e2f245ee07372d41572d7d8c80cf39186efa2944a
|
| 3 |
+
size 4996648936
|
model-00002-of-00002.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a4b8c3ab4f120fcefc2d012648c40340348938dc7c47e1b0120787dff0eff2e9
|
| 3 |
+
size 3210291272
|
model.safetensors.index.json
ADDED
|
@@ -0,0 +1,896 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"total_parameters": 927744,
|
| 4 |
+
"total_size": 8206839808
|
| 5 |
+
},
|
| 6 |
+
"weight_map": {
|
| 7 |
+
"model.embed_tokens.weight": "model-00001-of-00002.safetensors",
|
| 8 |
+
"model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 9 |
+
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 10 |
+
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 11 |
+
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 12 |
+
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 13 |
+
"model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 14 |
+
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 15 |
+
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 16 |
+
"model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 17 |
+
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 18 |
+
"model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 19 |
+
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 20 |
+
"model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 21 |
+
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 22 |
+
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 23 |
+
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 24 |
+
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 25 |
+
"model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 26 |
+
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 27 |
+
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 28 |
+
"model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 29 |
+
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 30 |
+
"model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 31 |
+
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 32 |
+
"model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 33 |
+
"model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 34 |
+
"model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 35 |
+
"model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 36 |
+
"model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 37 |
+
"model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 38 |
+
"model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 39 |
+
"model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 40 |
+
"model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 41 |
+
"model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 42 |
+
"model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 43 |
+
"model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 44 |
+
"model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 45 |
+
"model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 46 |
+
"model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 47 |
+
"model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 48 |
+
"model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 49 |
+
"model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 50 |
+
"model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 51 |
+
"model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 52 |
+
"model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 53 |
+
"model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 54 |
+
"model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 55 |
+
"model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 56 |
+
"model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 57 |
+
"model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 58 |
+
"model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 59 |
+
"model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 60 |
+
"model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 61 |
+
"model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 62 |
+
"model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 63 |
+
"model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 64 |
+
"model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 65 |
+
"model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 66 |
+
"model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 67 |
+
"model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 68 |
+
"model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 69 |
+
"model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 70 |
+
"model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 71 |
+
"model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 72 |
+
"model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 73 |
+
"model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 74 |
+
"model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 75 |
+
"model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 76 |
+
"model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 77 |
+
"model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 78 |
+
"model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 79 |
+
"model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 80 |
+
"model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 81 |
+
"model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 82 |
+
"model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 83 |
+
"model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 84 |
+
"model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 85 |
+
"model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 86 |
+
"model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 87 |
+
"model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 88 |
+
"model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 89 |
+
"model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 90 |
+
"model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 91 |
+
"model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 92 |
+
"model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 93 |
+
"model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 94 |
+
"model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 95 |
+
"model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 96 |
+
"model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 97 |
+
"model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 98 |
+
"model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 99 |
+
"model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 100 |
+
"model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 101 |
+
"model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 102 |
+
"model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 103 |
+
"model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 104 |
+
"model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 105 |
+
"model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 106 |
+
"model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 107 |
+
"model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 108 |
+
"model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 109 |
+
"model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 110 |
+
"model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 111 |
+
"model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 112 |
+
"model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 113 |
+
"model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 114 |
+
"model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 115 |
+
"model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 116 |
+
"model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 117 |
+
"model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 118 |
+
"model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 119 |
+
"model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 120 |
+
"model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 121 |
+
"model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 122 |
+
"model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 123 |
+
"model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 124 |
+
"model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 125 |
+
"model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 126 |
+
"model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 127 |
+
"model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 128 |
+
"model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 129 |
+
"model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 130 |
+
"model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 131 |
+
"model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 132 |
+
"model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 133 |
+
"model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 134 |
+
"model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 135 |
+
"model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 136 |
+
"model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 137 |
+
"model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 138 |
+
"model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 139 |
+
"model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 140 |
+
"model.layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 141 |
+
"model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 142 |
+
"model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 143 |
+
"model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 144 |
+
"model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 145 |
+
"model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 146 |
+
"model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 147 |
+
"model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 148 |
+
"model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 149 |
+
"model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 150 |
+
"model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 151 |
+
"model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 152 |
+
"model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 153 |
+
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 154 |
+
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 155 |
+
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 156 |
+
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 157 |
+
"model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 158 |
+
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 159 |
+
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 160 |
+
"model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 161 |
+
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 162 |
+
"model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 163 |
+
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 164 |
+
"model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 165 |
+
"model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 166 |
+
"model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 167 |
+
"model.layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 168 |
+
"model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 169 |
+
"model.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
|
| 170 |
+
"model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 171 |
+
"model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 172 |
+
"model.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
|
| 173 |
+
"model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 174 |
+
"model.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
|
| 175 |
+
"model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 176 |
+
"model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 177 |
+
"model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 178 |
+
"model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 179 |
+
"model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 180 |
+
"model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 181 |
+
"model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
|
| 182 |
+
"model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 183 |
+
"model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 184 |
+
"model.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
|
| 185 |
+
"model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 186 |
+
"model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
|
| 187 |
+
"model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 188 |
+
"model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 189 |
+
"model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 190 |
+
"model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 191 |
+
"model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 192 |
+
"model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 193 |
+
"model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
|
| 194 |
+
"model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 195 |
+
"model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 196 |
+
"model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
|
| 197 |
+
"model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 198 |
+
"model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
|
| 199 |
+
"model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 200 |
+
"model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 201 |
+
"model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 202 |
+
"model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 203 |
+
"model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 204 |
+
"model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 205 |
+
"model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
|
| 206 |
+
"model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 207 |
+
"model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 208 |
+
"model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
|
| 209 |
+
"model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 210 |
+
"model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
|
| 211 |
+
"model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 212 |
+
"model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 213 |
+
"model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 214 |
+
"model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 215 |
+
"model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 216 |
+
"model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 217 |
+
"model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
|
| 218 |
+
"model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 219 |
+
"model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 220 |
+
"model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
|
| 221 |
+
"model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 222 |
+
"model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
|
| 223 |
+
"model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 224 |
+
"model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 225 |
+
"model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 226 |
+
"model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 227 |
+
"model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 228 |
+
"model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 229 |
+
"model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
|
| 230 |
+
"model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 231 |
+
"model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 232 |
+
"model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
|
| 233 |
+
"model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 234 |
+
"model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
|
| 235 |
+
"model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 236 |
+
"model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 237 |
+
"model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 238 |
+
"model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 239 |
+
"model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 240 |
+
"model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 241 |
+
"model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
|
| 242 |
+
"model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 243 |
+
"model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 244 |
+
"model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
|
| 245 |
+
"model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 246 |
+
"model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
|
| 247 |
+
"model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 248 |
+
"model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 249 |
+
"model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 250 |
+
"model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 251 |
+
"model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 252 |
+
"model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 253 |
+
"model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
|
| 254 |
+
"model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 255 |
+
"model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 256 |
+
"model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
|
| 257 |
+
"model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 258 |
+
"model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
|
| 259 |
+
"model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 260 |
+
"model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 261 |
+
"model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 262 |
+
"model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 263 |
+
"model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 264 |
+
"model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 265 |
+
"model.layers.28.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
|
| 266 |
+
"model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 267 |
+
"model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 268 |
+
"model.layers.28.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
|
| 269 |
+
"model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 270 |
+
"model.layers.28.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
|
| 271 |
+
"model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 272 |
+
"model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 273 |
+
"model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 274 |
+
"model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 275 |
+
"model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 276 |
+
"model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 277 |
+
"model.layers.29.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
|
| 278 |
+
"model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 279 |
+
"model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 280 |
+
"model.layers.29.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
|
| 281 |
+
"model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 282 |
+
"model.layers.29.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
|
| 283 |
+
"model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 284 |
+
"model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 285 |
+
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 286 |
+
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 287 |
+
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 288 |
+
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 289 |
+
"model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 290 |
+
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 291 |
+
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 292 |
+
"model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 293 |
+
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 294 |
+
"model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 295 |
+
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 296 |
+
"model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 297 |
+
"model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 298 |
+
"model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 299 |
+
"model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 300 |
+
"model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 301 |
+
"model.layers.30.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
|
| 302 |
+
"model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 303 |
+
"model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 304 |
+
"model.layers.30.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
|
| 305 |
+
"model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 306 |
+
"model.layers.30.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
|
| 307 |
+
"model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 308 |
+
"model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 309 |
+
"model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 310 |
+
"model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 311 |
+
"model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 312 |
+
"model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 313 |
+
"model.layers.31.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
|
| 314 |
+
"model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 315 |
+
"model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 316 |
+
"model.layers.31.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
|
| 317 |
+
"model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 318 |
+
"model.layers.31.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
|
| 319 |
+
"model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 320 |
+
"model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 321 |
+
"model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 322 |
+
"model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 323 |
+
"model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 324 |
+
"model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 325 |
+
"model.layers.32.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
|
| 326 |
+
"model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 327 |
+
"model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 328 |
+
"model.layers.32.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
|
| 329 |
+
"model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 330 |
+
"model.layers.32.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
|
| 331 |
+
"model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 332 |
+
"model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 333 |
+
"model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 334 |
+
"model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 335 |
+
"model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 336 |
+
"model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 337 |
+
"model.layers.33.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
|
| 338 |
+
"model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 339 |
+
"model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 340 |
+
"model.layers.33.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
|
| 341 |
+
"model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 342 |
+
"model.layers.33.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
|
| 343 |
+
"model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 344 |
+
"model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 345 |
+
"model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 346 |
+
"model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 347 |
+
"model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 348 |
+
"model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 349 |
+
"model.layers.34.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
|
| 350 |
+
"model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 351 |
+
"model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 352 |
+
"model.layers.34.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
|
| 353 |
+
"model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 354 |
+
"model.layers.34.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
|
| 355 |
+
"model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 356 |
+
"model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 357 |
+
"model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 358 |
+
"model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 359 |
+
"model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 360 |
+
"model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 361 |
+
"model.layers.35.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
|
| 362 |
+
"model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 363 |
+
"model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 364 |
+
"model.layers.35.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
|
| 365 |
+
"model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 366 |
+
"model.layers.35.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
|
| 367 |
+
"model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 368 |
+
"model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 369 |
+
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 370 |
+
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 371 |
+
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 372 |
+
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 373 |
+
"model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 374 |
+
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 375 |
+
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 376 |
+
"model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 377 |
+
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 378 |
+
"model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 379 |
+
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 380 |
+
"model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 381 |
+
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 382 |
+
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 383 |
+
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 384 |
+
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 385 |
+
"model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 386 |
+
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 387 |
+
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 388 |
+
"model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 389 |
+
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 390 |
+
"model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 391 |
+
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 392 |
+
"model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 393 |
+
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 394 |
+
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 395 |
+
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 396 |
+
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 397 |
+
"model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 398 |
+
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 399 |
+
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 400 |
+
"model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 401 |
+
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 402 |
+
"model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 403 |
+
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 404 |
+
"model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 405 |
+
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 406 |
+
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 407 |
+
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 408 |
+
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 409 |
+
"model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 410 |
+
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 411 |
+
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 412 |
+
"model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 413 |
+
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 414 |
+
"model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 415 |
+
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 416 |
+
"model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 417 |
+
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 418 |
+
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 419 |
+
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 420 |
+
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 421 |
+
"model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 422 |
+
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 423 |
+
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 424 |
+
"model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 425 |
+
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 426 |
+
"model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 427 |
+
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 428 |
+
"model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 429 |
+
"model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 430 |
+
"model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 431 |
+
"model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 432 |
+
"model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 433 |
+
"model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 434 |
+
"model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 435 |
+
"model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 436 |
+
"model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 437 |
+
"model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 438 |
+
"model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 439 |
+
"model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 440 |
+
"model.norm.weight": "model-00002-of-00002.safetensors",
|
| 441 |
+
"perceiver_resampler.latents": "model-00002-of-00002.safetensors",
|
| 442 |
+
"perceiver_resampler.layers.0.input_context_norm.weight": "model-00002-of-00002.safetensors",
|
| 443 |
+
"perceiver_resampler.layers.0.input_latents_norm.weight": "model-00002-of-00002.safetensors",
|
| 444 |
+
"perceiver_resampler.layers.0.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 445 |
+
"perceiver_resampler.layers.0.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 446 |
+
"perceiver_resampler.layers.0.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 447 |
+
"perceiver_resampler.layers.0.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 448 |
+
"perceiver_resampler.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 449 |
+
"perceiver_resampler.layers.0.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 450 |
+
"perceiver_resampler.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 451 |
+
"perceiver_resampler.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 452 |
+
"perceiver_resampler.layers.1.input_context_norm.weight": "model-00002-of-00002.safetensors",
|
| 453 |
+
"perceiver_resampler.layers.1.input_latents_norm.weight": "model-00002-of-00002.safetensors",
|
| 454 |
+
"perceiver_resampler.layers.1.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 455 |
+
"perceiver_resampler.layers.1.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 456 |
+
"perceiver_resampler.layers.1.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 457 |
+
"perceiver_resampler.layers.1.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 458 |
+
"perceiver_resampler.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 459 |
+
"perceiver_resampler.layers.1.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 460 |
+
"perceiver_resampler.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 461 |
+
"perceiver_resampler.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 462 |
+
"perceiver_resampler.layers.2.input_context_norm.weight": "model-00002-of-00002.safetensors",
|
| 463 |
+
"perceiver_resampler.layers.2.input_latents_norm.weight": "model-00002-of-00002.safetensors",
|
| 464 |
+
"perceiver_resampler.layers.2.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 465 |
+
"perceiver_resampler.layers.2.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 466 |
+
"perceiver_resampler.layers.2.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 467 |
+
"perceiver_resampler.layers.2.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 468 |
+
"perceiver_resampler.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 469 |
+
"perceiver_resampler.layers.2.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 470 |
+
"perceiver_resampler.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 471 |
+
"perceiver_resampler.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 472 |
+
"perceiver_resampler.norm.weight": "model-00002-of-00002.safetensors",
|
| 473 |
+
"second_perceiver_resampler.latents": "model-00002-of-00002.safetensors",
|
| 474 |
+
"second_perceiver_resampler.layers.0.input_context_norm.weight": "model-00002-of-00002.safetensors",
|
| 475 |
+
"second_perceiver_resampler.layers.0.input_latents_norm.weight": "model-00002-of-00002.safetensors",
|
| 476 |
+
"second_perceiver_resampler.layers.0.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 477 |
+
"second_perceiver_resampler.layers.0.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 478 |
+
"second_perceiver_resampler.layers.0.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 479 |
+
"second_perceiver_resampler.layers.0.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 480 |
+
"second_perceiver_resampler.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 481 |
+
"second_perceiver_resampler.layers.0.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 482 |
+
"second_perceiver_resampler.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 483 |
+
"second_perceiver_resampler.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 484 |
+
"second_perceiver_resampler.layers.1.input_context_norm.weight": "model-00002-of-00002.safetensors",
|
| 485 |
+
"second_perceiver_resampler.layers.1.input_latents_norm.weight": "model-00002-of-00002.safetensors",
|
| 486 |
+
"second_perceiver_resampler.layers.1.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 487 |
+
"second_perceiver_resampler.layers.1.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 488 |
+
"second_perceiver_resampler.layers.1.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 489 |
+
"second_perceiver_resampler.layers.1.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 490 |
+
"second_perceiver_resampler.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 491 |
+
"second_perceiver_resampler.layers.1.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 492 |
+
"second_perceiver_resampler.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 493 |
+
"second_perceiver_resampler.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 494 |
+
"second_perceiver_resampler.layers.2.input_context_norm.weight": "model-00002-of-00002.safetensors",
|
| 495 |
+
"second_perceiver_resampler.layers.2.input_latents_norm.weight": "model-00002-of-00002.safetensors",
|
| 496 |
+
"second_perceiver_resampler.layers.2.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 497 |
+
"second_perceiver_resampler.layers.2.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 498 |
+
"second_perceiver_resampler.layers.2.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 499 |
+
"second_perceiver_resampler.layers.2.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 500 |
+
"second_perceiver_resampler.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 501 |
+
"second_perceiver_resampler.layers.2.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 502 |
+
"second_perceiver_resampler.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 503 |
+
"second_perceiver_resampler.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 504 |
+
"second_perceiver_resampler.norm.weight": "model-00002-of-00002.safetensors",
|
| 505 |
+
"visual.blocks.0.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 506 |
+
"visual.blocks.0.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 507 |
+
"visual.blocks.0.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 508 |
+
"visual.blocks.0.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 509 |
+
"visual.blocks.0.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 510 |
+
"visual.blocks.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 511 |
+
"visual.blocks.0.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 512 |
+
"visual.blocks.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 513 |
+
"visual.blocks.0.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 514 |
+
"visual.blocks.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 515 |
+
"visual.blocks.0.norm1.weight": "model-00001-of-00002.safetensors",
|
| 516 |
+
"visual.blocks.0.norm2.weight": "model-00001-of-00002.safetensors",
|
| 517 |
+
"visual.blocks.1.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 518 |
+
"visual.blocks.1.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 519 |
+
"visual.blocks.1.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 520 |
+
"visual.blocks.1.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 521 |
+
"visual.blocks.1.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 522 |
+
"visual.blocks.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 523 |
+
"visual.blocks.1.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 524 |
+
"visual.blocks.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 525 |
+
"visual.blocks.1.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 526 |
+
"visual.blocks.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 527 |
+
"visual.blocks.1.norm1.weight": "model-00001-of-00002.safetensors",
|
| 528 |
+
"visual.blocks.1.norm2.weight": "model-00001-of-00002.safetensors",
|
| 529 |
+
"visual.blocks.10.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 530 |
+
"visual.blocks.10.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 531 |
+
"visual.blocks.10.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 532 |
+
"visual.blocks.10.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 533 |
+
"visual.blocks.10.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 534 |
+
"visual.blocks.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 535 |
+
"visual.blocks.10.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 536 |
+
"visual.blocks.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 537 |
+
"visual.blocks.10.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 538 |
+
"visual.blocks.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 539 |
+
"visual.blocks.10.norm1.weight": "model-00001-of-00002.safetensors",
|
| 540 |
+
"visual.blocks.10.norm2.weight": "model-00001-of-00002.safetensors",
|
| 541 |
+
"visual.blocks.11.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 542 |
+
"visual.blocks.11.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 543 |
+
"visual.blocks.11.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 544 |
+
"visual.blocks.11.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 545 |
+
"visual.blocks.11.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 546 |
+
"visual.blocks.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 547 |
+
"visual.blocks.11.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 548 |
+
"visual.blocks.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 549 |
+
"visual.blocks.11.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 550 |
+
"visual.blocks.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 551 |
+
"visual.blocks.11.norm1.weight": "model-00001-of-00002.safetensors",
|
| 552 |
+
"visual.blocks.11.norm2.weight": "model-00001-of-00002.safetensors",
|
| 553 |
+
"visual.blocks.12.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 554 |
+
"visual.blocks.12.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 555 |
+
"visual.blocks.12.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 556 |
+
"visual.blocks.12.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 557 |
+
"visual.blocks.12.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 558 |
+
"visual.blocks.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 559 |
+
"visual.blocks.12.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 560 |
+
"visual.blocks.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 561 |
+
"visual.blocks.12.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 562 |
+
"visual.blocks.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 563 |
+
"visual.blocks.12.norm1.weight": "model-00001-of-00002.safetensors",
|
| 564 |
+
"visual.blocks.12.norm2.weight": "model-00001-of-00002.safetensors",
|
| 565 |
+
"visual.blocks.13.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 566 |
+
"visual.blocks.13.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 567 |
+
"visual.blocks.13.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 568 |
+
"visual.blocks.13.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 569 |
+
"visual.blocks.13.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 570 |
+
"visual.blocks.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 571 |
+
"visual.blocks.13.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 572 |
+
"visual.blocks.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 573 |
+
"visual.blocks.13.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 574 |
+
"visual.blocks.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 575 |
+
"visual.blocks.13.norm1.weight": "model-00001-of-00002.safetensors",
|
| 576 |
+
"visual.blocks.13.norm2.weight": "model-00001-of-00002.safetensors",
|
| 577 |
+
"visual.blocks.14.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 578 |
+
"visual.blocks.14.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 579 |
+
"visual.blocks.14.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 580 |
+
"visual.blocks.14.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 581 |
+
"visual.blocks.14.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 582 |
+
"visual.blocks.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 583 |
+
"visual.blocks.14.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 584 |
+
"visual.blocks.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 585 |
+
"visual.blocks.14.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 586 |
+
"visual.blocks.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 587 |
+
"visual.blocks.14.norm1.weight": "model-00001-of-00002.safetensors",
|
| 588 |
+
"visual.blocks.14.norm2.weight": "model-00001-of-00002.safetensors",
|
| 589 |
+
"visual.blocks.15.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 590 |
+
"visual.blocks.15.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 591 |
+
"visual.blocks.15.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 592 |
+
"visual.blocks.15.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 593 |
+
"visual.blocks.15.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 594 |
+
"visual.blocks.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 595 |
+
"visual.blocks.15.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 596 |
+
"visual.blocks.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 597 |
+
"visual.blocks.15.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 598 |
+
"visual.blocks.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 599 |
+
"visual.blocks.15.norm1.weight": "model-00001-of-00002.safetensors",
|
| 600 |
+
"visual.blocks.15.norm2.weight": "model-00001-of-00002.safetensors",
|
| 601 |
+
"visual.blocks.16.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 602 |
+
"visual.blocks.16.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 603 |
+
"visual.blocks.16.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 604 |
+
"visual.blocks.16.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 605 |
+
"visual.blocks.16.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 606 |
+
"visual.blocks.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 607 |
+
"visual.blocks.16.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 608 |
+
"visual.blocks.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 609 |
+
"visual.blocks.16.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 610 |
+
"visual.blocks.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 611 |
+
"visual.blocks.16.norm1.weight": "model-00001-of-00002.safetensors",
|
| 612 |
+
"visual.blocks.16.norm2.weight": "model-00001-of-00002.safetensors",
|
| 613 |
+
"visual.blocks.17.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 614 |
+
"visual.blocks.17.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 615 |
+
"visual.blocks.17.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 616 |
+
"visual.blocks.17.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 617 |
+
"visual.blocks.17.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 618 |
+
"visual.blocks.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 619 |
+
"visual.blocks.17.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 620 |
+
"visual.blocks.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 621 |
+
"visual.blocks.17.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 622 |
+
"visual.blocks.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 623 |
+
"visual.blocks.17.norm1.weight": "model-00001-of-00002.safetensors",
|
| 624 |
+
"visual.blocks.17.norm2.weight": "model-00001-of-00002.safetensors",
|
| 625 |
+
"visual.blocks.18.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 626 |
+
"visual.blocks.18.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 627 |
+
"visual.blocks.18.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 628 |
+
"visual.blocks.18.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 629 |
+
"visual.blocks.18.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 630 |
+
"visual.blocks.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 631 |
+
"visual.blocks.18.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 632 |
+
"visual.blocks.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 633 |
+
"visual.blocks.18.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 634 |
+
"visual.blocks.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 635 |
+
"visual.blocks.18.norm1.weight": "model-00001-of-00002.safetensors",
|
| 636 |
+
"visual.blocks.18.norm2.weight": "model-00001-of-00002.safetensors",
|
| 637 |
+
"visual.blocks.19.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 638 |
+
"visual.blocks.19.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 639 |
+
"visual.blocks.19.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 640 |
+
"visual.blocks.19.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 641 |
+
"visual.blocks.19.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 642 |
+
"visual.blocks.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 643 |
+
"visual.blocks.19.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 644 |
+
"visual.blocks.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 645 |
+
"visual.blocks.19.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 646 |
+
"visual.blocks.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 647 |
+
"visual.blocks.19.norm1.weight": "model-00001-of-00002.safetensors",
|
| 648 |
+
"visual.blocks.19.norm2.weight": "model-00001-of-00002.safetensors",
|
| 649 |
+
"visual.blocks.2.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 650 |
+
"visual.blocks.2.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 651 |
+
"visual.blocks.2.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 652 |
+
"visual.blocks.2.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 653 |
+
"visual.blocks.2.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 654 |
+
"visual.blocks.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 655 |
+
"visual.blocks.2.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 656 |
+
"visual.blocks.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 657 |
+
"visual.blocks.2.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 658 |
+
"visual.blocks.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 659 |
+
"visual.blocks.2.norm1.weight": "model-00001-of-00002.safetensors",
|
| 660 |
+
"visual.blocks.2.norm2.weight": "model-00001-of-00002.safetensors",
|
| 661 |
+
"visual.blocks.20.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 662 |
+
"visual.blocks.20.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 663 |
+
"visual.blocks.20.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 664 |
+
"visual.blocks.20.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 665 |
+
"visual.blocks.20.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 666 |
+
"visual.blocks.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 667 |
+
"visual.blocks.20.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 668 |
+
"visual.blocks.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 669 |
+
"visual.blocks.20.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 670 |
+
"visual.blocks.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 671 |
+
"visual.blocks.20.norm1.weight": "model-00001-of-00002.safetensors",
|
| 672 |
+
"visual.blocks.20.norm2.weight": "model-00001-of-00002.safetensors",
|
| 673 |
+
"visual.blocks.21.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 674 |
+
"visual.blocks.21.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 675 |
+
"visual.blocks.21.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 676 |
+
"visual.blocks.21.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 677 |
+
"visual.blocks.21.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 678 |
+
"visual.blocks.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 679 |
+
"visual.blocks.21.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 680 |
+
"visual.blocks.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 681 |
+
"visual.blocks.21.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 682 |
+
"visual.blocks.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 683 |
+
"visual.blocks.21.norm1.weight": "model-00001-of-00002.safetensors",
|
| 684 |
+
"visual.blocks.21.norm2.weight": "model-00001-of-00002.safetensors",
|
| 685 |
+
"visual.blocks.22.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 686 |
+
"visual.blocks.22.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 687 |
+
"visual.blocks.22.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 688 |
+
"visual.blocks.22.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 689 |
+
"visual.blocks.22.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 690 |
+
"visual.blocks.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 691 |
+
"visual.blocks.22.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 692 |
+
"visual.blocks.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 693 |
+
"visual.blocks.22.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 694 |
+
"visual.blocks.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 695 |
+
"visual.blocks.22.norm1.weight": "model-00001-of-00002.safetensors",
|
| 696 |
+
"visual.blocks.22.norm2.weight": "model-00001-of-00002.safetensors",
|
| 697 |
+
"visual.blocks.23.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 698 |
+
"visual.blocks.23.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 699 |
+
"visual.blocks.23.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 700 |
+
"visual.blocks.23.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 701 |
+
"visual.blocks.23.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 702 |
+
"visual.blocks.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 703 |
+
"visual.blocks.23.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 704 |
+
"visual.blocks.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 705 |
+
"visual.blocks.23.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 706 |
+
"visual.blocks.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 707 |
+
"visual.blocks.23.norm1.weight": "model-00001-of-00002.safetensors",
|
| 708 |
+
"visual.blocks.23.norm2.weight": "model-00001-of-00002.safetensors",
|
| 709 |
+
"visual.blocks.24.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 710 |
+
"visual.blocks.24.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 711 |
+
"visual.blocks.24.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 712 |
+
"visual.blocks.24.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 713 |
+
"visual.blocks.24.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 714 |
+
"visual.blocks.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 715 |
+
"visual.blocks.24.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 716 |
+
"visual.blocks.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 717 |
+
"visual.blocks.24.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 718 |
+
"visual.blocks.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 719 |
+
"visual.blocks.24.norm1.weight": "model-00001-of-00002.safetensors",
|
| 720 |
+
"visual.blocks.24.norm2.weight": "model-00001-of-00002.safetensors",
|
| 721 |
+
"visual.blocks.25.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 722 |
+
"visual.blocks.25.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 723 |
+
"visual.blocks.25.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 724 |
+
"visual.blocks.25.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 725 |
+
"visual.blocks.25.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 726 |
+
"visual.blocks.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 727 |
+
"visual.blocks.25.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 728 |
+
"visual.blocks.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 729 |
+
"visual.blocks.25.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 730 |
+
"visual.blocks.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 731 |
+
"visual.blocks.25.norm1.weight": "model-00001-of-00002.safetensors",
|
| 732 |
+
"visual.blocks.25.norm2.weight": "model-00001-of-00002.safetensors",
|
| 733 |
+
"visual.blocks.26.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 734 |
+
"visual.blocks.26.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 735 |
+
"visual.blocks.26.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 736 |
+
"visual.blocks.26.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 737 |
+
"visual.blocks.26.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 738 |
+
"visual.blocks.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 739 |
+
"visual.blocks.26.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 740 |
+
"visual.blocks.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 741 |
+
"visual.blocks.26.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 742 |
+
"visual.blocks.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 743 |
+
"visual.blocks.26.norm1.weight": "model-00001-of-00002.safetensors",
|
| 744 |
+
"visual.blocks.26.norm2.weight": "model-00001-of-00002.safetensors",
|
| 745 |
+
"visual.blocks.27.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 746 |
+
"visual.blocks.27.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 747 |
+
"visual.blocks.27.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 748 |
+
"visual.blocks.27.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 749 |
+
"visual.blocks.27.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 750 |
+
"visual.blocks.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 751 |
+
"visual.blocks.27.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 752 |
+
"visual.blocks.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 753 |
+
"visual.blocks.27.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 754 |
+
"visual.blocks.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 755 |
+
"visual.blocks.27.norm1.weight": "model-00001-of-00002.safetensors",
|
| 756 |
+
"visual.blocks.27.norm2.weight": "model-00001-of-00002.safetensors",
|
| 757 |
+
"visual.blocks.28.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 758 |
+
"visual.blocks.28.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 759 |
+
"visual.blocks.28.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 760 |
+
"visual.blocks.28.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 761 |
+
"visual.blocks.28.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 762 |
+
"visual.blocks.28.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 763 |
+
"visual.blocks.28.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 764 |
+
"visual.blocks.28.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 765 |
+
"visual.blocks.28.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 766 |
+
"visual.blocks.28.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 767 |
+
"visual.blocks.28.norm1.weight": "model-00001-of-00002.safetensors",
|
| 768 |
+
"visual.blocks.28.norm2.weight": "model-00001-of-00002.safetensors",
|
| 769 |
+
"visual.blocks.29.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 770 |
+
"visual.blocks.29.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 771 |
+
"visual.blocks.29.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 772 |
+
"visual.blocks.29.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 773 |
+
"visual.blocks.29.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 774 |
+
"visual.blocks.29.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 775 |
+
"visual.blocks.29.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 776 |
+
"visual.blocks.29.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 777 |
+
"visual.blocks.29.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 778 |
+
"visual.blocks.29.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 779 |
+
"visual.blocks.29.norm1.weight": "model-00001-of-00002.safetensors",
|
| 780 |
+
"visual.blocks.29.norm2.weight": "model-00001-of-00002.safetensors",
|
| 781 |
+
"visual.blocks.3.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 782 |
+
"visual.blocks.3.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 783 |
+
"visual.blocks.3.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 784 |
+
"visual.blocks.3.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 785 |
+
"visual.blocks.3.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 786 |
+
"visual.blocks.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 787 |
+
"visual.blocks.3.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 788 |
+
"visual.blocks.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 789 |
+
"visual.blocks.3.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 790 |
+
"visual.blocks.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 791 |
+
"visual.blocks.3.norm1.weight": "model-00001-of-00002.safetensors",
|
| 792 |
+
"visual.blocks.3.norm2.weight": "model-00001-of-00002.safetensors",
|
| 793 |
+
"visual.blocks.30.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 794 |
+
"visual.blocks.30.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 795 |
+
"visual.blocks.30.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 796 |
+
"visual.blocks.30.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 797 |
+
"visual.blocks.30.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 798 |
+
"visual.blocks.30.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 799 |
+
"visual.blocks.30.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 800 |
+
"visual.blocks.30.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 801 |
+
"visual.blocks.30.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 802 |
+
"visual.blocks.30.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 803 |
+
"visual.blocks.30.norm1.weight": "model-00001-of-00002.safetensors",
|
| 804 |
+
"visual.blocks.30.norm2.weight": "model-00001-of-00002.safetensors",
|
| 805 |
+
"visual.blocks.31.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 806 |
+
"visual.blocks.31.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 807 |
+
"visual.blocks.31.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 808 |
+
"visual.blocks.31.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 809 |
+
"visual.blocks.31.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 810 |
+
"visual.blocks.31.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 811 |
+
"visual.blocks.31.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 812 |
+
"visual.blocks.31.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 813 |
+
"visual.blocks.31.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 814 |
+
"visual.blocks.31.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 815 |
+
"visual.blocks.31.norm1.weight": "model-00001-of-00002.safetensors",
|
| 816 |
+
"visual.blocks.31.norm2.weight": "model-00001-of-00002.safetensors",
|
| 817 |
+
"visual.blocks.4.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 818 |
+
"visual.blocks.4.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 819 |
+
"visual.blocks.4.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 820 |
+
"visual.blocks.4.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 821 |
+
"visual.blocks.4.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 822 |
+
"visual.blocks.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 823 |
+
"visual.blocks.4.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 824 |
+
"visual.blocks.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 825 |
+
"visual.blocks.4.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 826 |
+
"visual.blocks.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 827 |
+
"visual.blocks.4.norm1.weight": "model-00001-of-00002.safetensors",
|
| 828 |
+
"visual.blocks.4.norm2.weight": "model-00001-of-00002.safetensors",
|
| 829 |
+
"visual.blocks.5.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 830 |
+
"visual.blocks.5.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 831 |
+
"visual.blocks.5.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 832 |
+
"visual.blocks.5.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 833 |
+
"visual.blocks.5.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 834 |
+
"visual.blocks.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 835 |
+
"visual.blocks.5.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 836 |
+
"visual.blocks.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 837 |
+
"visual.blocks.5.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 838 |
+
"visual.blocks.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 839 |
+
"visual.blocks.5.norm1.weight": "model-00001-of-00002.safetensors",
|
| 840 |
+
"visual.blocks.5.norm2.weight": "model-00001-of-00002.safetensors",
|
| 841 |
+
"visual.blocks.6.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 842 |
+
"visual.blocks.6.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 843 |
+
"visual.blocks.6.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 844 |
+
"visual.blocks.6.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 845 |
+
"visual.blocks.6.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 846 |
+
"visual.blocks.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 847 |
+
"visual.blocks.6.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 848 |
+
"visual.blocks.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 849 |
+
"visual.blocks.6.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 850 |
+
"visual.blocks.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 851 |
+
"visual.blocks.6.norm1.weight": "model-00001-of-00002.safetensors",
|
| 852 |
+
"visual.blocks.6.norm2.weight": "model-00001-of-00002.safetensors",
|
| 853 |
+
"visual.blocks.7.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 854 |
+
"visual.blocks.7.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 855 |
+
"visual.blocks.7.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 856 |
+
"visual.blocks.7.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 857 |
+
"visual.blocks.7.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 858 |
+
"visual.blocks.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 859 |
+
"visual.blocks.7.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 860 |
+
"visual.blocks.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 861 |
+
"visual.blocks.7.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 862 |
+
"visual.blocks.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 863 |
+
"visual.blocks.7.norm1.weight": "model-00001-of-00002.safetensors",
|
| 864 |
+
"visual.blocks.7.norm2.weight": "model-00001-of-00002.safetensors",
|
| 865 |
+
"visual.blocks.8.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 866 |
+
"visual.blocks.8.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 867 |
+
"visual.blocks.8.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 868 |
+
"visual.blocks.8.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 869 |
+
"visual.blocks.8.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 870 |
+
"visual.blocks.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 871 |
+
"visual.blocks.8.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 872 |
+
"visual.blocks.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 873 |
+
"visual.blocks.8.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 874 |
+
"visual.blocks.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 875 |
+
"visual.blocks.8.norm1.weight": "model-00001-of-00002.safetensors",
|
| 876 |
+
"visual.blocks.8.norm2.weight": "model-00001-of-00002.safetensors",
|
| 877 |
+
"visual.blocks.9.attn.proj.bias": "model-00001-of-00002.safetensors",
|
| 878 |
+
"visual.blocks.9.attn.proj.weight": "model-00001-of-00002.safetensors",
|
| 879 |
+
"visual.blocks.9.attn.qkv.bias": "model-00001-of-00002.safetensors",
|
| 880 |
+
"visual.blocks.9.attn.qkv.weight": "model-00001-of-00002.safetensors",
|
| 881 |
+
"visual.blocks.9.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
|
| 882 |
+
"visual.blocks.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 883 |
+
"visual.blocks.9.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
|
| 884 |
+
"visual.blocks.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 885 |
+
"visual.blocks.9.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
|
| 886 |
+
"visual.blocks.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 887 |
+
"visual.blocks.9.norm1.weight": "model-00001-of-00002.safetensors",
|
| 888 |
+
"visual.blocks.9.norm2.weight": "model-00001-of-00002.safetensors",
|
| 889 |
+
"visual.merger.ln_q.weight": "model-00001-of-00002.safetensors",
|
| 890 |
+
"visual.merger.mlp.0.bias": "model-00001-of-00002.safetensors",
|
| 891 |
+
"visual.merger.mlp.0.weight": "model-00001-of-00002.safetensors",
|
| 892 |
+
"visual.merger.mlp.2.bias": "model-00001-of-00002.safetensors",
|
| 893 |
+
"visual.merger.mlp.2.weight": "model-00001-of-00002.safetensors",
|
| 894 |
+
"visual.patch_embed.proj.weight": "model-00001-of-00002.safetensors"
|
| 895 |
+
}
|
| 896 |
+
}
|
modeling_traser.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from typing import List, Tuple, Optional, Any, Dict
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
|
| 6 |
+
from transformers import Qwen2_5_VLForConditionalGeneration
|
| 7 |
+
from transformers.modeling_outputs import ModelOutput
|
| 8 |
+
from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLConfig
|
| 9 |
+
from transformers.models.idefics2.modeling_idefics2 import Idefics2PerceiverResampler
|
| 10 |
+
from transformers.models.idefics2.configuration_idefics2 import Idefics2PerceiverConfig
|
| 11 |
+
from transformers.utils import ModelOutput
|
| 12 |
+
from transformers.processing_utils import Unpack
|
| 13 |
+
|
| 14 |
+
@dataclass
|
| 15 |
+
class TRASEROutput(ModelOutput):
|
| 16 |
+
loss: Optional[torch.FloatTensor] = None
|
| 17 |
+
logits: Optional[torch.FloatTensor] = None
|
| 18 |
+
past_key_values: Optional[List[torch.FloatTensor]] = None
|
| 19 |
+
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
| 20 |
+
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
| 21 |
+
rope_deltas: Optional[torch.LongTensor] = None
|
| 22 |
+
|
| 23 |
+
class TRASER(Qwen2_5_VLForConditionalGeneration):
|
| 24 |
+
def __init__(self, config: Qwen2_5_VLConfig, **kwargs):
|
| 25 |
+
super().__init__(config)
|
| 26 |
+
# Update config with kwargs if provided (fallback mechanism)
|
| 27 |
+
for k, v in kwargs.items():
|
| 28 |
+
if not hasattr(config, k):
|
| 29 |
+
setattr(config, k, v)
|
| 30 |
+
|
| 31 |
+
self.config = config
|
| 32 |
+
self._build_perceiver(dtype=config.torch_dtype, attn_impl=config._attn_implementation)
|
| 33 |
+
self.post_init()
|
| 34 |
+
|
| 35 |
+
def _build_perceiver(self, dtype: torch.dtype, attn_impl: str) -> None:
|
| 36 |
+
h = int(getattr(self.config, "hidden_size", 2048))
|
| 37 |
+
n_latents = int(getattr(self.config, "temporal_resampler_n_latents", 64))
|
| 38 |
+
depth = int(getattr(self.config, "resampler_depth", 3))
|
| 39 |
+
|
| 40 |
+
perceiver_cfg = Idefics2PerceiverConfig(
|
| 41 |
+
hidden_size=h,
|
| 42 |
+
resampler_n_latents=n_latents,
|
| 43 |
+
resampler_depth=depth,
|
| 44 |
+
_attn_implementation=attn_impl,
|
| 45 |
+
torch_dtype=dtype,
|
| 46 |
+
)
|
| 47 |
+
self.perceiver_resampler = Idefics2PerceiverResampler(perceiver_cfg)
|
| 48 |
+
|
| 49 |
+
if getattr(self.config, "object_resampler", True):
|
| 50 |
+
second_n_latents = int(getattr(self.config, "object_resampler_n_latents", 32))
|
| 51 |
+
|
| 52 |
+
second_perceiver_cfg = Idefics2PerceiverConfig(
|
| 53 |
+
hidden_size=h,
|
| 54 |
+
resampler_n_latents=second_n_latents,
|
| 55 |
+
resampler_depth=depth,
|
| 56 |
+
_attn_implementation=attn_impl,
|
| 57 |
+
torch_dtype=dtype,
|
| 58 |
+
)
|
| 59 |
+
self.second_perceiver_resampler = Idefics2PerceiverResampler(second_perceiver_cfg)
|
| 60 |
+
|
| 61 |
+
def prepare_inputs_for_generation(
|
| 62 |
+
self,
|
| 63 |
+
input_ids,
|
| 64 |
+
past_key_values=None,
|
| 65 |
+
attention_mask=None,
|
| 66 |
+
inputs_embeds=None,
|
| 67 |
+
cache_position=None,
|
| 68 |
+
position_ids=None,
|
| 69 |
+
use_cache=True,
|
| 70 |
+
pixel_values=None,
|
| 71 |
+
pixel_values_videos=None,
|
| 72 |
+
image_grid_thw=None,
|
| 73 |
+
video_grid_thw=None,
|
| 74 |
+
second_per_grid_ts=None,
|
| 75 |
+
**kwargs,
|
| 76 |
+
):
|
| 77 |
+
model_inputs = super().prepare_inputs_for_generation(
|
| 78 |
+
input_ids,
|
| 79 |
+
past_key_values=past_key_values,
|
| 80 |
+
attention_mask=attention_mask,
|
| 81 |
+
inputs_embeds=inputs_embeds,
|
| 82 |
+
cache_position=cache_position,
|
| 83 |
+
position_ids=position_ids,
|
| 84 |
+
pixel_values=pixel_values,
|
| 85 |
+
pixel_values_videos=pixel_values_videos,
|
| 86 |
+
image_grid_thw=image_grid_thw,
|
| 87 |
+
video_grid_thw=video_grid_thw,
|
| 88 |
+
second_per_grid_ts=second_per_grid_ts,
|
| 89 |
+
use_cache=use_cache,
|
| 90 |
+
**kwargs,
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
model_inputs["position_ids"] = position_ids
|
| 94 |
+
if cache_position is not None and cache_position[0] != 0:
|
| 95 |
+
model_inputs["pixel_values"] = None
|
| 96 |
+
model_inputs["pixel_values_videos"] = None
|
| 97 |
+
model_inputs["position_ids"] = None
|
| 98 |
+
return model_inputs
|
| 99 |
+
|
| 100 |
+
def forward(
|
| 101 |
+
self,
|
| 102 |
+
input_ids: Optional[torch.LongTensor] = None,
|
| 103 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 104 |
+
position_ids: Optional[torch.LongTensor] = None,
|
| 105 |
+
past_key_values: Optional[List[torch.FloatTensor]] = None,
|
| 106 |
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
| 107 |
+
labels: Optional[torch.LongTensor] = None,
|
| 108 |
+
use_cache: Optional[bool] = None,
|
| 109 |
+
output_attentions: Optional[bool] = None,
|
| 110 |
+
output_hidden_states: Optional[bool] = None,
|
| 111 |
+
cache_position: Optional[torch.LongTensor] = None,
|
| 112 |
+
rope_deltas: Optional[torch.LongTensor] = None,
|
| 113 |
+
**kwargs: Unpack[Any],
|
| 114 |
+
) -> TRASEROutput:
|
| 115 |
+
|
| 116 |
+
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 117 |
+
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 118 |
+
|
| 119 |
+
if rope_deltas is not None:
|
| 120 |
+
self.model.rope_deltas = rope_deltas
|
| 121 |
+
|
| 122 |
+
is_prefill = (inputs_embeds is not None) and (
|
| 123 |
+
past_key_values is None or (hasattr(past_key_values, "get_seq_length") and past_key_values.get_seq_length() == 0)
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
if is_prefill:
|
| 127 |
+
outputs = self.model.language_model(
|
| 128 |
+
input_ids=None,
|
| 129 |
+
inputs_embeds=inputs_embeds,
|
| 130 |
+
position_ids=position_ids,
|
| 131 |
+
attention_mask=attention_mask,
|
| 132 |
+
past_key_values=past_key_values,
|
| 133 |
+
use_cache=use_cache,
|
| 134 |
+
output_attentions=output_attentions,
|
| 135 |
+
output_hidden_states=output_hidden_states,
|
| 136 |
+
cache_position=cache_position,
|
| 137 |
+
return_dict=True,
|
| 138 |
+
)
|
| 139 |
+
else:
|
| 140 |
+
inputs_embeds = self.model.get_input_embeddings()(input_ids)
|
| 141 |
+
batch_size, seq_length, _ = inputs_embeds.shape
|
| 142 |
+
delta = (
|
| 143 |
+
(cache_position[0] + self.model.rope_deltas).to(inputs_embeds.device)
|
| 144 |
+
if cache_position is not None
|
| 145 |
+
else 0
|
| 146 |
+
)
|
| 147 |
+
pos = torch.arange(seq_length, device=inputs_embeds.device).view(1, -1).expand(batch_size, -1)
|
| 148 |
+
if cache_position is not None:
|
| 149 |
+
delta = delta.repeat_interleave(max(1, batch_size // delta.shape[0]), dim=0)
|
| 150 |
+
pos = pos.add(delta).unsqueeze(0).expand(3, -1, -1)
|
| 151 |
+
|
| 152 |
+
outputs = self.model.language_model(
|
| 153 |
+
input_ids=None,
|
| 154 |
+
position_ids=pos,
|
| 155 |
+
attention_mask=attention_mask,
|
| 156 |
+
past_key_values=past_key_values,
|
| 157 |
+
inputs_embeds=inputs_embeds,
|
| 158 |
+
use_cache=use_cache,
|
| 159 |
+
output_attentions=output_attentions,
|
| 160 |
+
output_hidden_states=output_hidden_states,
|
| 161 |
+
cache_position=cache_position,
|
| 162 |
+
**kwargs,
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
hidden_states = outputs.last_hidden_state
|
| 166 |
+
logits = self.lm_head(hidden_states)
|
| 167 |
+
|
| 168 |
+
loss = None
|
| 169 |
+
if labels is not None:
|
| 170 |
+
loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size)
|
| 171 |
+
|
| 172 |
+
return TRASEROutput(
|
| 173 |
+
loss=loss,
|
| 174 |
+
logits=logits,
|
| 175 |
+
past_key_values=outputs.past_key_values,
|
| 176 |
+
hidden_states=outputs.hidden_states,
|
| 177 |
+
attentions=outputs.attentions,
|
| 178 |
+
rope_deltas=self.model.rope_deltas,
|
| 179 |
+
)
|
qwen_vl_vsg_utils/src/qwen_vl_utils/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .vision_process import (
|
| 2 |
+
extract_vision_info,
|
| 3 |
+
fetch_image,
|
| 4 |
+
fetch_video,
|
| 5 |
+
process_vision_info,
|
| 6 |
+
smart_resize,
|
| 7 |
+
)
|
qwen_vl_vsg_utils/src/qwen_vl_utils/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (327 Bytes). View file
|
|
|
qwen_vl_vsg_utils/src/qwen_vl_utils/__pycache__/vision_process.cpython-310.pyc
ADDED
|
Binary file (12.9 kB). View file
|
|
|
qwen_vl_vsg_utils/src/qwen_vl_utils/vision_process.py
ADDED
|
@@ -0,0 +1,432 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import base64
|
| 4 |
+
import copy
|
| 5 |
+
import logging
|
| 6 |
+
import math
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
import time
|
| 10 |
+
import warnings
|
| 11 |
+
from functools import lru_cache
|
| 12 |
+
from io import BytesIO
|
| 13 |
+
from typing import Optional
|
| 14 |
+
|
| 15 |
+
import requests
|
| 16 |
+
import torch
|
| 17 |
+
import torchvision
|
| 18 |
+
from packaging import version
|
| 19 |
+
from PIL import Image
|
| 20 |
+
from torchvision import io, transforms
|
| 21 |
+
from torchvision.transforms import InterpolationMode
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
logger = logging.getLogger(__name__)
|
| 25 |
+
|
| 26 |
+
IMAGE_FACTOR = 28
|
| 27 |
+
MIN_PIXELS = 4 * 28 * 28
|
| 28 |
+
MAX_PIXELS = 16384 * 28 * 28
|
| 29 |
+
MAX_RATIO = 200
|
| 30 |
+
|
| 31 |
+
VIDEO_MAX_PIXELS = 768 * 28 * 28
|
| 32 |
+
FRAME_FACTOR = 2
|
| 33 |
+
FPS_MIN_FRAMES = 4
|
| 34 |
+
FPS_MAX_FRAMES = 768
|
| 35 |
+
VIDEO_MIN_PIXELS = 64 * 28 * 28
|
| 36 |
+
FPS = 1
|
| 37 |
+
VIDEO_TOTAL_PIXELS = int(float(os.environ.get('VIDEO_MAX_PIXELS', 128000 * 28 * 28 * 0.9)))
|
| 38 |
+
logger.info(f"set VIDEO_TOTAL_PIXELS: {VIDEO_TOTAL_PIXELS}")
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def round_by_factor(number: int, factor: int) -> int:
|
| 42 |
+
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
| 43 |
+
return round(number / factor) * factor
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def ceil_by_factor(number: int, factor: int) -> int:
|
| 47 |
+
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
|
| 48 |
+
return math.ceil(number / factor) * factor
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def floor_by_factor(number: int, factor: int) -> int:
|
| 52 |
+
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
|
| 53 |
+
return math.floor(number / factor) * factor
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def smart_resize(
|
| 57 |
+
height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
|
| 58 |
+
) -> tuple[int, int]:
|
| 59 |
+
if max(height, width) / min(height, width) > MAX_RATIO:
|
| 60 |
+
raise ValueError(
|
| 61 |
+
f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
|
| 62 |
+
)
|
| 63 |
+
h_bar = max(factor, round_by_factor(height, factor))
|
| 64 |
+
w_bar = max(factor, round_by_factor(width, factor))
|
| 65 |
+
if h_bar * w_bar > max_pixels:
|
| 66 |
+
beta = math.sqrt((height * width) / max_pixels)
|
| 67 |
+
h_bar = max(factor, floor_by_factor(height / beta, factor))
|
| 68 |
+
w_bar = max(factor, floor_by_factor(width / beta, factor))
|
| 69 |
+
elif h_bar * w_bar < min_pixels:
|
| 70 |
+
beta = math.sqrt(min_pixels / (height * width))
|
| 71 |
+
h_bar = ceil_by_factor(height * beta, factor)
|
| 72 |
+
w_bar = ceil_by_factor(width * beta, factor)
|
| 73 |
+
return h_bar, w_bar
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def to_rgb(pil_image: Image.Image) -> Image.Image:
|
| 77 |
+
if pil_image.mode == 'RGBA':
|
| 78 |
+
white_background = Image.new("RGB", pil_image.size, (255, 255, 255))
|
| 79 |
+
white_background.paste(pil_image, mask=pil_image.split()[3])
|
| 80 |
+
return white_background
|
| 81 |
+
else:
|
| 82 |
+
return pil_image.convert("RGB")
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def fetch_image(ele: dict[str, str | Image.Image], size_factor: int = IMAGE_FACTOR) -> Image.Image:
|
| 86 |
+
if "image" in ele:
|
| 87 |
+
image = ele["image"]
|
| 88 |
+
else:
|
| 89 |
+
image = ele["image_url"]
|
| 90 |
+
image_obj = None
|
| 91 |
+
if isinstance(image, Image.Image):
|
| 92 |
+
image_obj = image
|
| 93 |
+
elif image.startswith("http://") or image.startswith("https://"):
|
| 94 |
+
with requests.get(image, stream=True) as response:
|
| 95 |
+
response.raise_for_status()
|
| 96 |
+
with BytesIO(response.content) as bio:
|
| 97 |
+
image_obj = copy.deepcopy(Image.open(bio))
|
| 98 |
+
elif image.startswith("file://"):
|
| 99 |
+
image_obj = Image.open(image[7:])
|
| 100 |
+
elif image.startswith("data:image"):
|
| 101 |
+
if "base64," in image:
|
| 102 |
+
_, base64_data = image.split("base64,", 1)
|
| 103 |
+
data = base64.b64decode(base64_data)
|
| 104 |
+
with BytesIO(data) as bio:
|
| 105 |
+
image_obj = copy.deepcopy(Image.open(bio))
|
| 106 |
+
else:
|
| 107 |
+
image_obj = Image.open(image)
|
| 108 |
+
if image_obj is None:
|
| 109 |
+
raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
|
| 110 |
+
image = to_rgb(image_obj)
|
| 111 |
+
if "resized_height" in ele and "resized_width" in ele:
|
| 112 |
+
resized_height, resized_width = smart_resize(
|
| 113 |
+
ele["resized_height"],
|
| 114 |
+
ele["resized_width"],
|
| 115 |
+
factor=size_factor,
|
| 116 |
+
)
|
| 117 |
+
else:
|
| 118 |
+
width, height = image.size
|
| 119 |
+
min_pixels = ele.get("min_pixels", MIN_PIXELS)
|
| 120 |
+
max_pixels = ele.get("max_pixels", MAX_PIXELS)
|
| 121 |
+
resized_height, resized_width = smart_resize(
|
| 122 |
+
height,
|
| 123 |
+
width,
|
| 124 |
+
factor=size_factor,
|
| 125 |
+
min_pixels=min_pixels,
|
| 126 |
+
max_pixels=max_pixels,
|
| 127 |
+
)
|
| 128 |
+
image = image.resize((resized_width, resized_height))
|
| 129 |
+
|
| 130 |
+
return image
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def smart_nframes(
|
| 134 |
+
ele: dict,
|
| 135 |
+
total_frames: int,
|
| 136 |
+
video_fps: int | float,
|
| 137 |
+
) -> int:
|
| 138 |
+
assert not ("fps" in ele and "nframes" in ele), "Only accept either `fps` or `nframes`"
|
| 139 |
+
if "nframes" in ele:
|
| 140 |
+
nframes = round_by_factor(ele["nframes"], FRAME_FACTOR)
|
| 141 |
+
else:
|
| 142 |
+
fps = ele.get("fps", FPS)
|
| 143 |
+
min_frames = ceil_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR)
|
| 144 |
+
max_frames = floor_by_factor(ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR)
|
| 145 |
+
nframes = total_frames / video_fps * fps
|
| 146 |
+
if nframes > total_frames:
|
| 147 |
+
logger.warning(f"smart_nframes: nframes[{nframes}] > total_frames[{total_frames}]")
|
| 148 |
+
nframes = min(min(max(nframes, min_frames), max_frames), total_frames)
|
| 149 |
+
nframes = floor_by_factor(nframes, FRAME_FACTOR)
|
| 150 |
+
if not (FRAME_FACTOR <= nframes and nframes <= total_frames):
|
| 151 |
+
raise ValueError(f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}.")
|
| 152 |
+
return nframes
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def _read_video_torchvision(
|
| 156 |
+
ele: dict,
|
| 157 |
+
) -> (torch.Tensor, float):
|
| 158 |
+
video_path = ele["video"]
|
| 159 |
+
if version.parse(torchvision.__version__) < version.parse("0.19.0"):
|
| 160 |
+
if "http://" in video_path or "https://" in video_path:
|
| 161 |
+
warnings.warn("torchvision < 0.19.0 does not support http/https video path, please upgrade to 0.19.0.")
|
| 162 |
+
if "file://" in video_path:
|
| 163 |
+
video_path = video_path[7:]
|
| 164 |
+
st = time.time()
|
| 165 |
+
video, audio, info = io.read_video(
|
| 166 |
+
video_path,
|
| 167 |
+
start_pts=ele.get("video_start", 0.0),
|
| 168 |
+
end_pts=ele.get("video_end", None),
|
| 169 |
+
pts_unit="sec",
|
| 170 |
+
output_format="TCHW",
|
| 171 |
+
)
|
| 172 |
+
total_frames, video_fps = video.size(0), info["video_fps"]
|
| 173 |
+
logger.info(f"torchvision: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
|
| 174 |
+
nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
|
| 175 |
+
idx = torch.linspace(0, total_frames - 1, nframes).round().long()
|
| 176 |
+
sample_fps = nframes / max(total_frames, 1e-6) * video_fps
|
| 177 |
+
video = video[idx]
|
| 178 |
+
return video, sample_fps, idx.tolist()
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def is_decord_available() -> bool:
|
| 182 |
+
import importlib.util
|
| 183 |
+
|
| 184 |
+
return importlib.util.find_spec("decord") is not None
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def calculate_video_frame_range(
|
| 188 |
+
ele: dict,
|
| 189 |
+
total_frames: int,
|
| 190 |
+
video_fps: float,
|
| 191 |
+
) -> tuple[int, int, int]:
|
| 192 |
+
"""
|
| 193 |
+
Calculate the start and end frame indices based on the given time range.
|
| 194 |
+
|
| 195 |
+
Args:
|
| 196 |
+
ele (dict): A dictionary containing optional 'video_start' and 'video_end' keys (in seconds).
|
| 197 |
+
total_frames (int): Total number of frames in the video.
|
| 198 |
+
video_fps (float): Frames per second of the video.
|
| 199 |
+
|
| 200 |
+
Returns:
|
| 201 |
+
tuple: A tuple containing (start_frame, end_frame, frame_count).
|
| 202 |
+
|
| 203 |
+
Raises:
|
| 204 |
+
ValueError: If input parameters are invalid or the time range is inconsistent.
|
| 205 |
+
"""
|
| 206 |
+
if video_fps <= 0:
|
| 207 |
+
raise ValueError("video_fps must be a positive number")
|
| 208 |
+
if total_frames <= 0:
|
| 209 |
+
raise ValueError("total_frames must be a positive integer")
|
| 210 |
+
|
| 211 |
+
video_start = ele.get("video_start", None)
|
| 212 |
+
video_end = ele.get("video_end", None)
|
| 213 |
+
if video_start is None and video_end is None:
|
| 214 |
+
return 0, total_frames - 1, total_frames
|
| 215 |
+
|
| 216 |
+
max_duration = total_frames / video_fps
|
| 217 |
+
if video_start is not None:
|
| 218 |
+
video_start_clamped = max(0.0, min(video_start, max_duration))
|
| 219 |
+
start_frame = math.ceil(video_start_clamped * video_fps)
|
| 220 |
+
else:
|
| 221 |
+
start_frame = 0
|
| 222 |
+
if video_end is not None:
|
| 223 |
+
video_end_clamped = max(0.0, min(video_end, max_duration))
|
| 224 |
+
end_frame = math.floor(video_end_clamped * video_fps)
|
| 225 |
+
end_frame = min(end_frame, total_frames - 1)
|
| 226 |
+
else:
|
| 227 |
+
end_frame = total_frames - 1
|
| 228 |
+
|
| 229 |
+
if start_frame >= end_frame:
|
| 230 |
+
raise ValueError(
|
| 231 |
+
f"Invalid time range: Start frame {start_frame} (at {video_start_clamped if video_start is not None else 0}s) "
|
| 232 |
+
f"exceeds end frame {end_frame} (at {video_end_clamped if video_end is not None else max_duration}s). "
|
| 233 |
+
f"Video duration: {max_duration:.2f}s ({total_frames} frames @ {video_fps}fps)"
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
logger.info(f"calculate video frame range: {start_frame=}, {end_frame=}, {total_frames=} from {video_start=}, {video_end=}, {video_fps=:.3f}")
|
| 237 |
+
return start_frame, end_frame, end_frame - start_frame + 1
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def _read_video_decord(
|
| 241 |
+
ele: dict,
|
| 242 |
+
) -> (torch.Tensor, float):
|
| 243 |
+
"""read video using decord.VideoReader
|
| 244 |
+
|
| 245 |
+
Args:
|
| 246 |
+
ele (dict): a dict contains the configuration of video.
|
| 247 |
+
support keys:
|
| 248 |
+
- video: the path of video. support "file://", "http://", "https://" and local path.
|
| 249 |
+
- video_start: the start time of video.
|
| 250 |
+
- video_end: the end time of video.
|
| 251 |
+
Returns:
|
| 252 |
+
torch.Tensor: the video tensor with shape (T, C, H, W).
|
| 253 |
+
"""
|
| 254 |
+
import decord
|
| 255 |
+
video_path = ele["video"]
|
| 256 |
+
st = time.time()
|
| 257 |
+
vr = decord.VideoReader(video_path)
|
| 258 |
+
|
| 259 |
+
total_frames, video_fps = len(vr), vr.get_avg_fps()
|
| 260 |
+
start_frame, end_frame, total_frames = calculate_video_frame_range(
|
| 261 |
+
ele,
|
| 262 |
+
total_frames,
|
| 263 |
+
video_fps,
|
| 264 |
+
)
|
| 265 |
+
nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
|
| 266 |
+
idx = torch.linspace(start_frame, end_frame, nframes).round().long().tolist()
|
| 267 |
+
video = vr.get_batch(idx).asnumpy()
|
| 268 |
+
video = torch.tensor(video).permute(0, 3, 1, 2) # Convert to TCHW format
|
| 269 |
+
logger.info(f"decord: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
|
| 270 |
+
sample_fps = nframes / max(total_frames, 1e-6) * video_fps
|
| 271 |
+
return video, sample_fps, idx
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
def is_torchcodec_available() -> bool:
|
| 275 |
+
try:
|
| 276 |
+
import importlib.util
|
| 277 |
+
if importlib.util.find_spec("torchcodec") is None:
|
| 278 |
+
return False
|
| 279 |
+
from torchcodec.decoders import VideoDecoder
|
| 280 |
+
return True
|
| 281 |
+
except (ImportError, AttributeError, Exception):
|
| 282 |
+
return False
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
def _read_video_torchcodec(
|
| 286 |
+
ele: dict,
|
| 287 |
+
) -> (torch.Tensor, float):
|
| 288 |
+
from torchcodec.decoders import VideoDecoder
|
| 289 |
+
TORCHCODEC_NUM_THREADS = int(os.environ.get('TORCHCODEC_NUM_THREADS', 8))
|
| 290 |
+
logger.info(f"set TORCHCODEC_NUM_THREADS: {TORCHCODEC_NUM_THREADS}")
|
| 291 |
+
video_path = ele["video"]
|
| 292 |
+
st = time.time()
|
| 293 |
+
decoder = VideoDecoder(video_path, num_ffmpeg_threads=TORCHCODEC_NUM_THREADS)
|
| 294 |
+
video_fps = decoder.metadata.average_fps
|
| 295 |
+
total_frames = decoder.metadata.num_frames
|
| 296 |
+
start_frame, end_frame, total_frames = calculate_video_frame_range(
|
| 297 |
+
ele,
|
| 298 |
+
total_frames,
|
| 299 |
+
video_fps,
|
| 300 |
+
)
|
| 301 |
+
nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
|
| 302 |
+
idx = torch.linspace(start_frame, end_frame, nframes).round().long().tolist()
|
| 303 |
+
sample_fps = nframes / max(total_frames, 1e-6) * video_fps
|
| 304 |
+
video = decoder.get_frames_at(indices=idx).data
|
| 305 |
+
logger.info(f"torchcodec: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
|
| 306 |
+
return video, sample_fps, idx
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
VIDEO_READER_BACKENDS = {
|
| 310 |
+
"decord": _read_video_decord,
|
| 311 |
+
"torchvision": _read_video_torchvision,
|
| 312 |
+
"torchcodec": _read_video_torchcodec,
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
FORCE_QWENVL_VIDEO_READER = os.getenv("FORCE_QWENVL_VIDEO_READER", None)
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
@lru_cache(maxsize=1)
|
| 319 |
+
def get_video_reader_backend() -> str:
|
| 320 |
+
if FORCE_QWENVL_VIDEO_READER is not None:
|
| 321 |
+
video_reader_backend = FORCE_QWENVL_VIDEO_READER
|
| 322 |
+
elif is_torchcodec_available():
|
| 323 |
+
video_reader_backend = "torchcodec"
|
| 324 |
+
elif is_decord_available():
|
| 325 |
+
video_reader_backend = "decord"
|
| 326 |
+
else:
|
| 327 |
+
video_reader_backend = "torchvision"
|
| 328 |
+
print(f"qwen-vl-utils using {video_reader_backend} to read video.", file=sys.stderr)
|
| 329 |
+
return video_reader_backend
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
def fetch_video(ele: dict, image_factor: int = IMAGE_FACTOR, return_video_sample_fps: bool = False) -> torch.Tensor | list[Image.Image]:
|
| 333 |
+
if isinstance(ele["video"], str):
|
| 334 |
+
video_reader_backend = get_video_reader_backend()
|
| 335 |
+
try:
|
| 336 |
+
video, sample_fps, sampled_frame_idx_list = VIDEO_READER_BACKENDS[video_reader_backend](ele)
|
| 337 |
+
except Exception as e:
|
| 338 |
+
logger.warning(f"video_reader_backend {video_reader_backend} error, use torchvision as default, msg: {e}")
|
| 339 |
+
video, sample_fps, sampled_frame_idx_list = VIDEO_READER_BACKENDS["torchvision"](ele)
|
| 340 |
+
|
| 341 |
+
nframes, _, height, width = video.shape
|
| 342 |
+
min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS)
|
| 343 |
+
total_pixels = ele.get("total_pixels", VIDEO_TOTAL_PIXELS)
|
| 344 |
+
max_pixels = max(min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR), int(min_pixels * 1.05))
|
| 345 |
+
max_pixels_supposed = ele.get("max_pixels", max_pixels)
|
| 346 |
+
if max_pixels_supposed > max_pixels:
|
| 347 |
+
logger.warning(f"The given max_pixels[{max_pixels_supposed}] exceeds limit[{max_pixels}].")
|
| 348 |
+
max_pixels = min(max_pixels_supposed, max_pixels)
|
| 349 |
+
if "resized_height" in ele and "resized_width" in ele:
|
| 350 |
+
resized_height, resized_width = smart_resize(
|
| 351 |
+
ele["resized_height"],
|
| 352 |
+
ele["resized_width"],
|
| 353 |
+
factor=image_factor,
|
| 354 |
+
)
|
| 355 |
+
else:
|
| 356 |
+
resized_height, resized_width = smart_resize(
|
| 357 |
+
height,
|
| 358 |
+
width,
|
| 359 |
+
factor=image_factor,
|
| 360 |
+
min_pixels=min_pixels,
|
| 361 |
+
max_pixels=max_pixels,
|
| 362 |
+
)
|
| 363 |
+
video = transforms.functional.resize(
|
| 364 |
+
video,
|
| 365 |
+
[resized_height, resized_width],
|
| 366 |
+
interpolation=InterpolationMode.BICUBIC,
|
| 367 |
+
antialias=True,
|
| 368 |
+
).float()
|
| 369 |
+
if return_video_sample_fps:
|
| 370 |
+
return video, sample_fps, sampled_frame_idx_list
|
| 371 |
+
return video, sampled_frame_idx_list
|
| 372 |
+
else:
|
| 373 |
+
assert isinstance(ele["video"], (list, tuple))
|
| 374 |
+
process_info = ele.copy()
|
| 375 |
+
process_info.pop("type", None)
|
| 376 |
+
process_info.pop("video", None)
|
| 377 |
+
images = [
|
| 378 |
+
fetch_image({"image": video_element, **process_info}, size_factor=image_factor)
|
| 379 |
+
for video_element in ele["video"]
|
| 380 |
+
]
|
| 381 |
+
nframes = ceil_by_factor(len(images), FRAME_FACTOR)
|
| 382 |
+
if len(images) < nframes:
|
| 383 |
+
images.extend([images[-1]] * (nframes - len(images)))
|
| 384 |
+
if return_video_sample_fps:
|
| 385 |
+
return images, process_info.pop("fps", 2.0)
|
| 386 |
+
return images
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
def extract_vision_info(conversations: list[dict] | list[list[dict]]) -> list[dict]:
|
| 390 |
+
vision_infos = []
|
| 391 |
+
if isinstance(conversations[0], dict):
|
| 392 |
+
conversations = [conversations]
|
| 393 |
+
for conversation in conversations:
|
| 394 |
+
for message in conversation:
|
| 395 |
+
if isinstance(message["content"], list):
|
| 396 |
+
for ele in message["content"]:
|
| 397 |
+
if (
|
| 398 |
+
"image" in ele
|
| 399 |
+
or "image_url" in ele
|
| 400 |
+
or "video" in ele
|
| 401 |
+
or ele.get("type","") in ("image", "image_url", "video")
|
| 402 |
+
):
|
| 403 |
+
vision_infos.append(ele)
|
| 404 |
+
return vision_infos
|
| 405 |
+
|
| 406 |
+
|
| 407 |
+
def process_vision_info(
|
| 408 |
+
conversations: list[dict] | list[list[dict]],
|
| 409 |
+
return_video_kwargs: bool = False,
|
| 410 |
+
) -> tuple[list[Image.Image] | None, list[torch.Tensor | list[Image.Image]] | None, Optional[dict]]:
|
| 411 |
+
vision_infos = extract_vision_info(conversations)
|
| 412 |
+
image_inputs = []
|
| 413 |
+
video_inputs = []
|
| 414 |
+
video_sample_fps_list = []
|
| 415 |
+
video_sampled_frame_idx_list = []
|
| 416 |
+
for vision_info in vision_infos:
|
| 417 |
+
if "image" in vision_info or "image_url" in vision_info:
|
| 418 |
+
image_inputs.append(fetch_image(vision_info))
|
| 419 |
+
elif "video" in vision_info:
|
| 420 |
+
video_input, video_sample_fps, sampled_frame_idx_list = fetch_video(vision_info, return_video_sample_fps=True)
|
| 421 |
+
video_sample_fps_list.append(video_sample_fps)
|
| 422 |
+
video_inputs.append(video_input)
|
| 423 |
+
video_sampled_frame_idx_list.append(sampled_frame_idx_list)
|
| 424 |
+
else:
|
| 425 |
+
raise ValueError("image, image_url or video should in content.")
|
| 426 |
+
if len(image_inputs) == 0:
|
| 427 |
+
image_inputs = None
|
| 428 |
+
if len(video_inputs) == 0:
|
| 429 |
+
video_inputs = None
|
| 430 |
+
if return_video_kwargs:
|
| 431 |
+
return image_inputs, video_inputs, {'fps': video_sample_fps_list}, video_sampled_frame_idx_list
|
| 432 |
+
return image_inputs, video_inputs, video_sampled_frame_idx_list
|
resampler_utils/__pycache__/token_arrangement.cpython-310.pyc
ADDED
|
Binary file (14 kB). View file
|
|
|
resampler_utils/__pycache__/token_insert_1017_multi_resampler.cpython-310.pyc
ADDED
|
Binary file (13.3 kB). View file
|
|
|
resampler_utils/__pycache__/token_insert_1020_multi_two_resampler.cpython-310.pyc
ADDED
|
Binary file (15 kB). View file
|
|
|
resampler_utils/__pycache__/token_insert_new.cpython-310.pyc
ADDED
|
Binary file (11.2 kB). View file
|
|
|
resampler_utils/__pycache__/token_insert_no_resampler.cpython-310.pyc
ADDED
|
Binary file (9.06 kB). View file
|
|
|
resampler_utils/__pycache__/token_insert_single_resampler.cpython-310.pyc
ADDED
|
Binary file (11.2 kB). View file
|
|
|
resampler_utils/__pycache__/token_insert_temporal.cpython-310.pyc
ADDED
|
Binary file (12.6 kB). View file
|
|
|
resampler_utils/__pycache__/token_selection.cpython-310.pyc
ADDED
|
Binary file (2.88 kB). View file
|
|
|
resampler_utils/__pycache__/token_selection_bbox.cpython-310.pyc
ADDED
|
Binary file (9.38 kB). View file
|
|
|
resampler_utils/__pycache__/token_selection_temporal.cpython-310.pyc
ADDED
|
Binary file (3.29 kB). View file
|
|
|
resampler_utils/token_arrangement.py
ADDED
|
@@ -0,0 +1,640 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn.functional as F
|
| 3 |
+
from typing import List, Optional, Tuple
|
| 4 |
+
import math
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def rearrange_token(
|
| 8 |
+
model,
|
| 9 |
+
input_ids: torch.LongTensor, # [B, L]
|
| 10 |
+
attention_mask: torch.LongTensor, # [B, L]
|
| 11 |
+
pixel_values: Optional[torch.FloatTensor], # unused here (image path kept for API compatibility)
|
| 12 |
+
image_grid_thw: Optional[torch.LongTensor], # unused here (image path kept for API compatibility)
|
| 13 |
+
pixel_values_videos: Optional[torch.FloatTensor], # may be None
|
| 14 |
+
video_grid_thw: Optional[torch.LongTensor], # may be None
|
| 15 |
+
second_per_grid_ts: Optional[torch.Tensor], # may be None
|
| 16 |
+
|
| 17 |
+
# Per-sample list of objects; each object is a 1D LongTensor of relative video-token indices (in the original video token stream)
|
| 18 |
+
obj_token_indices_per_sample: List[List[torch.Tensor]],
|
| 19 |
+
|
| 20 |
+
# Only mode3_traj_and_text is kept:
|
| 21 |
+
obj_traj_start_id: Optional[int] = None,
|
| 22 |
+
obj_traj_end_id: Optional[int] = None,
|
| 23 |
+
|
| 24 |
+
# Required: List[sample][object] -> 1D LongTensor(ids)
|
| 25 |
+
text_token_ids_per_sample: Optional[List[List[torch.Tensor]]] = None,
|
| 26 |
+
|
| 27 |
+
timestamp_token_ids_per_batch=None, # List[sample][1D LongTensor(ids)]
|
| 28 |
+
grids_per_temporal_window_per_batch=None, # List[sample] number of grids per temporal window
|
| 29 |
+
|
| 30 |
+
labels: Optional[torch.LongTensor] = None,
|
| 31 |
+
IGNORE_ID: int = -100,
|
| 32 |
+
|
| 33 |
+
use_resampler: bool = True, # True → per-object resampling + linear (1D) positions
|
| 34 |
+
use_second_resampler: bool = True,
|
| 35 |
+
add_timestamp_token: bool = True, # whether to add timestamp token for each object window
|
| 36 |
+
):
|
| 37 |
+
"""
|
| 38 |
+
Fixed simplifications:
|
| 39 |
+
- insert_where: only "in_order" (no argument kept)
|
| 40 |
+
- insertion_mode: only "mode3_traj_and_text"
|
| 41 |
+
- perceiver_injection: only "visuals" (no time tokens injected into resampler)
|
| 42 |
+
|
| 43 |
+
Returns:
|
| 44 |
+
new_inputs_embeds: [B, Lmax, D]
|
| 45 |
+
new_position_ids: [3, B, Lmax] (int32)
|
| 46 |
+
new_attention_mask: [B, Lmax] (bool)
|
| 47 |
+
rope_deltas: [B, 1] (long)
|
| 48 |
+
cache_position: [Lmax] (int32)
|
| 49 |
+
new_input_ids: [B, Lmax] (long)
|
| 50 |
+
new_labels: [B, Lmax] or None (long)
|
| 51 |
+
"""
|
| 52 |
+
dev = input_ids.device
|
| 53 |
+
B, L = input_ids.shape
|
| 54 |
+
cpu = torch.device("cpu")
|
| 55 |
+
|
| 56 |
+
assert text_token_ids_per_sample is not None and len(text_token_ids_per_sample) == B, \
|
| 57 |
+
"mode3_traj_and_text requires text_token_ids_per_sample with length B."
|
| 58 |
+
|
| 59 |
+
if add_timestamp_token:
|
| 60 |
+
assert timestamp_token_ids_per_batch is not None and len(timestamp_token_ids_per_batch) == B, \
|
| 61 |
+
"add_timestamp_token=True requires timestamp_token_ids_per_batch with length B."
|
| 62 |
+
assert grids_per_temporal_window_per_batch is not None and len(grids_per_temporal_window_per_batch) == B, \
|
| 63 |
+
"add_timestamp_token=True requires grids_per_temporal_window_per_batch with length B."
|
| 64 |
+
else:
|
| 65 |
+
# still needed for window indexing if use_resampler path uses temporal windows
|
| 66 |
+
assert grids_per_temporal_window_per_batch is not None and len(grids_per_temporal_window_per_batch) == B, \
|
| 67 |
+
"grids_per_temporal_window_per_batch is required."
|
| 68 |
+
|
| 69 |
+
tok_embed = model.get_input_embeddings()
|
| 70 |
+
vt_id = int(model.config.video_token_id)
|
| 71 |
+
vs_id = getattr(model.config, "vision_start_token_id", None)
|
| 72 |
+
ve_id = getattr(model.config, "vision_end_token_id", None)
|
| 73 |
+
pad_id = 151643 # align with original implementation
|
| 74 |
+
|
| 75 |
+
# ---- (0+) temporal window meta ----
|
| 76 |
+
assert video_grid_thw is not None, "video_grid_thw is required for temporal windowing"
|
| 77 |
+
assert video_grid_thw.shape[0] == B and video_grid_thw.shape[1] == 3, \
|
| 78 |
+
f"video_grid_thw should be ({B},3), got {video_grid_thw.shape}"
|
| 79 |
+
|
| 80 |
+
grid_area_batch: List[int] = [] # per-sample spatial token count (H*W/4)
|
| 81 |
+
temporal_window_size_batch = grids_per_temporal_window_per_batch
|
| 82 |
+
|
| 83 |
+
# ---- (0) Compute visual features (with grad) ----
|
| 84 |
+
video_embeds = None
|
| 85 |
+
if pixel_values_videos is not None:
|
| 86 |
+
_vid = model.model.get_video_features(
|
| 87 |
+
pixel_values_videos.type(model.model.visual.dtype), video_grid_thw
|
| 88 |
+
)
|
| 89 |
+
video_embeds = torch.cat(_vid, dim=0) if isinstance(_vid, (list, tuple)) else _vid # [N_vid, D]
|
| 90 |
+
del pixel_values_videos, _vid
|
| 91 |
+
|
| 92 |
+
# ---- (0.1) Resamplers ----
|
| 93 |
+
resampler = None
|
| 94 |
+
resampler_num_latents = None
|
| 95 |
+
second_resampler = None
|
| 96 |
+
second_resampler_num_latents = None
|
| 97 |
+
if use_resampler:
|
| 98 |
+
if not hasattr(model, "perceiver_resampler"):
|
| 99 |
+
raise RuntimeError("use_resampler=True, but model.perceiver_resampler not found.")
|
| 100 |
+
resampler = model.perceiver_resampler
|
| 101 |
+
resampler_num_latents = int(resampler.n_latents)
|
| 102 |
+
if use_second_resampler:
|
| 103 |
+
if not hasattr(model, "second_perceiver_resampler"):
|
| 104 |
+
raise RuntimeError("use_second_resampler=True, but model.second_perceiver_resampler not found.")
|
| 105 |
+
second_resampler = model.second_perceiver_resampler
|
| 106 |
+
second_resampler_num_latents = int(second_resampler.n_latents)
|
| 107 |
+
|
| 108 |
+
# ---- (1) Position ids preparation ----
|
| 109 |
+
need_3d_rope = (not use_resampler)
|
| 110 |
+
if need_3d_rope:
|
| 111 |
+
with torch.no_grad():
|
| 112 |
+
position_ids_full, _ = model.model.get_rope_index(
|
| 113 |
+
input_ids=input_ids,
|
| 114 |
+
image_grid_thw=image_grid_thw,
|
| 115 |
+
video_grid_thw=video_grid_thw,
|
| 116 |
+
second_per_grid_ts=second_per_grid_ts,
|
| 117 |
+
attention_mask=attention_mask,
|
| 118 |
+
).to(cpu) # (3, B, L)
|
| 119 |
+
else:
|
| 120 |
+
position_ids_full = None
|
| 121 |
+
|
| 122 |
+
# ---- (2) Move to CPU for sequence planning ----
|
| 123 |
+
attn_cpu = attention_mask.to(cpu, dtype=torch.bool)
|
| 124 |
+
ids_cpu = input_ids.to(cpu)
|
| 125 |
+
pid_cpu = position_ids_full.to(cpu, dtype=torch.int32) if need_3d_rope else None
|
| 126 |
+
lbls_cpu = labels.to(cpu) if labels is not None else None
|
| 127 |
+
|
| 128 |
+
eff_lens: List[int] = []
|
| 129 |
+
vid_idx_list: List[torch.Tensor] = []
|
| 130 |
+
for b in range(B):
|
| 131 |
+
video_grid_thw_b = video_grid_thw[b]
|
| 132 |
+
# H*W/4 as integer
|
| 133 |
+
grid_area = (int(video_grid_thw_b[1].item()) * int(video_grid_thw_b[2].item())) // 4
|
| 134 |
+
grid_area_batch.append(int(grid_area))
|
| 135 |
+
|
| 136 |
+
nz = torch.nonzero(attn_cpu[b], as_tuple=False).flatten()
|
| 137 |
+
L_eff = int(nz[-1].item()) + 1 if nz.numel() > 0 else 0
|
| 138 |
+
eff_lens.append(L_eff)
|
| 139 |
+
|
| 140 |
+
if L_eff > 0:
|
| 141 |
+
ids_b_eff = ids_cpu[b, :L_eff]
|
| 142 |
+
vid_idx = torch.nonzero(ids_b_eff == vt_id, as_tuple=False).flatten()
|
| 143 |
+
vid_idx_list.append(vid_idx)
|
| 144 |
+
else:
|
| 145 |
+
vid_idx_list.append(torch.empty(0, dtype=torch.long))
|
| 146 |
+
|
| 147 |
+
# ---- Global offsets into concatenated video_embeds for each sample ----
|
| 148 |
+
vid_counts = [int(v.numel()) for v in vid_idx_list]
|
| 149 |
+
vid_offsets: List[int] = [0] * B
|
| 150 |
+
running = 0
|
| 151 |
+
for b in range(B):
|
| 152 |
+
vid_offsets[b] = running
|
| 153 |
+
running += vid_counts[b]
|
| 154 |
+
|
| 155 |
+
# ---- (3) Length planning ----
|
| 156 |
+
def _object_block_len(b: int, obj_i: int, sel_latent_len: int, rel_temporal_window_idx: torch.Tensor) -> int:
|
| 157 |
+
"""
|
| 158 |
+
mode3_traj_and_text block length:
|
| 159 |
+
[<traj_start>?] + [text] + [<VS>?] + [<ts>* + <vt_latents>*] + [<VE>?] + [<traj_end>?]
|
| 160 |
+
where <ts>* and <vt_latents>* repeat per non-empty temporal window (resampler path),
|
| 161 |
+
or raw selected video tokens (non-resampler path).
|
| 162 |
+
"""
|
| 163 |
+
add = 0
|
| 164 |
+
|
| 165 |
+
if obj_traj_start_id is not None:
|
| 166 |
+
add += 1
|
| 167 |
+
|
| 168 |
+
# text
|
| 169 |
+
tlen = int(text_token_ids_per_sample[b][obj_i].numel())
|
| 170 |
+
add += tlen
|
| 171 |
+
|
| 172 |
+
# VS
|
| 173 |
+
if vs_id is not None:
|
| 174 |
+
add += 1
|
| 175 |
+
|
| 176 |
+
# timestamps per unique window (if enabled)
|
| 177 |
+
if add_timestamp_token and timestamp_token_ids_per_batch is not None:
|
| 178 |
+
locs = rel_temporal_window_idx.unique()
|
| 179 |
+
for loc in locs:
|
| 180 |
+
loc_i = int(loc.item())
|
| 181 |
+
if loc_i < len(timestamp_token_ids_per_batch[b]):
|
| 182 |
+
add += int(timestamp_token_ids_per_batch[b][loc_i].numel())
|
| 183 |
+
else:
|
| 184 |
+
add += int(timestamp_token_ids_per_batch[b][-1].numel())
|
| 185 |
+
|
| 186 |
+
# visual placeholder length (either resampled latents or raw selected tokens)
|
| 187 |
+
add += int(sel_latent_len)
|
| 188 |
+
|
| 189 |
+
# VE
|
| 190 |
+
if ve_id is not None:
|
| 191 |
+
add += 1
|
| 192 |
+
|
| 193 |
+
if obj_traj_end_id is not None:
|
| 194 |
+
add += 1
|
| 195 |
+
|
| 196 |
+
return add
|
| 197 |
+
|
| 198 |
+
L_new_each: List[int] = []
|
| 199 |
+
|
| 200 |
+
for b in range(B):
|
| 201 |
+
L_eff = eff_lens[b]
|
| 202 |
+
ids_b = ids_cpu[b, :L_eff]
|
| 203 |
+
vid_idx = vid_idx_list[b]
|
| 204 |
+
|
| 205 |
+
if L_eff == 0:
|
| 206 |
+
L_new_each.append(0)
|
| 207 |
+
continue
|
| 208 |
+
if vid_idx.numel() == 0:
|
| 209 |
+
L_new_each.append(L_eff)
|
| 210 |
+
continue
|
| 211 |
+
|
| 212 |
+
v_s = int(vid_idx[0].item())
|
| 213 |
+
v_e = int(vid_idx[-1].item())
|
| 214 |
+
|
| 215 |
+
has_vs = (vs_id is not None and v_s - 1 >= 0 and ids_b[v_s - 1].item() == vs_id)
|
| 216 |
+
has_ve = (ve_id is not None and v_e + 1 < L_eff and ids_b[v_e + 1].item() == ve_id)
|
| 217 |
+
if has_vs:
|
| 218 |
+
v_s -= 1
|
| 219 |
+
if has_ve:
|
| 220 |
+
v_e += 1
|
| 221 |
+
|
| 222 |
+
prefix_len = v_s
|
| 223 |
+
suffix_len = L_eff - (v_e + 1)
|
| 224 |
+
|
| 225 |
+
sel_lists = obj_token_indices_per_sample[b]
|
| 226 |
+
Nv = int(vid_idx.numel())
|
| 227 |
+
|
| 228 |
+
cur_total = 0
|
| 229 |
+
for i, rel in enumerate(sel_lists):
|
| 230 |
+
rel = rel.to(cpu, dtype=torch.long)
|
| 231 |
+
sel_len = int(rel.numel())
|
| 232 |
+
|
| 233 |
+
if use_resampler:
|
| 234 |
+
tokens_per_window = int(grid_area_batch[b] * int(temporal_window_size_batch[b]))
|
| 235 |
+
rel_temporal_window_idx = rel // tokens_per_window if (tokens_per_window > 0) else torch.zeros_like(rel)
|
| 236 |
+
nonempty_windows = int(rel_temporal_window_idx.unique().numel())
|
| 237 |
+
|
| 238 |
+
if use_second_resampler and second_resampler_num_latents is not None:
|
| 239 |
+
sel_len = int(second_resampler_num_latents) + int(resampler_num_latents) * nonempty_windows
|
| 240 |
+
else:
|
| 241 |
+
sel_len = int(resampler_num_latents) * nonempty_windows
|
| 242 |
+
else:
|
| 243 |
+
# Non-resampler: keep raw selected video tokens count
|
| 244 |
+
tokens_per_window = int(grid_area_batch[b] * int(temporal_window_size_batch[b]))
|
| 245 |
+
rel_temporal_window_idx = rel // tokens_per_window if (tokens_per_window > 0) else torch.zeros_like(rel)
|
| 246 |
+
|
| 247 |
+
cur_total += _object_block_len(b, i, sel_len, rel_temporal_window_idx)
|
| 248 |
+
|
| 249 |
+
L_new_each.append(prefix_len + cur_total + suffix_len)
|
| 250 |
+
|
| 251 |
+
Lmax = max(L_new_each) if len(L_new_each) > 0 else 0
|
| 252 |
+
|
| 253 |
+
# ---- (4) Allocate new sequence tensors on CPU and fill per-sample ----
|
| 254 |
+
new_input_ids_cpu = torch.full((B, Lmax), pad_id, dtype=torch.long, device=cpu)
|
| 255 |
+
new_attention_mask_cpu = torch.zeros((B, Lmax), dtype=torch.bool, device=cpu)
|
| 256 |
+
new_position_ids_cpu = torch.zeros((3, B, Lmax), dtype=torch.int32, device=cpu)
|
| 257 |
+
new_labels_cpu = None
|
| 258 |
+
if labels is not None:
|
| 259 |
+
new_labels_cpu = torch.full((B, Lmax), IGNORE_ID, dtype=torch.long, device=cpu)
|
| 260 |
+
|
| 261 |
+
rows_for_video: List[torch.Tensor] = [torch.empty(0, dtype=torch.long) for _ in range(B)]
|
| 262 |
+
|
| 263 |
+
batched_obj_rows: List[torch.Tensor] = [] # each: rows into video_embeds (visual-only)
|
| 264 |
+
batched_obj_pos: List[torch.Tensor] = [] # each: destination positions [R]
|
| 265 |
+
batched_obj_bids: List[int] = []
|
| 266 |
+
batched_obj_lens: List[int] = [] # visual token lengths per (object-window)
|
| 267 |
+
|
| 268 |
+
batched_second_rows: List[torch.Tensor] = []
|
| 269 |
+
batched_second_pos: List[torch.Tensor] = []
|
| 270 |
+
batched_second_bids: List[int] = []
|
| 271 |
+
batched_second_oids: List[int] = []
|
| 272 |
+
|
| 273 |
+
def _text_pos_block(start_scalar: int, length: int, dtype=torch.int32) -> torch.Tensor:
|
| 274 |
+
"""Create 1D-linear positions replicated across 3 RoPE dims."""
|
| 275 |
+
if length <= 0:
|
| 276 |
+
return torch.empty(3, 0, dtype=dtype, device=cpu)
|
| 277 |
+
ar = torch.arange(start_scalar, start_scalar + length, device=cpu, dtype=dtype)
|
| 278 |
+
return torch.stack([ar, ar, ar], dim=0)
|
| 279 |
+
|
| 280 |
+
for b in range(B):
|
| 281 |
+
L_eff = eff_lens[b]
|
| 282 |
+
if L_eff == 0:
|
| 283 |
+
continue
|
| 284 |
+
|
| 285 |
+
ids_b = ids_cpu[b, :L_eff]
|
| 286 |
+
msk_b = attn_cpu[b, :L_eff]
|
| 287 |
+
labs_b = lbls_cpu[b, :L_eff] if lbls_cpu is not None else None
|
| 288 |
+
vid_idx = vid_idx_list[b]
|
| 289 |
+
|
| 290 |
+
dst = 0
|
| 291 |
+
|
| 292 |
+
# No video tokens: copy through
|
| 293 |
+
if vid_idx.numel() == 0:
|
| 294 |
+
new_input_ids_cpu[b, :L_eff] = ids_b
|
| 295 |
+
new_attention_mask_cpu[b, :L_eff] = msk_b
|
| 296 |
+
if new_labels_cpu is not None and labs_b is not None:
|
| 297 |
+
new_labels_cpu[b, :L_eff] = labs_b
|
| 298 |
+
if need_3d_rope:
|
| 299 |
+
new_position_ids_cpu[:, b, :L_eff] = pid_cpu[:, b, :L_eff]
|
| 300 |
+
else:
|
| 301 |
+
new_position_ids_cpu[:, b, :L_eff] = _text_pos_block(0, L_eff, dtype=torch.int32)
|
| 302 |
+
continue
|
| 303 |
+
|
| 304 |
+
v_s = int(vid_idx[0].item())
|
| 305 |
+
v_e = int(vid_idx[-1].item())
|
| 306 |
+
has_vs = (vs_id is not None and v_s - 1 >= 0 and ids_b[v_s - 1].item() == vs_id)
|
| 307 |
+
has_ve = (ve_id is not None and v_e + 1 < L_eff and ids_b[v_e + 1].item() == ve_id)
|
| 308 |
+
if has_vs:
|
| 309 |
+
v_s -= 1
|
| 310 |
+
if has_ve:
|
| 311 |
+
v_e += 1
|
| 312 |
+
|
| 313 |
+
prefix_len = v_s
|
| 314 |
+
suffix_len = L_eff - (v_e + 1)
|
| 315 |
+
|
| 316 |
+
if need_3d_rope:
|
| 317 |
+
pid_b = pid_cpu[:, b, :L_eff]
|
| 318 |
+
pos_scalar = pid_b.max(dim=0).values
|
| 319 |
+
first_video_scalar = int(pos_scalar[v_s + (1 if has_vs else 0)].item())
|
| 320 |
+
last_video_scalar = int(pos_scalar[v_e - (1 if has_ve else 0)].item())
|
| 321 |
+
vs_scalar = int(pos_scalar[v_s].item()) if has_vs else None
|
| 322 |
+
|
| 323 |
+
min_video_scalar_base = int(first_video_scalar)
|
| 324 |
+
max_video_scalar_base = int(last_video_scalar)
|
| 325 |
+
|
| 326 |
+
# prefix
|
| 327 |
+
if prefix_len > 0:
|
| 328 |
+
new_input_ids_cpu[b, dst:dst + prefix_len] = ids_b[:prefix_len]
|
| 329 |
+
new_attention_mask_cpu[b, dst:dst + prefix_len] = msk_b[:prefix_len]
|
| 330 |
+
if new_labels_cpu is not None and labs_b is not None:
|
| 331 |
+
new_labels_cpu[b, dst:dst + prefix_len] = labs_b[:prefix_len]
|
| 332 |
+
if need_3d_rope:
|
| 333 |
+
new_position_ids_cpu[:, b, dst:dst + prefix_len] = pid_b[:, :prefix_len]
|
| 334 |
+
else:
|
| 335 |
+
new_position_ids_cpu[:, b, dst:dst + prefix_len] = _text_pos_block(dst, prefix_len, dtype=torch.int32)
|
| 336 |
+
dst += prefix_len
|
| 337 |
+
|
| 338 |
+
# in_order only:
|
| 339 |
+
if need_3d_rope:
|
| 340 |
+
cursor = int(vs_scalar) if has_vs else int(first_video_scalar)
|
| 341 |
+
else:
|
| 342 |
+
cursor = dst
|
| 343 |
+
|
| 344 |
+
Nv = int(vid_idx.numel())
|
| 345 |
+
pos2rank = torch.full((L_eff,), -1, dtype=torch.long, device=cpu)
|
| 346 |
+
if Nv > 0:
|
| 347 |
+
pos2rank[vid_idx] = torch.arange(Nv, dtype=torch.long, device=cpu)
|
| 348 |
+
|
| 349 |
+
vid_offset = int(vid_offsets[b])
|
| 350 |
+
|
| 351 |
+
sel_lists = obj_token_indices_per_sample[b]
|
| 352 |
+
for i, rel in enumerate(sel_lists):
|
| 353 |
+
rel = rel.to(cpu, dtype=torch.long)
|
| 354 |
+
if rel.numel() > 0:
|
| 355 |
+
rel.clamp_(0, Nv - 1)
|
| 356 |
+
|
| 357 |
+
g = vid_idx.index_select(0, rel) if (Nv > 0 and rel.numel() > 0) else torch.empty(0, dtype=torch.long, device=cpu)
|
| 358 |
+
|
| 359 |
+
# (1) <obj_traj_start> (optional)
|
| 360 |
+
if obj_traj_start_id is not None:
|
| 361 |
+
new_input_ids_cpu[b, dst] = int(obj_traj_start_id)
|
| 362 |
+
new_position_ids_cpu[:, b, dst:dst + 1] = _text_pos_block(cursor if need_3d_rope else dst, 1, dtype=torch.int32)
|
| 363 |
+
if new_labels_cpu is not None:
|
| 364 |
+
new_labels_cpu[b, dst] = IGNORE_ID
|
| 365 |
+
new_attention_mask_cpu[b, dst] = True
|
| 366 |
+
dst += 1
|
| 367 |
+
if need_3d_rope:
|
| 368 |
+
cursor += 1
|
| 369 |
+
|
| 370 |
+
# (2) text tokens (required)
|
| 371 |
+
txt_ids = text_token_ids_per_sample[b][i].to(cpu, dtype=torch.long)
|
| 372 |
+
k = int(txt_ids.numel())
|
| 373 |
+
if k > 0:
|
| 374 |
+
new_input_ids_cpu[b, dst:dst + k] = txt_ids
|
| 375 |
+
new_position_ids_cpu[:, b, dst:dst + k] = _text_pos_block(cursor if need_3d_rope else dst, k, dtype=torch.int32)
|
| 376 |
+
if new_labels_cpu is not None:
|
| 377 |
+
new_labels_cpu[b, dst:dst + k] = IGNORE_ID
|
| 378 |
+
new_attention_mask_cpu[b, dst:dst + k] = True
|
| 379 |
+
dst += k
|
| 380 |
+
if need_3d_rope:
|
| 381 |
+
cursor += k
|
| 382 |
+
|
| 383 |
+
# (3) <VS> (optional)
|
| 384 |
+
if vs_id is not None:
|
| 385 |
+
new_input_ids_cpu[b, dst] = int(vs_id)
|
| 386 |
+
new_position_ids_cpu[:, b, dst:dst + 1] = _text_pos_block(cursor if need_3d_rope else dst, 1, dtype=torch.int32)
|
| 387 |
+
if new_labels_cpu is not None:
|
| 388 |
+
new_labels_cpu[b, dst] = IGNORE_ID
|
| 389 |
+
new_attention_mask_cpu[b, dst] = True
|
| 390 |
+
dst += 1
|
| 391 |
+
if need_3d_rope:
|
| 392 |
+
cursor += 1
|
| 393 |
+
|
| 394 |
+
# (4) video tokens
|
| 395 |
+
if g.numel() > 0:
|
| 396 |
+
if use_resampler:
|
| 397 |
+
tokens_per_window = int(grid_area_batch[b] * int(temporal_window_size_batch[b]))
|
| 398 |
+
rel_temporal_window_idx = rel // tokens_per_window if (tokens_per_window > 0) else torch.zeros_like(rel)
|
| 399 |
+
|
| 400 |
+
# Loop only over windows that actually appear in rel (robust)
|
| 401 |
+
W_eff = int(rel_temporal_window_idx.max().item()) + 1 if rel_temporal_window_idx.numel() > 0 else 0
|
| 402 |
+
|
| 403 |
+
all_rows_list = []
|
| 404 |
+
for w in range(W_eff):
|
| 405 |
+
m_w = (rel_temporal_window_idx == w)
|
| 406 |
+
if not torch.any(m_w):
|
| 407 |
+
all_rows_list.append(torch.empty(0, dtype=torch.long, device=cpu))
|
| 408 |
+
continue
|
| 409 |
+
rel_w = rel[m_w]
|
| 410 |
+
rows_w = rel_w + vid_offset
|
| 411 |
+
all_rows_list.append(rows_w)
|
| 412 |
+
|
| 413 |
+
# second resampler: global object summary
|
| 414 |
+
if use_second_resampler and second_resampler is not None:
|
| 415 |
+
rows_all = torch.cat([x for x in all_rows_list if x.numel() > 0], dim=0) if any(x.numel() > 0 for x in all_rows_list) \
|
| 416 |
+
else torch.empty(0, dtype=torch.long, device=cpu)
|
| 417 |
+
|
| 418 |
+
if rows_all.numel() > 0:
|
| 419 |
+
R2 = int(second_resampler_num_latents)
|
| 420 |
+
new_input_ids_cpu[b, dst:dst + R2] = int(vt_id)
|
| 421 |
+
new_position_ids_cpu[:, b, dst:dst + R2] = _text_pos_block(cursor if need_3d_rope else dst, R2, dtype=torch.int32)
|
| 422 |
+
if new_labels_cpu is not None:
|
| 423 |
+
new_labels_cpu[b, dst:dst + R2] = IGNORE_ID
|
| 424 |
+
new_attention_mask_cpu[b, dst:dst + R2] = True
|
| 425 |
+
|
| 426 |
+
pos_idx2 = torch.arange(dst, dst + R2, dtype=torch.long, device=cpu)
|
| 427 |
+
batched_second_rows.append(rows_all)
|
| 428 |
+
batched_second_pos.append(pos_idx2)
|
| 429 |
+
batched_second_bids.append(b)
|
| 430 |
+
batched_second_oids.append(i)
|
| 431 |
+
|
| 432 |
+
dst += R2
|
| 433 |
+
if need_3d_rope:
|
| 434 |
+
cursor += R2
|
| 435 |
+
|
| 436 |
+
R = int(resampler_num_latents)
|
| 437 |
+
|
| 438 |
+
for w in range(W_eff):
|
| 439 |
+
m_w = (rel_temporal_window_idx == w)
|
| 440 |
+
if not torch.any(m_w):
|
| 441 |
+
continue
|
| 442 |
+
|
| 443 |
+
# timestamp tokens (text-only; NOT injected into resampler)
|
| 444 |
+
if add_timestamp_token and (timestamp_token_ids_per_batch is not None):
|
| 445 |
+
loc = w
|
| 446 |
+
if loc < len(timestamp_token_ids_per_batch[b]):
|
| 447 |
+
ts_ids = timestamp_token_ids_per_batch[b][loc].to(cpu, dtype=torch.long)
|
| 448 |
+
else:
|
| 449 |
+
ts_ids = timestamp_token_ids_per_batch[b][-1].to(cpu, dtype=torch.long)
|
| 450 |
+
kt = int(ts_ids.numel())
|
| 451 |
+
assert kt > 0, "Timestamp token ids should not be empty."
|
| 452 |
+
|
| 453 |
+
new_input_ids_cpu[b, dst:dst + kt] = ts_ids
|
| 454 |
+
new_position_ids_cpu[:, b, dst:dst + kt] = _text_pos_block(cursor if need_3d_rope else dst, kt, dtype=torch.int32)
|
| 455 |
+
if new_labels_cpu is not None:
|
| 456 |
+
new_labels_cpu[b, dst:dst + kt] = IGNORE_ID
|
| 457 |
+
new_attention_mask_cpu[b, dst:dst + kt] = True
|
| 458 |
+
dst += kt
|
| 459 |
+
if need_3d_rope:
|
| 460 |
+
cursor += kt
|
| 461 |
+
|
| 462 |
+
# reserve R vt slots for resampled latents
|
| 463 |
+
new_input_ids_cpu[b, dst:dst + R] = int(vt_id)
|
| 464 |
+
new_position_ids_cpu[:, b, dst:dst + R] = _text_pos_block(cursor if need_3d_rope else dst, R, dtype=torch.int32)
|
| 465 |
+
if new_labels_cpu is not None:
|
| 466 |
+
new_labels_cpu[b, dst:dst + R] = IGNORE_ID
|
| 467 |
+
new_attention_mask_cpu[b, dst:dst + R] = True
|
| 468 |
+
|
| 469 |
+
rel_w = rel[m_w]
|
| 470 |
+
rows_w = rel_w + vid_offset
|
| 471 |
+
pos_idx = torch.arange(dst, dst + R, dtype=torch.long, device=cpu)
|
| 472 |
+
|
| 473 |
+
batched_obj_rows.append(rows_w)
|
| 474 |
+
batched_obj_pos.append(pos_idx)
|
| 475 |
+
batched_obj_bids.append(b)
|
| 476 |
+
batched_obj_lens.append(int(rows_w.numel())) # visuals-only
|
| 477 |
+
|
| 478 |
+
dst += R
|
| 479 |
+
if need_3d_rope:
|
| 480 |
+
cursor += R
|
| 481 |
+
|
| 482 |
+
else:
|
| 483 |
+
# Non-resampler: 3D RoPE positions for selected raw video tokens
|
| 484 |
+
assert need_3d_rope, "Non-resampler path requires 3D RoPE positions."
|
| 485 |
+
pid_vid = pid_b.index_select(1, g) # (3, Lv_sel)
|
| 486 |
+
|
| 487 |
+
# in_order only: shift selected pid by delta
|
| 488 |
+
delta = int(cursor - min_video_scalar_base)
|
| 489 |
+
if delta != 0:
|
| 490 |
+
pid_vid = pid_vid + delta
|
| 491 |
+
cursor = max_video_scalar_base + delta + 1
|
| 492 |
+
|
| 493 |
+
Lv_sel = int(g.numel())
|
| 494 |
+
new_input_ids_cpu[b, dst:dst + Lv_sel] = int(vt_id)
|
| 495 |
+
new_position_ids_cpu[:, b, dst:dst + Lv_sel] = pid_vid
|
| 496 |
+
if new_labels_cpu is not None:
|
| 497 |
+
new_labels_cpu[b, dst:dst + Lv_sel] = IGNORE_ID
|
| 498 |
+
new_attention_mask_cpu[b, dst:dst + Lv_sel] = True
|
| 499 |
+
|
| 500 |
+
ranks = pos2rank.index_select(0, g)
|
| 501 |
+
rows = ranks + vid_offset
|
| 502 |
+
rows_for_video[b] = torch.cat([rows_for_video[b], rows], dim=0)
|
| 503 |
+
dst += Lv_sel
|
| 504 |
+
|
| 505 |
+
# (5) <VE> (optional)
|
| 506 |
+
if ve_id is not None:
|
| 507 |
+
new_input_ids_cpu[b, dst] = int(ve_id)
|
| 508 |
+
new_position_ids_cpu[:, b, dst:dst + 1] = _text_pos_block(cursor if need_3d_rope else dst, 1, dtype=torch.int32)
|
| 509 |
+
if new_labels_cpu is not None:
|
| 510 |
+
new_labels_cpu[b, dst] = IGNORE_ID
|
| 511 |
+
new_attention_mask_cpu[b, dst] = True
|
| 512 |
+
dst += 1
|
| 513 |
+
if need_3d_rope:
|
| 514 |
+
cursor += 1
|
| 515 |
+
|
| 516 |
+
# (6) <obj_traj_end> (optional)
|
| 517 |
+
if obj_traj_end_id is not None:
|
| 518 |
+
new_input_ids_cpu[b, dst] = int(obj_traj_end_id)
|
| 519 |
+
new_position_ids_cpu[:, b, dst:dst + 1] = _text_pos_block(cursor if need_3d_rope else dst, 1, dtype=torch.int32)
|
| 520 |
+
if new_labels_cpu is not None:
|
| 521 |
+
new_labels_cpu[b, dst] = IGNORE_ID
|
| 522 |
+
new_attention_mask_cpu[b, dst] = True
|
| 523 |
+
dst += 1
|
| 524 |
+
if need_3d_rope:
|
| 525 |
+
cursor += 1
|
| 526 |
+
|
| 527 |
+
# suffix
|
| 528 |
+
if suffix_len > 0:
|
| 529 |
+
src_lo = v_e + 1
|
| 530 |
+
src_hi = L_eff
|
| 531 |
+
seg = src_hi - src_lo
|
| 532 |
+
new_input_ids_cpu[b, dst:dst + seg] = ids_b[src_lo:src_hi]
|
| 533 |
+
new_attention_mask_cpu[b, dst:dst + seg] = msk_b[src_lo:src_hi]
|
| 534 |
+
if new_labels_cpu is not None and labs_b is not None:
|
| 535 |
+
new_labels_cpu[b, dst:dst + seg] = labs_b[src_lo:src_hi]
|
| 536 |
+
new_position_ids_cpu[:, b, dst:dst + seg] = _text_pos_block(dst, seg, dtype=torch.int32) if not need_3d_rope else _text_pos_block(cursor, seg, dtype=torch.int32)
|
| 537 |
+
dst += seg
|
| 538 |
+
|
| 539 |
+
assert dst == L_new_each[b], f"sample {b}: dst={dst}, L_new={L_new_each[b]}"
|
| 540 |
+
|
| 541 |
+
# ---- (5) Move back to device, build inputs_embeds, and paste visual features ----
|
| 542 |
+
new_input_ids = new_input_ids_cpu.to(dev, non_blocking=True)
|
| 543 |
+
new_position_ids = new_position_ids_cpu.to(dev, non_blocking=True)
|
| 544 |
+
new_attention_mask = new_attention_mask_cpu.to(dev, non_blocking=True)
|
| 545 |
+
new_labels = None if new_labels_cpu is None else new_labels_cpu.to(dev, non_blocking=True)
|
| 546 |
+
|
| 547 |
+
base = tok_embed(new_input_ids)
|
| 548 |
+
new_inputs_embeds = base.clone()
|
| 549 |
+
|
| 550 |
+
# Non-resampler: copy raw video features at vt positions
|
| 551 |
+
if (video_embeds is not None) and (not use_resampler) and any(r.numel() > 0 for r in rows_for_video):
|
| 552 |
+
vemb = video_embeds.to(dev, dtype=new_inputs_embeds.dtype, non_blocking=True)
|
| 553 |
+
for b in range(B):
|
| 554 |
+
rows = rows_for_video[b]
|
| 555 |
+
if rows.numel() == 0:
|
| 556 |
+
continue
|
| 557 |
+
vt_pos = torch.nonzero(new_input_ids[b] == vt_id, as_tuple=False).flatten()
|
| 558 |
+
assert vt_pos.numel() == rows.numel(), f"video rows mismatch for sample {b}"
|
| 559 |
+
new_inputs_embeds[b].index_copy_(0, vt_pos.to(dev), vemb.index_select(0, rows.to(dev)))
|
| 560 |
+
|
| 561 |
+
# ---- (5.1) second resampler: object-level global summary ----
|
| 562 |
+
if use_resampler and use_second_resampler and len(batched_second_rows) > 0:
|
| 563 |
+
if video_embeds is None:
|
| 564 |
+
raise RuntimeError("use_second_resampler=True but video_embeds is None.")
|
| 565 |
+
dev_emb = video_embeds.device
|
| 566 |
+
dtype_emb = video_embeds.dtype
|
| 567 |
+
D = video_embeds.shape[-1]
|
| 568 |
+
N_obj2 = len(batched_second_rows)
|
| 569 |
+
|
| 570 |
+
seqs2 = []
|
| 571 |
+
lens2 = []
|
| 572 |
+
for rows_all in batched_second_rows:
|
| 573 |
+
if rows_all.numel() == 0:
|
| 574 |
+
seqs2.append(torch.zeros(0, D, device=dev_emb, dtype=dtype_emb))
|
| 575 |
+
lens2.append(0)
|
| 576 |
+
else:
|
| 577 |
+
seqs2.append(video_embeds.index_select(0, rows_all.to(dev_emb)))
|
| 578 |
+
lens2.append(int(rows_all.numel()))
|
| 579 |
+
x2 = torch.nn.utils.rnn.pad_sequence(seqs2, batch_first=True) if len(seqs2) > 0 else torch.zeros(0, 0, D, device=dev_emb, dtype=dtype_emb)
|
| 580 |
+
L2_max = x2.size(1) if x2.numel() > 0 else 0
|
| 581 |
+
lens2_t = torch.tensor(lens2, device=dev_emb, dtype=torch.long) if len(lens2) > 0 else torch.zeros(0, device=dev_emb, dtype=torch.long)
|
| 582 |
+
ar2 = torch.arange(L2_max, device=dev_emb).unsqueeze(0) if L2_max > 0 else torch.zeros(1, 0, device=dev_emb, dtype=torch.long)
|
| 583 |
+
mask2 = (ar2 < lens2_t.unsqueeze(1)) if L2_max > 0 else torch.zeros(0, 0, device=dev_emb, dtype=torch.bool)
|
| 584 |
+
|
| 585 |
+
y2 = second_resampler(x2, attention_mask=mask2) # [N_obj2, R2, D]
|
| 586 |
+
y2 = y2.to(new_inputs_embeds.dtype)
|
| 587 |
+
|
| 588 |
+
for j in range(N_obj2):
|
| 589 |
+
b_cur = batched_second_bids[j]
|
| 590 |
+
pos2 = batched_second_pos[j].to(dev)
|
| 591 |
+
new_inputs_embeds[b_cur, pos2] = y2[j]
|
| 592 |
+
|
| 593 |
+
# ---- (5.2) main resampler: visuals-only ----
|
| 594 |
+
if use_resampler and len(batched_obj_rows) > 0:
|
| 595 |
+
if video_embeds is None:
|
| 596 |
+
raise RuntimeError("use_resampler=True but video_embeds is None.")
|
| 597 |
+
dev_emb = video_embeds.device
|
| 598 |
+
dtype_emb = video_embeds.dtype
|
| 599 |
+
D = video_embeds.shape[-1]
|
| 600 |
+
|
| 601 |
+
N_obj = len(batched_obj_rows)
|
| 602 |
+
lens = torch.tensor(batched_obj_lens, device=dev_emb, dtype=torch.long) # [N_obj]
|
| 603 |
+
L_max = int(lens.max().item()) if lens.numel() > 0 else 0
|
| 604 |
+
|
| 605 |
+
seqs = []
|
| 606 |
+
for rows in batched_obj_rows:
|
| 607 |
+
if rows.numel() == 0:
|
| 608 |
+
seqs.append(torch.zeros(0, D, device=dev_emb, dtype=dtype_emb))
|
| 609 |
+
else:
|
| 610 |
+
seqs.append(video_embeds.index_select(0, rows.to(dev_emb))) # [Lv_sel, D]
|
| 611 |
+
x = torch.nn.utils.rnn.pad_sequence(seqs, batch_first=True) if len(seqs) > 0 else torch.zeros(0, 0, D, device=dev_emb, dtype=dtype_emb)
|
| 612 |
+
|
| 613 |
+
ar = torch.arange(L_max, device=dev_emb).unsqueeze(0) if L_max > 0 else torch.zeros(1, 0, device=dev_emb, dtype=torch.long)
|
| 614 |
+
mask = (ar < lens.unsqueeze(1)) if L_max > 0 else torch.zeros(0, 0, device=dev_emb, dtype=torch.bool)
|
| 615 |
+
|
| 616 |
+
y = resampler(x, attention_mask=mask) # [N_obj, R, D]
|
| 617 |
+
y = y.to(new_inputs_embeds.dtype)
|
| 618 |
+
|
| 619 |
+
per_b_indices: List[List[int]] = [[] for _ in range(B)]
|
| 620 |
+
for i in range(N_obj):
|
| 621 |
+
per_b_indices[batched_obj_bids[i]].append(i)
|
| 622 |
+
|
| 623 |
+
for b in range(B):
|
| 624 |
+
if not per_b_indices[b]:
|
| 625 |
+
continue
|
| 626 |
+
pos_list = []
|
| 627 |
+
emb_list = []
|
| 628 |
+
for i in per_b_indices[b]:
|
| 629 |
+
pos_list.append(batched_obj_pos[i].to(dev))
|
| 630 |
+
emb_list.append(y[i])
|
| 631 |
+
pos_b = torch.cat(pos_list, dim=0)
|
| 632 |
+
emb_b = torch.cat(emb_list, dim=0)
|
| 633 |
+
new_inputs_embeds[b, pos_b] = emb_b
|
| 634 |
+
|
| 635 |
+
# ---- (6) rope_deltas / cache_position ----
|
| 636 |
+
maxpos = new_position_ids.max(dim=0)[0].max(dim=1, keepdim=True)[0] # [B,1]
|
| 637 |
+
rope_deltas = (maxpos + 1 - new_inputs_embeds.shape[1]).to(dtype=torch.long, device=dev)
|
| 638 |
+
cache_position = torch.arange(new_inputs_embeds.shape[1], device=dev, dtype=torch.int32)
|
| 639 |
+
|
| 640 |
+
return new_inputs_embeds, new_position_ids, new_attention_mask, rope_deltas, cache_position, new_input_ids, new_labels
|
resampler_utils/token_selection.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn.functional as F
|
| 3 |
+
from typing import Literal, Optional, Tuple
|
| 4 |
+
|
| 5 |
+
@torch.no_grad()
|
| 6 |
+
def select_tokens(
|
| 7 |
+
obj_masks: torch.Tensor,
|
| 8 |
+
grid_thw: Tuple[int,int,int],
|
| 9 |
+
*,
|
| 10 |
+
patch_size: int = 14,
|
| 11 |
+
spatial_merge_size: int = 2,
|
| 12 |
+
temporal_patch_size: int = 2,
|
| 13 |
+
coverage_thresh: float = 0.7,
|
| 14 |
+
time_reduce: Literal["mean","max","all"] = "max",
|
| 15 |
+
device: str | torch.device = "cpu",
|
| 16 |
+
retry_step: float = 0.1,
|
| 17 |
+
retry_times: int = 1,
|
| 18 |
+
ensure_at_least_one: bool = True,
|
| 19 |
+
dtype: torch.dtype = torch.float32,
|
| 20 |
+
):
|
| 21 |
+
if obj_masks.dim() == 3:
|
| 22 |
+
obj_masks = obj_masks.unsqueeze(0)
|
| 23 |
+
O, N, H_rz, W_rz = obj_masks.shape
|
| 24 |
+
T, H, W = grid_thw
|
| 25 |
+
m, g = spatial_merge_size, temporal_patch_size
|
| 26 |
+
if N != T*g:
|
| 27 |
+
if N < T * g:
|
| 28 |
+
pad = T*g - N
|
| 29 |
+
last = obj_masks[:,-1:,:,:].repeat(1, pad, 1, 1)
|
| 30 |
+
obj_masks = torch.cat([obj_masks, last], dim=1)
|
| 31 |
+
N = T * g
|
| 32 |
+
else:
|
| 33 |
+
obj_masks = obj_masks[:, :T * g, :, :]
|
| 34 |
+
N = T * g
|
| 35 |
+
Hm, Wm = H // m, W // m
|
| 36 |
+
pix_h, pix_w = m * patch_size, m * patch_size
|
| 37 |
+
assert H_rz % pix_h == 0 and W_rz % pix_w == 0, "resized // (28×28)"
|
| 38 |
+
|
| 39 |
+
M = obj_masks.to(device=device, dtype=dtype).clamp(0, 1)
|
| 40 |
+
|
| 41 |
+
M_flat = M.view(O*N, 1, H_rz, W_rz)
|
| 42 |
+
cov_hw = F.avg_pool2d(M_flat, kernel_size=(pix_h, pix_w), stride=(pix_h, pix_w)) # (O*N,1,Hm,Wm)
|
| 43 |
+
cov_hw = cov_hw.view(O, N, Hm, Wm)
|
| 44 |
+
|
| 45 |
+
cov_hw = cov_hw.view(O, T, g, Hm, Wm)
|
| 46 |
+
if time_reduce == "mean":
|
| 47 |
+
cov_thw = cov_hw.mean(dim=2)
|
| 48 |
+
elif time_reduce == "max":
|
| 49 |
+
cov_thw = cov_hw.max(dim=2).values
|
| 50 |
+
elif time_reduce == "all":
|
| 51 |
+
cov_thw = cov_hw.min(dim=2).values
|
| 52 |
+
else:
|
| 53 |
+
raise ValueError("time_reduce ∈ {'mean','max','all'}")
|
| 54 |
+
|
| 55 |
+
per_obj_idx = []
|
| 56 |
+
per_t = Hm * Wm
|
| 57 |
+
for o in range(O):
|
| 58 |
+
nz = torch.empty(0, 3, dtype=torch.long, device=device)
|
| 59 |
+
tried = 0
|
| 60 |
+
thr = coverage_thresh
|
| 61 |
+
while tried <= retry_times:
|
| 62 |
+
thr_eff = max(0.0, float(thr))
|
| 63 |
+
sel = (cov_thw[o] >= thr_eff)
|
| 64 |
+
nz = torch.nonzero(sel, as_tuple=False)
|
| 65 |
+
if nz.numel() > 0:
|
| 66 |
+
break
|
| 67 |
+
tried += 1
|
| 68 |
+
thr -= retry_step
|
| 69 |
+
if nz.numel() == 0:
|
| 70 |
+
if ensure_at_least_one:
|
| 71 |
+
flat = cov_thw[o].reshape(-1)
|
| 72 |
+
arg = torch.argmax(flat)
|
| 73 |
+
t = arg // (Hm * Wm)
|
| 74 |
+
rem = arg % (Hm * Wm)
|
| 75 |
+
hp = rem // Wm
|
| 76 |
+
wp = rem % Wm
|
| 77 |
+
idx = (t * per_t + hp * Wm + wp).view(1)
|
| 78 |
+
per_obj_idx.append(idx.to(device=device, dtype=torch.long))
|
| 79 |
+
else:
|
| 80 |
+
per_obj_idx.append(torch.empty(0, dtype=torch.long, device=device))
|
| 81 |
+
else:
|
| 82 |
+
t = nz[:, 0]
|
| 83 |
+
hp = nz[:, 1]
|
| 84 |
+
wp = nz[:, 2]
|
| 85 |
+
idx = t * per_t + hp * Wm + wp
|
| 86 |
+
per_obj_idx.append(idx.to(device=device, dtype=torch.long))
|
| 87 |
+
|
| 88 |
+
if len(per_obj_idx) == 0:
|
| 89 |
+
union_idx = torch.empty(0, dtype=torch.long, device=device)
|
| 90 |
+
else:
|
| 91 |
+
union_idx = torch.unique(torch.cat(per_obj_idx, dim=0)) if per_obj_idx[0].numel() else torch.empty(0, dtype=torch.long, device=device)
|
| 92 |
+
|
| 93 |
+
union_idx_cpu = union_idx.cpu()
|
| 94 |
+
per_obj_idx_cpu = [idx.cpu() for idx in per_obj_idx]
|
| 95 |
+
cov_thw_cpu = cov_thw.cpu()
|
| 96 |
+
|
| 97 |
+
del M, M_flat, cov_hw, cov_thw, per_obj_idx, union_idx
|
| 98 |
+
if O > 0:
|
| 99 |
+
del sel, nz
|
| 100 |
+
|
| 101 |
+
return union_idx_cpu, per_obj_idx_cpu, cov_thw_cpu
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<|im_start|>",
|
| 4 |
+
"<|im_end|>",
|
| 5 |
+
"<|object_ref_start|>",
|
| 6 |
+
"<|object_ref_end|>",
|
| 7 |
+
"<|box_start|>",
|
| 8 |
+
"<|box_end|>",
|
| 9 |
+
"<|quad_start|>",
|
| 10 |
+
"<|quad_end|>",
|
| 11 |
+
"<|vision_start|>",
|
| 12 |
+
"<|vision_end|>",
|
| 13 |
+
"<|vision_pad|>",
|
| 14 |
+
"<|image_pad|>",
|
| 15 |
+
"<|video_pad|>",
|
| 16 |
+
{
|
| 17 |
+
"content": "<obj_traj_start>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"content": "<obj_traj_end>",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
}
|
| 30 |
+
],
|
| 31 |
+
"eos_token": {
|
| 32 |
+
"content": "<|im_end|>",
|
| 33 |
+
"lstrip": false,
|
| 34 |
+
"normalized": false,
|
| 35 |
+
"rstrip": false,
|
| 36 |
+
"single_word": false
|
| 37 |
+
},
|
| 38 |
+
"pad_token": {
|
| 39 |
+
"content": "<|endoftext|>",
|
| 40 |
+
"lstrip": false,
|
| 41 |
+
"normalized": false,
|
| 42 |
+
"rstrip": false,
|
| 43 |
+
"single_word": false
|
| 44 |
+
}
|
| 45 |
+
}
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"151643": {
|
| 6 |
+
"content": "<|endoftext|>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"151644": {
|
| 14 |
+
"content": "<|im_start|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"151645": {
|
| 22 |
+
"content": "<|im_end|>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
},
|
| 29 |
+
"151646": {
|
| 30 |
+
"content": "<|object_ref_start|>",
|
| 31 |
+
"lstrip": false,
|
| 32 |
+
"normalized": false,
|
| 33 |
+
"rstrip": false,
|
| 34 |
+
"single_word": false,
|
| 35 |
+
"special": true
|
| 36 |
+
},
|
| 37 |
+
"151647": {
|
| 38 |
+
"content": "<|object_ref_end|>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false,
|
| 43 |
+
"special": true
|
| 44 |
+
},
|
| 45 |
+
"151648": {
|
| 46 |
+
"content": "<|box_start|>",
|
| 47 |
+
"lstrip": false,
|
| 48 |
+
"normalized": false,
|
| 49 |
+
"rstrip": false,
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"special": true
|
| 52 |
+
},
|
| 53 |
+
"151649": {
|
| 54 |
+
"content": "<|box_end|>",
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"normalized": false,
|
| 57 |
+
"rstrip": false,
|
| 58 |
+
"single_word": false,
|
| 59 |
+
"special": true
|
| 60 |
+
},
|
| 61 |
+
"151650": {
|
| 62 |
+
"content": "<|quad_start|>",
|
| 63 |
+
"lstrip": false,
|
| 64 |
+
"normalized": false,
|
| 65 |
+
"rstrip": false,
|
| 66 |
+
"single_word": false,
|
| 67 |
+
"special": true
|
| 68 |
+
},
|
| 69 |
+
"151651": {
|
| 70 |
+
"content": "<|quad_end|>",
|
| 71 |
+
"lstrip": false,
|
| 72 |
+
"normalized": false,
|
| 73 |
+
"rstrip": false,
|
| 74 |
+
"single_word": false,
|
| 75 |
+
"special": true
|
| 76 |
+
},
|
| 77 |
+
"151652": {
|
| 78 |
+
"content": "<|vision_start|>",
|
| 79 |
+
"lstrip": false,
|
| 80 |
+
"normalized": false,
|
| 81 |
+
"rstrip": false,
|
| 82 |
+
"single_word": false,
|
| 83 |
+
"special": true
|
| 84 |
+
},
|
| 85 |
+
"151653": {
|
| 86 |
+
"content": "<|vision_end|>",
|
| 87 |
+
"lstrip": false,
|
| 88 |
+
"normalized": false,
|
| 89 |
+
"rstrip": false,
|
| 90 |
+
"single_word": false,
|
| 91 |
+
"special": true
|
| 92 |
+
},
|
| 93 |
+
"151654": {
|
| 94 |
+
"content": "<|vision_pad|>",
|
| 95 |
+
"lstrip": false,
|
| 96 |
+
"normalized": false,
|
| 97 |
+
"rstrip": false,
|
| 98 |
+
"single_word": false,
|
| 99 |
+
"special": true
|
| 100 |
+
},
|
| 101 |
+
"151655": {
|
| 102 |
+
"content": "<|image_pad|>",
|
| 103 |
+
"lstrip": false,
|
| 104 |
+
"normalized": false,
|
| 105 |
+
"rstrip": false,
|
| 106 |
+
"single_word": false,
|
| 107 |
+
"special": true
|
| 108 |
+
},
|
| 109 |
+
"151656": {
|
| 110 |
+
"content": "<|video_pad|>",
|
| 111 |
+
"lstrip": false,
|
| 112 |
+
"normalized": false,
|
| 113 |
+
"rstrip": false,
|
| 114 |
+
"single_word": false,
|
| 115 |
+
"special": true
|
| 116 |
+
},
|
| 117 |
+
"151657": {
|
| 118 |
+
"content": "<tool_call>",
|
| 119 |
+
"lstrip": false,
|
| 120 |
+
"normalized": false,
|
| 121 |
+
"rstrip": false,
|
| 122 |
+
"single_word": false,
|
| 123 |
+
"special": false
|
| 124 |
+
},
|
| 125 |
+
"151658": {
|
| 126 |
+
"content": "</tool_call>",
|
| 127 |
+
"lstrip": false,
|
| 128 |
+
"normalized": false,
|
| 129 |
+
"rstrip": false,
|
| 130 |
+
"single_word": false,
|
| 131 |
+
"special": false
|
| 132 |
+
},
|
| 133 |
+
"151659": {
|
| 134 |
+
"content": "<|fim_prefix|>",
|
| 135 |
+
"lstrip": false,
|
| 136 |
+
"normalized": false,
|
| 137 |
+
"rstrip": false,
|
| 138 |
+
"single_word": false,
|
| 139 |
+
"special": false
|
| 140 |
+
},
|
| 141 |
+
"151660": {
|
| 142 |
+
"content": "<|fim_middle|>",
|
| 143 |
+
"lstrip": false,
|
| 144 |
+
"normalized": false,
|
| 145 |
+
"rstrip": false,
|
| 146 |
+
"single_word": false,
|
| 147 |
+
"special": false
|
| 148 |
+
},
|
| 149 |
+
"151661": {
|
| 150 |
+
"content": "<|fim_suffix|>",
|
| 151 |
+
"lstrip": false,
|
| 152 |
+
"normalized": false,
|
| 153 |
+
"rstrip": false,
|
| 154 |
+
"single_word": false,
|
| 155 |
+
"special": false
|
| 156 |
+
},
|
| 157 |
+
"151662": {
|
| 158 |
+
"content": "<|fim_pad|>",
|
| 159 |
+
"lstrip": false,
|
| 160 |
+
"normalized": false,
|
| 161 |
+
"rstrip": false,
|
| 162 |
+
"single_word": false,
|
| 163 |
+
"special": false
|
| 164 |
+
},
|
| 165 |
+
"151663": {
|
| 166 |
+
"content": "<|repo_name|>",
|
| 167 |
+
"lstrip": false,
|
| 168 |
+
"normalized": false,
|
| 169 |
+
"rstrip": false,
|
| 170 |
+
"single_word": false,
|
| 171 |
+
"special": false
|
| 172 |
+
},
|
| 173 |
+
"151664": {
|
| 174 |
+
"content": "<|file_sep|>",
|
| 175 |
+
"lstrip": false,
|
| 176 |
+
"normalized": false,
|
| 177 |
+
"rstrip": false,
|
| 178 |
+
"single_word": false,
|
| 179 |
+
"special": false
|
| 180 |
+
},
|
| 181 |
+
"151665": {
|
| 182 |
+
"content": "<obj_traj_start>",
|
| 183 |
+
"lstrip": false,
|
| 184 |
+
"normalized": false,
|
| 185 |
+
"rstrip": false,
|
| 186 |
+
"single_word": false,
|
| 187 |
+
"special": true
|
| 188 |
+
},
|
| 189 |
+
"151666": {
|
| 190 |
+
"content": "<obj_traj_end>",
|
| 191 |
+
"lstrip": false,
|
| 192 |
+
"normalized": false,
|
| 193 |
+
"rstrip": false,
|
| 194 |
+
"single_word": false,
|
| 195 |
+
"special": true
|
| 196 |
+
}
|
| 197 |
+
},
|
| 198 |
+
"additional_special_tokens": [
|
| 199 |
+
"<|im_start|>",
|
| 200 |
+
"<|im_end|>",
|
| 201 |
+
"<|object_ref_start|>",
|
| 202 |
+
"<|object_ref_end|>",
|
| 203 |
+
"<|box_start|>",
|
| 204 |
+
"<|box_end|>",
|
| 205 |
+
"<|quad_start|>",
|
| 206 |
+
"<|quad_end|>",
|
| 207 |
+
"<|vision_start|>",
|
| 208 |
+
"<|vision_end|>",
|
| 209 |
+
"<|vision_pad|>",
|
| 210 |
+
"<|image_pad|>",
|
| 211 |
+
"<|video_pad|>",
|
| 212 |
+
"<obj_traj_start>",
|
| 213 |
+
"<obj_traj_end>"
|
| 214 |
+
],
|
| 215 |
+
"bos_token": null,
|
| 216 |
+
"clean_up_tokenization_spaces": false,
|
| 217 |
+
"eos_token": "<|im_end|>",
|
| 218 |
+
"errors": "replace",
|
| 219 |
+
"extra_special_tokens": {},
|
| 220 |
+
"model_max_length": 128000,
|
| 221 |
+
"pad_token": "<|endoftext|>",
|
| 222 |
+
"padding_side": "right",
|
| 223 |
+
"split_special_tokens": false,
|
| 224 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 225 |
+
"unk_token": null
|
| 226 |
+
}
|
vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|