jun-1001 commited on Apr 7

Commit

2e7f2ce

verified ·

1 Parent(s): fa9ea4b

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
circulant_merged/added_tokens.json +24 -0
circulant_merged/chat_template.json +3 -0
circulant_merged/config.json +51 -0
circulant_merged/generation_config.json +12 -0
circulant_merged/merges.txt +0 -0
circulant_merged/model-00001-of-00008.safetensors +3 -0
circulant_merged/model-00002-of-00008.safetensors +3 -0
circulant_merged/model-00003-of-00008.safetensors +3 -0
circulant_merged/model-00004-of-00008.safetensors +3 -0
circulant_merged/model-00005-of-00008.safetensors +3 -0
circulant_merged/model-00006-of-00008.safetensors +3 -0
circulant_merged/model-00007-of-00008.safetensors +3 -0
circulant_merged/model-00008-of-00008.safetensors +3 -0
circulant_merged/model.safetensors.index.json +831 -0
circulant_merged/preprocessor_config.json +29 -0
circulant_merged/special_tokens_map.json +31 -0
circulant_merged/tokenizer.json +3 -0
circulant_merged/tokenizer_config.json +210 -0
circulant_merged/vocab.json +0 -0
data/eval_qwenvl.jsonl +0 -0
data/eval_vora.jsonl +0 -0
eval/eval_qwen_baseline.py +222 -0
eval/eval_qwen_vl.py +341 -0
eval/eval_vora.py +430 -0
eval/run_eval.sh +213 -0
generation_files/added_tokens.json +24 -0
generation_files/chat_template.json +3 -0
generation_files/generation_config.json +14 -0
generation_files/merges.txt +0 -0
generation_files/preprocessor_config.json +19 -0
generation_files/processing_vora.py +150 -0
generation_files/processor_config.json +6 -0
generation_files/special_tokens_map.json +31 -0
generation_files/tokenizer.json +3 -0
generation_files/tokenizer_config.json +209 -0
generation_files/vocab.json +0 -0
generation_files/vora_generation_utils.py +101 -0
lora_merged/added_tokens.json +24 -0
lora_merged/chat_template.json +3 -0
lora_merged/config.json +51 -0
lora_merged/generation_config.json +12 -0
lora_merged/merges.txt +0 -0
lora_merged/model-00001-of-00008.safetensors +3 -0
lora_merged/model-00002-of-00008.safetensors +3 -0
lora_merged/model-00003-of-00008.safetensors +3 -0
lora_merged/model-00004-of-00008.safetensors +3 -0
lora_merged/model-00005-of-00008.safetensors +3 -0
lora_merged/model-00006-of-00008.safetensors +3 -0
lora_merged/model-00007-of-00008.safetensors +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+circulant_merged/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+generation_files/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+lora_merged/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+vora_merged_250/tokenizer.json filter=lfs diff=lfs merge=lfs -text

circulant_merged/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

circulant_merged/chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+}

circulant_merged/config.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "_name_or_path": "Qwen2.5-VL-3B-Instruct",
+  "architectures": [
+    "Qwen2_5_VLForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "image_token_id": 151655,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 128000,
+  "max_window_layers": 70,
+  "model_type": "qwen2_5_vl",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "mrope_section": [
+      16,
+      24,
+      24
+    ],
+    "rope_type": "default",
+    "type": "default"
+  },
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.49.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "video_token_id": 151656,
+  "vision_config": {
+    "hidden_size": 1280,
+    "in_chans": 3,
+    "model_type": "qwen2_5_vl",
+    "out_hidden_size": 2048,
+    "spatial_patch_size": 14,
+    "tokens_per_second": 2,
+    "torch_dtype": "float32"
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652,
+  "vision_token_id": 151654,
+  "vocab_size": 151936
+}

circulant_merged/generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.05,
+  "temperature": 1e-06,
+  "transformers_version": "4.49.0"
+}

circulant_merged/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

circulant_merged/model-00001-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a88d3bd0a1ee8f0d26e039c20c14dcf0286a86bfe4592c8a64f105e44552c02
+size 997996256

circulant_merged/model-00002-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:477d86c384a7671e0a2a5181afbe4433a235de7d0106115f383bc6545c8cd5fa
+size 980624160

circulant_merged/model-00003-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e04dbcd85cebc93cadfc9a7af020bbc3545d996990029d1e787dc90979756ab0
+size 970020872

circulant_merged/model-00004-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7dc2cca7ab7494ca77c3d4842950ba4d0a609cae47bda5e8efb55b3b898d345
+size 970020904

circulant_merged/model-00005-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8491735ed17f37c7ad1b06732c1badf8179a6175e6881d878b2a325e8762eb6d
+size 988909632

circulant_merged/model-00006-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:310e84bec8d3312e702e561ee769e64a80ef45c3e198452e9a754a279c52857b
+size 970020944

circulant_merged/model-00007-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d94936fb64842ed59c634660010738a11ad810c1b310225734b83cddc9526e52
+size 970020936

circulant_merged/model-00008-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfa2f969bf4caf7d68d44a9804d1ef31203c1bf0b9baaef95b2525d64a82f5b0
+size 661722864

circulant_merged/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,831 @@

+{
+  "metadata": {
+    "total_size": 7509245952
+  },
+  "weight_map": {
+    "model.embed_tokens.weight": "model-00002-of-00008.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00002-of-00008.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00002-of-00008.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00002-of-00008.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00002-of-00008.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00008.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00002-of-00008.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00008.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00003-of-00008.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00003-of-00008.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00003-of-00008.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00004-of-00008.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00004-of-00008.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00004-of-00008.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00004-of-00008.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00004-of-00008.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00004-of-00008.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00004-of-00008.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00004-of-00008.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00004-of-00008.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00005-of-00008.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00005-of-00008.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00005-of-00008.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00005-of-00008.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00005-of-00008.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00005-of-00008.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00005-of-00008.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00005-of-00008.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00005-of-00008.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00005-of-00008.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00005-of-00008.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00005-of-00008.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00005-of-00008.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00005-of-00008.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00005-of-00008.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00005-of-00008.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00005-of-00008.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00005-of-00008.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00005-of-00008.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00005-of-00008.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00005-of-00008.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00005-of-00008.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00003-of-00008.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00003-of-00008.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00003-of-00008.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00006-of-00008.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00006-of-00008.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00006-of-00008.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00006-of-00008.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00006-of-00008.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00006-of-00008.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00006-of-00008.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00006-of-00008.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00006-of-00008.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00006-of-00008.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00006-of-00008.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00006-of-00008.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00006-of-00008.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00006-of-00008.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00006-of-00008.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00006-of-00008.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00006-of-00008.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00006-of-00008.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00006-of-00008.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00007-of-00008.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00007-of-00008.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00007-of-00008.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00007-of-00008.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00007-of-00008.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00007-of-00008.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.layers.28.self_attn.k_proj.bias": "model-00007-of-00008.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.28.self_attn.q_proj.bias": "model-00007-of-00008.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.28.self_attn.v_proj.bias": "model-00007-of-00008.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.layers.29.self_attn.k_proj.bias": "model-00007-of-00008.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.29.self_attn.q_proj.bias": "model-00007-of-00008.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.29.self_attn.v_proj.bias": "model-00007-of-00008.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00003-of-00008.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00003-of-00008.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00003-of-00008.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.layers.30.self_attn.k_proj.bias": "model-00007-of-00008.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.30.self_attn.q_proj.bias": "model-00007-of-00008.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.30.self_attn.v_proj.bias": "model-00007-of-00008.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00008-of-00008.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00008-of-00008.safetensors",
+    "model.layers.31.self_attn.k_proj.bias": "model-00007-of-00008.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.31.self_attn.q_proj.bias": "model-00007-of-00008.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.31.self_attn.v_proj.bias": "model-00007-of-00008.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00007-of-00008.safetensors",
+    "model.layers.32.input_layernorm.weight": "model-00008-of-00008.safetensors",
+    "model.layers.32.mlp.down_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.32.mlp.gate_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.32.mlp.up_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.32.post_attention_layernorm.weight": "model-00008-of-00008.safetensors",
+    "model.layers.32.self_attn.k_proj.bias": "model-00008-of-00008.safetensors",
+    "model.layers.32.self_attn.k_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.32.self_attn.o_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.32.self_attn.q_proj.bias": "model-00008-of-00008.safetensors",
+    "model.layers.32.self_attn.q_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.32.self_attn.v_proj.bias": "model-00008-of-00008.safetensors",
+    "model.layers.32.self_attn.v_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.33.input_layernorm.weight": "model-00008-of-00008.safetensors",
+    "model.layers.33.mlp.down_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.33.mlp.gate_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.33.mlp.up_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.33.post_attention_layernorm.weight": "model-00008-of-00008.safetensors",
+    "model.layers.33.self_attn.k_proj.bias": "model-00008-of-00008.safetensors",
+    "model.layers.33.self_attn.k_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.33.self_attn.o_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.33.self_attn.q_proj.bias": "model-00008-of-00008.safetensors",
+    "model.layers.33.self_attn.q_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.33.self_attn.v_proj.bias": "model-00008-of-00008.safetensors",
+    "model.layers.33.self_attn.v_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.34.input_layernorm.weight": "model-00008-of-00008.safetensors",
+    "model.layers.34.mlp.down_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.34.mlp.gate_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.34.mlp.up_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.34.post_attention_layernorm.weight": "model-00008-of-00008.safetensors",
+    "model.layers.34.self_attn.k_proj.bias": "model-00008-of-00008.safetensors",
+    "model.layers.34.self_attn.k_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.34.self_attn.o_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.34.self_attn.q_proj.bias": "model-00008-of-00008.safetensors",
+    "model.layers.34.self_attn.q_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.34.self_attn.v_proj.bias": "model-00008-of-00008.safetensors",
+    "model.layers.34.self_attn.v_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.35.input_layernorm.weight": "model-00008-of-00008.safetensors",
+    "model.layers.35.mlp.down_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.35.mlp.gate_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.35.mlp.up_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.35.post_attention_layernorm.weight": "model-00008-of-00008.safetensors",
+    "model.layers.35.self_attn.k_proj.bias": "model-00008-of-00008.safetensors",
+    "model.layers.35.self_attn.k_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.35.self_attn.o_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.35.self_attn.q_proj.bias": "model-00008-of-00008.safetensors",
+    "model.layers.35.self_attn.q_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.35.self_attn.v_proj.bias": "model-00008-of-00008.safetensors",
+    "model.layers.35.self_attn.v_proj.weight": "model-00008-of-00008.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00003-of-00008.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00003-of-00008.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00003-of-00008.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00003-of-00008.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00003-of-00008.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00003-of-00008.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00003-of-00008.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00003-of-00008.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00003-of-00008.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00008.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00004-of-00008.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00004-of-00008.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00004-of-00008.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00004-of-00008.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00004-of-00008.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00004-of-00008.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00004-of-00008.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00004-of-00008.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00004-of-00008.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00004-of-00008.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00004-of-00008.safetensors",
+    "model.norm.weight": "model-00008-of-00008.safetensors",
+    "visual.blocks.0.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.0.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.0.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.0.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.0.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.0.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.0.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.0.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.0.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.0.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.0.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.0.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.1.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.1.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.1.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.1.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.1.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.1.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.1.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.1.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.1.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.1.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.1.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.1.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.10.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.10.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.10.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.10.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.10.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.10.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.10.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.10.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.10.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.10.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.10.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.10.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.11.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.11.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.11.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.11.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.11.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.11.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.11.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.11.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.11.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.11.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.11.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.11.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.12.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.12.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.12.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.12.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.12.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.12.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.12.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.12.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.12.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.12.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.12.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.12.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.13.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.13.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.13.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.13.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.13.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.13.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.13.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.13.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.13.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.13.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.13.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.13.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.14.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.14.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.14.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.14.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.14.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.14.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.14.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.14.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.14.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.14.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.14.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.14.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.15.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.15.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.15.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.15.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.15.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.15.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.15.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.15.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.15.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.15.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.15.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.15.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.16.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.16.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.16.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.16.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.16.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.16.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.16.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.16.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.16.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.16.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.16.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.16.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.17.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.17.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.17.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.17.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.17.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.17.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.17.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.17.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.17.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.17.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.17.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.17.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.18.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.18.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.18.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.18.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.18.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.18.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.18.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.18.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.18.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.18.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.18.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.18.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.19.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.19.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.19.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.19.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.19.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.19.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.19.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.19.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.19.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.19.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.19.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.19.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.2.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.2.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.2.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.2.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.2.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.2.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.2.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.2.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.2.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.2.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.2.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.2.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.20.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.20.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.20.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.20.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.20.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.20.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.20.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.20.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.20.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.20.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.20.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.20.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.21.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.21.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.21.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.21.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.21.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.21.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.21.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.21.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.21.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.21.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.21.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.21.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.22.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.22.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.22.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.22.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.22.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.22.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.22.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.22.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.22.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.22.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.22.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.22.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.23.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.23.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.23.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.23.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.23.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.23.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.23.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.23.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.23.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.23.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.23.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.23.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.24.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.24.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.24.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.24.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.24.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.24.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.24.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.24.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.24.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.24.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.24.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.24.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.25.attn.proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.25.attn.proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.25.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.25.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.25.mlp.down_proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.25.mlp.down_proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.25.mlp.gate_proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.25.mlp.gate_proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.25.mlp.up_proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.25.mlp.up_proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.25.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.25.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.26.attn.proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.26.attn.proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.26.attn.qkv.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.26.attn.qkv.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.26.mlp.down_proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.26.mlp.down_proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.26.mlp.gate_proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.26.mlp.gate_proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.26.mlp.up_proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.26.mlp.up_proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.26.norm1.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.26.norm2.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.27.attn.proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.27.attn.proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.27.attn.qkv.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.27.attn.qkv.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.27.mlp.down_proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.27.mlp.down_proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.27.mlp.gate_proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.27.mlp.gate_proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.27.mlp.up_proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.27.mlp.up_proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.27.norm1.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.27.norm2.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.28.attn.proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.28.attn.proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.28.attn.qkv.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.28.attn.qkv.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.28.mlp.down_proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.28.mlp.down_proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.28.mlp.gate_proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.28.mlp.gate_proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.28.mlp.up_proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.28.mlp.up_proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.28.norm1.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.28.norm2.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.29.attn.proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.29.attn.proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.29.attn.qkv.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.29.attn.qkv.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.29.mlp.down_proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.29.mlp.down_proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.29.mlp.gate_proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.29.mlp.gate_proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.29.mlp.up_proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.29.mlp.up_proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.29.norm1.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.29.norm2.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.3.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.3.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.3.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.3.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.3.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.3.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.3.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.3.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.3.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.3.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.3.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.3.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.30.attn.proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.30.attn.proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.30.attn.qkv.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.30.attn.qkv.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.30.mlp.down_proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.30.mlp.down_proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.30.mlp.gate_proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.30.mlp.gate_proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.30.mlp.up_proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.30.mlp.up_proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.30.norm1.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.30.norm2.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.31.attn.proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.31.attn.proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.31.attn.qkv.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.31.attn.qkv.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.31.mlp.down_proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.31.mlp.down_proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.31.mlp.gate_proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.31.mlp.gate_proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.31.mlp.up_proj.bias": "model-00002-of-00008.safetensors",
+    "visual.blocks.31.mlp.up_proj.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.31.norm1.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.31.norm2.weight": "model-00002-of-00008.safetensors",
+    "visual.blocks.4.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.4.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.4.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.4.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.4.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.4.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.4.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.4.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.4.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.4.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.4.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.4.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.5.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.5.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.5.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.5.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.5.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.5.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.5.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.5.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.5.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.5.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.5.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.5.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.6.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.6.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.6.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.6.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.6.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.6.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.6.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.6.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.6.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.6.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.6.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.6.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.7.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.7.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.7.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.7.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.7.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.7.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.7.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.7.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.7.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.7.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.7.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.7.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.8.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.8.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.8.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.8.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.8.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.8.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.8.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.8.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.8.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.8.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.8.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.8.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.9.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.9.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.9.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.9.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.9.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.9.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.9.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.9.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.9.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
+    "visual.blocks.9.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.9.norm1.weight": "model-00001-of-00008.safetensors",
+    "visual.blocks.9.norm2.weight": "model-00001-of-00008.safetensors",
+    "visual.merger.ln_q.weight": "model-00002-of-00008.safetensors",
+    "visual.merger.mlp.0.bias": "model-00002-of-00008.safetensors",
+    "visual.merger.mlp.0.weight": "model-00002-of-00008.safetensors",
+    "visual.merger.mlp.2.bias": "model-00002-of-00008.safetensors",
+    "visual.merger.mlp.2.weight": "model-00002-of-00008.safetensors",
+    "visual.patch_embed.proj.weight": "model-00001-of-00008.safetensors"
+  }
+}

circulant_merged/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2VLImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "max_pixels": 589824,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "patch_size": 14,
+  "processor_class": "Qwen2_5_VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
+  },
+  "temporal_patch_size": 2
+}

circulant_merged/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

circulant_merged/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

circulant_merged/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,210 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "processor_class": "Qwen2_5_VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

circulant_merged/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/eval_qwenvl.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

data/eval_vora.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

eval/eval_qwen_baseline.py ADDED Viewed

	@@ -0,0 +1,222 @@

+"""
+Qwen2.5-7B Text-Only Baseline Evaluation
+Computes perplexity on the same held-out caption data WITHOUT images.
+This serves as baseline: a pure text LLM shouldn't predict image captions well.
+Usage:
+  python eval/eval_qwen_baseline.py \
+      --model-path qwen_models/models--Qwen--Qwen2.5-7B-Instruct/snapshots/a09a35458c702b33eeacc393d103063234e8bc28 \
+      --eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl
+"""
+import argparse
+import json
+import math
+import os
+import sys
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+IGNORE_INDEX = -100
+def load_eval_data(eval_path, max_samples=None):
+    data = []
+    with open(eval_path, "r") as f:
+        for line in f:
+            item = json.loads(line.strip())
+            data.append(item)
+            if max_samples and len(data) >= max_samples:
+                break
+    print(f"Loaded {len(data)} evaluation samples")
+    return data
+def build_text_only_batch(tokenizer, caption, device):
+    """Build prompt for text-only baseline.
+    Uses the same prompt template as VoRA, but replaces <image> with
+    a text instruction "Describe this image." (since there's no image).
+    """
+    system_start = "<|im_start|>system\n"
+    system_message = "You are a helpful assistant."
+    system_end = "<|im_end|>"
+    user_start = "\n<|im_start|>user\n"
+    user_end = "<|im_end|>\n<|im_start|>assistant\n"
+    prompt = (system_start + system_message + system_end +
+              user_start + "Describe this image." + user_end)
+    prompt_ids = tokenizer.encode(prompt)
+    caption_ids = tokenizer.encode(caption)
+    eos_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
+    full_ids = prompt_ids + caption_ids + [eos_id]
+    labels = [IGNORE_INDEX] * len(prompt_ids) + caption_ids + [eos_id]
+    batch = {
+        "input_ids": torch.tensor([full_ids], dtype=torch.long).to(device),
+        "attention_mask": torch.ones(1, len(full_ids), dtype=torch.long).to(device),
+        "labels": torch.tensor([labels], dtype=torch.long).to(device),
+    }
+    return batch, len(caption_ids) + 1
+@torch.no_grad()
+def evaluate_perplexity(model, tokenizer, eval_data, device):
+    model.eval()
+    total_loss = 0.0
+    total_tokens = 0
+    errors = 0
+    for i, item in enumerate(tqdm(eval_data, desc="Qwen Baseline Perplexity")):
+        caption = item["text"]
+        try:
+            batch, n_caption_tokens = build_text_only_batch(tokenizer, caption, device)
+            outputs = model(**batch)
+            loss = outputs.loss
+            total_loss += loss.item() * n_caption_tokens
+            total_tokens += n_caption_tokens
+        except Exception as e:
+            errors += 1
+            if errors <= 5:
+                print(f"  Error on sample {i}: {e}")
+            continue
+    if total_tokens == 0:
+        print("No valid samples!")
+        return float("inf")
+    avg_loss = total_loss / total_tokens
+    perplexity = math.exp(avg_loss)
+    print(f"\n=== Qwen2.5-7B Text-Only Baseline ===")
+    print(f"Samples evaluated: {len(eval_data) - errors}/{len(eval_data)}")
+    print(f"Errors: {errors}")
+    print(f"Average cross-entropy loss: {avg_loss:.4f}")
+    print(f"Perplexity: {perplexity:.2f}")
+    return perplexity
+@torch.no_grad()
+def evaluate_caption(model, tokenizer, eval_data, device, max_new_tokens=256):
+    """Generate captions without any image (text-only baseline)."""
+    model.eval()
+    predictions = []
+    references = []
+    system_start = "<|im_start|>system\n"
+    system_message = "You are a helpful assistant."
+    system_end = "<|im_end|>"
+    user_start = "\n<|im_start|>user\n"
+    user_end = "<|im_end|>\n<|im_start|>assistant\n"
+    prompt = (system_start + system_message + system_end +
+              user_start + "Describe this image." + user_end)
+    prompt_ids = tokenizer.encode(prompt)
+    eos_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
+    for item in tqdm(eval_data, desc="Qwen Baseline Caption"):
+        try:
+            input_ids = torch.tensor([prompt_ids], dtype=torch.long).to(device)
+            attention_mask = torch.ones_like(input_ids)
+            outputs = model.generate(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
+                pad_token_id=tokenizer.eos_token_id,
+                eos_token_id=eos_id,
+            )
+            generated = outputs[0][len(prompt_ids):]
+            text = tokenizer.decode(generated, skip_special_tokens=True)
+            predictions.append(text)
+            references.append(item["text"])
+        except Exception as e:
+            continue
+    if predictions:
+        metrics = _compute_metrics(predictions, references)
+        print(f"\n=== Qwen Baseline Caption Results ===")
+        print(f"Samples: {len(predictions)}/{len(eval_data)}")
+        for k, v in metrics.items():
+            print(f"{k}: {v:.4f}")
+        print(f"\n--- Sample Outputs (first 3) ---")
+        for i in range(min(3, len(predictions))):
+            print(f"[{i}] Generated: {predictions[i][:200]}")
+            print(f"[{i}] Reference: {references[i][:200]}")
+            print()
+        return metrics
+    return {}
+def _compute_metrics(predictions, references):
+    metrics = {}
+    try:
+        from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
+        smooth = SmoothingFunction().method1
+        refs = [[ref.split()] for ref in references]
+        preds = [pred.split() for pred in predictions]
+        metrics["BLEU-1"] = corpus_bleu(refs, preds, weights=(1, 0, 0, 0), smoothing_function=smooth)
+        metrics["BLEU-4"] = corpus_bleu(refs, preds, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)
+    except ImportError:
+        pass
+    try:
+        from rouge_score import rouge_scorer
+        scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
+        scores = [scorer.score(ref, pred)["rougeL"].fmeasure for pred, ref in zip(predictions, references)]
+        metrics["ROUGE-L"] = sum(scores) / len(scores)
+    except ImportError:
+        pass
+    return metrics
+def main():
+    parser = argparse.ArgumentParser(description="Qwen2.5-7B Text-Only Baseline")
+    parser.add_argument("--mode", type=str, default="all",
+                        choices=["perplexity", "caption", "all"])
+    parser.add_argument("--model-path", type=str, required=True,
+                        help="Path to Qwen2.5-7B-Instruct")
+    parser.add_argument("--eval-data", type=str, required=True)
+    parser.add_argument("--max-samples", type=int, default=None)
+    parser.add_argument("--max-new-tokens", type=int, default=256)
+    parser.add_argument("--dtype", type=str, default="float16",
+                        choices=["float16", "bfloat16"])
+    parser.add_argument("--output", type=str, default=None)
+    args = parser.parse_args()
+    dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
+    print(f"Loading Qwen2.5-7B from {args.model_path} ...")
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_path, torch_dtype=dtype, device_map="auto",
+        trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
+    model.eval()
+    device = next(model.parameters()).device
+    print(f"Model loaded on {device}")
+    eval_data = load_eval_data(args.eval_data, max_samples=args.max_samples)
+    results = {"model": "Qwen2.5-7B-Instruct (text-only)", "num_samples": len(eval_data)}
+    if args.mode in ("perplexity", "all"):
+        ppl = evaluate_perplexity(model, tokenizer, eval_data, device)
+        results["perplexity"] = ppl
+    if args.mode in ("caption", "all"):
+        caption_metrics = evaluate_caption(
+            model, tokenizer, eval_data, device, max_new_tokens=args.max_new_tokens)
+        results.update(caption_metrics)
+    if args.output:
+        os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
+        with open(args.output, "w") as f:
+            json.dump(results, f, indent=2, ensure_ascii=False)
+        print(f"\nResults saved to {args.output}")
+if __name__ == "__main__":
+    main()

eval/eval_qwen_vl.py ADDED Viewed

	@@ -0,0 +1,341 @@

+"""
+Qwen2.5-VL-3B Evaluation Script
+Evaluates the original Qwen2.5-VL-3B-Instruct (with vision) on held-out caption data.
+Also supports evaluating LoRA / block-circulant finetuned versions if checkpoints exist.
+Usage:
+  # Original model
+  python eval/eval_qwen_vl.py --mode all \
+      --model-path Finetune-Qwen2.5-VL/Qwen2.5-VL-3B-Instruct \
+      --eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl
+  # With LoRA adapter
+  python eval/eval_qwen_vl.py --mode all \
+      --model-path Finetune-Qwen2.5-VL/Qwen2.5-VL-3B-Instruct \
+      --adapter-path Finetune-Qwen2.5-VL/saves/Qwen2.5-VL-3B-Instruct/lora \
+      --eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl
+"""
+import argparse
+import json
+import math
+import os
+import sys
+import torch
+from PIL import Image
+from tqdm import tqdm
+from transformers import (
+    AutoModelForCausalLM,
+    AutoProcessor,
+    AutoTokenizer,
+    Qwen2VLForConditionalGeneration,
+)
+IGNORE_INDEX = -100
+# ============================================================
+# Data loading
+# ============================================================
+def load_eval_data(eval_path, max_samples=None):
+    data = []
+    with open(eval_path, "r") as f:
+        for line in f:
+            item = json.loads(line.strip())
+            data.append(item)
+            if max_samples and len(data) >= max_samples:
+                break
+    print(f"Loaded {len(data)} evaluation samples")
+    return data
+# ============================================================
+# Build inputs for Qwen2.5-VL
+# ============================================================
+def build_messages(image_path, caption=None):
+    """Build Qwen2.5-VL chat messages for image captioning."""
+    messages = [
+        {
+            "role": "system",
+            "content": [{"type": "text", "text": "You are a helpful assistant."}],
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": f"file://{os.path.abspath(image_path)}"},
+                {"type": "text", "text": "Describe this image."},
+            ],
+        },
+    ]
+    if caption is not None:
+        # For perplexity: add assistant response
+        messages.append({
+            "role": "assistant",
+            "content": [{"type": "text", "text": caption}],
+        })
+    return messages
+def prepare_perplexity_inputs(processor, image_path, caption, device):
+    """Prepare inputs for perplexity evaluation (with labels)."""
+    # Full messages with the ground truth caption as assistant response
+    messages_full = build_messages(image_path, caption=caption)
+    text_full = processor.apply_chat_template(
+        messages_full, tokenize=False, add_generation_prompt=False)
+    # Prompt-only (no assistant response) to find where caption starts
+    messages_prompt = build_messages(image_path, caption=None)
+    text_prompt = processor.apply_chat_template(
+        messages_prompt, tokenize=False, add_generation_prompt=True)
+    # Process full input with image
+    image = Image.open(image_path).convert("RGB")
+    inputs_full = processor(
+        text=[text_full], images=[image], padding=True, return_tensors="pt"
+    ).to(device)
+    inputs_prompt = processor(
+        text=[text_prompt], images=[image], padding=True, return_tensors="pt"
+    ).to(device)
+    # Create labels: mask out prompt tokens
+    input_ids = inputs_full["input_ids"]
+    prompt_len = inputs_prompt["input_ids"].shape[1]
+    labels = input_ids.clone()
+    labels[:, :prompt_len] = IGNORE_INDEX
+    n_caption_tokens = int((labels != IGNORE_INDEX).sum().item())
+    inputs_full["labels"] = labels
+    return inputs_full, n_caption_tokens
+def prepare_generation_inputs(processor, image_path, device):
+    """Prepare inputs for caption generation."""
+    messages = build_messages(image_path, caption=None)
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True)
+    image = Image.open(image_path).convert("RGB")
+    inputs = processor(
+        text=[text], images=[image], padding=True, return_tensors="pt"
+    ).to(device)
+    return inputs
+# ============================================================
+# Evaluation: Perplexity
+# ============================================================
+@torch.no_grad()
+def evaluate_perplexity(model, processor, eval_data, device):
+    model.eval()
+    total_loss = 0.0
+    total_tokens = 0
+    errors = 0
+    for i, item in enumerate(tqdm(eval_data, desc="Qwen-VL Perplexity")):
+        image_path = item["image"]
+        caption = item["text"]
+        if not os.path.exists(image_path):
+            errors += 1
+            continue
+        try:
+            inputs, n_tokens = prepare_perplexity_inputs(
+                processor, image_path, caption, device)
+            outputs = model(**inputs)
+            loss = outputs.loss
+            total_loss += loss.item() * n_tokens
+            total_tokens += n_tokens
+        except Exception as e:
+            errors += 1
+            if errors <= 5:
+                print(f"  Error on sample {i}: {e}")
+            continue
+    if total_tokens == 0:
+        print("No valid samples!")
+        return float("inf")
+    avg_loss = total_loss / total_tokens
+    perplexity = math.exp(avg_loss)
+    print(f"\n=== Qwen2.5-VL Perplexity Results ===")
+    print(f"Samples: {len(eval_data) - errors}/{len(eval_data)}")
+    print(f"Errors: {errors}")
+    print(f"Average CE loss: {avg_loss:.4f}")
+    print(f"Perplexity: {perplexity:.2f}")
+    return perplexity
+# ============================================================
+# Evaluation: Caption Generation
+# ============================================================
+@torch.no_grad()
+def evaluate_caption(model, processor, eval_data, device, max_new_tokens=256):
+    model.eval()
+    predictions = []
+    references = []
+    errors = 0
+    for i, item in enumerate(tqdm(eval_data, desc="Qwen-VL Caption")):
+        image_path = item["image"]
+        caption = item["text"]
+        if not os.path.exists(image_path):
+            errors += 1
+            continue
+        try:
+            inputs = prepare_generation_inputs(processor, image_path, device)
+            prompt_len = inputs["input_ids"].shape[1]
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
+            )
+            generated = outputs[0][prompt_len:]
+            text = processor.tokenizer.decode(generated, skip_special_tokens=True)
+            predictions.append(text)
+            references.append(caption)
+        except Exception as e:
+            errors += 1
+            if errors <= 5:
+                print(f"  Error on sample {i}: {e}")
+            continue
+    if not predictions:
+        print("No valid samples!")
+        return {}
+    metrics = _compute_metrics(predictions, references)
+    print(f"\n=== Qwen2.5-VL Caption Results ===")
+    print(f"Samples: {len(predictions)}/{len(eval_data)}")
+    print(f"Errors: {errors}")
+    for k, v in metrics.items():
+        print(f"{k}: {v:.4f}")
+    print(f"\n--- Sample Outputs (first 5) ---")
+    for i in range(min(5, len(predictions))):
+        print(f"[{i}] Generated: {predictions[i][:200]}")
+        print(f"[{i}] Reference: {references[i][:200]}")
+        print()
+    return metrics
+def _compute_metrics(predictions, references):
+    metrics = {}
+    try:
+        from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
+        smooth = SmoothingFunction().method1
+        refs = [[ref.split()] for ref in references]
+        preds = [pred.split() for pred in predictions]
+        metrics["BLEU-1"] = corpus_bleu(refs, preds, weights=(1, 0, 0, 0), smoothing_function=smooth)
+        metrics["BLEU-4"] = corpus_bleu(refs, preds, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)
+    except ImportError:
+        print("Warning: nltk not installed. pip install nltk")
+    try:
+        from rouge_score import rouge_scorer
+        scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
+        scores = [scorer.score(ref, pred)["rougeL"].fmeasure for pred, ref in zip(predictions, references)]
+        metrics["ROUGE-L"] = sum(scores) / len(scores)
+    except ImportError:
+        print("Warning: rouge-score not installed. pip install rouge-score")
+    return metrics
+# ============================================================
+# Model loading
+# ============================================================
+def load_model(model_path, adapter_path=None, dtype=torch.float16):
+    print(f"Loading Qwen2.5-VL from {model_path} ...")
+    processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+    # Try Qwen2VL-specific class first, fall back to AutoModel
+    try:
+        model = Qwen2VLForConditionalGeneration.from_pretrained(
+            model_path,
+            torch_dtype=dtype,
+            device_map="auto",
+            trust_remote_code=True,
+        )
+    except Exception:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype=dtype,
+            device_map="auto",
+            trust_remote_code=True,
+        )
+    # Load LoRA adapter if provided
+    if adapter_path and os.path.exists(adapter_path):
+        print(f"Loading adapter from {adapter_path} ...")
+        from peft import PeftModel
+        model = PeftModel.from_pretrained(model, adapter_path)
+        model = model.merge_and_unload()
+        print("Adapter merged.")
+    model.eval()
+    device = next(model.parameters()).device
+    print(f"Model loaded on {device}")
+    return model, processor
+# ============================================================
+# Main
+# ============================================================
+def main():
+    parser = argparse.ArgumentParser(description="Qwen2.5-VL-3B Evaluation")
+    parser.add_argument("--mode", type=str, default="all",
+                        choices=["perplexity", "caption", "all"])
+    parser.add_argument("--model-path", type=str, required=True,
+                        help="Path to Qwen2.5-VL-3B-Instruct")
+    parser.add_argument("--adapter-path", type=str, default=None,
+                        help="Path to LoRA/circulant adapter (optional)")
+    parser.add_argument("--eval-data", type=str, required=True,
+                        help="Path to eval_qwenvl.jsonl")
+    parser.add_argument("--max-samples", type=int, default=None)
+    parser.add_argument("--max-new-tokens", type=int, default=256)
+    parser.add_argument("--dtype", type=str, default="float16",
+                        choices=["float16", "bfloat16"])
+    parser.add_argument("--output", type=str, default=None)
+    args = parser.parse_args()
+    dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
+    model, processor = load_model(args.model_path, args.adapter_path, dtype)
+    device = next(model.parameters()).device
+    eval_data = load_eval_data(args.eval_data, max_samples=args.max_samples)
+    model_name = "Qwen2.5-VL-3B"
+    if args.adapter_path:
+        model_name += f" + {os.path.basename(args.adapter_path)}"
+    results = {"model": model_name, "num_samples": len(eval_data)}
+    if args.mode in ("perplexity", "all"):
+        ppl = evaluate_perplexity(model, processor, eval_data, device)
+        results["perplexity"] = ppl
+    if args.mode in ("caption", "all"):
+        metrics = evaluate_caption(
+            model, processor, eval_data, device, max_new_tokens=args.max_new_tokens)
+        results.update(metrics)
+    if args.output:
+        os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
+        with open(args.output, "w") as f:
+            json.dump(results, f, indent=2, ensure_ascii=False)
+        print(f"\nResults saved to {args.output}")
+if __name__ == "__main__":
+    main()

eval/eval_vora.py ADDED Viewed

	@@ -0,0 +1,430 @@

+"""
+VoRA Evaluation Script
+- Perplexity (cross-entropy loss) on held-out caption data
+- Caption generation with BLEU / ROUGE-L metrics
+Usage:
+  # Perplexity evaluation
+  python eval/eval_vora.py --mode perplexity \
+      --checkpoint output/pretrain_I30M_T6M/checkpoint-250 \
+      --eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl \
+      --image-processor qwen_models/models--apple--aimv2-huge-patch14-448/snapshots/f723839533d3bbdc969f541c864789f531ec0e5c
+  # Caption generation evaluation
+  python eval/eval_vora.py --mode caption \
+      --checkpoint output/pretrain_I30M_T6M/checkpoint-250 \
+      --eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl \
+      --image-processor qwen_models/models--apple--aimv2-huge-patch14-448/snapshots/f723839533d3bbdc969f541c864789f531ec0e5c
+  # Both
+  python eval/eval_vora.py --mode all \
+      --checkpoint output/pretrain_I30M_T6M/checkpoint-250 \
+      --eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl \
+      --image-processor qwen_models/models--apple--aimv2-huge-patch14-448/snapshots/f723839533d3bbdc969f541c864789f531ec0e5c
+"""
+import argparse
+import json
+import math
+import os
+import sys
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from tqdm import tqdm
+from transformers import AutoImageProcessor, AutoTokenizer
+# Add project root to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from models.modeling_vora import VoRAForCausalLM, VoRAConfig
+# ============================================================
+# Image preprocessing (same as training pipeline)
+# ============================================================
+def expand2square(pil_img):
+    """Expand image to square with black padding (same as training)."""
+    background_color = (0, 0, 0)
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def load_and_process_image(image_path, image_processor):
+    """Load image, expand to square, apply HF image transforms."""
+    img = Image.open(image_path).convert("RGB")
+    img = expand2square(img)
+    pixel_values = image_processor(img, return_tensors="pt")["pixel_values"]  # (1, 3, 448, 448)
+    return pixel_values
+# ============================================================
+# Text processing (same prompt template as training)
+# ============================================================
+IMAGE_TOKEN_INDEX = -200
+IGNORE_INDEX = -100
+def build_prompt_ids(tokenizer, has_image=True):
+    """Build the prompt token IDs (system + user turn) for captioning."""
+    system_start = "<|im_start|>system\n"
+    system_message = "You are a helpful assistant."
+    system_end = "<|im_end|>"
+    user_start = "\n<|im_start|>user\n"
+    user_end = "<|im_end|>\n<|im_start|>assistant\n"
+    if has_image:
+        # system + user with <image> placeholder
+        prompt = system_start + system_message + system_end + user_start
+        prompt_after_image = user_end
+        prompt_ids = tokenizer.encode(prompt)
+        after_image_ids = tokenizer.encode(prompt_after_image)
+        # Insert image token index between prompt and after_image
+        input_ids = prompt_ids + [IMAGE_TOKEN_INDEX] + after_image_ids
+    else:
+        prompt = (system_start + system_message + system_end +
+                  user_start + "Describe this image." + user_end)
+        input_ids = tokenizer.encode(prompt)
+    return input_ids
+def build_perplexity_batch(tokenizer, image_path, caption, image_processor, device):
+    """Build a batch for perplexity evaluation (with labels)."""
+    prompt_ids = build_prompt_ids(tokenizer, has_image=True)
+    caption_ids = tokenizer.encode(caption)
+    eos_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
+    full_ids = prompt_ids + caption_ids + [eos_id]
+    # Labels: -100 for prompt tokens, actual IDs for caption tokens
+    labels = [IGNORE_INDEX] * len(prompt_ids) + caption_ids + [eos_id]
+    # Load image
+    pixel_values = load_and_process_image(image_path, image_processor)
+    batch = {
+        "input_ids": torch.tensor([full_ids], dtype=torch.long).to(device),
+        "attention_mask": torch.ones(1, len(full_ids), dtype=torch.long).to(device),
+        "labels": torch.tensor([labels], dtype=torch.long).to(device),
+        "frames": pixel_values.to(device),  # (1, 3, 448, 448)
+        "n_frames": [1],
+        "vision_placeholder_index": IMAGE_TOKEN_INDEX,
+    }
+    return batch, len(caption_ids) + 1  # +1 for eos
+def build_generation_batch(tokenizer, image_path, image_processor, device):
+    """Build a batch for caption generation (no labels)."""
+    prompt_ids = build_prompt_ids(tokenizer, has_image=True)
+    pixel_values = load_and_process_image(image_path, image_processor)
+    batch = {
+        "input_ids": torch.tensor([prompt_ids], dtype=torch.long).to(device),
+        "attention_mask": torch.ones(1, len(prompt_ids), dtype=torch.long).to(device),
+        "frames": pixel_values.to(device),
+        "n_frames": [1],
+        "vision_placeholder_index": IMAGE_TOKEN_INDEX,
+    }
+    return batch
+# ============================================================
+# Load evaluation data
+# ============================================================
+def load_eval_data(eval_path, max_samples=None):
+    """Load eval data from eval_qwenvl.jsonl format: {"image": path, "text": caption}"""
+    data = []
+    with open(eval_path, "r") as f:
+        for line in f:
+            item = json.loads(line.strip())
+            data.append(item)
+            if max_samples and len(data) >= max_samples:
+                break
+    print(f"Loaded {len(data)} evaluation samples")
+    return data
+# ============================================================
+# Evaluation: Perplexity
+# ============================================================
+@torch.no_grad()
+def evaluate_perplexity(model, tokenizer, image_processor, eval_data, device):
+    """Compute perplexity on held-out caption data."""
+    model.eval()
+    total_loss = 0.0
+    total_tokens = 0
+    errors = 0
+    for i, item in enumerate(tqdm(eval_data, desc="Perplexity")):
+        image_path = item["image"]
+        caption = item["text"]
+        if not os.path.exists(image_path):
+            errors += 1
+            continue
+        try:
+            batch, n_caption_tokens = build_perplexity_batch(
+                tokenizer, image_path, caption, image_processor, device)
+            outputs = model(**batch)
+            loss = outputs.loss
+            total_loss += loss.item() * n_caption_tokens
+            total_tokens += n_caption_tokens
+        except Exception as e:
+            errors += 1
+            if errors <= 5:
+                print(f"  Error on sample {i}: {e}")
+            continue
+    if total_tokens == 0:
+        print("No valid samples for perplexity!")
+        return float("inf")
+    avg_loss = total_loss / total_tokens
+    perplexity = math.exp(avg_loss)
+    print(f"\n=== Perplexity Results ===")
+    print(f"Samples evaluated: {len(eval_data) - errors}/{len(eval_data)}")
+    print(f"Errors: {errors}")
+    print(f"Average cross-entropy loss: {avg_loss:.4f}")
+    print(f"Perplexity: {perplexity:.2f}")
+    return perplexity
+# ============================================================
+# Evaluation: Caption Generation
+# ============================================================
+@torch.no_grad()
+def evaluate_caption(model, tokenizer, image_processor, eval_data, device,
+                     max_new_tokens=256):
+    """Generate captions and compute BLEU / ROUGE-L."""
+    model.eval()
+    predictions = []
+    references = []
+    errors = 0
+    eos_token_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
+    for i, item in enumerate(tqdm(eval_data, desc="Caption Generation")):
+        image_path = item["image"]
+        caption = item["text"]
+        if not os.path.exists(image_path):
+            errors += 1
+            continue
+        try:
+            batch = build_generation_batch(tokenizer, image_path, image_processor, device)
+            outputs = model.generate(
+                batch,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
+                pad_token_id=tokenizer.eos_token_id,
+                eos_token_id=eos_token_id,
+            )
+            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            predictions.append(generated_text)
+            references.append(caption)
+        except Exception as e:
+            errors += 1
+            if errors <= 5:
+                print(f"  Error on sample {i}: {e}")
+            continue
+    if len(predictions) == 0:
+        print("No valid samples for caption evaluation!")
+        return {}
+    # Compute metrics
+    metrics = compute_caption_metrics(predictions, references)
+    print(f"\n=== Caption Generation Results ===")
+    print(f"Samples evaluated: {len(predictions)}/{len(eval_data)}")
+    print(f"Errors: {errors}")
+    for k, v in metrics.items():
+        print(f"{k}: {v:.4f}")
+    # Print a few examples
+    print(f"\n--- Sample Outputs (first 5) ---")
+    for i in range(min(5, len(predictions))):
+        print(f"[{i}] Generated: {predictions[i][:200]}")
+        print(f"[{i}] Reference: {references[i][:200]}")
+        print()
+    return metrics
+def compute_caption_metrics(predictions, references):
+    """Compute BLEU-1, BLEU-4, ROUGE-L metrics."""
+    metrics = {}
+    # BLEU
+    try:
+        from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
+        smooth = SmoothingFunction().method1
+        refs_tokenized = [[ref.split()] for ref in references]
+        preds_tokenized = [pred.split() for pred in predictions]
+        metrics["BLEU-1"] = corpus_bleu(refs_tokenized, preds_tokenized,
+                                         weights=(1, 0, 0, 0),
+                                         smoothing_function=smooth)
+        metrics["BLEU-4"] = corpus_bleu(refs_tokenized, preds_tokenized,
+                                         weights=(0.25, 0.25, 0.25, 0.25),
+                                         smoothing_function=smooth)
+    except ImportError:
+        print("Warning: nltk not installed, skipping BLEU. Install with: pip install nltk")
+    # ROUGE-L
+    try:
+        from rouge_score import rouge_scorer
+        scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
+        rouge_scores = [scorer.score(ref, pred)["rougeL"].fmeasure
+                        for pred, ref in zip(predictions, references)]
+        metrics["ROUGE-L"] = sum(rouge_scores) / len(rouge_scores)
+    except ImportError:
+        print("Warning: rouge_score not installed, skipping ROUGE-L. Install with: pip install rouge-score")
+    return metrics
+# ============================================================
+# Model loading
+# ============================================================
+def load_vora_model(checkpoint_path, device_map="auto", dtype=torch.float16):
+    """Load VoRA model from checkpoint."""
+    print(f"Loading VoRA model from {checkpoint_path} ...")
+    config = VoRAConfig.from_pretrained(checkpoint_path)
+    # Disable aux_vision for inference (not needed)
+    config.aux_vision = ""
+    model = VoRAForCausalLM(config)
+    model.debug_max_steps = 0  # Disable debug prints
+    # Load checkpoint weights
+    from tools.merge_lora import partial_load_from_checkpoints
+    state_dict = partial_load_from_checkpoints(checkpoint_path)
+    msg = model.load_state_dict(state_dict, strict=False)
+    print(f"Load state dict: missing={len(msg.missing_keys)}, unexpected={len(msg.unexpected_keys)}")
+    if msg.missing_keys:
+        print(f"  Missing keys (first 5): {msg.missing_keys[:5]}")
+    model = model.to(dtype=dtype)
+    if device_map == "auto" and torch.cuda.device_count() > 1:
+        from accelerate import dispatch_model, infer_auto_device_map
+        device_map_computed = infer_auto_device_map(model, max_memory={
+            i: "22GiB" for i in range(torch.cuda.device_count())
+        })
+        model = dispatch_model(model, device_map=device_map_computed)
+        print(f"Model dispatched across {torch.cuda.device_count()} GPUs")
+    else:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model = model.to(device)
+        print(f"Model on {device}")
+    model.eval()
+    return model
+def load_merged_vora_model(merged_path, device_map="auto", dtype=torch.float16):
+    """Load merged (LoRA-free) VoRA model."""
+    print(f"Loading merged VoRA model from {merged_path} ...")
+    model = VoRAForCausalLM.from_pretrained(
+        merged_path,
+        torch_dtype=dtype,
+        device_map=device_map,
+        trust_remote_code=True,
+    )
+    model.debug_max_steps = 0
+    model.eval()
+    return model
+# ============================================================
+# Main
+# ============================================================
+def main():
+    parser = argparse.ArgumentParser(description="VoRA Evaluation")
+    parser.add_argument("--mode", type=str, default="all",
+                        choices=["perplexity", "caption", "all"])
+    parser.add_argument("--checkpoint", type=str, required=True,
+                        help="Path to VoRA checkpoint or merged model directory")
+    parser.add_argument("--merged", action="store_true",
+                        help="If set, load as merged model (no LoRA)")
+    parser.add_argument("--eval-data", type=str, required=True,
+                        help="Path to eval_qwenvl.jsonl")
+    parser.add_argument("--image-processor", type=str, required=True,
+                        help="Path to AIMv2 model for image preprocessing")
+    parser.add_argument("--max-samples", type=int, default=None,
+                        help="Max number of eval samples (default: all)")
+    parser.add_argument("--max-new-tokens", type=int, default=256,
+                        help="Max new tokens for caption generation")
+    parser.add_argument("--dtype", type=str, default="float16",
+                        choices=["float16", "bfloat16"])
+    parser.add_argument("--output", type=str, default=None,
+                        help="Path to save results JSON")
+    args = parser.parse_args()
+    dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
+    # Load model
+    if args.merged:
+        model = load_merged_vora_model(args.checkpoint, dtype=dtype)
+    else:
+        model = load_vora_model(args.checkpoint, dtype=dtype)
+    device = next(model.parameters()).device
+    # Load tokenizer and image processor
+    tokenizer = model.tokenizer
+    image_processor = AutoImageProcessor.from_pretrained(args.image_processor)
+    # Load eval data
+    eval_data = load_eval_data(args.eval_data, max_samples=args.max_samples)
+    results = {"checkpoint": args.checkpoint, "num_samples": len(eval_data)}
+    # Run evaluations
+    if args.mode in ("perplexity", "all"):
+        ppl = evaluate_perplexity(model, tokenizer, image_processor, eval_data, device)
+        results["perplexity"] = ppl
+    if args.mode in ("caption", "all"):
+        caption_metrics = evaluate_caption(
+            model, tokenizer, image_processor, eval_data, device,
+            max_new_tokens=args.max_new_tokens)
+        results.update(caption_metrics)
+    # Save results
+    if args.output:
+        os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
+        with open(args.output, "w") as f:
+            json.dump(results, f, indent=2, ensure_ascii=False)
+        print(f"\nResults saved to {args.output}")
+    return results
+if __name__ == "__main__":
+    main()

eval/run_eval.sh ADDED Viewed

	@@ -0,0 +1,213 @@

+#!/bin/bash
+# =============================================================
+# VoRA Evaluation Runner (打包版)
+# =============================================================
+#
+# ========== 使用教程 ==========
+#
+# 1. 把 eval_pack/ 整个文件夹传到目标机器
+#
+# 2. 创建 conda 环境 (一次性):
+#      conda create -n eval python=3.10 -y
+#      conda activate eval
+#      cd eval_pack
+#      pip install -r requirements_eval.txt
+#
+# 3. 修改下方「路径配置」部分:
+#      - QWEN_MODEL:      Qwen2.5-7B-Instruct 路径
+#      - QWEN_VL_MODEL:   Qwen2.5-VL-3B-Instruct 路径
+#      - AIMV2_PATH:      aimv2-huge-patch14-448 路径
+#      - VORA_CIRC_MERGED: VoRA-Circulant merged 模型路径
+#
+# 4. 修改 data/eval_qwenvl.jsonl 中的图片绝对路径:
+#      sed -i 's|/share/home/jcdl1lsy2clx/VoRA/data_dir/VoRA-Recap-29M/frames|/你的/frames路径|g' data/eval_qwenvl.jsonl
+#
+#    data/eval_vora.jsonl 用的是相对路径 (frames/xxx.jpg),
+#    确保从 eval_pack/ 目录能访问到 frames/ 文件夹:
+#      ln -s /你的/frames路径 frames
+#
+# 5. 运行:
+#      conda activate eval
+#      cd eval_pack
+#      bash eval/run_eval.sh
+#
+# 6. 结果会打印表格并保存到 eval/results/*.json
+#
+# ========== 目录结构 ==========
+#
+#   eval_pack/
+#   ├── eval/                <- 评测脚本
+#   ├── models/              <- VoRA 模型代码
+#   ├── tools/               <- merge_lora.py
+#   ├── generation_files/    <- tokenizer/processor
+#   ├── data/
+#   │   ├── eval_vora.jsonl  <- VoRA 系列用 (相对路径)
+#   │   └── eval_qwenvl.jsonl <- Qwen-VL 系列用 (绝对路径)
+#   ├── vora_merged_250/     <- VoRA merged (~32G)
+#   ├── lora_merged/         <- Qwen-VL+LoRA merged (~7G)
+#   ├── circulant_merged/    <- Qwen-VL+Circulant merged (~7G)
+#   └── requirements_eval.txt
+#
+# =============================================================
+set -e
+cd "$(dirname "$0")/.."
+# ---------- 路径配置 (根据你的环境修改) ----------
+# 基线模型 (另一台机器上已有)
+QWEN_MODEL="/path/to/Qwen2.5-7B-Instruct"
+QWEN_VL_MODEL="/path/to/Qwen2.5-VL-3B-Instruct"
+AIMV2_PATH="/path/to/aimv2-huge-patch14-448"
+# 评测数据
+EVAL_DATA_VORA="data/eval_vora.jsonl"       # VoRA 系列用 (相对路径 frames/xxx.jpg)
+EVAL_DATA_QWEN="data/eval_qwenvl.jsonl"     # Qwen-VL 系列用 (绝对路径, 需要按目标机器修改)
+DTYPE="float16"
+RESULTS_DIR="eval/results"
+# 已 merge 好的模型
+VORA_MERGED="vora_merged_250"
+VORA_CIRC_MERGED="/path/to/vora-circulant-merged"
+LORA_MERGED="lora_merged"
+CIRC_MERGED="circulant_merged"
+# -----------------------------------------------
+mkdir -p "$RESULTS_DIR"
+pip install nltk rouge-score qwen-vl-utils 2>/dev/null || true
+python -c "import nltk; nltk.download('punkt', quiet=True); nltk.download('punkt_tab', quiet=True)" 2>/dev/null || true
+echo ""
+echo "=============================================="
+echo "  Step 1: Qwen2.5-7B Text-Only Baseline"
+echo "=============================================="
+python eval/eval_qwen_baseline.py \
+    --mode all \
+    --model-path "$QWEN_MODEL" \
+    --eval-data "$EVAL_DATA_QWEN" \
+    --max-samples 200 \
+    --max-new-tokens 256 \
+    --dtype "$DTYPE" \
+    --output "$RESULTS_DIR/qwen_baseline.json"
+echo ""
+echo "=============================================="
+echo "  Step 2: VoRA Full Eval"
+echo "=============================================="
+python eval/eval_vora.py \
+    --mode all \
+    --checkpoint "$VORA_MERGED" \
+    --merged \
+    --eval-data "$EVAL_DATA_VORA" \
+    --image-processor "$AIMV2_PATH" \
+    --max-samples 200 \
+    --max-new-tokens 256 \
+    --dtype "$DTYPE" \
+    --output "$RESULTS_DIR/vora_best.json"
+echo ""
+echo "=============================================="
+echo "  Step 3: VoRA-Circulant Full Eval"
+echo "=============================================="
+python eval/eval_vora.py \
+    --mode all \
+    --checkpoint "$VORA_CIRC_MERGED" \
+    --merged \
+    --eval-data "$EVAL_DATA_VORA" \
+    --image-processor "$AIMV2_PATH" \
+    --max-samples 200 \
+    --max-new-tokens 256 \
+    --dtype "$DTYPE" \
+    --output "$RESULTS_DIR/vora_circulant.json"
+echo ""
+echo "=============================================="
+echo "  Step 4: Qwen2.5-VL-3B Original Baseline"
+echo "=============================================="
+python eval/eval_qwen_vl.py \
+    --mode all \
+    --model-path "$QWEN_VL_MODEL" \
+    --eval-data "$EVAL_DATA_QWEN" \
+    --max-samples 200 \
+    --max-new-tokens 256 \
+    --dtype "$DTYPE" \
+    --output "$RESULTS_DIR/qwen_vl_original.json"
+echo ""
+echo "=============================================="
+echo "  Step 5: Qwen2.5-VL-3B + LoRA"
+echo "=============================================="
+if [ -d "$LORA_MERGED" ]; then
+    python eval/eval_qwen_vl.py \
+        --mode all \
+        --model-path "$LORA_MERGED" \
+        --eval-data "$EVAL_DATA_QWEN" \
+        --max-samples 200 \
+        --max-new-tokens 256 \
+        --dtype "$DTYPE" \
+        --output "$RESULTS_DIR/qwen_vl_lora.json"
+else
+    echo "LoRA merged model not found at $LORA_MERGED, skipping."
+fi
+echo ""
+echo "=============================================="
+echo "  Step 6: Qwen2.5-VL-3B + Block-Circulant"
+echo "=============================================="
+if [ -d "$CIRC_MERGED" ]; then
+    python eval/eval_qwen_vl.py \
+        --mode all \
+        --model-path "$CIRC_MERGED" \
+        --eval-data "$EVAL_DATA_QWEN" \
+        --max-samples 200 \
+        --max-new-tokens 256 \
+        --dtype "$DTYPE" \
+        --output "$RESULTS_DIR/qwen_vl_circulant.json"
+else
+    echo "Block-Circulant merged model not found at $CIRC_MERGED, skipping."
+fi
+echo ""
+echo "=============================================="
+echo "  Summary"
+echo "=============================================="
+python -c "
+import json, os, math
+results_dir = '$RESULTS_DIR'
+print('='*70)
+print(f'{\"Model\":<35} {\"Loss\":>10} {\"PPL\":>8} {\"BLEU-4\":>8} {\"ROUGE-L\":>8}')
+print('-'*70)
+def print_row(name, filepath):
+    if not os.path.exists(filepath):
+        return
+    r = json.load(open(filepath))
+    ppl = r.get('perplexity', None)
+    loss = math.log(ppl) if ppl and ppl != float('inf') else None
+    b4 = r.get('BLEU-4', None)
+    rl = r.get('ROUGE-L', None)
+    loss_s = f'{loss:.4f}' if loss else '-'
+    ppl_s = f'{ppl:.2f}' if ppl else '-'
+    b4_s = f'{b4:.4f}' if b4 else '-'
+    rl_s = f'{rl:.4f}' if rl else '-'
+    print(f'{name:<35} {loss_s:>10} {ppl_s:>8} {b4_s:>8} {rl_s:>8}')
+print_row('Qwen2.5-7B (text-only)', os.path.join(results_dir, 'qwen_baseline.json'))
+print_row('VoRA', os.path.join(results_dir, 'vora_best.json'))
+print_row('VoRA-Circulant', os.path.join(results_dir, 'vora_circulant.json'))
+print('-'*70)
+print_row('Qwen2.5-VL-3B (original)', os.path.join(results_dir, 'qwen_vl_original.json'))
+print_row('Qwen2.5-VL-3B + LoRA', os.path.join(results_dir, 'qwen_vl_lora.json'))
+print_row('Qwen2.5-VL-3B + Circulant', os.path.join(results_dir, 'qwen_vl_circulant.json'))
+print('='*70)
+"
+echo ""
+echo "All results saved to \$RESULTS_DIR/"
+echo "Done!"

generation_files/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

generation_files/chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% set text_string = namespace(value='') %}{% for content in message['content'] %}{% if 'text' in content %}{% set text_string.value = content['text'] %}{% endif %}{% endfor %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}{% if '<image>' not in text_string.value %}<image>{% endif %}{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+}

generation_files/generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token_id": 151643,
+  "pad_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "repetition_penalty": 1.05,
+  "temperature": 0.7,
+  "top_p": 0.8,
+  "top_k": 20,
+  "transformers_version": "4.37.0"
+}

generation_files/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

generation_files/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "crop_size": 224,
+  "do_center_crop": true,
+  "do_normalize": true,
+  "do_resize": true,
+  "feature_extractor_type": "CLIPFeatureExtractor",
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "size": 224
+}

generation_files/processing_vora.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import torch
+from typing import List, Union
+from PIL import Image
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from .modeling_vora import VoRAForCausalLM
+class VoRAProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {},
+    }
+class VoRAProcesser(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = [
+        "chat_template",
+        "image_token",
+    ]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=None,
+        image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
+        image_token_index = -200,
+        **kwargs,
+    ):
+        self.image_token = image_token
+        self.image_token_index = image_token_index
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        **kwargs: Unpack[VoRAProcessorKwargs],
+    ):
+        if images is None and text is None:
+            raise ValueError("You have to specify at least one of `images` or `text`.")
+        images, text = _validate_images_text_input_order(images, text)
+        output_kwargs = self._merge_kwargs(
+            VoRAProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            images = [[self.expand2square(image[0])] for image in images]
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+        else:
+            image_inputs = {}
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+        input_ids = [self.tokenizer_vision_placeholder(t) for t in text]
+        attention_mask = [
+            [1] * len(input_ids[i]) for i in range(len(input_ids))
+        ]
+        text_inputs = dict(
+            input_ids=torch.as_tensor(input_ids, dtype=torch.int64),
+            attention_mask=torch.as_tensor(attention_mask, dtype=torch.int64),
+        )
+        image_inputs['frames'] = image_inputs.pop('pixel_values')
+        image_inputs['n_frames'] = [len(_images) for _images in images]
+        image_inputs['vision_placeholder_index'] = self.image_token_index
+        return BatchFeature(data={**text_inputs, **image_inputs})
+    def expand2square(self, pil_img: Image.Image):
+        background_color = (0, 0, 0)
+        width, height = pil_img.size
+        if width == height:
+            return pil_img
+        elif width > height:
+            result = Image.new(pil_img.mode, (width, width), background_color)
+            result.paste(pil_img, (0, (width - height) // 2))
+            return result
+        else:
+            result = Image.new(pil_img.mode, (height, height), background_color)
+            result.paste(pil_img, ((height - width) // 2, 0))
+            return result
+    def tokenizer_vision_placeholder(self, prompt, add_bos=False):
+        def join_lists(*lists, sep):
+            result = []
+            for i, lst in enumerate(lists):
+                if i > 0 and sep:
+                    result.extend([sep])
+                result.extend(lst)
+            return result
+        prompt_chunks = [self.tokenizer.encode(
+            chunk) for chunk in prompt.split(self.image_token)]
+        input_ids = join_lists(*prompt_chunks, sep=self.image_token_index)
+        if add_bos:
+            input_ids = [self.tokenizer.bos_token_id] + input_ids
+        return input_ids
+if __name__ == '__main__':
+    import torch
+    from transformers import AutoProcessor, AutoModelForCausalLM
+    model_name = "/mnt/bn/wh-data/open_source/models/VoRA-7B-Instruct"
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
+    conversation = [
+        {
+            "role":"user",
+            "content":[
+                {
+                    "type":"image",
+                    "url": "/mnt/bn/wh-data/data/datasets/a_demo/frames/35.jpg"
+                },
+                {
+                    "type":"text",
+                    "text":"<image> Describe this image."
+                }
+            ]
+        }
+    ]
+    model_inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, return_tensors='pt', return_dict=True).to(model.device)
+    gen_kwargs = {"max_new_tokens": 1024, "pad_token_id": processor.tokenizer.eos_token_id}
+    with torch.inference_mode():
+        outputs = model.generate(model_inputs, **gen_kwargs)
+        output_text = processor.tokenizer.batch_decode(
+            outputs, skip_special_tokens=True
+        )
+        print(output_text)

generation_files/processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "image_token": "<image>",
+  "image_token_index": -200,
+  "processor_class": "VoRAProcessing",
+  "auto_map": {"AutoProcessor": "processing_vora.VoRAProcesser"}
+}

generation_files/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

generation_files/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

generation_files/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,209 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "VoRAProcessing",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

generation_files/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

generation_files/vora_generation_utils.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from typing import Any, Dict, Optional
+import torch
+from transformers import GenerationMixin
+from transformers.cache_utils import Cache
+from transformers.utils import ModelOutput
+class VoraGenerationMixin(GenerationMixin):
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ):
+        if attention_mask is not None and attention_mask.ndim == 4:
+            attention_mask_2d = (attention_mask[:, 0, :, :] == 0).any(dim=1).long().to(attention_mask.device)
+            model_input = super().prepare_inputs_for_generation(
+                input_ids,
+                past_key_values=past_key_values,
+                attention_mask=attention_mask_2d,
+                inputs_embeds=inputs_embeds,
+                cache_position=cache_position,
+                **kwargs,
+            )
+            model_input['attention_mask'] = attention_mask
+            return model_input
+        else:
+            return super().prepare_inputs_for_generation(
+                input_ids,
+                past_key_values=past_key_values,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                cache_position=cache_position,
+                **kwargs,
+            )
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        num_new_tokens: int = 1,
+    ) -> Dict[str, Any]:
+        if "attention_mask" in model_kwargs and model_kwargs["attention_mask"].ndim == 4:
+            attention_mask = model_kwargs.pop("attention_mask")
+            model_kwargs = super()._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=is_encoder_decoder, num_new_tokens=num_new_tokens
+            )
+            bs, _, seq_len, tgt_len = attention_mask.shape
+            dtype = attention_mask.dtype
+            min_dtype = torch.finfo(dtype).min
+            new_col = attention_mask.new_zeros((bs, 1, seq_len, 1)).fill_(min_dtype)
+            new_row = attention_mask.new_zeros((bs, 1, 1, tgt_len + 1))
+            model_kwargs["attention_mask"] = torch.cat([
+                torch.cat([attention_mask, new_col], dim=-1),
+                new_row
+            ], dim=2)
+            return model_kwargs
+        else:
+            return super()._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=is_encoder_decoder, num_new_tokens=num_new_tokens
+            )
+def custom_prepare_4d_causal_attention_mask_with_cache_position(
+    attention_mask: torch.Tensor,
+    sequence_length: int,
+    target_length: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    cache_position: torch.Tensor,
+    batch_size: int,
+    **kwargs,
+):
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+        causal_mask = attention_mask[:, :, -sequence_length:, -target_length:]
+    else:
+        min_dtype = torch.finfo(dtype).min
+        causal_mask = torch.full(
+            (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+        )
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+    return causal_mask

lora_merged/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

lora_merged/chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+}

lora_merged/config.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "_name_or_path": "Qwen2.5-VL-3B-Instruct",
+  "architectures": [
+    "Qwen2_5_VLForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "image_token_id": 151655,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 128000,
+  "max_window_layers": 70,
+  "model_type": "qwen2_5_vl",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "mrope_section": [
+      16,
+      24,
+      24
+    ],
+    "rope_type": "default",
+    "type": "default"
+  },
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.49.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "video_token_id": 151656,
+  "vision_config": {
+    "hidden_size": 1280,
+    "in_chans": 3,
+    "model_type": "qwen2_5_vl",
+    "out_hidden_size": 2048,
+    "spatial_patch_size": 14,
+    "tokens_per_second": 2,
+    "torch_dtype": "float32"
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652,
+  "vision_token_id": 151654,
+  "vocab_size": 151936
+}

lora_merged/generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.05,
+  "temperature": 1e-06,
+  "transformers_version": "4.49.0"
+}

lora_merged/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

lora_merged/model-00001-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a88d3bd0a1ee8f0d26e039c20c14dcf0286a86bfe4592c8a64f105e44552c02
+size 997996256

lora_merged/model-00002-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44593ca14843f5bc2406fc225a4492c5b344a0a4c5c3d148147cecbfdf1207cd
+size 980624160

lora_merged/model-00003-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:292bb3f2b30726c82ebdeeb577266c8a98c903bf524c6966dfb801c501f6ff2d
+size 970020872

lora_merged/model-00004-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bad74a6f40948fd7313c5dcc99ac82345a961f425f190a55d9989bfd707d7fd0
+size 970020904

lora_merged/model-00005-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60dac3409a978722733df7c0f7136330d7558f3d3eec26bdf817faa9727e2df2
+size 988909632

lora_merged/model-00006-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6025d52fdda9e22c470f37e688965a3c706d2e93e4c48c0d2f7858d1ccd95e76
+size 970020944

lora_merged/model-00007-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:494697cb7e9d023ea9376118be73a1acb9d5b1eae13aaf5fe336cb1a73bf55d4
+size 970020936