Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +4 -0
- circulant_merged/added_tokens.json +24 -0
- circulant_merged/chat_template.json +3 -0
- circulant_merged/config.json +51 -0
- circulant_merged/generation_config.json +12 -0
- circulant_merged/merges.txt +0 -0
- circulant_merged/model-00001-of-00008.safetensors +3 -0
- circulant_merged/model-00002-of-00008.safetensors +3 -0
- circulant_merged/model-00003-of-00008.safetensors +3 -0
- circulant_merged/model-00004-of-00008.safetensors +3 -0
- circulant_merged/model-00005-of-00008.safetensors +3 -0
- circulant_merged/model-00006-of-00008.safetensors +3 -0
- circulant_merged/model-00007-of-00008.safetensors +3 -0
- circulant_merged/model-00008-of-00008.safetensors +3 -0
- circulant_merged/model.safetensors.index.json +831 -0
- circulant_merged/preprocessor_config.json +29 -0
- circulant_merged/special_tokens_map.json +31 -0
- circulant_merged/tokenizer.json +3 -0
- circulant_merged/tokenizer_config.json +210 -0
- circulant_merged/vocab.json +0 -0
- data/eval_qwenvl.jsonl +0 -0
- data/eval_vora.jsonl +0 -0
- eval/eval_qwen_baseline.py +222 -0
- eval/eval_qwen_vl.py +341 -0
- eval/eval_vora.py +430 -0
- eval/run_eval.sh +213 -0
- generation_files/added_tokens.json +24 -0
- generation_files/chat_template.json +3 -0
- generation_files/generation_config.json +14 -0
- generation_files/merges.txt +0 -0
- generation_files/preprocessor_config.json +19 -0
- generation_files/processing_vora.py +150 -0
- generation_files/processor_config.json +6 -0
- generation_files/special_tokens_map.json +31 -0
- generation_files/tokenizer.json +3 -0
- generation_files/tokenizer_config.json +209 -0
- generation_files/vocab.json +0 -0
- generation_files/vora_generation_utils.py +101 -0
- lora_merged/added_tokens.json +24 -0
- lora_merged/chat_template.json +3 -0
- lora_merged/config.json +51 -0
- lora_merged/generation_config.json +12 -0
- lora_merged/merges.txt +0 -0
- lora_merged/model-00001-of-00008.safetensors +3 -0
- lora_merged/model-00002-of-00008.safetensors +3 -0
- lora_merged/model-00003-of-00008.safetensors +3 -0
- lora_merged/model-00004-of-00008.safetensors +3 -0
- lora_merged/model-00005-of-00008.safetensors +3 -0
- lora_merged/model-00006-of-00008.safetensors +3 -0
- lora_merged/model-00007-of-00008.safetensors +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
circulant_merged/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
generation_files/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
lora_merged/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
vora_merged_250/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
circulant_merged/added_tokens.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</tool_call>": 151658,
|
| 3 |
+
"<tool_call>": 151657,
|
| 4 |
+
"<|box_end|>": 151649,
|
| 5 |
+
"<|box_start|>": 151648,
|
| 6 |
+
"<|endoftext|>": 151643,
|
| 7 |
+
"<|file_sep|>": 151664,
|
| 8 |
+
"<|fim_middle|>": 151660,
|
| 9 |
+
"<|fim_pad|>": 151662,
|
| 10 |
+
"<|fim_prefix|>": 151659,
|
| 11 |
+
"<|fim_suffix|>": 151661,
|
| 12 |
+
"<|im_end|>": 151645,
|
| 13 |
+
"<|im_start|>": 151644,
|
| 14 |
+
"<|image_pad|>": 151655,
|
| 15 |
+
"<|object_ref_end|>": 151647,
|
| 16 |
+
"<|object_ref_start|>": 151646,
|
| 17 |
+
"<|quad_end|>": 151651,
|
| 18 |
+
"<|quad_start|>": 151650,
|
| 19 |
+
"<|repo_name|>": 151663,
|
| 20 |
+
"<|video_pad|>": 151656,
|
| 21 |
+
"<|vision_end|>": 151653,
|
| 22 |
+
"<|vision_pad|>": 151654,
|
| 23 |
+
"<|vision_start|>": 151652
|
| 24 |
+
}
|
circulant_merged/chat_template.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
|
| 3 |
+
}
|
circulant_merged/config.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "Qwen2.5-VL-3B-Instruct",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"Qwen2_5_VLForConditionalGeneration"
|
| 5 |
+
],
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 151643,
|
| 8 |
+
"eos_token_id": 151645,
|
| 9 |
+
"hidden_act": "silu",
|
| 10 |
+
"hidden_size": 2048,
|
| 11 |
+
"image_token_id": 151655,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 11008,
|
| 14 |
+
"max_position_embeddings": 128000,
|
| 15 |
+
"max_window_layers": 70,
|
| 16 |
+
"model_type": "qwen2_5_vl",
|
| 17 |
+
"num_attention_heads": 16,
|
| 18 |
+
"num_hidden_layers": 36,
|
| 19 |
+
"num_key_value_heads": 2,
|
| 20 |
+
"rms_norm_eps": 1e-06,
|
| 21 |
+
"rope_scaling": {
|
| 22 |
+
"mrope_section": [
|
| 23 |
+
16,
|
| 24 |
+
24,
|
| 25 |
+
24
|
| 26 |
+
],
|
| 27 |
+
"rope_type": "default",
|
| 28 |
+
"type": "default"
|
| 29 |
+
},
|
| 30 |
+
"rope_theta": 1000000.0,
|
| 31 |
+
"sliding_window": 32768,
|
| 32 |
+
"tie_word_embeddings": true,
|
| 33 |
+
"torch_dtype": "bfloat16",
|
| 34 |
+
"transformers_version": "4.49.0",
|
| 35 |
+
"use_cache": true,
|
| 36 |
+
"use_sliding_window": false,
|
| 37 |
+
"video_token_id": 151656,
|
| 38 |
+
"vision_config": {
|
| 39 |
+
"hidden_size": 1280,
|
| 40 |
+
"in_chans": 3,
|
| 41 |
+
"model_type": "qwen2_5_vl",
|
| 42 |
+
"out_hidden_size": 2048,
|
| 43 |
+
"spatial_patch_size": 14,
|
| 44 |
+
"tokens_per_second": 2,
|
| 45 |
+
"torch_dtype": "float32"
|
| 46 |
+
},
|
| 47 |
+
"vision_end_token_id": 151653,
|
| 48 |
+
"vision_start_token_id": 151652,
|
| 49 |
+
"vision_token_id": 151654,
|
| 50 |
+
"vocab_size": 151936
|
| 51 |
+
}
|
circulant_merged/generation_config.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 151643,
|
| 3 |
+
"do_sample": true,
|
| 4 |
+
"eos_token_id": [
|
| 5 |
+
151645,
|
| 6 |
+
151643
|
| 7 |
+
],
|
| 8 |
+
"pad_token_id": 151643,
|
| 9 |
+
"repetition_penalty": 1.05,
|
| 10 |
+
"temperature": 1e-06,
|
| 11 |
+
"transformers_version": "4.49.0"
|
| 12 |
+
}
|
circulant_merged/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
circulant_merged/model-00001-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1a88d3bd0a1ee8f0d26e039c20c14dcf0286a86bfe4592c8a64f105e44552c02
|
| 3 |
+
size 997996256
|
circulant_merged/model-00002-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:477d86c384a7671e0a2a5181afbe4433a235de7d0106115f383bc6545c8cd5fa
|
| 3 |
+
size 980624160
|
circulant_merged/model-00003-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e04dbcd85cebc93cadfc9a7af020bbc3545d996990029d1e787dc90979756ab0
|
| 3 |
+
size 970020872
|
circulant_merged/model-00004-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f7dc2cca7ab7494ca77c3d4842950ba4d0a609cae47bda5e8efb55b3b898d345
|
| 3 |
+
size 970020904
|
circulant_merged/model-00005-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8491735ed17f37c7ad1b06732c1badf8179a6175e6881d878b2a325e8762eb6d
|
| 3 |
+
size 988909632
|
circulant_merged/model-00006-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:310e84bec8d3312e702e561ee769e64a80ef45c3e198452e9a754a279c52857b
|
| 3 |
+
size 970020944
|
circulant_merged/model-00007-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d94936fb64842ed59c634660010738a11ad810c1b310225734b83cddc9526e52
|
| 3 |
+
size 970020936
|
circulant_merged/model-00008-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cfa2f969bf4caf7d68d44a9804d1ef31203c1bf0b9baaef95b2525d64a82f5b0
|
| 3 |
+
size 661722864
|
circulant_merged/model.safetensors.index.json
ADDED
|
@@ -0,0 +1,831 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"total_size": 7509245952
|
| 4 |
+
},
|
| 5 |
+
"weight_map": {
|
| 6 |
+
"model.embed_tokens.weight": "model-00002-of-00008.safetensors",
|
| 7 |
+
"model.layers.0.input_layernorm.weight": "model-00003-of-00008.safetensors",
|
| 8 |
+
"model.layers.0.mlp.down_proj.weight": "model-00003-of-00008.safetensors",
|
| 9 |
+
"model.layers.0.mlp.gate_proj.weight": "model-00003-of-00008.safetensors",
|
| 10 |
+
"model.layers.0.mlp.up_proj.weight": "model-00003-of-00008.safetensors",
|
| 11 |
+
"model.layers.0.post_attention_layernorm.weight": "model-00003-of-00008.safetensors",
|
| 12 |
+
"model.layers.0.self_attn.k_proj.bias": "model-00002-of-00008.safetensors",
|
| 13 |
+
"model.layers.0.self_attn.k_proj.weight": "model-00002-of-00008.safetensors",
|
| 14 |
+
"model.layers.0.self_attn.o_proj.weight": "model-00002-of-00008.safetensors",
|
| 15 |
+
"model.layers.0.self_attn.q_proj.bias": "model-00002-of-00008.safetensors",
|
| 16 |
+
"model.layers.0.self_attn.q_proj.weight": "model-00002-of-00008.safetensors",
|
| 17 |
+
"model.layers.0.self_attn.v_proj.bias": "model-00002-of-00008.safetensors",
|
| 18 |
+
"model.layers.0.self_attn.v_proj.weight": "model-00002-of-00008.safetensors",
|
| 19 |
+
"model.layers.1.input_layernorm.weight": "model-00003-of-00008.safetensors",
|
| 20 |
+
"model.layers.1.mlp.down_proj.weight": "model-00003-of-00008.safetensors",
|
| 21 |
+
"model.layers.1.mlp.gate_proj.weight": "model-00003-of-00008.safetensors",
|
| 22 |
+
"model.layers.1.mlp.up_proj.weight": "model-00003-of-00008.safetensors",
|
| 23 |
+
"model.layers.1.post_attention_layernorm.weight": "model-00003-of-00008.safetensors",
|
| 24 |
+
"model.layers.1.self_attn.k_proj.bias": "model-00003-of-00008.safetensors",
|
| 25 |
+
"model.layers.1.self_attn.k_proj.weight": "model-00003-of-00008.safetensors",
|
| 26 |
+
"model.layers.1.self_attn.o_proj.weight": "model-00003-of-00008.safetensors",
|
| 27 |
+
"model.layers.1.self_attn.q_proj.bias": "model-00003-of-00008.safetensors",
|
| 28 |
+
"model.layers.1.self_attn.q_proj.weight": "model-00003-of-00008.safetensors",
|
| 29 |
+
"model.layers.1.self_attn.v_proj.bias": "model-00003-of-00008.safetensors",
|
| 30 |
+
"model.layers.1.self_attn.v_proj.weight": "model-00003-of-00008.safetensors",
|
| 31 |
+
"model.layers.10.input_layernorm.weight": "model-00004-of-00008.safetensors",
|
| 32 |
+
"model.layers.10.mlp.down_proj.weight": "model-00004-of-00008.safetensors",
|
| 33 |
+
"model.layers.10.mlp.gate_proj.weight": "model-00004-of-00008.safetensors",
|
| 34 |
+
"model.layers.10.mlp.up_proj.weight": "model-00004-of-00008.safetensors",
|
| 35 |
+
"model.layers.10.post_attention_layernorm.weight": "model-00004-of-00008.safetensors",
|
| 36 |
+
"model.layers.10.self_attn.k_proj.bias": "model-00004-of-00008.safetensors",
|
| 37 |
+
"model.layers.10.self_attn.k_proj.weight": "model-00004-of-00008.safetensors",
|
| 38 |
+
"model.layers.10.self_attn.o_proj.weight": "model-00004-of-00008.safetensors",
|
| 39 |
+
"model.layers.10.self_attn.q_proj.bias": "model-00004-of-00008.safetensors",
|
| 40 |
+
"model.layers.10.self_attn.q_proj.weight": "model-00004-of-00008.safetensors",
|
| 41 |
+
"model.layers.10.self_attn.v_proj.bias": "model-00004-of-00008.safetensors",
|
| 42 |
+
"model.layers.10.self_attn.v_proj.weight": "model-00004-of-00008.safetensors",
|
| 43 |
+
"model.layers.11.input_layernorm.weight": "model-00004-of-00008.safetensors",
|
| 44 |
+
"model.layers.11.mlp.down_proj.weight": "model-00004-of-00008.safetensors",
|
| 45 |
+
"model.layers.11.mlp.gate_proj.weight": "model-00004-of-00008.safetensors",
|
| 46 |
+
"model.layers.11.mlp.up_proj.weight": "model-00004-of-00008.safetensors",
|
| 47 |
+
"model.layers.11.post_attention_layernorm.weight": "model-00004-of-00008.safetensors",
|
| 48 |
+
"model.layers.11.self_attn.k_proj.bias": "model-00004-of-00008.safetensors",
|
| 49 |
+
"model.layers.11.self_attn.k_proj.weight": "model-00004-of-00008.safetensors",
|
| 50 |
+
"model.layers.11.self_attn.o_proj.weight": "model-00004-of-00008.safetensors",
|
| 51 |
+
"model.layers.11.self_attn.q_proj.bias": "model-00004-of-00008.safetensors",
|
| 52 |
+
"model.layers.11.self_attn.q_proj.weight": "model-00004-of-00008.safetensors",
|
| 53 |
+
"model.layers.11.self_attn.v_proj.bias": "model-00004-of-00008.safetensors",
|
| 54 |
+
"model.layers.11.self_attn.v_proj.weight": "model-00004-of-00008.safetensors",
|
| 55 |
+
"model.layers.12.input_layernorm.weight": "model-00005-of-00008.safetensors",
|
| 56 |
+
"model.layers.12.mlp.down_proj.weight": "model-00005-of-00008.safetensors",
|
| 57 |
+
"model.layers.12.mlp.gate_proj.weight": "model-00004-of-00008.safetensors",
|
| 58 |
+
"model.layers.12.mlp.up_proj.weight": "model-00004-of-00008.safetensors",
|
| 59 |
+
"model.layers.12.post_attention_layernorm.weight": "model-00005-of-00008.safetensors",
|
| 60 |
+
"model.layers.12.self_attn.k_proj.bias": "model-00004-of-00008.safetensors",
|
| 61 |
+
"model.layers.12.self_attn.k_proj.weight": "model-00004-of-00008.safetensors",
|
| 62 |
+
"model.layers.12.self_attn.o_proj.weight": "model-00004-of-00008.safetensors",
|
| 63 |
+
"model.layers.12.self_attn.q_proj.bias": "model-00004-of-00008.safetensors",
|
| 64 |
+
"model.layers.12.self_attn.q_proj.weight": "model-00004-of-00008.safetensors",
|
| 65 |
+
"model.layers.12.self_attn.v_proj.bias": "model-00004-of-00008.safetensors",
|
| 66 |
+
"model.layers.12.self_attn.v_proj.weight": "model-00004-of-00008.safetensors",
|
| 67 |
+
"model.layers.13.input_layernorm.weight": "model-00005-of-00008.safetensors",
|
| 68 |
+
"model.layers.13.mlp.down_proj.weight": "model-00005-of-00008.safetensors",
|
| 69 |
+
"model.layers.13.mlp.gate_proj.weight": "model-00005-of-00008.safetensors",
|
| 70 |
+
"model.layers.13.mlp.up_proj.weight": "model-00005-of-00008.safetensors",
|
| 71 |
+
"model.layers.13.post_attention_layernorm.weight": "model-00005-of-00008.safetensors",
|
| 72 |
+
"model.layers.13.self_attn.k_proj.bias": "model-00005-of-00008.safetensors",
|
| 73 |
+
"model.layers.13.self_attn.k_proj.weight": "model-00005-of-00008.safetensors",
|
| 74 |
+
"model.layers.13.self_attn.o_proj.weight": "model-00005-of-00008.safetensors",
|
| 75 |
+
"model.layers.13.self_attn.q_proj.bias": "model-00005-of-00008.safetensors",
|
| 76 |
+
"model.layers.13.self_attn.q_proj.weight": "model-00005-of-00008.safetensors",
|
| 77 |
+
"model.layers.13.self_attn.v_proj.bias": "model-00005-of-00008.safetensors",
|
| 78 |
+
"model.layers.13.self_attn.v_proj.weight": "model-00005-of-00008.safetensors",
|
| 79 |
+
"model.layers.14.input_layernorm.weight": "model-00005-of-00008.safetensors",
|
| 80 |
+
"model.layers.14.mlp.down_proj.weight": "model-00005-of-00008.safetensors",
|
| 81 |
+
"model.layers.14.mlp.gate_proj.weight": "model-00005-of-00008.safetensors",
|
| 82 |
+
"model.layers.14.mlp.up_proj.weight": "model-00005-of-00008.safetensors",
|
| 83 |
+
"model.layers.14.post_attention_layernorm.weight": "model-00005-of-00008.safetensors",
|
| 84 |
+
"model.layers.14.self_attn.k_proj.bias": "model-00005-of-00008.safetensors",
|
| 85 |
+
"model.layers.14.self_attn.k_proj.weight": "model-00005-of-00008.safetensors",
|
| 86 |
+
"model.layers.14.self_attn.o_proj.weight": "model-00005-of-00008.safetensors",
|
| 87 |
+
"model.layers.14.self_attn.q_proj.bias": "model-00005-of-00008.safetensors",
|
| 88 |
+
"model.layers.14.self_attn.q_proj.weight": "model-00005-of-00008.safetensors",
|
| 89 |
+
"model.layers.14.self_attn.v_proj.bias": "model-00005-of-00008.safetensors",
|
| 90 |
+
"model.layers.14.self_attn.v_proj.weight": "model-00005-of-00008.safetensors",
|
| 91 |
+
"model.layers.15.input_layernorm.weight": "model-00005-of-00008.safetensors",
|
| 92 |
+
"model.layers.15.mlp.down_proj.weight": "model-00005-of-00008.safetensors",
|
| 93 |
+
"model.layers.15.mlp.gate_proj.weight": "model-00005-of-00008.safetensors",
|
| 94 |
+
"model.layers.15.mlp.up_proj.weight": "model-00005-of-00008.safetensors",
|
| 95 |
+
"model.layers.15.post_attention_layernorm.weight": "model-00005-of-00008.safetensors",
|
| 96 |
+
"model.layers.15.self_attn.k_proj.bias": "model-00005-of-00008.safetensors",
|
| 97 |
+
"model.layers.15.self_attn.k_proj.weight": "model-00005-of-00008.safetensors",
|
| 98 |
+
"model.layers.15.self_attn.o_proj.weight": "model-00005-of-00008.safetensors",
|
| 99 |
+
"model.layers.15.self_attn.q_proj.bias": "model-00005-of-00008.safetensors",
|
| 100 |
+
"model.layers.15.self_attn.q_proj.weight": "model-00005-of-00008.safetensors",
|
| 101 |
+
"model.layers.15.self_attn.v_proj.bias": "model-00005-of-00008.safetensors",
|
| 102 |
+
"model.layers.15.self_attn.v_proj.weight": "model-00005-of-00008.safetensors",
|
| 103 |
+
"model.layers.16.input_layernorm.weight": "model-00005-of-00008.safetensors",
|
| 104 |
+
"model.layers.16.mlp.down_proj.weight": "model-00005-of-00008.safetensors",
|
| 105 |
+
"model.layers.16.mlp.gate_proj.weight": "model-00005-of-00008.safetensors",
|
| 106 |
+
"model.layers.16.mlp.up_proj.weight": "model-00005-of-00008.safetensors",
|
| 107 |
+
"model.layers.16.post_attention_layernorm.weight": "model-00005-of-00008.safetensors",
|
| 108 |
+
"model.layers.16.self_attn.k_proj.bias": "model-00005-of-00008.safetensors",
|
| 109 |
+
"model.layers.16.self_attn.k_proj.weight": "model-00005-of-00008.safetensors",
|
| 110 |
+
"model.layers.16.self_attn.o_proj.weight": "model-00005-of-00008.safetensors",
|
| 111 |
+
"model.layers.16.self_attn.q_proj.bias": "model-00005-of-00008.safetensors",
|
| 112 |
+
"model.layers.16.self_attn.q_proj.weight": "model-00005-of-00008.safetensors",
|
| 113 |
+
"model.layers.16.self_attn.v_proj.bias": "model-00005-of-00008.safetensors",
|
| 114 |
+
"model.layers.16.self_attn.v_proj.weight": "model-00005-of-00008.safetensors",
|
| 115 |
+
"model.layers.17.input_layernorm.weight": "model-00005-of-00008.safetensors",
|
| 116 |
+
"model.layers.17.mlp.down_proj.weight": "model-00005-of-00008.safetensors",
|
| 117 |
+
"model.layers.17.mlp.gate_proj.weight": "model-00005-of-00008.safetensors",
|
| 118 |
+
"model.layers.17.mlp.up_proj.weight": "model-00005-of-00008.safetensors",
|
| 119 |
+
"model.layers.17.post_attention_layernorm.weight": "model-00005-of-00008.safetensors",
|
| 120 |
+
"model.layers.17.self_attn.k_proj.bias": "model-00005-of-00008.safetensors",
|
| 121 |
+
"model.layers.17.self_attn.k_proj.weight": "model-00005-of-00008.safetensors",
|
| 122 |
+
"model.layers.17.self_attn.o_proj.weight": "model-00005-of-00008.safetensors",
|
| 123 |
+
"model.layers.17.self_attn.q_proj.bias": "model-00005-of-00008.safetensors",
|
| 124 |
+
"model.layers.17.self_attn.q_proj.weight": "model-00005-of-00008.safetensors",
|
| 125 |
+
"model.layers.17.self_attn.v_proj.bias": "model-00005-of-00008.safetensors",
|
| 126 |
+
"model.layers.17.self_attn.v_proj.weight": "model-00005-of-00008.safetensors",
|
| 127 |
+
"model.layers.18.input_layernorm.weight": "model-00005-of-00008.safetensors",
|
| 128 |
+
"model.layers.18.mlp.down_proj.weight": "model-00005-of-00008.safetensors",
|
| 129 |
+
"model.layers.18.mlp.gate_proj.weight": "model-00005-of-00008.safetensors",
|
| 130 |
+
"model.layers.18.mlp.up_proj.weight": "model-00005-of-00008.safetensors",
|
| 131 |
+
"model.layers.18.post_attention_layernorm.weight": "model-00005-of-00008.safetensors",
|
| 132 |
+
"model.layers.18.self_attn.k_proj.bias": "model-00005-of-00008.safetensors",
|
| 133 |
+
"model.layers.18.self_attn.k_proj.weight": "model-00005-of-00008.safetensors",
|
| 134 |
+
"model.layers.18.self_attn.o_proj.weight": "model-00005-of-00008.safetensors",
|
| 135 |
+
"model.layers.18.self_attn.q_proj.bias": "model-00005-of-00008.safetensors",
|
| 136 |
+
"model.layers.18.self_attn.q_proj.weight": "model-00005-of-00008.safetensors",
|
| 137 |
+
"model.layers.18.self_attn.v_proj.bias": "model-00005-of-00008.safetensors",
|
| 138 |
+
"model.layers.18.self_attn.v_proj.weight": "model-00005-of-00008.safetensors",
|
| 139 |
+
"model.layers.19.input_layernorm.weight": "model-00006-of-00008.safetensors",
|
| 140 |
+
"model.layers.19.mlp.down_proj.weight": "model-00006-of-00008.safetensors",
|
| 141 |
+
"model.layers.19.mlp.gate_proj.weight": "model-00006-of-00008.safetensors",
|
| 142 |
+
"model.layers.19.mlp.up_proj.weight": "model-00006-of-00008.safetensors",
|
| 143 |
+
"model.layers.19.post_attention_layernorm.weight": "model-00006-of-00008.safetensors",
|
| 144 |
+
"model.layers.19.self_attn.k_proj.bias": "model-00005-of-00008.safetensors",
|
| 145 |
+
"model.layers.19.self_attn.k_proj.weight": "model-00005-of-00008.safetensors",
|
| 146 |
+
"model.layers.19.self_attn.o_proj.weight": "model-00005-of-00008.safetensors",
|
| 147 |
+
"model.layers.19.self_attn.q_proj.bias": "model-00005-of-00008.safetensors",
|
| 148 |
+
"model.layers.19.self_attn.q_proj.weight": "model-00005-of-00008.safetensors",
|
| 149 |
+
"model.layers.19.self_attn.v_proj.bias": "model-00005-of-00008.safetensors",
|
| 150 |
+
"model.layers.19.self_attn.v_proj.weight": "model-00005-of-00008.safetensors",
|
| 151 |
+
"model.layers.2.input_layernorm.weight": "model-00003-of-00008.safetensors",
|
| 152 |
+
"model.layers.2.mlp.down_proj.weight": "model-00003-of-00008.safetensors",
|
| 153 |
+
"model.layers.2.mlp.gate_proj.weight": "model-00003-of-00008.safetensors",
|
| 154 |
+
"model.layers.2.mlp.up_proj.weight": "model-00003-of-00008.safetensors",
|
| 155 |
+
"model.layers.2.post_attention_layernorm.weight": "model-00003-of-00008.safetensors",
|
| 156 |
+
"model.layers.2.self_attn.k_proj.bias": "model-00003-of-00008.safetensors",
|
| 157 |
+
"model.layers.2.self_attn.k_proj.weight": "model-00003-of-00008.safetensors",
|
| 158 |
+
"model.layers.2.self_attn.o_proj.weight": "model-00003-of-00008.safetensors",
|
| 159 |
+
"model.layers.2.self_attn.q_proj.bias": "model-00003-of-00008.safetensors",
|
| 160 |
+
"model.layers.2.self_attn.q_proj.weight": "model-00003-of-00008.safetensors",
|
| 161 |
+
"model.layers.2.self_attn.v_proj.bias": "model-00003-of-00008.safetensors",
|
| 162 |
+
"model.layers.2.self_attn.v_proj.weight": "model-00003-of-00008.safetensors",
|
| 163 |
+
"model.layers.20.input_layernorm.weight": "model-00006-of-00008.safetensors",
|
| 164 |
+
"model.layers.20.mlp.down_proj.weight": "model-00006-of-00008.safetensors",
|
| 165 |
+
"model.layers.20.mlp.gate_proj.weight": "model-00006-of-00008.safetensors",
|
| 166 |
+
"model.layers.20.mlp.up_proj.weight": "model-00006-of-00008.safetensors",
|
| 167 |
+
"model.layers.20.post_attention_layernorm.weight": "model-00006-of-00008.safetensors",
|
| 168 |
+
"model.layers.20.self_attn.k_proj.bias": "model-00006-of-00008.safetensors",
|
| 169 |
+
"model.layers.20.self_attn.k_proj.weight": "model-00006-of-00008.safetensors",
|
| 170 |
+
"model.layers.20.self_attn.o_proj.weight": "model-00006-of-00008.safetensors",
|
| 171 |
+
"model.layers.20.self_attn.q_proj.bias": "model-00006-of-00008.safetensors",
|
| 172 |
+
"model.layers.20.self_attn.q_proj.weight": "model-00006-of-00008.safetensors",
|
| 173 |
+
"model.layers.20.self_attn.v_proj.bias": "model-00006-of-00008.safetensors",
|
| 174 |
+
"model.layers.20.self_attn.v_proj.weight": "model-00006-of-00008.safetensors",
|
| 175 |
+
"model.layers.21.input_layernorm.weight": "model-00006-of-00008.safetensors",
|
| 176 |
+
"model.layers.21.mlp.down_proj.weight": "model-00006-of-00008.safetensors",
|
| 177 |
+
"model.layers.21.mlp.gate_proj.weight": "model-00006-of-00008.safetensors",
|
| 178 |
+
"model.layers.21.mlp.up_proj.weight": "model-00006-of-00008.safetensors",
|
| 179 |
+
"model.layers.21.post_attention_layernorm.weight": "model-00006-of-00008.safetensors",
|
| 180 |
+
"model.layers.21.self_attn.k_proj.bias": "model-00006-of-00008.safetensors",
|
| 181 |
+
"model.layers.21.self_attn.k_proj.weight": "model-00006-of-00008.safetensors",
|
| 182 |
+
"model.layers.21.self_attn.o_proj.weight": "model-00006-of-00008.safetensors",
|
| 183 |
+
"model.layers.21.self_attn.q_proj.bias": "model-00006-of-00008.safetensors",
|
| 184 |
+
"model.layers.21.self_attn.q_proj.weight": "model-00006-of-00008.safetensors",
|
| 185 |
+
"model.layers.21.self_attn.v_proj.bias": "model-00006-of-00008.safetensors",
|
| 186 |
+
"model.layers.21.self_attn.v_proj.weight": "model-00006-of-00008.safetensors",
|
| 187 |
+
"model.layers.22.input_layernorm.weight": "model-00006-of-00008.safetensors",
|
| 188 |
+
"model.layers.22.mlp.down_proj.weight": "model-00006-of-00008.safetensors",
|
| 189 |
+
"model.layers.22.mlp.gate_proj.weight": "model-00006-of-00008.safetensors",
|
| 190 |
+
"model.layers.22.mlp.up_proj.weight": "model-00006-of-00008.safetensors",
|
| 191 |
+
"model.layers.22.post_attention_layernorm.weight": "model-00006-of-00008.safetensors",
|
| 192 |
+
"model.layers.22.self_attn.k_proj.bias": "model-00006-of-00008.safetensors",
|
| 193 |
+
"model.layers.22.self_attn.k_proj.weight": "model-00006-of-00008.safetensors",
|
| 194 |
+
"model.layers.22.self_attn.o_proj.weight": "model-00006-of-00008.safetensors",
|
| 195 |
+
"model.layers.22.self_attn.q_proj.bias": "model-00006-of-00008.safetensors",
|
| 196 |
+
"model.layers.22.self_attn.q_proj.weight": "model-00006-of-00008.safetensors",
|
| 197 |
+
"model.layers.22.self_attn.v_proj.bias": "model-00006-of-00008.safetensors",
|
| 198 |
+
"model.layers.22.self_attn.v_proj.weight": "model-00006-of-00008.safetensors",
|
| 199 |
+
"model.layers.23.input_layernorm.weight": "model-00006-of-00008.safetensors",
|
| 200 |
+
"model.layers.23.mlp.down_proj.weight": "model-00006-of-00008.safetensors",
|
| 201 |
+
"model.layers.23.mlp.gate_proj.weight": "model-00006-of-00008.safetensors",
|
| 202 |
+
"model.layers.23.mlp.up_proj.weight": "model-00006-of-00008.safetensors",
|
| 203 |
+
"model.layers.23.post_attention_layernorm.weight": "model-00006-of-00008.safetensors",
|
| 204 |
+
"model.layers.23.self_attn.k_proj.bias": "model-00006-of-00008.safetensors",
|
| 205 |
+
"model.layers.23.self_attn.k_proj.weight": "model-00006-of-00008.safetensors",
|
| 206 |
+
"model.layers.23.self_attn.o_proj.weight": "model-00006-of-00008.safetensors",
|
| 207 |
+
"model.layers.23.self_attn.q_proj.bias": "model-00006-of-00008.safetensors",
|
| 208 |
+
"model.layers.23.self_attn.q_proj.weight": "model-00006-of-00008.safetensors",
|
| 209 |
+
"model.layers.23.self_attn.v_proj.bias": "model-00006-of-00008.safetensors",
|
| 210 |
+
"model.layers.23.self_attn.v_proj.weight": "model-00006-of-00008.safetensors",
|
| 211 |
+
"model.layers.24.input_layernorm.weight": "model-00006-of-00008.safetensors",
|
| 212 |
+
"model.layers.24.mlp.down_proj.weight": "model-00006-of-00008.safetensors",
|
| 213 |
+
"model.layers.24.mlp.gate_proj.weight": "model-00006-of-00008.safetensors",
|
| 214 |
+
"model.layers.24.mlp.up_proj.weight": "model-00006-of-00008.safetensors",
|
| 215 |
+
"model.layers.24.post_attention_layernorm.weight": "model-00006-of-00008.safetensors",
|
| 216 |
+
"model.layers.24.self_attn.k_proj.bias": "model-00006-of-00008.safetensors",
|
| 217 |
+
"model.layers.24.self_attn.k_proj.weight": "model-00006-of-00008.safetensors",
|
| 218 |
+
"model.layers.24.self_attn.o_proj.weight": "model-00006-of-00008.safetensors",
|
| 219 |
+
"model.layers.24.self_attn.q_proj.bias": "model-00006-of-00008.safetensors",
|
| 220 |
+
"model.layers.24.self_attn.q_proj.weight": "model-00006-of-00008.safetensors",
|
| 221 |
+
"model.layers.24.self_attn.v_proj.bias": "model-00006-of-00008.safetensors",
|
| 222 |
+
"model.layers.24.self_attn.v_proj.weight": "model-00006-of-00008.safetensors",
|
| 223 |
+
"model.layers.25.input_layernorm.weight": "model-00007-of-00008.safetensors",
|
| 224 |
+
"model.layers.25.mlp.down_proj.weight": "model-00007-of-00008.safetensors",
|
| 225 |
+
"model.layers.25.mlp.gate_proj.weight": "model-00006-of-00008.safetensors",
|
| 226 |
+
"model.layers.25.mlp.up_proj.weight": "model-00007-of-00008.safetensors",
|
| 227 |
+
"model.layers.25.post_attention_layernorm.weight": "model-00007-of-00008.safetensors",
|
| 228 |
+
"model.layers.25.self_attn.k_proj.bias": "model-00006-of-00008.safetensors",
|
| 229 |
+
"model.layers.25.self_attn.k_proj.weight": "model-00006-of-00008.safetensors",
|
| 230 |
+
"model.layers.25.self_attn.o_proj.weight": "model-00006-of-00008.safetensors",
|
| 231 |
+
"model.layers.25.self_attn.q_proj.bias": "model-00006-of-00008.safetensors",
|
| 232 |
+
"model.layers.25.self_attn.q_proj.weight": "model-00006-of-00008.safetensors",
|
| 233 |
+
"model.layers.25.self_attn.v_proj.bias": "model-00006-of-00008.safetensors",
|
| 234 |
+
"model.layers.25.self_attn.v_proj.weight": "model-00006-of-00008.safetensors",
|
| 235 |
+
"model.layers.26.input_layernorm.weight": "model-00007-of-00008.safetensors",
|
| 236 |
+
"model.layers.26.mlp.down_proj.weight": "model-00007-of-00008.safetensors",
|
| 237 |
+
"model.layers.26.mlp.gate_proj.weight": "model-00007-of-00008.safetensors",
|
| 238 |
+
"model.layers.26.mlp.up_proj.weight": "model-00007-of-00008.safetensors",
|
| 239 |
+
"model.layers.26.post_attention_layernorm.weight": "model-00007-of-00008.safetensors",
|
| 240 |
+
"model.layers.26.self_attn.k_proj.bias": "model-00007-of-00008.safetensors",
|
| 241 |
+
"model.layers.26.self_attn.k_proj.weight": "model-00007-of-00008.safetensors",
|
| 242 |
+
"model.layers.26.self_attn.o_proj.weight": "model-00007-of-00008.safetensors",
|
| 243 |
+
"model.layers.26.self_attn.q_proj.bias": "model-00007-of-00008.safetensors",
|
| 244 |
+
"model.layers.26.self_attn.q_proj.weight": "model-00007-of-00008.safetensors",
|
| 245 |
+
"model.layers.26.self_attn.v_proj.bias": "model-00007-of-00008.safetensors",
|
| 246 |
+
"model.layers.26.self_attn.v_proj.weight": "model-00007-of-00008.safetensors",
|
| 247 |
+
"model.layers.27.input_layernorm.weight": "model-00007-of-00008.safetensors",
|
| 248 |
+
"model.layers.27.mlp.down_proj.weight": "model-00007-of-00008.safetensors",
|
| 249 |
+
"model.layers.27.mlp.gate_proj.weight": "model-00007-of-00008.safetensors",
|
| 250 |
+
"model.layers.27.mlp.up_proj.weight": "model-00007-of-00008.safetensors",
|
| 251 |
+
"model.layers.27.post_attention_layernorm.weight": "model-00007-of-00008.safetensors",
|
| 252 |
+
"model.layers.27.self_attn.k_proj.bias": "model-00007-of-00008.safetensors",
|
| 253 |
+
"model.layers.27.self_attn.k_proj.weight": "model-00007-of-00008.safetensors",
|
| 254 |
+
"model.layers.27.self_attn.o_proj.weight": "model-00007-of-00008.safetensors",
|
| 255 |
+
"model.layers.27.self_attn.q_proj.bias": "model-00007-of-00008.safetensors",
|
| 256 |
+
"model.layers.27.self_attn.q_proj.weight": "model-00007-of-00008.safetensors",
|
| 257 |
+
"model.layers.27.self_attn.v_proj.bias": "model-00007-of-00008.safetensors",
|
| 258 |
+
"model.layers.27.self_attn.v_proj.weight": "model-00007-of-00008.safetensors",
|
| 259 |
+
"model.layers.28.input_layernorm.weight": "model-00007-of-00008.safetensors",
|
| 260 |
+
"model.layers.28.mlp.down_proj.weight": "model-00007-of-00008.safetensors",
|
| 261 |
+
"model.layers.28.mlp.gate_proj.weight": "model-00007-of-00008.safetensors",
|
| 262 |
+
"model.layers.28.mlp.up_proj.weight": "model-00007-of-00008.safetensors",
|
| 263 |
+
"model.layers.28.post_attention_layernorm.weight": "model-00007-of-00008.safetensors",
|
| 264 |
+
"model.layers.28.self_attn.k_proj.bias": "model-00007-of-00008.safetensors",
|
| 265 |
+
"model.layers.28.self_attn.k_proj.weight": "model-00007-of-00008.safetensors",
|
| 266 |
+
"model.layers.28.self_attn.o_proj.weight": "model-00007-of-00008.safetensors",
|
| 267 |
+
"model.layers.28.self_attn.q_proj.bias": "model-00007-of-00008.safetensors",
|
| 268 |
+
"model.layers.28.self_attn.q_proj.weight": "model-00007-of-00008.safetensors",
|
| 269 |
+
"model.layers.28.self_attn.v_proj.bias": "model-00007-of-00008.safetensors",
|
| 270 |
+
"model.layers.28.self_attn.v_proj.weight": "model-00007-of-00008.safetensors",
|
| 271 |
+
"model.layers.29.input_layernorm.weight": "model-00007-of-00008.safetensors",
|
| 272 |
+
"model.layers.29.mlp.down_proj.weight": "model-00007-of-00008.safetensors",
|
| 273 |
+
"model.layers.29.mlp.gate_proj.weight": "model-00007-of-00008.safetensors",
|
| 274 |
+
"model.layers.29.mlp.up_proj.weight": "model-00007-of-00008.safetensors",
|
| 275 |
+
"model.layers.29.post_attention_layernorm.weight": "model-00007-of-00008.safetensors",
|
| 276 |
+
"model.layers.29.self_attn.k_proj.bias": "model-00007-of-00008.safetensors",
|
| 277 |
+
"model.layers.29.self_attn.k_proj.weight": "model-00007-of-00008.safetensors",
|
| 278 |
+
"model.layers.29.self_attn.o_proj.weight": "model-00007-of-00008.safetensors",
|
| 279 |
+
"model.layers.29.self_attn.q_proj.bias": "model-00007-of-00008.safetensors",
|
| 280 |
+
"model.layers.29.self_attn.q_proj.weight": "model-00007-of-00008.safetensors",
|
| 281 |
+
"model.layers.29.self_attn.v_proj.bias": "model-00007-of-00008.safetensors",
|
| 282 |
+
"model.layers.29.self_attn.v_proj.weight": "model-00007-of-00008.safetensors",
|
| 283 |
+
"model.layers.3.input_layernorm.weight": "model-00003-of-00008.safetensors",
|
| 284 |
+
"model.layers.3.mlp.down_proj.weight": "model-00003-of-00008.safetensors",
|
| 285 |
+
"model.layers.3.mlp.gate_proj.weight": "model-00003-of-00008.safetensors",
|
| 286 |
+
"model.layers.3.mlp.up_proj.weight": "model-00003-of-00008.safetensors",
|
| 287 |
+
"model.layers.3.post_attention_layernorm.weight": "model-00003-of-00008.safetensors",
|
| 288 |
+
"model.layers.3.self_attn.k_proj.bias": "model-00003-of-00008.safetensors",
|
| 289 |
+
"model.layers.3.self_attn.k_proj.weight": "model-00003-of-00008.safetensors",
|
| 290 |
+
"model.layers.3.self_attn.o_proj.weight": "model-00003-of-00008.safetensors",
|
| 291 |
+
"model.layers.3.self_attn.q_proj.bias": "model-00003-of-00008.safetensors",
|
| 292 |
+
"model.layers.3.self_attn.q_proj.weight": "model-00003-of-00008.safetensors",
|
| 293 |
+
"model.layers.3.self_attn.v_proj.bias": "model-00003-of-00008.safetensors",
|
| 294 |
+
"model.layers.3.self_attn.v_proj.weight": "model-00003-of-00008.safetensors",
|
| 295 |
+
"model.layers.30.input_layernorm.weight": "model-00007-of-00008.safetensors",
|
| 296 |
+
"model.layers.30.mlp.down_proj.weight": "model-00007-of-00008.safetensors",
|
| 297 |
+
"model.layers.30.mlp.gate_proj.weight": "model-00007-of-00008.safetensors",
|
| 298 |
+
"model.layers.30.mlp.up_proj.weight": "model-00007-of-00008.safetensors",
|
| 299 |
+
"model.layers.30.post_attention_layernorm.weight": "model-00007-of-00008.safetensors",
|
| 300 |
+
"model.layers.30.self_attn.k_proj.bias": "model-00007-of-00008.safetensors",
|
| 301 |
+
"model.layers.30.self_attn.k_proj.weight": "model-00007-of-00008.safetensors",
|
| 302 |
+
"model.layers.30.self_attn.o_proj.weight": "model-00007-of-00008.safetensors",
|
| 303 |
+
"model.layers.30.self_attn.q_proj.bias": "model-00007-of-00008.safetensors",
|
| 304 |
+
"model.layers.30.self_attn.q_proj.weight": "model-00007-of-00008.safetensors",
|
| 305 |
+
"model.layers.30.self_attn.v_proj.bias": "model-00007-of-00008.safetensors",
|
| 306 |
+
"model.layers.30.self_attn.v_proj.weight": "model-00007-of-00008.safetensors",
|
| 307 |
+
"model.layers.31.input_layernorm.weight": "model-00008-of-00008.safetensors",
|
| 308 |
+
"model.layers.31.mlp.down_proj.weight": "model-00008-of-00008.safetensors",
|
| 309 |
+
"model.layers.31.mlp.gate_proj.weight": "model-00007-of-00008.safetensors",
|
| 310 |
+
"model.layers.31.mlp.up_proj.weight": "model-00007-of-00008.safetensors",
|
| 311 |
+
"model.layers.31.post_attention_layernorm.weight": "model-00008-of-00008.safetensors",
|
| 312 |
+
"model.layers.31.self_attn.k_proj.bias": "model-00007-of-00008.safetensors",
|
| 313 |
+
"model.layers.31.self_attn.k_proj.weight": "model-00007-of-00008.safetensors",
|
| 314 |
+
"model.layers.31.self_attn.o_proj.weight": "model-00007-of-00008.safetensors",
|
| 315 |
+
"model.layers.31.self_attn.q_proj.bias": "model-00007-of-00008.safetensors",
|
| 316 |
+
"model.layers.31.self_attn.q_proj.weight": "model-00007-of-00008.safetensors",
|
| 317 |
+
"model.layers.31.self_attn.v_proj.bias": "model-00007-of-00008.safetensors",
|
| 318 |
+
"model.layers.31.self_attn.v_proj.weight": "model-00007-of-00008.safetensors",
|
| 319 |
+
"model.layers.32.input_layernorm.weight": "model-00008-of-00008.safetensors",
|
| 320 |
+
"model.layers.32.mlp.down_proj.weight": "model-00008-of-00008.safetensors",
|
| 321 |
+
"model.layers.32.mlp.gate_proj.weight": "model-00008-of-00008.safetensors",
|
| 322 |
+
"model.layers.32.mlp.up_proj.weight": "model-00008-of-00008.safetensors",
|
| 323 |
+
"model.layers.32.post_attention_layernorm.weight": "model-00008-of-00008.safetensors",
|
| 324 |
+
"model.layers.32.self_attn.k_proj.bias": "model-00008-of-00008.safetensors",
|
| 325 |
+
"model.layers.32.self_attn.k_proj.weight": "model-00008-of-00008.safetensors",
|
| 326 |
+
"model.layers.32.self_attn.o_proj.weight": "model-00008-of-00008.safetensors",
|
| 327 |
+
"model.layers.32.self_attn.q_proj.bias": "model-00008-of-00008.safetensors",
|
| 328 |
+
"model.layers.32.self_attn.q_proj.weight": "model-00008-of-00008.safetensors",
|
| 329 |
+
"model.layers.32.self_attn.v_proj.bias": "model-00008-of-00008.safetensors",
|
| 330 |
+
"model.layers.32.self_attn.v_proj.weight": "model-00008-of-00008.safetensors",
|
| 331 |
+
"model.layers.33.input_layernorm.weight": "model-00008-of-00008.safetensors",
|
| 332 |
+
"model.layers.33.mlp.down_proj.weight": "model-00008-of-00008.safetensors",
|
| 333 |
+
"model.layers.33.mlp.gate_proj.weight": "model-00008-of-00008.safetensors",
|
| 334 |
+
"model.layers.33.mlp.up_proj.weight": "model-00008-of-00008.safetensors",
|
| 335 |
+
"model.layers.33.post_attention_layernorm.weight": "model-00008-of-00008.safetensors",
|
| 336 |
+
"model.layers.33.self_attn.k_proj.bias": "model-00008-of-00008.safetensors",
|
| 337 |
+
"model.layers.33.self_attn.k_proj.weight": "model-00008-of-00008.safetensors",
|
| 338 |
+
"model.layers.33.self_attn.o_proj.weight": "model-00008-of-00008.safetensors",
|
| 339 |
+
"model.layers.33.self_attn.q_proj.bias": "model-00008-of-00008.safetensors",
|
| 340 |
+
"model.layers.33.self_attn.q_proj.weight": "model-00008-of-00008.safetensors",
|
| 341 |
+
"model.layers.33.self_attn.v_proj.bias": "model-00008-of-00008.safetensors",
|
| 342 |
+
"model.layers.33.self_attn.v_proj.weight": "model-00008-of-00008.safetensors",
|
| 343 |
+
"model.layers.34.input_layernorm.weight": "model-00008-of-00008.safetensors",
|
| 344 |
+
"model.layers.34.mlp.down_proj.weight": "model-00008-of-00008.safetensors",
|
| 345 |
+
"model.layers.34.mlp.gate_proj.weight": "model-00008-of-00008.safetensors",
|
| 346 |
+
"model.layers.34.mlp.up_proj.weight": "model-00008-of-00008.safetensors",
|
| 347 |
+
"model.layers.34.post_attention_layernorm.weight": "model-00008-of-00008.safetensors",
|
| 348 |
+
"model.layers.34.self_attn.k_proj.bias": "model-00008-of-00008.safetensors",
|
| 349 |
+
"model.layers.34.self_attn.k_proj.weight": "model-00008-of-00008.safetensors",
|
| 350 |
+
"model.layers.34.self_attn.o_proj.weight": "model-00008-of-00008.safetensors",
|
| 351 |
+
"model.layers.34.self_attn.q_proj.bias": "model-00008-of-00008.safetensors",
|
| 352 |
+
"model.layers.34.self_attn.q_proj.weight": "model-00008-of-00008.safetensors",
|
| 353 |
+
"model.layers.34.self_attn.v_proj.bias": "model-00008-of-00008.safetensors",
|
| 354 |
+
"model.layers.34.self_attn.v_proj.weight": "model-00008-of-00008.safetensors",
|
| 355 |
+
"model.layers.35.input_layernorm.weight": "model-00008-of-00008.safetensors",
|
| 356 |
+
"model.layers.35.mlp.down_proj.weight": "model-00008-of-00008.safetensors",
|
| 357 |
+
"model.layers.35.mlp.gate_proj.weight": "model-00008-of-00008.safetensors",
|
| 358 |
+
"model.layers.35.mlp.up_proj.weight": "model-00008-of-00008.safetensors",
|
| 359 |
+
"model.layers.35.post_attention_layernorm.weight": "model-00008-of-00008.safetensors",
|
| 360 |
+
"model.layers.35.self_attn.k_proj.bias": "model-00008-of-00008.safetensors",
|
| 361 |
+
"model.layers.35.self_attn.k_proj.weight": "model-00008-of-00008.safetensors",
|
| 362 |
+
"model.layers.35.self_attn.o_proj.weight": "model-00008-of-00008.safetensors",
|
| 363 |
+
"model.layers.35.self_attn.q_proj.bias": "model-00008-of-00008.safetensors",
|
| 364 |
+
"model.layers.35.self_attn.q_proj.weight": "model-00008-of-00008.safetensors",
|
| 365 |
+
"model.layers.35.self_attn.v_proj.bias": "model-00008-of-00008.safetensors",
|
| 366 |
+
"model.layers.35.self_attn.v_proj.weight": "model-00008-of-00008.safetensors",
|
| 367 |
+
"model.layers.4.input_layernorm.weight": "model-00003-of-00008.safetensors",
|
| 368 |
+
"model.layers.4.mlp.down_proj.weight": "model-00003-of-00008.safetensors",
|
| 369 |
+
"model.layers.4.mlp.gate_proj.weight": "model-00003-of-00008.safetensors",
|
| 370 |
+
"model.layers.4.mlp.up_proj.weight": "model-00003-of-00008.safetensors",
|
| 371 |
+
"model.layers.4.post_attention_layernorm.weight": "model-00003-of-00008.safetensors",
|
| 372 |
+
"model.layers.4.self_attn.k_proj.bias": "model-00003-of-00008.safetensors",
|
| 373 |
+
"model.layers.4.self_attn.k_proj.weight": "model-00003-of-00008.safetensors",
|
| 374 |
+
"model.layers.4.self_attn.o_proj.weight": "model-00003-of-00008.safetensors",
|
| 375 |
+
"model.layers.4.self_attn.q_proj.bias": "model-00003-of-00008.safetensors",
|
| 376 |
+
"model.layers.4.self_attn.q_proj.weight": "model-00003-of-00008.safetensors",
|
| 377 |
+
"model.layers.4.self_attn.v_proj.bias": "model-00003-of-00008.safetensors",
|
| 378 |
+
"model.layers.4.self_attn.v_proj.weight": "model-00003-of-00008.safetensors",
|
| 379 |
+
"model.layers.5.input_layernorm.weight": "model-00003-of-00008.safetensors",
|
| 380 |
+
"model.layers.5.mlp.down_proj.weight": "model-00003-of-00008.safetensors",
|
| 381 |
+
"model.layers.5.mlp.gate_proj.weight": "model-00003-of-00008.safetensors",
|
| 382 |
+
"model.layers.5.mlp.up_proj.weight": "model-00003-of-00008.safetensors",
|
| 383 |
+
"model.layers.5.post_attention_layernorm.weight": "model-00003-of-00008.safetensors",
|
| 384 |
+
"model.layers.5.self_attn.k_proj.bias": "model-00003-of-00008.safetensors",
|
| 385 |
+
"model.layers.5.self_attn.k_proj.weight": "model-00003-of-00008.safetensors",
|
| 386 |
+
"model.layers.5.self_attn.o_proj.weight": "model-00003-of-00008.safetensors",
|
| 387 |
+
"model.layers.5.self_attn.q_proj.bias": "model-00003-of-00008.safetensors",
|
| 388 |
+
"model.layers.5.self_attn.q_proj.weight": "model-00003-of-00008.safetensors",
|
| 389 |
+
"model.layers.5.self_attn.v_proj.bias": "model-00003-of-00008.safetensors",
|
| 390 |
+
"model.layers.5.self_attn.v_proj.weight": "model-00003-of-00008.safetensors",
|
| 391 |
+
"model.layers.6.input_layernorm.weight": "model-00004-of-00008.safetensors",
|
| 392 |
+
"model.layers.6.mlp.down_proj.weight": "model-00004-of-00008.safetensors",
|
| 393 |
+
"model.layers.6.mlp.gate_proj.weight": "model-00003-of-00008.safetensors",
|
| 394 |
+
"model.layers.6.mlp.up_proj.weight": "model-00004-of-00008.safetensors",
|
| 395 |
+
"model.layers.6.post_attention_layernorm.weight": "model-00004-of-00008.safetensors",
|
| 396 |
+
"model.layers.6.self_attn.k_proj.bias": "model-00003-of-00008.safetensors",
|
| 397 |
+
"model.layers.6.self_attn.k_proj.weight": "model-00003-of-00008.safetensors",
|
| 398 |
+
"model.layers.6.self_attn.o_proj.weight": "model-00003-of-00008.safetensors",
|
| 399 |
+
"model.layers.6.self_attn.q_proj.bias": "model-00003-of-00008.safetensors",
|
| 400 |
+
"model.layers.6.self_attn.q_proj.weight": "model-00003-of-00008.safetensors",
|
| 401 |
+
"model.layers.6.self_attn.v_proj.bias": "model-00003-of-00008.safetensors",
|
| 402 |
+
"model.layers.6.self_attn.v_proj.weight": "model-00003-of-00008.safetensors",
|
| 403 |
+
"model.layers.7.input_layernorm.weight": "model-00004-of-00008.safetensors",
|
| 404 |
+
"model.layers.7.mlp.down_proj.weight": "model-00004-of-00008.safetensors",
|
| 405 |
+
"model.layers.7.mlp.gate_proj.weight": "model-00004-of-00008.safetensors",
|
| 406 |
+
"model.layers.7.mlp.up_proj.weight": "model-00004-of-00008.safetensors",
|
| 407 |
+
"model.layers.7.post_attention_layernorm.weight": "model-00004-of-00008.safetensors",
|
| 408 |
+
"model.layers.7.self_attn.k_proj.bias": "model-00004-of-00008.safetensors",
|
| 409 |
+
"model.layers.7.self_attn.k_proj.weight": "model-00004-of-00008.safetensors",
|
| 410 |
+
"model.layers.7.self_attn.o_proj.weight": "model-00004-of-00008.safetensors",
|
| 411 |
+
"model.layers.7.self_attn.q_proj.bias": "model-00004-of-00008.safetensors",
|
| 412 |
+
"model.layers.7.self_attn.q_proj.weight": "model-00004-of-00008.safetensors",
|
| 413 |
+
"model.layers.7.self_attn.v_proj.bias": "model-00004-of-00008.safetensors",
|
| 414 |
+
"model.layers.7.self_attn.v_proj.weight": "model-00004-of-00008.safetensors",
|
| 415 |
+
"model.layers.8.input_layernorm.weight": "model-00004-of-00008.safetensors",
|
| 416 |
+
"model.layers.8.mlp.down_proj.weight": "model-00004-of-00008.safetensors",
|
| 417 |
+
"model.layers.8.mlp.gate_proj.weight": "model-00004-of-00008.safetensors",
|
| 418 |
+
"model.layers.8.mlp.up_proj.weight": "model-00004-of-00008.safetensors",
|
| 419 |
+
"model.layers.8.post_attention_layernorm.weight": "model-00004-of-00008.safetensors",
|
| 420 |
+
"model.layers.8.self_attn.k_proj.bias": "model-00004-of-00008.safetensors",
|
| 421 |
+
"model.layers.8.self_attn.k_proj.weight": "model-00004-of-00008.safetensors",
|
| 422 |
+
"model.layers.8.self_attn.o_proj.weight": "model-00004-of-00008.safetensors",
|
| 423 |
+
"model.layers.8.self_attn.q_proj.bias": "model-00004-of-00008.safetensors",
|
| 424 |
+
"model.layers.8.self_attn.q_proj.weight": "model-00004-of-00008.safetensors",
|
| 425 |
+
"model.layers.8.self_attn.v_proj.bias": "model-00004-of-00008.safetensors",
|
| 426 |
+
"model.layers.8.self_attn.v_proj.weight": "model-00004-of-00008.safetensors",
|
| 427 |
+
"model.layers.9.input_layernorm.weight": "model-00004-of-00008.safetensors",
|
| 428 |
+
"model.layers.9.mlp.down_proj.weight": "model-00004-of-00008.safetensors",
|
| 429 |
+
"model.layers.9.mlp.gate_proj.weight": "model-00004-of-00008.safetensors",
|
| 430 |
+
"model.layers.9.mlp.up_proj.weight": "model-00004-of-00008.safetensors",
|
| 431 |
+
"model.layers.9.post_attention_layernorm.weight": "model-00004-of-00008.safetensors",
|
| 432 |
+
"model.layers.9.self_attn.k_proj.bias": "model-00004-of-00008.safetensors",
|
| 433 |
+
"model.layers.9.self_attn.k_proj.weight": "model-00004-of-00008.safetensors",
|
| 434 |
+
"model.layers.9.self_attn.o_proj.weight": "model-00004-of-00008.safetensors",
|
| 435 |
+
"model.layers.9.self_attn.q_proj.bias": "model-00004-of-00008.safetensors",
|
| 436 |
+
"model.layers.9.self_attn.q_proj.weight": "model-00004-of-00008.safetensors",
|
| 437 |
+
"model.layers.9.self_attn.v_proj.bias": "model-00004-of-00008.safetensors",
|
| 438 |
+
"model.layers.9.self_attn.v_proj.weight": "model-00004-of-00008.safetensors",
|
| 439 |
+
"model.norm.weight": "model-00008-of-00008.safetensors",
|
| 440 |
+
"visual.blocks.0.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 441 |
+
"visual.blocks.0.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 442 |
+
"visual.blocks.0.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 443 |
+
"visual.blocks.0.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 444 |
+
"visual.blocks.0.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 445 |
+
"visual.blocks.0.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 446 |
+
"visual.blocks.0.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 447 |
+
"visual.blocks.0.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 448 |
+
"visual.blocks.0.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 449 |
+
"visual.blocks.0.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 450 |
+
"visual.blocks.0.norm1.weight": "model-00001-of-00008.safetensors",
|
| 451 |
+
"visual.blocks.0.norm2.weight": "model-00001-of-00008.safetensors",
|
| 452 |
+
"visual.blocks.1.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 453 |
+
"visual.blocks.1.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 454 |
+
"visual.blocks.1.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 455 |
+
"visual.blocks.1.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 456 |
+
"visual.blocks.1.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 457 |
+
"visual.blocks.1.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 458 |
+
"visual.blocks.1.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 459 |
+
"visual.blocks.1.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 460 |
+
"visual.blocks.1.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 461 |
+
"visual.blocks.1.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 462 |
+
"visual.blocks.1.norm1.weight": "model-00001-of-00008.safetensors",
|
| 463 |
+
"visual.blocks.1.norm2.weight": "model-00001-of-00008.safetensors",
|
| 464 |
+
"visual.blocks.10.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 465 |
+
"visual.blocks.10.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 466 |
+
"visual.blocks.10.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 467 |
+
"visual.blocks.10.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 468 |
+
"visual.blocks.10.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 469 |
+
"visual.blocks.10.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 470 |
+
"visual.blocks.10.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 471 |
+
"visual.blocks.10.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 472 |
+
"visual.blocks.10.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 473 |
+
"visual.blocks.10.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 474 |
+
"visual.blocks.10.norm1.weight": "model-00001-of-00008.safetensors",
|
| 475 |
+
"visual.blocks.10.norm2.weight": "model-00001-of-00008.safetensors",
|
| 476 |
+
"visual.blocks.11.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 477 |
+
"visual.blocks.11.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 478 |
+
"visual.blocks.11.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 479 |
+
"visual.blocks.11.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 480 |
+
"visual.blocks.11.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 481 |
+
"visual.blocks.11.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 482 |
+
"visual.blocks.11.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 483 |
+
"visual.blocks.11.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 484 |
+
"visual.blocks.11.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 485 |
+
"visual.blocks.11.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 486 |
+
"visual.blocks.11.norm1.weight": "model-00001-of-00008.safetensors",
|
| 487 |
+
"visual.blocks.11.norm2.weight": "model-00001-of-00008.safetensors",
|
| 488 |
+
"visual.blocks.12.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 489 |
+
"visual.blocks.12.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 490 |
+
"visual.blocks.12.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 491 |
+
"visual.blocks.12.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 492 |
+
"visual.blocks.12.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 493 |
+
"visual.blocks.12.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 494 |
+
"visual.blocks.12.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 495 |
+
"visual.blocks.12.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 496 |
+
"visual.blocks.12.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 497 |
+
"visual.blocks.12.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 498 |
+
"visual.blocks.12.norm1.weight": "model-00001-of-00008.safetensors",
|
| 499 |
+
"visual.blocks.12.norm2.weight": "model-00001-of-00008.safetensors",
|
| 500 |
+
"visual.blocks.13.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 501 |
+
"visual.blocks.13.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 502 |
+
"visual.blocks.13.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 503 |
+
"visual.blocks.13.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 504 |
+
"visual.blocks.13.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 505 |
+
"visual.blocks.13.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 506 |
+
"visual.blocks.13.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 507 |
+
"visual.blocks.13.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 508 |
+
"visual.blocks.13.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 509 |
+
"visual.blocks.13.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 510 |
+
"visual.blocks.13.norm1.weight": "model-00001-of-00008.safetensors",
|
| 511 |
+
"visual.blocks.13.norm2.weight": "model-00001-of-00008.safetensors",
|
| 512 |
+
"visual.blocks.14.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 513 |
+
"visual.blocks.14.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 514 |
+
"visual.blocks.14.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 515 |
+
"visual.blocks.14.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 516 |
+
"visual.blocks.14.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 517 |
+
"visual.blocks.14.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 518 |
+
"visual.blocks.14.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 519 |
+
"visual.blocks.14.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 520 |
+
"visual.blocks.14.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 521 |
+
"visual.blocks.14.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 522 |
+
"visual.blocks.14.norm1.weight": "model-00001-of-00008.safetensors",
|
| 523 |
+
"visual.blocks.14.norm2.weight": "model-00001-of-00008.safetensors",
|
| 524 |
+
"visual.blocks.15.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 525 |
+
"visual.blocks.15.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 526 |
+
"visual.blocks.15.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 527 |
+
"visual.blocks.15.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 528 |
+
"visual.blocks.15.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 529 |
+
"visual.blocks.15.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 530 |
+
"visual.blocks.15.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 531 |
+
"visual.blocks.15.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 532 |
+
"visual.blocks.15.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 533 |
+
"visual.blocks.15.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 534 |
+
"visual.blocks.15.norm1.weight": "model-00001-of-00008.safetensors",
|
| 535 |
+
"visual.blocks.15.norm2.weight": "model-00001-of-00008.safetensors",
|
| 536 |
+
"visual.blocks.16.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 537 |
+
"visual.blocks.16.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 538 |
+
"visual.blocks.16.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 539 |
+
"visual.blocks.16.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 540 |
+
"visual.blocks.16.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 541 |
+
"visual.blocks.16.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 542 |
+
"visual.blocks.16.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 543 |
+
"visual.blocks.16.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 544 |
+
"visual.blocks.16.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 545 |
+
"visual.blocks.16.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 546 |
+
"visual.blocks.16.norm1.weight": "model-00001-of-00008.safetensors",
|
| 547 |
+
"visual.blocks.16.norm2.weight": "model-00001-of-00008.safetensors",
|
| 548 |
+
"visual.blocks.17.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 549 |
+
"visual.blocks.17.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 550 |
+
"visual.blocks.17.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 551 |
+
"visual.blocks.17.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 552 |
+
"visual.blocks.17.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 553 |
+
"visual.blocks.17.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 554 |
+
"visual.blocks.17.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 555 |
+
"visual.blocks.17.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 556 |
+
"visual.blocks.17.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 557 |
+
"visual.blocks.17.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 558 |
+
"visual.blocks.17.norm1.weight": "model-00001-of-00008.safetensors",
|
| 559 |
+
"visual.blocks.17.norm2.weight": "model-00001-of-00008.safetensors",
|
| 560 |
+
"visual.blocks.18.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 561 |
+
"visual.blocks.18.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 562 |
+
"visual.blocks.18.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 563 |
+
"visual.blocks.18.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 564 |
+
"visual.blocks.18.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 565 |
+
"visual.blocks.18.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 566 |
+
"visual.blocks.18.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 567 |
+
"visual.blocks.18.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 568 |
+
"visual.blocks.18.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 569 |
+
"visual.blocks.18.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 570 |
+
"visual.blocks.18.norm1.weight": "model-00001-of-00008.safetensors",
|
| 571 |
+
"visual.blocks.18.norm2.weight": "model-00001-of-00008.safetensors",
|
| 572 |
+
"visual.blocks.19.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 573 |
+
"visual.blocks.19.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 574 |
+
"visual.blocks.19.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 575 |
+
"visual.blocks.19.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 576 |
+
"visual.blocks.19.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 577 |
+
"visual.blocks.19.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 578 |
+
"visual.blocks.19.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 579 |
+
"visual.blocks.19.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 580 |
+
"visual.blocks.19.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 581 |
+
"visual.blocks.19.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 582 |
+
"visual.blocks.19.norm1.weight": "model-00001-of-00008.safetensors",
|
| 583 |
+
"visual.blocks.19.norm2.weight": "model-00001-of-00008.safetensors",
|
| 584 |
+
"visual.blocks.2.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 585 |
+
"visual.blocks.2.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 586 |
+
"visual.blocks.2.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 587 |
+
"visual.blocks.2.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 588 |
+
"visual.blocks.2.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 589 |
+
"visual.blocks.2.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 590 |
+
"visual.blocks.2.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 591 |
+
"visual.blocks.2.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 592 |
+
"visual.blocks.2.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 593 |
+
"visual.blocks.2.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 594 |
+
"visual.blocks.2.norm1.weight": "model-00001-of-00008.safetensors",
|
| 595 |
+
"visual.blocks.2.norm2.weight": "model-00001-of-00008.safetensors",
|
| 596 |
+
"visual.blocks.20.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 597 |
+
"visual.blocks.20.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 598 |
+
"visual.blocks.20.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 599 |
+
"visual.blocks.20.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 600 |
+
"visual.blocks.20.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 601 |
+
"visual.blocks.20.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 602 |
+
"visual.blocks.20.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 603 |
+
"visual.blocks.20.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 604 |
+
"visual.blocks.20.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 605 |
+
"visual.blocks.20.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 606 |
+
"visual.blocks.20.norm1.weight": "model-00001-of-00008.safetensors",
|
| 607 |
+
"visual.blocks.20.norm2.weight": "model-00001-of-00008.safetensors",
|
| 608 |
+
"visual.blocks.21.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 609 |
+
"visual.blocks.21.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 610 |
+
"visual.blocks.21.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 611 |
+
"visual.blocks.21.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 612 |
+
"visual.blocks.21.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 613 |
+
"visual.blocks.21.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 614 |
+
"visual.blocks.21.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 615 |
+
"visual.blocks.21.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 616 |
+
"visual.blocks.21.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 617 |
+
"visual.blocks.21.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 618 |
+
"visual.blocks.21.norm1.weight": "model-00001-of-00008.safetensors",
|
| 619 |
+
"visual.blocks.21.norm2.weight": "model-00001-of-00008.safetensors",
|
| 620 |
+
"visual.blocks.22.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 621 |
+
"visual.blocks.22.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 622 |
+
"visual.blocks.22.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 623 |
+
"visual.blocks.22.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 624 |
+
"visual.blocks.22.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 625 |
+
"visual.blocks.22.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 626 |
+
"visual.blocks.22.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 627 |
+
"visual.blocks.22.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 628 |
+
"visual.blocks.22.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 629 |
+
"visual.blocks.22.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 630 |
+
"visual.blocks.22.norm1.weight": "model-00001-of-00008.safetensors",
|
| 631 |
+
"visual.blocks.22.norm2.weight": "model-00001-of-00008.safetensors",
|
| 632 |
+
"visual.blocks.23.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 633 |
+
"visual.blocks.23.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 634 |
+
"visual.blocks.23.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 635 |
+
"visual.blocks.23.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 636 |
+
"visual.blocks.23.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 637 |
+
"visual.blocks.23.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 638 |
+
"visual.blocks.23.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 639 |
+
"visual.blocks.23.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 640 |
+
"visual.blocks.23.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 641 |
+
"visual.blocks.23.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 642 |
+
"visual.blocks.23.norm1.weight": "model-00001-of-00008.safetensors",
|
| 643 |
+
"visual.blocks.23.norm2.weight": "model-00001-of-00008.safetensors",
|
| 644 |
+
"visual.blocks.24.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 645 |
+
"visual.blocks.24.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 646 |
+
"visual.blocks.24.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 647 |
+
"visual.blocks.24.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 648 |
+
"visual.blocks.24.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 649 |
+
"visual.blocks.24.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 650 |
+
"visual.blocks.24.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 651 |
+
"visual.blocks.24.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 652 |
+
"visual.blocks.24.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 653 |
+
"visual.blocks.24.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 654 |
+
"visual.blocks.24.norm1.weight": "model-00001-of-00008.safetensors",
|
| 655 |
+
"visual.blocks.24.norm2.weight": "model-00001-of-00008.safetensors",
|
| 656 |
+
"visual.blocks.25.attn.proj.bias": "model-00002-of-00008.safetensors",
|
| 657 |
+
"visual.blocks.25.attn.proj.weight": "model-00002-of-00008.safetensors",
|
| 658 |
+
"visual.blocks.25.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 659 |
+
"visual.blocks.25.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 660 |
+
"visual.blocks.25.mlp.down_proj.bias": "model-00002-of-00008.safetensors",
|
| 661 |
+
"visual.blocks.25.mlp.down_proj.weight": "model-00002-of-00008.safetensors",
|
| 662 |
+
"visual.blocks.25.mlp.gate_proj.bias": "model-00002-of-00008.safetensors",
|
| 663 |
+
"visual.blocks.25.mlp.gate_proj.weight": "model-00002-of-00008.safetensors",
|
| 664 |
+
"visual.blocks.25.mlp.up_proj.bias": "model-00002-of-00008.safetensors",
|
| 665 |
+
"visual.blocks.25.mlp.up_proj.weight": "model-00002-of-00008.safetensors",
|
| 666 |
+
"visual.blocks.25.norm1.weight": "model-00001-of-00008.safetensors",
|
| 667 |
+
"visual.blocks.25.norm2.weight": "model-00001-of-00008.safetensors",
|
| 668 |
+
"visual.blocks.26.attn.proj.bias": "model-00002-of-00008.safetensors",
|
| 669 |
+
"visual.blocks.26.attn.proj.weight": "model-00002-of-00008.safetensors",
|
| 670 |
+
"visual.blocks.26.attn.qkv.bias": "model-00002-of-00008.safetensors",
|
| 671 |
+
"visual.blocks.26.attn.qkv.weight": "model-00002-of-00008.safetensors",
|
| 672 |
+
"visual.blocks.26.mlp.down_proj.bias": "model-00002-of-00008.safetensors",
|
| 673 |
+
"visual.blocks.26.mlp.down_proj.weight": "model-00002-of-00008.safetensors",
|
| 674 |
+
"visual.blocks.26.mlp.gate_proj.bias": "model-00002-of-00008.safetensors",
|
| 675 |
+
"visual.blocks.26.mlp.gate_proj.weight": "model-00002-of-00008.safetensors",
|
| 676 |
+
"visual.blocks.26.mlp.up_proj.bias": "model-00002-of-00008.safetensors",
|
| 677 |
+
"visual.blocks.26.mlp.up_proj.weight": "model-00002-of-00008.safetensors",
|
| 678 |
+
"visual.blocks.26.norm1.weight": "model-00002-of-00008.safetensors",
|
| 679 |
+
"visual.blocks.26.norm2.weight": "model-00002-of-00008.safetensors",
|
| 680 |
+
"visual.blocks.27.attn.proj.bias": "model-00002-of-00008.safetensors",
|
| 681 |
+
"visual.blocks.27.attn.proj.weight": "model-00002-of-00008.safetensors",
|
| 682 |
+
"visual.blocks.27.attn.qkv.bias": "model-00002-of-00008.safetensors",
|
| 683 |
+
"visual.blocks.27.attn.qkv.weight": "model-00002-of-00008.safetensors",
|
| 684 |
+
"visual.blocks.27.mlp.down_proj.bias": "model-00002-of-00008.safetensors",
|
| 685 |
+
"visual.blocks.27.mlp.down_proj.weight": "model-00002-of-00008.safetensors",
|
| 686 |
+
"visual.blocks.27.mlp.gate_proj.bias": "model-00002-of-00008.safetensors",
|
| 687 |
+
"visual.blocks.27.mlp.gate_proj.weight": "model-00002-of-00008.safetensors",
|
| 688 |
+
"visual.blocks.27.mlp.up_proj.bias": "model-00002-of-00008.safetensors",
|
| 689 |
+
"visual.blocks.27.mlp.up_proj.weight": "model-00002-of-00008.safetensors",
|
| 690 |
+
"visual.blocks.27.norm1.weight": "model-00002-of-00008.safetensors",
|
| 691 |
+
"visual.blocks.27.norm2.weight": "model-00002-of-00008.safetensors",
|
| 692 |
+
"visual.blocks.28.attn.proj.bias": "model-00002-of-00008.safetensors",
|
| 693 |
+
"visual.blocks.28.attn.proj.weight": "model-00002-of-00008.safetensors",
|
| 694 |
+
"visual.blocks.28.attn.qkv.bias": "model-00002-of-00008.safetensors",
|
| 695 |
+
"visual.blocks.28.attn.qkv.weight": "model-00002-of-00008.safetensors",
|
| 696 |
+
"visual.blocks.28.mlp.down_proj.bias": "model-00002-of-00008.safetensors",
|
| 697 |
+
"visual.blocks.28.mlp.down_proj.weight": "model-00002-of-00008.safetensors",
|
| 698 |
+
"visual.blocks.28.mlp.gate_proj.bias": "model-00002-of-00008.safetensors",
|
| 699 |
+
"visual.blocks.28.mlp.gate_proj.weight": "model-00002-of-00008.safetensors",
|
| 700 |
+
"visual.blocks.28.mlp.up_proj.bias": "model-00002-of-00008.safetensors",
|
| 701 |
+
"visual.blocks.28.mlp.up_proj.weight": "model-00002-of-00008.safetensors",
|
| 702 |
+
"visual.blocks.28.norm1.weight": "model-00002-of-00008.safetensors",
|
| 703 |
+
"visual.blocks.28.norm2.weight": "model-00002-of-00008.safetensors",
|
| 704 |
+
"visual.blocks.29.attn.proj.bias": "model-00002-of-00008.safetensors",
|
| 705 |
+
"visual.blocks.29.attn.proj.weight": "model-00002-of-00008.safetensors",
|
| 706 |
+
"visual.blocks.29.attn.qkv.bias": "model-00002-of-00008.safetensors",
|
| 707 |
+
"visual.blocks.29.attn.qkv.weight": "model-00002-of-00008.safetensors",
|
| 708 |
+
"visual.blocks.29.mlp.down_proj.bias": "model-00002-of-00008.safetensors",
|
| 709 |
+
"visual.blocks.29.mlp.down_proj.weight": "model-00002-of-00008.safetensors",
|
| 710 |
+
"visual.blocks.29.mlp.gate_proj.bias": "model-00002-of-00008.safetensors",
|
| 711 |
+
"visual.blocks.29.mlp.gate_proj.weight": "model-00002-of-00008.safetensors",
|
| 712 |
+
"visual.blocks.29.mlp.up_proj.bias": "model-00002-of-00008.safetensors",
|
| 713 |
+
"visual.blocks.29.mlp.up_proj.weight": "model-00002-of-00008.safetensors",
|
| 714 |
+
"visual.blocks.29.norm1.weight": "model-00002-of-00008.safetensors",
|
| 715 |
+
"visual.blocks.29.norm2.weight": "model-00002-of-00008.safetensors",
|
| 716 |
+
"visual.blocks.3.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 717 |
+
"visual.blocks.3.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 718 |
+
"visual.blocks.3.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 719 |
+
"visual.blocks.3.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 720 |
+
"visual.blocks.3.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 721 |
+
"visual.blocks.3.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 722 |
+
"visual.blocks.3.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 723 |
+
"visual.blocks.3.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 724 |
+
"visual.blocks.3.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 725 |
+
"visual.blocks.3.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 726 |
+
"visual.blocks.3.norm1.weight": "model-00001-of-00008.safetensors",
|
| 727 |
+
"visual.blocks.3.norm2.weight": "model-00001-of-00008.safetensors",
|
| 728 |
+
"visual.blocks.30.attn.proj.bias": "model-00002-of-00008.safetensors",
|
| 729 |
+
"visual.blocks.30.attn.proj.weight": "model-00002-of-00008.safetensors",
|
| 730 |
+
"visual.blocks.30.attn.qkv.bias": "model-00002-of-00008.safetensors",
|
| 731 |
+
"visual.blocks.30.attn.qkv.weight": "model-00002-of-00008.safetensors",
|
| 732 |
+
"visual.blocks.30.mlp.down_proj.bias": "model-00002-of-00008.safetensors",
|
| 733 |
+
"visual.blocks.30.mlp.down_proj.weight": "model-00002-of-00008.safetensors",
|
| 734 |
+
"visual.blocks.30.mlp.gate_proj.bias": "model-00002-of-00008.safetensors",
|
| 735 |
+
"visual.blocks.30.mlp.gate_proj.weight": "model-00002-of-00008.safetensors",
|
| 736 |
+
"visual.blocks.30.mlp.up_proj.bias": "model-00002-of-00008.safetensors",
|
| 737 |
+
"visual.blocks.30.mlp.up_proj.weight": "model-00002-of-00008.safetensors",
|
| 738 |
+
"visual.blocks.30.norm1.weight": "model-00002-of-00008.safetensors",
|
| 739 |
+
"visual.blocks.30.norm2.weight": "model-00002-of-00008.safetensors",
|
| 740 |
+
"visual.blocks.31.attn.proj.bias": "model-00002-of-00008.safetensors",
|
| 741 |
+
"visual.blocks.31.attn.proj.weight": "model-00002-of-00008.safetensors",
|
| 742 |
+
"visual.blocks.31.attn.qkv.bias": "model-00002-of-00008.safetensors",
|
| 743 |
+
"visual.blocks.31.attn.qkv.weight": "model-00002-of-00008.safetensors",
|
| 744 |
+
"visual.blocks.31.mlp.down_proj.bias": "model-00002-of-00008.safetensors",
|
| 745 |
+
"visual.blocks.31.mlp.down_proj.weight": "model-00002-of-00008.safetensors",
|
| 746 |
+
"visual.blocks.31.mlp.gate_proj.bias": "model-00002-of-00008.safetensors",
|
| 747 |
+
"visual.blocks.31.mlp.gate_proj.weight": "model-00002-of-00008.safetensors",
|
| 748 |
+
"visual.blocks.31.mlp.up_proj.bias": "model-00002-of-00008.safetensors",
|
| 749 |
+
"visual.blocks.31.mlp.up_proj.weight": "model-00002-of-00008.safetensors",
|
| 750 |
+
"visual.blocks.31.norm1.weight": "model-00002-of-00008.safetensors",
|
| 751 |
+
"visual.blocks.31.norm2.weight": "model-00002-of-00008.safetensors",
|
| 752 |
+
"visual.blocks.4.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 753 |
+
"visual.blocks.4.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 754 |
+
"visual.blocks.4.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 755 |
+
"visual.blocks.4.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 756 |
+
"visual.blocks.4.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 757 |
+
"visual.blocks.4.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 758 |
+
"visual.blocks.4.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 759 |
+
"visual.blocks.4.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 760 |
+
"visual.blocks.4.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 761 |
+
"visual.blocks.4.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 762 |
+
"visual.blocks.4.norm1.weight": "model-00001-of-00008.safetensors",
|
| 763 |
+
"visual.blocks.4.norm2.weight": "model-00001-of-00008.safetensors",
|
| 764 |
+
"visual.blocks.5.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 765 |
+
"visual.blocks.5.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 766 |
+
"visual.blocks.5.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 767 |
+
"visual.blocks.5.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 768 |
+
"visual.blocks.5.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 769 |
+
"visual.blocks.5.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 770 |
+
"visual.blocks.5.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 771 |
+
"visual.blocks.5.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 772 |
+
"visual.blocks.5.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 773 |
+
"visual.blocks.5.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 774 |
+
"visual.blocks.5.norm1.weight": "model-00001-of-00008.safetensors",
|
| 775 |
+
"visual.blocks.5.norm2.weight": "model-00001-of-00008.safetensors",
|
| 776 |
+
"visual.blocks.6.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 777 |
+
"visual.blocks.6.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 778 |
+
"visual.blocks.6.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 779 |
+
"visual.blocks.6.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 780 |
+
"visual.blocks.6.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 781 |
+
"visual.blocks.6.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 782 |
+
"visual.blocks.6.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 783 |
+
"visual.blocks.6.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 784 |
+
"visual.blocks.6.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 785 |
+
"visual.blocks.6.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 786 |
+
"visual.blocks.6.norm1.weight": "model-00001-of-00008.safetensors",
|
| 787 |
+
"visual.blocks.6.norm2.weight": "model-00001-of-00008.safetensors",
|
| 788 |
+
"visual.blocks.7.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 789 |
+
"visual.blocks.7.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 790 |
+
"visual.blocks.7.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 791 |
+
"visual.blocks.7.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 792 |
+
"visual.blocks.7.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 793 |
+
"visual.blocks.7.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 794 |
+
"visual.blocks.7.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 795 |
+
"visual.blocks.7.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 796 |
+
"visual.blocks.7.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 797 |
+
"visual.blocks.7.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 798 |
+
"visual.blocks.7.norm1.weight": "model-00001-of-00008.safetensors",
|
| 799 |
+
"visual.blocks.7.norm2.weight": "model-00001-of-00008.safetensors",
|
| 800 |
+
"visual.blocks.8.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 801 |
+
"visual.blocks.8.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 802 |
+
"visual.blocks.8.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 803 |
+
"visual.blocks.8.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 804 |
+
"visual.blocks.8.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 805 |
+
"visual.blocks.8.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 806 |
+
"visual.blocks.8.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 807 |
+
"visual.blocks.8.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 808 |
+
"visual.blocks.8.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 809 |
+
"visual.blocks.8.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 810 |
+
"visual.blocks.8.norm1.weight": "model-00001-of-00008.safetensors",
|
| 811 |
+
"visual.blocks.8.norm2.weight": "model-00001-of-00008.safetensors",
|
| 812 |
+
"visual.blocks.9.attn.proj.bias": "model-00001-of-00008.safetensors",
|
| 813 |
+
"visual.blocks.9.attn.proj.weight": "model-00001-of-00008.safetensors",
|
| 814 |
+
"visual.blocks.9.attn.qkv.bias": "model-00001-of-00008.safetensors",
|
| 815 |
+
"visual.blocks.9.attn.qkv.weight": "model-00001-of-00008.safetensors",
|
| 816 |
+
"visual.blocks.9.mlp.down_proj.bias": "model-00001-of-00008.safetensors",
|
| 817 |
+
"visual.blocks.9.mlp.down_proj.weight": "model-00001-of-00008.safetensors",
|
| 818 |
+
"visual.blocks.9.mlp.gate_proj.bias": "model-00001-of-00008.safetensors",
|
| 819 |
+
"visual.blocks.9.mlp.gate_proj.weight": "model-00001-of-00008.safetensors",
|
| 820 |
+
"visual.blocks.9.mlp.up_proj.bias": "model-00001-of-00008.safetensors",
|
| 821 |
+
"visual.blocks.9.mlp.up_proj.weight": "model-00001-of-00008.safetensors",
|
| 822 |
+
"visual.blocks.9.norm1.weight": "model-00001-of-00008.safetensors",
|
| 823 |
+
"visual.blocks.9.norm2.weight": "model-00001-of-00008.safetensors",
|
| 824 |
+
"visual.merger.ln_q.weight": "model-00002-of-00008.safetensors",
|
| 825 |
+
"visual.merger.mlp.0.bias": "model-00002-of-00008.safetensors",
|
| 826 |
+
"visual.merger.mlp.0.weight": "model-00002-of-00008.safetensors",
|
| 827 |
+
"visual.merger.mlp.2.bias": "model-00002-of-00008.safetensors",
|
| 828 |
+
"visual.merger.mlp.2.weight": "model-00002-of-00008.safetensors",
|
| 829 |
+
"visual.patch_embed.proj.weight": "model-00001-of-00008.safetensors"
|
| 830 |
+
}
|
| 831 |
+
}
|
circulant_merged/preprocessor_config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_convert_rgb": true,
|
| 3 |
+
"do_normalize": true,
|
| 4 |
+
"do_rescale": true,
|
| 5 |
+
"do_resize": true,
|
| 6 |
+
"image_mean": [
|
| 7 |
+
0.48145466,
|
| 8 |
+
0.4578275,
|
| 9 |
+
0.40821073
|
| 10 |
+
],
|
| 11 |
+
"image_processor_type": "Qwen2VLImageProcessor",
|
| 12 |
+
"image_std": [
|
| 13 |
+
0.26862954,
|
| 14 |
+
0.26130258,
|
| 15 |
+
0.27577711
|
| 16 |
+
],
|
| 17 |
+
"max_pixels": 589824,
|
| 18 |
+
"merge_size": 2,
|
| 19 |
+
"min_pixels": 3136,
|
| 20 |
+
"patch_size": 14,
|
| 21 |
+
"processor_class": "Qwen2_5_VLProcessor",
|
| 22 |
+
"resample": 3,
|
| 23 |
+
"rescale_factor": 0.00392156862745098,
|
| 24 |
+
"size": {
|
| 25 |
+
"longest_edge": 12845056,
|
| 26 |
+
"shortest_edge": 3136
|
| 27 |
+
},
|
| 28 |
+
"temporal_patch_size": 2
|
| 29 |
+
}
|
circulant_merged/special_tokens_map.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<|im_start|>",
|
| 4 |
+
"<|im_end|>",
|
| 5 |
+
"<|object_ref_start|>",
|
| 6 |
+
"<|object_ref_end|>",
|
| 7 |
+
"<|box_start|>",
|
| 8 |
+
"<|box_end|>",
|
| 9 |
+
"<|quad_start|>",
|
| 10 |
+
"<|quad_end|>",
|
| 11 |
+
"<|vision_start|>",
|
| 12 |
+
"<|vision_end|>",
|
| 13 |
+
"<|vision_pad|>",
|
| 14 |
+
"<|image_pad|>",
|
| 15 |
+
"<|video_pad|>"
|
| 16 |
+
],
|
| 17 |
+
"eos_token": {
|
| 18 |
+
"content": "<|im_end|>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
},
|
| 24 |
+
"pad_token": {
|
| 25 |
+
"content": "<|endoftext|>",
|
| 26 |
+
"lstrip": false,
|
| 27 |
+
"normalized": false,
|
| 28 |
+
"rstrip": false,
|
| 29 |
+
"single_word": false
|
| 30 |
+
}
|
| 31 |
+
}
|
circulant_merged/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
|
| 3 |
+
size 11421896
|
circulant_merged/tokenizer_config.json
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"151643": {
|
| 6 |
+
"content": "<|endoftext|>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"151644": {
|
| 14 |
+
"content": "<|im_start|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"151645": {
|
| 22 |
+
"content": "<|im_end|>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
},
|
| 29 |
+
"151646": {
|
| 30 |
+
"content": "<|object_ref_start|>",
|
| 31 |
+
"lstrip": false,
|
| 32 |
+
"normalized": false,
|
| 33 |
+
"rstrip": false,
|
| 34 |
+
"single_word": false,
|
| 35 |
+
"special": true
|
| 36 |
+
},
|
| 37 |
+
"151647": {
|
| 38 |
+
"content": "<|object_ref_end|>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false,
|
| 43 |
+
"special": true
|
| 44 |
+
},
|
| 45 |
+
"151648": {
|
| 46 |
+
"content": "<|box_start|>",
|
| 47 |
+
"lstrip": false,
|
| 48 |
+
"normalized": false,
|
| 49 |
+
"rstrip": false,
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"special": true
|
| 52 |
+
},
|
| 53 |
+
"151649": {
|
| 54 |
+
"content": "<|box_end|>",
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"normalized": false,
|
| 57 |
+
"rstrip": false,
|
| 58 |
+
"single_word": false,
|
| 59 |
+
"special": true
|
| 60 |
+
},
|
| 61 |
+
"151650": {
|
| 62 |
+
"content": "<|quad_start|>",
|
| 63 |
+
"lstrip": false,
|
| 64 |
+
"normalized": false,
|
| 65 |
+
"rstrip": false,
|
| 66 |
+
"single_word": false,
|
| 67 |
+
"special": true
|
| 68 |
+
},
|
| 69 |
+
"151651": {
|
| 70 |
+
"content": "<|quad_end|>",
|
| 71 |
+
"lstrip": false,
|
| 72 |
+
"normalized": false,
|
| 73 |
+
"rstrip": false,
|
| 74 |
+
"single_word": false,
|
| 75 |
+
"special": true
|
| 76 |
+
},
|
| 77 |
+
"151652": {
|
| 78 |
+
"content": "<|vision_start|>",
|
| 79 |
+
"lstrip": false,
|
| 80 |
+
"normalized": false,
|
| 81 |
+
"rstrip": false,
|
| 82 |
+
"single_word": false,
|
| 83 |
+
"special": true
|
| 84 |
+
},
|
| 85 |
+
"151653": {
|
| 86 |
+
"content": "<|vision_end|>",
|
| 87 |
+
"lstrip": false,
|
| 88 |
+
"normalized": false,
|
| 89 |
+
"rstrip": false,
|
| 90 |
+
"single_word": false,
|
| 91 |
+
"special": true
|
| 92 |
+
},
|
| 93 |
+
"151654": {
|
| 94 |
+
"content": "<|vision_pad|>",
|
| 95 |
+
"lstrip": false,
|
| 96 |
+
"normalized": false,
|
| 97 |
+
"rstrip": false,
|
| 98 |
+
"single_word": false,
|
| 99 |
+
"special": true
|
| 100 |
+
},
|
| 101 |
+
"151655": {
|
| 102 |
+
"content": "<|image_pad|>",
|
| 103 |
+
"lstrip": false,
|
| 104 |
+
"normalized": false,
|
| 105 |
+
"rstrip": false,
|
| 106 |
+
"single_word": false,
|
| 107 |
+
"special": true
|
| 108 |
+
},
|
| 109 |
+
"151656": {
|
| 110 |
+
"content": "<|video_pad|>",
|
| 111 |
+
"lstrip": false,
|
| 112 |
+
"normalized": false,
|
| 113 |
+
"rstrip": false,
|
| 114 |
+
"single_word": false,
|
| 115 |
+
"special": true
|
| 116 |
+
},
|
| 117 |
+
"151657": {
|
| 118 |
+
"content": "<tool_call>",
|
| 119 |
+
"lstrip": false,
|
| 120 |
+
"normalized": false,
|
| 121 |
+
"rstrip": false,
|
| 122 |
+
"single_word": false,
|
| 123 |
+
"special": false
|
| 124 |
+
},
|
| 125 |
+
"151658": {
|
| 126 |
+
"content": "</tool_call>",
|
| 127 |
+
"lstrip": false,
|
| 128 |
+
"normalized": false,
|
| 129 |
+
"rstrip": false,
|
| 130 |
+
"single_word": false,
|
| 131 |
+
"special": false
|
| 132 |
+
},
|
| 133 |
+
"151659": {
|
| 134 |
+
"content": "<|fim_prefix|>",
|
| 135 |
+
"lstrip": false,
|
| 136 |
+
"normalized": false,
|
| 137 |
+
"rstrip": false,
|
| 138 |
+
"single_word": false,
|
| 139 |
+
"special": false
|
| 140 |
+
},
|
| 141 |
+
"151660": {
|
| 142 |
+
"content": "<|fim_middle|>",
|
| 143 |
+
"lstrip": false,
|
| 144 |
+
"normalized": false,
|
| 145 |
+
"rstrip": false,
|
| 146 |
+
"single_word": false,
|
| 147 |
+
"special": false
|
| 148 |
+
},
|
| 149 |
+
"151661": {
|
| 150 |
+
"content": "<|fim_suffix|>",
|
| 151 |
+
"lstrip": false,
|
| 152 |
+
"normalized": false,
|
| 153 |
+
"rstrip": false,
|
| 154 |
+
"single_word": false,
|
| 155 |
+
"special": false
|
| 156 |
+
},
|
| 157 |
+
"151662": {
|
| 158 |
+
"content": "<|fim_pad|>",
|
| 159 |
+
"lstrip": false,
|
| 160 |
+
"normalized": false,
|
| 161 |
+
"rstrip": false,
|
| 162 |
+
"single_word": false,
|
| 163 |
+
"special": false
|
| 164 |
+
},
|
| 165 |
+
"151663": {
|
| 166 |
+
"content": "<|repo_name|>",
|
| 167 |
+
"lstrip": false,
|
| 168 |
+
"normalized": false,
|
| 169 |
+
"rstrip": false,
|
| 170 |
+
"single_word": false,
|
| 171 |
+
"special": false
|
| 172 |
+
},
|
| 173 |
+
"151664": {
|
| 174 |
+
"content": "<|file_sep|>",
|
| 175 |
+
"lstrip": false,
|
| 176 |
+
"normalized": false,
|
| 177 |
+
"rstrip": false,
|
| 178 |
+
"single_word": false,
|
| 179 |
+
"special": false
|
| 180 |
+
}
|
| 181 |
+
},
|
| 182 |
+
"additional_special_tokens": [
|
| 183 |
+
"<|im_start|>",
|
| 184 |
+
"<|im_end|>",
|
| 185 |
+
"<|object_ref_start|>",
|
| 186 |
+
"<|object_ref_end|>",
|
| 187 |
+
"<|box_start|>",
|
| 188 |
+
"<|box_end|>",
|
| 189 |
+
"<|quad_start|>",
|
| 190 |
+
"<|quad_end|>",
|
| 191 |
+
"<|vision_start|>",
|
| 192 |
+
"<|vision_end|>",
|
| 193 |
+
"<|vision_pad|>",
|
| 194 |
+
"<|image_pad|>",
|
| 195 |
+
"<|video_pad|>"
|
| 196 |
+
],
|
| 197 |
+
"bos_token": null,
|
| 198 |
+
"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
|
| 199 |
+
"clean_up_tokenization_spaces": false,
|
| 200 |
+
"eos_token": "<|im_end|>",
|
| 201 |
+
"errors": "replace",
|
| 202 |
+
"extra_special_tokens": {},
|
| 203 |
+
"model_max_length": 131072,
|
| 204 |
+
"pad_token": "<|endoftext|>",
|
| 205 |
+
"padding_side": "left",
|
| 206 |
+
"processor_class": "Qwen2_5_VLProcessor",
|
| 207 |
+
"split_special_tokens": false,
|
| 208 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 209 |
+
"unk_token": null
|
| 210 |
+
}
|
circulant_merged/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/eval_qwenvl.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/eval_vora.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval/eval_qwen_baseline.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Qwen2.5-7B Text-Only Baseline Evaluation
|
| 3 |
+
Computes perplexity on the same held-out caption data WITHOUT images.
|
| 4 |
+
This serves as baseline: a pure text LLM shouldn't predict image captions well.
|
| 5 |
+
|
| 6 |
+
Usage:
|
| 7 |
+
python eval/eval_qwen_baseline.py \
|
| 8 |
+
--model-path qwen_models/models--Qwen--Qwen2.5-7B-Instruct/snapshots/a09a35458c702b33eeacc393d103063234e8bc28 \
|
| 9 |
+
--eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import argparse
|
| 13 |
+
import json
|
| 14 |
+
import math
|
| 15 |
+
import os
|
| 16 |
+
import sys
|
| 17 |
+
|
| 18 |
+
import torch
|
| 19 |
+
from tqdm import tqdm
|
| 20 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 21 |
+
|
| 22 |
+
IGNORE_INDEX = -100
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def load_eval_data(eval_path, max_samples=None):
|
| 26 |
+
data = []
|
| 27 |
+
with open(eval_path, "r") as f:
|
| 28 |
+
for line in f:
|
| 29 |
+
item = json.loads(line.strip())
|
| 30 |
+
data.append(item)
|
| 31 |
+
if max_samples and len(data) >= max_samples:
|
| 32 |
+
break
|
| 33 |
+
print(f"Loaded {len(data)} evaluation samples")
|
| 34 |
+
return data
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def build_text_only_batch(tokenizer, caption, device):
|
| 38 |
+
"""Build prompt for text-only baseline.
|
| 39 |
+
|
| 40 |
+
Uses the same prompt template as VoRA, but replaces <image> with
|
| 41 |
+
a text instruction "Describe this image." (since there's no image).
|
| 42 |
+
"""
|
| 43 |
+
system_start = "<|im_start|>system\n"
|
| 44 |
+
system_message = "You are a helpful assistant."
|
| 45 |
+
system_end = "<|im_end|>"
|
| 46 |
+
user_start = "\n<|im_start|>user\n"
|
| 47 |
+
user_end = "<|im_end|>\n<|im_start|>assistant\n"
|
| 48 |
+
|
| 49 |
+
prompt = (system_start + system_message + system_end +
|
| 50 |
+
user_start + "Describe this image." + user_end)
|
| 51 |
+
|
| 52 |
+
prompt_ids = tokenizer.encode(prompt)
|
| 53 |
+
caption_ids = tokenizer.encode(caption)
|
| 54 |
+
eos_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
|
| 55 |
+
full_ids = prompt_ids + caption_ids + [eos_id]
|
| 56 |
+
|
| 57 |
+
labels = [IGNORE_INDEX] * len(prompt_ids) + caption_ids + [eos_id]
|
| 58 |
+
|
| 59 |
+
batch = {
|
| 60 |
+
"input_ids": torch.tensor([full_ids], dtype=torch.long).to(device),
|
| 61 |
+
"attention_mask": torch.ones(1, len(full_ids), dtype=torch.long).to(device),
|
| 62 |
+
"labels": torch.tensor([labels], dtype=torch.long).to(device),
|
| 63 |
+
}
|
| 64 |
+
return batch, len(caption_ids) + 1
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
@torch.no_grad()
|
| 68 |
+
def evaluate_perplexity(model, tokenizer, eval_data, device):
|
| 69 |
+
model.eval()
|
| 70 |
+
total_loss = 0.0
|
| 71 |
+
total_tokens = 0
|
| 72 |
+
errors = 0
|
| 73 |
+
|
| 74 |
+
for i, item in enumerate(tqdm(eval_data, desc="Qwen Baseline Perplexity")):
|
| 75 |
+
caption = item["text"]
|
| 76 |
+
try:
|
| 77 |
+
batch, n_caption_tokens = build_text_only_batch(tokenizer, caption, device)
|
| 78 |
+
outputs = model(**batch)
|
| 79 |
+
loss = outputs.loss
|
| 80 |
+
total_loss += loss.item() * n_caption_tokens
|
| 81 |
+
total_tokens += n_caption_tokens
|
| 82 |
+
except Exception as e:
|
| 83 |
+
errors += 1
|
| 84 |
+
if errors <= 5:
|
| 85 |
+
print(f" Error on sample {i}: {e}")
|
| 86 |
+
continue
|
| 87 |
+
|
| 88 |
+
if total_tokens == 0:
|
| 89 |
+
print("No valid samples!")
|
| 90 |
+
return float("inf")
|
| 91 |
+
|
| 92 |
+
avg_loss = total_loss / total_tokens
|
| 93 |
+
perplexity = math.exp(avg_loss)
|
| 94 |
+
print(f"\n=== Qwen2.5-7B Text-Only Baseline ===")
|
| 95 |
+
print(f"Samples evaluated: {len(eval_data) - errors}/{len(eval_data)}")
|
| 96 |
+
print(f"Errors: {errors}")
|
| 97 |
+
print(f"Average cross-entropy loss: {avg_loss:.4f}")
|
| 98 |
+
print(f"Perplexity: {perplexity:.2f}")
|
| 99 |
+
return perplexity
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
@torch.no_grad()
|
| 103 |
+
def evaluate_caption(model, tokenizer, eval_data, device, max_new_tokens=256):
|
| 104 |
+
"""Generate captions without any image (text-only baseline)."""
|
| 105 |
+
model.eval()
|
| 106 |
+
predictions = []
|
| 107 |
+
references = []
|
| 108 |
+
|
| 109 |
+
system_start = "<|im_start|>system\n"
|
| 110 |
+
system_message = "You are a helpful assistant."
|
| 111 |
+
system_end = "<|im_end|>"
|
| 112 |
+
user_start = "\n<|im_start|>user\n"
|
| 113 |
+
user_end = "<|im_end|>\n<|im_start|>assistant\n"
|
| 114 |
+
prompt = (system_start + system_message + system_end +
|
| 115 |
+
user_start + "Describe this image." + user_end)
|
| 116 |
+
prompt_ids = tokenizer.encode(prompt)
|
| 117 |
+
eos_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
|
| 118 |
+
|
| 119 |
+
for item in tqdm(eval_data, desc="Qwen Baseline Caption"):
|
| 120 |
+
try:
|
| 121 |
+
input_ids = torch.tensor([prompt_ids], dtype=torch.long).to(device)
|
| 122 |
+
attention_mask = torch.ones_like(input_ids)
|
| 123 |
+
|
| 124 |
+
outputs = model.generate(
|
| 125 |
+
input_ids=input_ids,
|
| 126 |
+
attention_mask=attention_mask,
|
| 127 |
+
max_new_tokens=max_new_tokens,
|
| 128 |
+
do_sample=False,
|
| 129 |
+
pad_token_id=tokenizer.eos_token_id,
|
| 130 |
+
eos_token_id=eos_id,
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
generated = outputs[0][len(prompt_ids):]
|
| 134 |
+
text = tokenizer.decode(generated, skip_special_tokens=True)
|
| 135 |
+
predictions.append(text)
|
| 136 |
+
references.append(item["text"])
|
| 137 |
+
except Exception as e:
|
| 138 |
+
continue
|
| 139 |
+
|
| 140 |
+
if predictions:
|
| 141 |
+
metrics = _compute_metrics(predictions, references)
|
| 142 |
+
print(f"\n=== Qwen Baseline Caption Results ===")
|
| 143 |
+
print(f"Samples: {len(predictions)}/{len(eval_data)}")
|
| 144 |
+
for k, v in metrics.items():
|
| 145 |
+
print(f"{k}: {v:.4f}")
|
| 146 |
+
|
| 147 |
+
print(f"\n--- Sample Outputs (first 3) ---")
|
| 148 |
+
for i in range(min(3, len(predictions))):
|
| 149 |
+
print(f"[{i}] Generated: {predictions[i][:200]}")
|
| 150 |
+
print(f"[{i}] Reference: {references[i][:200]}")
|
| 151 |
+
print()
|
| 152 |
+
return metrics
|
| 153 |
+
return {}
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def _compute_metrics(predictions, references):
|
| 157 |
+
metrics = {}
|
| 158 |
+
try:
|
| 159 |
+
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
|
| 160 |
+
smooth = SmoothingFunction().method1
|
| 161 |
+
refs = [[ref.split()] for ref in references]
|
| 162 |
+
preds = [pred.split() for pred in predictions]
|
| 163 |
+
metrics["BLEU-1"] = corpus_bleu(refs, preds, weights=(1, 0, 0, 0), smoothing_function=smooth)
|
| 164 |
+
metrics["BLEU-4"] = corpus_bleu(refs, preds, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)
|
| 165 |
+
except ImportError:
|
| 166 |
+
pass
|
| 167 |
+
try:
|
| 168 |
+
from rouge_score import rouge_scorer
|
| 169 |
+
scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
|
| 170 |
+
scores = [scorer.score(ref, pred)["rougeL"].fmeasure for pred, ref in zip(predictions, references)]
|
| 171 |
+
metrics["ROUGE-L"] = sum(scores) / len(scores)
|
| 172 |
+
except ImportError:
|
| 173 |
+
pass
|
| 174 |
+
return metrics
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def main():
|
| 178 |
+
parser = argparse.ArgumentParser(description="Qwen2.5-7B Text-Only Baseline")
|
| 179 |
+
parser.add_argument("--mode", type=str, default="all",
|
| 180 |
+
choices=["perplexity", "caption", "all"])
|
| 181 |
+
parser.add_argument("--model-path", type=str, required=True,
|
| 182 |
+
help="Path to Qwen2.5-7B-Instruct")
|
| 183 |
+
parser.add_argument("--eval-data", type=str, required=True)
|
| 184 |
+
parser.add_argument("--max-samples", type=int, default=None)
|
| 185 |
+
parser.add_argument("--max-new-tokens", type=int, default=256)
|
| 186 |
+
parser.add_argument("--dtype", type=str, default="float16",
|
| 187 |
+
choices=["float16", "bfloat16"])
|
| 188 |
+
parser.add_argument("--output", type=str, default=None)
|
| 189 |
+
args = parser.parse_args()
|
| 190 |
+
|
| 191 |
+
dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
|
| 192 |
+
|
| 193 |
+
print(f"Loading Qwen2.5-7B from {args.model_path} ...")
|
| 194 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 195 |
+
args.model_path, torch_dtype=dtype, device_map="auto",
|
| 196 |
+
trust_remote_code=True)
|
| 197 |
+
tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
|
| 198 |
+
model.eval()
|
| 199 |
+
device = next(model.parameters()).device
|
| 200 |
+
print(f"Model loaded on {device}")
|
| 201 |
+
|
| 202 |
+
eval_data = load_eval_data(args.eval_data, max_samples=args.max_samples)
|
| 203 |
+
results = {"model": "Qwen2.5-7B-Instruct (text-only)", "num_samples": len(eval_data)}
|
| 204 |
+
|
| 205 |
+
if args.mode in ("perplexity", "all"):
|
| 206 |
+
ppl = evaluate_perplexity(model, tokenizer, eval_data, device)
|
| 207 |
+
results["perplexity"] = ppl
|
| 208 |
+
|
| 209 |
+
if args.mode in ("caption", "all"):
|
| 210 |
+
caption_metrics = evaluate_caption(
|
| 211 |
+
model, tokenizer, eval_data, device, max_new_tokens=args.max_new_tokens)
|
| 212 |
+
results.update(caption_metrics)
|
| 213 |
+
|
| 214 |
+
if args.output:
|
| 215 |
+
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
|
| 216 |
+
with open(args.output, "w") as f:
|
| 217 |
+
json.dump(results, f, indent=2, ensure_ascii=False)
|
| 218 |
+
print(f"\nResults saved to {args.output}")
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
if __name__ == "__main__":
|
| 222 |
+
main()
|
eval/eval_qwen_vl.py
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Qwen2.5-VL-3B Evaluation Script
|
| 3 |
+
Evaluates the original Qwen2.5-VL-3B-Instruct (with vision) on held-out caption data.
|
| 4 |
+
Also supports evaluating LoRA / block-circulant finetuned versions if checkpoints exist.
|
| 5 |
+
|
| 6 |
+
Usage:
|
| 7 |
+
# Original model
|
| 8 |
+
python eval/eval_qwen_vl.py --mode all \
|
| 9 |
+
--model-path Finetune-Qwen2.5-VL/Qwen2.5-VL-3B-Instruct \
|
| 10 |
+
--eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl
|
| 11 |
+
|
| 12 |
+
# With LoRA adapter
|
| 13 |
+
python eval/eval_qwen_vl.py --mode all \
|
| 14 |
+
--model-path Finetune-Qwen2.5-VL/Qwen2.5-VL-3B-Instruct \
|
| 15 |
+
--adapter-path Finetune-Qwen2.5-VL/saves/Qwen2.5-VL-3B-Instruct/lora \
|
| 16 |
+
--eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import argparse
|
| 20 |
+
import json
|
| 21 |
+
import math
|
| 22 |
+
import os
|
| 23 |
+
import sys
|
| 24 |
+
|
| 25 |
+
import torch
|
| 26 |
+
from PIL import Image
|
| 27 |
+
from tqdm import tqdm
|
| 28 |
+
from transformers import (
|
| 29 |
+
AutoModelForCausalLM,
|
| 30 |
+
AutoProcessor,
|
| 31 |
+
AutoTokenizer,
|
| 32 |
+
Qwen2VLForConditionalGeneration,
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
IGNORE_INDEX = -100
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ============================================================
|
| 39 |
+
# Data loading
|
| 40 |
+
# ============================================================
|
| 41 |
+
|
| 42 |
+
def load_eval_data(eval_path, max_samples=None):
|
| 43 |
+
data = []
|
| 44 |
+
with open(eval_path, "r") as f:
|
| 45 |
+
for line in f:
|
| 46 |
+
item = json.loads(line.strip())
|
| 47 |
+
data.append(item)
|
| 48 |
+
if max_samples and len(data) >= max_samples:
|
| 49 |
+
break
|
| 50 |
+
print(f"Loaded {len(data)} evaluation samples")
|
| 51 |
+
return data
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# ============================================================
|
| 55 |
+
# Build inputs for Qwen2.5-VL
|
| 56 |
+
# ============================================================
|
| 57 |
+
|
| 58 |
+
def build_messages(image_path, caption=None):
|
| 59 |
+
"""Build Qwen2.5-VL chat messages for image captioning."""
|
| 60 |
+
messages = [
|
| 61 |
+
{
|
| 62 |
+
"role": "system",
|
| 63 |
+
"content": [{"type": "text", "text": "You are a helpful assistant."}],
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"role": "user",
|
| 67 |
+
"content": [
|
| 68 |
+
{"type": "image", "image": f"file://{os.path.abspath(image_path)}"},
|
| 69 |
+
{"type": "text", "text": "Describe this image."},
|
| 70 |
+
],
|
| 71 |
+
},
|
| 72 |
+
]
|
| 73 |
+
if caption is not None:
|
| 74 |
+
# For perplexity: add assistant response
|
| 75 |
+
messages.append({
|
| 76 |
+
"role": "assistant",
|
| 77 |
+
"content": [{"type": "text", "text": caption}],
|
| 78 |
+
})
|
| 79 |
+
return messages
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def prepare_perplexity_inputs(processor, image_path, caption, device):
|
| 83 |
+
"""Prepare inputs for perplexity evaluation (with labels)."""
|
| 84 |
+
# Full messages with the ground truth caption as assistant response
|
| 85 |
+
messages_full = build_messages(image_path, caption=caption)
|
| 86 |
+
text_full = processor.apply_chat_template(
|
| 87 |
+
messages_full, tokenize=False, add_generation_prompt=False)
|
| 88 |
+
|
| 89 |
+
# Prompt-only (no assistant response) to find where caption starts
|
| 90 |
+
messages_prompt = build_messages(image_path, caption=None)
|
| 91 |
+
text_prompt = processor.apply_chat_template(
|
| 92 |
+
messages_prompt, tokenize=False, add_generation_prompt=True)
|
| 93 |
+
|
| 94 |
+
# Process full input with image
|
| 95 |
+
image = Image.open(image_path).convert("RGB")
|
| 96 |
+
inputs_full = processor(
|
| 97 |
+
text=[text_full], images=[image], padding=True, return_tensors="pt"
|
| 98 |
+
).to(device)
|
| 99 |
+
inputs_prompt = processor(
|
| 100 |
+
text=[text_prompt], images=[image], padding=True, return_tensors="pt"
|
| 101 |
+
).to(device)
|
| 102 |
+
|
| 103 |
+
# Create labels: mask out prompt tokens
|
| 104 |
+
input_ids = inputs_full["input_ids"]
|
| 105 |
+
prompt_len = inputs_prompt["input_ids"].shape[1]
|
| 106 |
+
labels = input_ids.clone()
|
| 107 |
+
labels[:, :prompt_len] = IGNORE_INDEX
|
| 108 |
+
|
| 109 |
+
n_caption_tokens = int((labels != IGNORE_INDEX).sum().item())
|
| 110 |
+
inputs_full["labels"] = labels
|
| 111 |
+
|
| 112 |
+
return inputs_full, n_caption_tokens
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def prepare_generation_inputs(processor, image_path, device):
|
| 116 |
+
"""Prepare inputs for caption generation."""
|
| 117 |
+
messages = build_messages(image_path, caption=None)
|
| 118 |
+
text = processor.apply_chat_template(
|
| 119 |
+
messages, tokenize=False, add_generation_prompt=True)
|
| 120 |
+
|
| 121 |
+
image = Image.open(image_path).convert("RGB")
|
| 122 |
+
inputs = processor(
|
| 123 |
+
text=[text], images=[image], padding=True, return_tensors="pt"
|
| 124 |
+
).to(device)
|
| 125 |
+
return inputs
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
# ============================================================
|
| 129 |
+
# Evaluation: Perplexity
|
| 130 |
+
# ============================================================
|
| 131 |
+
|
| 132 |
+
@torch.no_grad()
|
| 133 |
+
def evaluate_perplexity(model, processor, eval_data, device):
|
| 134 |
+
model.eval()
|
| 135 |
+
total_loss = 0.0
|
| 136 |
+
total_tokens = 0
|
| 137 |
+
errors = 0
|
| 138 |
+
|
| 139 |
+
for i, item in enumerate(tqdm(eval_data, desc="Qwen-VL Perplexity")):
|
| 140 |
+
image_path = item["image"]
|
| 141 |
+
caption = item["text"]
|
| 142 |
+
|
| 143 |
+
if not os.path.exists(image_path):
|
| 144 |
+
errors += 1
|
| 145 |
+
continue
|
| 146 |
+
|
| 147 |
+
try:
|
| 148 |
+
inputs, n_tokens = prepare_perplexity_inputs(
|
| 149 |
+
processor, image_path, caption, device)
|
| 150 |
+
outputs = model(**inputs)
|
| 151 |
+
loss = outputs.loss
|
| 152 |
+
total_loss += loss.item() * n_tokens
|
| 153 |
+
total_tokens += n_tokens
|
| 154 |
+
except Exception as e:
|
| 155 |
+
errors += 1
|
| 156 |
+
if errors <= 5:
|
| 157 |
+
print(f" Error on sample {i}: {e}")
|
| 158 |
+
continue
|
| 159 |
+
|
| 160 |
+
if total_tokens == 0:
|
| 161 |
+
print("No valid samples!")
|
| 162 |
+
return float("inf")
|
| 163 |
+
|
| 164 |
+
avg_loss = total_loss / total_tokens
|
| 165 |
+
perplexity = math.exp(avg_loss)
|
| 166 |
+
print(f"\n=== Qwen2.5-VL Perplexity Results ===")
|
| 167 |
+
print(f"Samples: {len(eval_data) - errors}/{len(eval_data)}")
|
| 168 |
+
print(f"Errors: {errors}")
|
| 169 |
+
print(f"Average CE loss: {avg_loss:.4f}")
|
| 170 |
+
print(f"Perplexity: {perplexity:.2f}")
|
| 171 |
+
return perplexity
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
# ============================================================
|
| 175 |
+
# Evaluation: Caption Generation
|
| 176 |
+
# ============================================================
|
| 177 |
+
|
| 178 |
+
@torch.no_grad()
|
| 179 |
+
def evaluate_caption(model, processor, eval_data, device, max_new_tokens=256):
|
| 180 |
+
model.eval()
|
| 181 |
+
predictions = []
|
| 182 |
+
references = []
|
| 183 |
+
errors = 0
|
| 184 |
+
|
| 185 |
+
for i, item in enumerate(tqdm(eval_data, desc="Qwen-VL Caption")):
|
| 186 |
+
image_path = item["image"]
|
| 187 |
+
caption = item["text"]
|
| 188 |
+
|
| 189 |
+
if not os.path.exists(image_path):
|
| 190 |
+
errors += 1
|
| 191 |
+
continue
|
| 192 |
+
|
| 193 |
+
try:
|
| 194 |
+
inputs = prepare_generation_inputs(processor, image_path, device)
|
| 195 |
+
prompt_len = inputs["input_ids"].shape[1]
|
| 196 |
+
|
| 197 |
+
outputs = model.generate(
|
| 198 |
+
**inputs,
|
| 199 |
+
max_new_tokens=max_new_tokens,
|
| 200 |
+
do_sample=False,
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
generated = outputs[0][prompt_len:]
|
| 204 |
+
text = processor.tokenizer.decode(generated, skip_special_tokens=True)
|
| 205 |
+
predictions.append(text)
|
| 206 |
+
references.append(caption)
|
| 207 |
+
except Exception as e:
|
| 208 |
+
errors += 1
|
| 209 |
+
if errors <= 5:
|
| 210 |
+
print(f" Error on sample {i}: {e}")
|
| 211 |
+
continue
|
| 212 |
+
|
| 213 |
+
if not predictions:
|
| 214 |
+
print("No valid samples!")
|
| 215 |
+
return {}
|
| 216 |
+
|
| 217 |
+
metrics = _compute_metrics(predictions, references)
|
| 218 |
+
print(f"\n=== Qwen2.5-VL Caption Results ===")
|
| 219 |
+
print(f"Samples: {len(predictions)}/{len(eval_data)}")
|
| 220 |
+
print(f"Errors: {errors}")
|
| 221 |
+
for k, v in metrics.items():
|
| 222 |
+
print(f"{k}: {v:.4f}")
|
| 223 |
+
|
| 224 |
+
print(f"\n--- Sample Outputs (first 5) ---")
|
| 225 |
+
for i in range(min(5, len(predictions))):
|
| 226 |
+
print(f"[{i}] Generated: {predictions[i][:200]}")
|
| 227 |
+
print(f"[{i}] Reference: {references[i][:200]}")
|
| 228 |
+
print()
|
| 229 |
+
|
| 230 |
+
return metrics
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def _compute_metrics(predictions, references):
|
| 234 |
+
metrics = {}
|
| 235 |
+
try:
|
| 236 |
+
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
|
| 237 |
+
smooth = SmoothingFunction().method1
|
| 238 |
+
refs = [[ref.split()] for ref in references]
|
| 239 |
+
preds = [pred.split() for pred in predictions]
|
| 240 |
+
metrics["BLEU-1"] = corpus_bleu(refs, preds, weights=(1, 0, 0, 0), smoothing_function=smooth)
|
| 241 |
+
metrics["BLEU-4"] = corpus_bleu(refs, preds, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)
|
| 242 |
+
except ImportError:
|
| 243 |
+
print("Warning: nltk not installed. pip install nltk")
|
| 244 |
+
try:
|
| 245 |
+
from rouge_score import rouge_scorer
|
| 246 |
+
scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
|
| 247 |
+
scores = [scorer.score(ref, pred)["rougeL"].fmeasure for pred, ref in zip(predictions, references)]
|
| 248 |
+
metrics["ROUGE-L"] = sum(scores) / len(scores)
|
| 249 |
+
except ImportError:
|
| 250 |
+
print("Warning: rouge-score not installed. pip install rouge-score")
|
| 251 |
+
return metrics
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
# ============================================================
|
| 255 |
+
# Model loading
|
| 256 |
+
# ============================================================
|
| 257 |
+
|
| 258 |
+
def load_model(model_path, adapter_path=None, dtype=torch.float16):
|
| 259 |
+
print(f"Loading Qwen2.5-VL from {model_path} ...")
|
| 260 |
+
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
|
| 261 |
+
|
| 262 |
+
# Try Qwen2VL-specific class first, fall back to AutoModel
|
| 263 |
+
try:
|
| 264 |
+
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
| 265 |
+
model_path,
|
| 266 |
+
torch_dtype=dtype,
|
| 267 |
+
device_map="auto",
|
| 268 |
+
trust_remote_code=True,
|
| 269 |
+
)
|
| 270 |
+
except Exception:
|
| 271 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 272 |
+
model_path,
|
| 273 |
+
torch_dtype=dtype,
|
| 274 |
+
device_map="auto",
|
| 275 |
+
trust_remote_code=True,
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
# Load LoRA adapter if provided
|
| 279 |
+
if adapter_path and os.path.exists(adapter_path):
|
| 280 |
+
print(f"Loading adapter from {adapter_path} ...")
|
| 281 |
+
from peft import PeftModel
|
| 282 |
+
model = PeftModel.from_pretrained(model, adapter_path)
|
| 283 |
+
model = model.merge_and_unload()
|
| 284 |
+
print("Adapter merged.")
|
| 285 |
+
|
| 286 |
+
model.eval()
|
| 287 |
+
device = next(model.parameters()).device
|
| 288 |
+
print(f"Model loaded on {device}")
|
| 289 |
+
return model, processor
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
# ============================================================
|
| 293 |
+
# Main
|
| 294 |
+
# ============================================================
|
| 295 |
+
|
| 296 |
+
def main():
|
| 297 |
+
parser = argparse.ArgumentParser(description="Qwen2.5-VL-3B Evaluation")
|
| 298 |
+
parser.add_argument("--mode", type=str, default="all",
|
| 299 |
+
choices=["perplexity", "caption", "all"])
|
| 300 |
+
parser.add_argument("--model-path", type=str, required=True,
|
| 301 |
+
help="Path to Qwen2.5-VL-3B-Instruct")
|
| 302 |
+
parser.add_argument("--adapter-path", type=str, default=None,
|
| 303 |
+
help="Path to LoRA/circulant adapter (optional)")
|
| 304 |
+
parser.add_argument("--eval-data", type=str, required=True,
|
| 305 |
+
help="Path to eval_qwenvl.jsonl")
|
| 306 |
+
parser.add_argument("--max-samples", type=int, default=None)
|
| 307 |
+
parser.add_argument("--max-new-tokens", type=int, default=256)
|
| 308 |
+
parser.add_argument("--dtype", type=str, default="float16",
|
| 309 |
+
choices=["float16", "bfloat16"])
|
| 310 |
+
parser.add_argument("--output", type=str, default=None)
|
| 311 |
+
args = parser.parse_args()
|
| 312 |
+
|
| 313 |
+
dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
|
| 314 |
+
model, processor = load_model(args.model_path, args.adapter_path, dtype)
|
| 315 |
+
device = next(model.parameters()).device
|
| 316 |
+
|
| 317 |
+
eval_data = load_eval_data(args.eval_data, max_samples=args.max_samples)
|
| 318 |
+
|
| 319 |
+
model_name = "Qwen2.5-VL-3B"
|
| 320 |
+
if args.adapter_path:
|
| 321 |
+
model_name += f" + {os.path.basename(args.adapter_path)}"
|
| 322 |
+
results = {"model": model_name, "num_samples": len(eval_data)}
|
| 323 |
+
|
| 324 |
+
if args.mode in ("perplexity", "all"):
|
| 325 |
+
ppl = evaluate_perplexity(model, processor, eval_data, device)
|
| 326 |
+
results["perplexity"] = ppl
|
| 327 |
+
|
| 328 |
+
if args.mode in ("caption", "all"):
|
| 329 |
+
metrics = evaluate_caption(
|
| 330 |
+
model, processor, eval_data, device, max_new_tokens=args.max_new_tokens)
|
| 331 |
+
results.update(metrics)
|
| 332 |
+
|
| 333 |
+
if args.output:
|
| 334 |
+
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
|
| 335 |
+
with open(args.output, "w") as f:
|
| 336 |
+
json.dump(results, f, indent=2, ensure_ascii=False)
|
| 337 |
+
print(f"\nResults saved to {args.output}")
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
if __name__ == "__main__":
|
| 341 |
+
main()
|
eval/eval_vora.py
ADDED
|
@@ -0,0 +1,430 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VoRA Evaluation Script
|
| 3 |
+
- Perplexity (cross-entropy loss) on held-out caption data
|
| 4 |
+
- Caption generation with BLEU / ROUGE-L metrics
|
| 5 |
+
|
| 6 |
+
Usage:
|
| 7 |
+
# Perplexity evaluation
|
| 8 |
+
python eval/eval_vora.py --mode perplexity \
|
| 9 |
+
--checkpoint output/pretrain_I30M_T6M/checkpoint-250 \
|
| 10 |
+
--eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl \
|
| 11 |
+
--image-processor qwen_models/models--apple--aimv2-huge-patch14-448/snapshots/f723839533d3bbdc969f541c864789f531ec0e5c
|
| 12 |
+
|
| 13 |
+
# Caption generation evaluation
|
| 14 |
+
python eval/eval_vora.py --mode caption \
|
| 15 |
+
--checkpoint output/pretrain_I30M_T6M/checkpoint-250 \
|
| 16 |
+
--eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl \
|
| 17 |
+
--image-processor qwen_models/models--apple--aimv2-huge-patch14-448/snapshots/f723839533d3bbdc969f541c864789f531ec0e5c
|
| 18 |
+
|
| 19 |
+
# Both
|
| 20 |
+
python eval/eval_vora.py --mode all \
|
| 21 |
+
--checkpoint output/pretrain_I30M_T6M/checkpoint-250 \
|
| 22 |
+
--eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl \
|
| 23 |
+
--image-processor qwen_models/models--apple--aimv2-huge-patch14-448/snapshots/f723839533d3bbdc969f541c864789f531ec0e5c
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
import argparse
|
| 27 |
+
import json
|
| 28 |
+
import math
|
| 29 |
+
import os
|
| 30 |
+
import sys
|
| 31 |
+
|
| 32 |
+
import torch
|
| 33 |
+
import torch.nn.functional as F
|
| 34 |
+
from PIL import Image
|
| 35 |
+
from tqdm import tqdm
|
| 36 |
+
from transformers import AutoImageProcessor, AutoTokenizer
|
| 37 |
+
|
| 38 |
+
# Add project root to path
|
| 39 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 40 |
+
|
| 41 |
+
from models.modeling_vora import VoRAForCausalLM, VoRAConfig
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# ============================================================
|
| 45 |
+
# Image preprocessing (same as training pipeline)
|
| 46 |
+
# ============================================================
|
| 47 |
+
|
| 48 |
+
def expand2square(pil_img):
|
| 49 |
+
"""Expand image to square with black padding (same as training)."""
|
| 50 |
+
background_color = (0, 0, 0)
|
| 51 |
+
width, height = pil_img.size
|
| 52 |
+
if width == height:
|
| 53 |
+
return pil_img
|
| 54 |
+
elif width > height:
|
| 55 |
+
result = Image.new(pil_img.mode, (width, width), background_color)
|
| 56 |
+
result.paste(pil_img, (0, (width - height) // 2))
|
| 57 |
+
return result
|
| 58 |
+
else:
|
| 59 |
+
result = Image.new(pil_img.mode, (height, height), background_color)
|
| 60 |
+
result.paste(pil_img, ((height - width) // 2, 0))
|
| 61 |
+
return result
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def load_and_process_image(image_path, image_processor):
|
| 65 |
+
"""Load image, expand to square, apply HF image transforms."""
|
| 66 |
+
img = Image.open(image_path).convert("RGB")
|
| 67 |
+
img = expand2square(img)
|
| 68 |
+
pixel_values = image_processor(img, return_tensors="pt")["pixel_values"] # (1, 3, 448, 448)
|
| 69 |
+
return pixel_values
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# ============================================================
|
| 73 |
+
# Text processing (same prompt template as training)
|
| 74 |
+
# ============================================================
|
| 75 |
+
|
| 76 |
+
IMAGE_TOKEN_INDEX = -200
|
| 77 |
+
IGNORE_INDEX = -100
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def build_prompt_ids(tokenizer, has_image=True):
|
| 81 |
+
"""Build the prompt token IDs (system + user turn) for captioning."""
|
| 82 |
+
system_start = "<|im_start|>system\n"
|
| 83 |
+
system_message = "You are a helpful assistant."
|
| 84 |
+
system_end = "<|im_end|>"
|
| 85 |
+
user_start = "\n<|im_start|>user\n"
|
| 86 |
+
user_end = "<|im_end|>\n<|im_start|>assistant\n"
|
| 87 |
+
|
| 88 |
+
if has_image:
|
| 89 |
+
# system + user with <image> placeholder
|
| 90 |
+
prompt = system_start + system_message + system_end + user_start
|
| 91 |
+
prompt_after_image = user_end
|
| 92 |
+
prompt_ids = tokenizer.encode(prompt)
|
| 93 |
+
after_image_ids = tokenizer.encode(prompt_after_image)
|
| 94 |
+
# Insert image token index between prompt and after_image
|
| 95 |
+
input_ids = prompt_ids + [IMAGE_TOKEN_INDEX] + after_image_ids
|
| 96 |
+
else:
|
| 97 |
+
prompt = (system_start + system_message + system_end +
|
| 98 |
+
user_start + "Describe this image." + user_end)
|
| 99 |
+
input_ids = tokenizer.encode(prompt)
|
| 100 |
+
|
| 101 |
+
return input_ids
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def build_perplexity_batch(tokenizer, image_path, caption, image_processor, device):
|
| 105 |
+
"""Build a batch for perplexity evaluation (with labels)."""
|
| 106 |
+
prompt_ids = build_prompt_ids(tokenizer, has_image=True)
|
| 107 |
+
caption_ids = tokenizer.encode(caption)
|
| 108 |
+
eos_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
|
| 109 |
+
full_ids = prompt_ids + caption_ids + [eos_id]
|
| 110 |
+
|
| 111 |
+
# Labels: -100 for prompt tokens, actual IDs for caption tokens
|
| 112 |
+
labels = [IGNORE_INDEX] * len(prompt_ids) + caption_ids + [eos_id]
|
| 113 |
+
|
| 114 |
+
# Load image
|
| 115 |
+
pixel_values = load_and_process_image(image_path, image_processor)
|
| 116 |
+
|
| 117 |
+
batch = {
|
| 118 |
+
"input_ids": torch.tensor([full_ids], dtype=torch.long).to(device),
|
| 119 |
+
"attention_mask": torch.ones(1, len(full_ids), dtype=torch.long).to(device),
|
| 120 |
+
"labels": torch.tensor([labels], dtype=torch.long).to(device),
|
| 121 |
+
"frames": pixel_values.to(device), # (1, 3, 448, 448)
|
| 122 |
+
"n_frames": [1],
|
| 123 |
+
"vision_placeholder_index": IMAGE_TOKEN_INDEX,
|
| 124 |
+
}
|
| 125 |
+
return batch, len(caption_ids) + 1 # +1 for eos
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def build_generation_batch(tokenizer, image_path, image_processor, device):
|
| 129 |
+
"""Build a batch for caption generation (no labels)."""
|
| 130 |
+
prompt_ids = build_prompt_ids(tokenizer, has_image=True)
|
| 131 |
+
pixel_values = load_and_process_image(image_path, image_processor)
|
| 132 |
+
|
| 133 |
+
batch = {
|
| 134 |
+
"input_ids": torch.tensor([prompt_ids], dtype=torch.long).to(device),
|
| 135 |
+
"attention_mask": torch.ones(1, len(prompt_ids), dtype=torch.long).to(device),
|
| 136 |
+
"frames": pixel_values.to(device),
|
| 137 |
+
"n_frames": [1],
|
| 138 |
+
"vision_placeholder_index": IMAGE_TOKEN_INDEX,
|
| 139 |
+
}
|
| 140 |
+
return batch
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
# ============================================================
|
| 144 |
+
# Load evaluation data
|
| 145 |
+
# ============================================================
|
| 146 |
+
|
| 147 |
+
def load_eval_data(eval_path, max_samples=None):
|
| 148 |
+
"""Load eval data from eval_qwenvl.jsonl format: {"image": path, "text": caption}"""
|
| 149 |
+
data = []
|
| 150 |
+
with open(eval_path, "r") as f:
|
| 151 |
+
for line in f:
|
| 152 |
+
item = json.loads(line.strip())
|
| 153 |
+
data.append(item)
|
| 154 |
+
if max_samples and len(data) >= max_samples:
|
| 155 |
+
break
|
| 156 |
+
print(f"Loaded {len(data)} evaluation samples")
|
| 157 |
+
return data
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
# ============================================================
|
| 161 |
+
# Evaluation: Perplexity
|
| 162 |
+
# ============================================================
|
| 163 |
+
|
| 164 |
+
@torch.no_grad()
|
| 165 |
+
def evaluate_perplexity(model, tokenizer, image_processor, eval_data, device):
|
| 166 |
+
"""Compute perplexity on held-out caption data."""
|
| 167 |
+
model.eval()
|
| 168 |
+
total_loss = 0.0
|
| 169 |
+
total_tokens = 0
|
| 170 |
+
errors = 0
|
| 171 |
+
|
| 172 |
+
for i, item in enumerate(tqdm(eval_data, desc="Perplexity")):
|
| 173 |
+
image_path = item["image"]
|
| 174 |
+
caption = item["text"]
|
| 175 |
+
|
| 176 |
+
if not os.path.exists(image_path):
|
| 177 |
+
errors += 1
|
| 178 |
+
continue
|
| 179 |
+
|
| 180 |
+
try:
|
| 181 |
+
batch, n_caption_tokens = build_perplexity_batch(
|
| 182 |
+
tokenizer, image_path, caption, image_processor, device)
|
| 183 |
+
|
| 184 |
+
outputs = model(**batch)
|
| 185 |
+
loss = outputs.loss
|
| 186 |
+
|
| 187 |
+
total_loss += loss.item() * n_caption_tokens
|
| 188 |
+
total_tokens += n_caption_tokens
|
| 189 |
+
except Exception as e:
|
| 190 |
+
errors += 1
|
| 191 |
+
if errors <= 5:
|
| 192 |
+
print(f" Error on sample {i}: {e}")
|
| 193 |
+
continue
|
| 194 |
+
|
| 195 |
+
if total_tokens == 0:
|
| 196 |
+
print("No valid samples for perplexity!")
|
| 197 |
+
return float("inf")
|
| 198 |
+
|
| 199 |
+
avg_loss = total_loss / total_tokens
|
| 200 |
+
perplexity = math.exp(avg_loss)
|
| 201 |
+
print(f"\n=== Perplexity Results ===")
|
| 202 |
+
print(f"Samples evaluated: {len(eval_data) - errors}/{len(eval_data)}")
|
| 203 |
+
print(f"Errors: {errors}")
|
| 204 |
+
print(f"Average cross-entropy loss: {avg_loss:.4f}")
|
| 205 |
+
print(f"Perplexity: {perplexity:.2f}")
|
| 206 |
+
return perplexity
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
# ============================================================
|
| 210 |
+
# Evaluation: Caption Generation
|
| 211 |
+
# ============================================================
|
| 212 |
+
|
| 213 |
+
@torch.no_grad()
|
| 214 |
+
def evaluate_caption(model, tokenizer, image_processor, eval_data, device,
|
| 215 |
+
max_new_tokens=256):
|
| 216 |
+
"""Generate captions and compute BLEU / ROUGE-L."""
|
| 217 |
+
model.eval()
|
| 218 |
+
predictions = []
|
| 219 |
+
references = []
|
| 220 |
+
errors = 0
|
| 221 |
+
|
| 222 |
+
eos_token_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
|
| 223 |
+
|
| 224 |
+
for i, item in enumerate(tqdm(eval_data, desc="Caption Generation")):
|
| 225 |
+
image_path = item["image"]
|
| 226 |
+
caption = item["text"]
|
| 227 |
+
|
| 228 |
+
if not os.path.exists(image_path):
|
| 229 |
+
errors += 1
|
| 230 |
+
continue
|
| 231 |
+
|
| 232 |
+
try:
|
| 233 |
+
batch = build_generation_batch(tokenizer, image_path, image_processor, device)
|
| 234 |
+
|
| 235 |
+
outputs = model.generate(
|
| 236 |
+
batch,
|
| 237 |
+
max_new_tokens=max_new_tokens,
|
| 238 |
+
do_sample=False,
|
| 239 |
+
pad_token_id=tokenizer.eos_token_id,
|
| 240 |
+
eos_token_id=eos_token_id,
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 244 |
+
predictions.append(generated_text)
|
| 245 |
+
references.append(caption)
|
| 246 |
+
except Exception as e:
|
| 247 |
+
errors += 1
|
| 248 |
+
if errors <= 5:
|
| 249 |
+
print(f" Error on sample {i}: {e}")
|
| 250 |
+
continue
|
| 251 |
+
|
| 252 |
+
if len(predictions) == 0:
|
| 253 |
+
print("No valid samples for caption evaluation!")
|
| 254 |
+
return {}
|
| 255 |
+
|
| 256 |
+
# Compute metrics
|
| 257 |
+
metrics = compute_caption_metrics(predictions, references)
|
| 258 |
+
|
| 259 |
+
print(f"\n=== Caption Generation Results ===")
|
| 260 |
+
print(f"Samples evaluated: {len(predictions)}/{len(eval_data)}")
|
| 261 |
+
print(f"Errors: {errors}")
|
| 262 |
+
for k, v in metrics.items():
|
| 263 |
+
print(f"{k}: {v:.4f}")
|
| 264 |
+
|
| 265 |
+
# Print a few examples
|
| 266 |
+
print(f"\n--- Sample Outputs (first 5) ---")
|
| 267 |
+
for i in range(min(5, len(predictions))):
|
| 268 |
+
print(f"[{i}] Generated: {predictions[i][:200]}")
|
| 269 |
+
print(f"[{i}] Reference: {references[i][:200]}")
|
| 270 |
+
print()
|
| 271 |
+
|
| 272 |
+
return metrics
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
def compute_caption_metrics(predictions, references):
|
| 276 |
+
"""Compute BLEU-1, BLEU-4, ROUGE-L metrics."""
|
| 277 |
+
metrics = {}
|
| 278 |
+
|
| 279 |
+
# BLEU
|
| 280 |
+
try:
|
| 281 |
+
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
|
| 282 |
+
smooth = SmoothingFunction().method1
|
| 283 |
+
refs_tokenized = [[ref.split()] for ref in references]
|
| 284 |
+
preds_tokenized = [pred.split() for pred in predictions]
|
| 285 |
+
|
| 286 |
+
metrics["BLEU-1"] = corpus_bleu(refs_tokenized, preds_tokenized,
|
| 287 |
+
weights=(1, 0, 0, 0),
|
| 288 |
+
smoothing_function=smooth)
|
| 289 |
+
metrics["BLEU-4"] = corpus_bleu(refs_tokenized, preds_tokenized,
|
| 290 |
+
weights=(0.25, 0.25, 0.25, 0.25),
|
| 291 |
+
smoothing_function=smooth)
|
| 292 |
+
except ImportError:
|
| 293 |
+
print("Warning: nltk not installed, skipping BLEU. Install with: pip install nltk")
|
| 294 |
+
|
| 295 |
+
# ROUGE-L
|
| 296 |
+
try:
|
| 297 |
+
from rouge_score import rouge_scorer
|
| 298 |
+
scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
|
| 299 |
+
rouge_scores = [scorer.score(ref, pred)["rougeL"].fmeasure
|
| 300 |
+
for pred, ref in zip(predictions, references)]
|
| 301 |
+
metrics["ROUGE-L"] = sum(rouge_scores) / len(rouge_scores)
|
| 302 |
+
except ImportError:
|
| 303 |
+
print("Warning: rouge_score not installed, skipping ROUGE-L. Install with: pip install rouge-score")
|
| 304 |
+
|
| 305 |
+
return metrics
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
# ============================================================
|
| 309 |
+
# Model loading
|
| 310 |
+
# ============================================================
|
| 311 |
+
|
| 312 |
+
def load_vora_model(checkpoint_path, device_map="auto", dtype=torch.float16):
|
| 313 |
+
"""Load VoRA model from checkpoint."""
|
| 314 |
+
print(f"Loading VoRA model from {checkpoint_path} ...")
|
| 315 |
+
config = VoRAConfig.from_pretrained(checkpoint_path)
|
| 316 |
+
|
| 317 |
+
# Disable aux_vision for inference (not needed)
|
| 318 |
+
config.aux_vision = ""
|
| 319 |
+
|
| 320 |
+
model = VoRAForCausalLM(config)
|
| 321 |
+
model.debug_max_steps = 0 # Disable debug prints
|
| 322 |
+
|
| 323 |
+
# Load checkpoint weights
|
| 324 |
+
from tools.merge_lora import partial_load_from_checkpoints
|
| 325 |
+
state_dict = partial_load_from_checkpoints(checkpoint_path)
|
| 326 |
+
msg = model.load_state_dict(state_dict, strict=False)
|
| 327 |
+
print(f"Load state dict: missing={len(msg.missing_keys)}, unexpected={len(msg.unexpected_keys)}")
|
| 328 |
+
if msg.missing_keys:
|
| 329 |
+
print(f" Missing keys (first 5): {msg.missing_keys[:5]}")
|
| 330 |
+
|
| 331 |
+
model = model.to(dtype=dtype)
|
| 332 |
+
|
| 333 |
+
if device_map == "auto" and torch.cuda.device_count() > 1:
|
| 334 |
+
from accelerate import dispatch_model, infer_auto_device_map
|
| 335 |
+
device_map_computed = infer_auto_device_map(model, max_memory={
|
| 336 |
+
i: "22GiB" for i in range(torch.cuda.device_count())
|
| 337 |
+
})
|
| 338 |
+
model = dispatch_model(model, device_map=device_map_computed)
|
| 339 |
+
print(f"Model dispatched across {torch.cuda.device_count()} GPUs")
|
| 340 |
+
else:
|
| 341 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 342 |
+
model = model.to(device)
|
| 343 |
+
print(f"Model on {device}")
|
| 344 |
+
|
| 345 |
+
model.eval()
|
| 346 |
+
return model
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
def load_merged_vora_model(merged_path, device_map="auto", dtype=torch.float16):
|
| 350 |
+
"""Load merged (LoRA-free) VoRA model."""
|
| 351 |
+
print(f"Loading merged VoRA model from {merged_path} ...")
|
| 352 |
+
model = VoRAForCausalLM.from_pretrained(
|
| 353 |
+
merged_path,
|
| 354 |
+
torch_dtype=dtype,
|
| 355 |
+
device_map=device_map,
|
| 356 |
+
trust_remote_code=True,
|
| 357 |
+
)
|
| 358 |
+
model.debug_max_steps = 0
|
| 359 |
+
model.eval()
|
| 360 |
+
return model
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
# ============================================================
|
| 364 |
+
# Main
|
| 365 |
+
# ============================================================
|
| 366 |
+
|
| 367 |
+
def main():
|
| 368 |
+
parser = argparse.ArgumentParser(description="VoRA Evaluation")
|
| 369 |
+
parser.add_argument("--mode", type=str, default="all",
|
| 370 |
+
choices=["perplexity", "caption", "all"])
|
| 371 |
+
parser.add_argument("--checkpoint", type=str, required=True,
|
| 372 |
+
help="Path to VoRA checkpoint or merged model directory")
|
| 373 |
+
parser.add_argument("--merged", action="store_true",
|
| 374 |
+
help="If set, load as merged model (no LoRA)")
|
| 375 |
+
parser.add_argument("--eval-data", type=str, required=True,
|
| 376 |
+
help="Path to eval_qwenvl.jsonl")
|
| 377 |
+
parser.add_argument("--image-processor", type=str, required=True,
|
| 378 |
+
help="Path to AIMv2 model for image preprocessing")
|
| 379 |
+
parser.add_argument("--max-samples", type=int, default=None,
|
| 380 |
+
help="Max number of eval samples (default: all)")
|
| 381 |
+
parser.add_argument("--max-new-tokens", type=int, default=256,
|
| 382 |
+
help="Max new tokens for caption generation")
|
| 383 |
+
parser.add_argument("--dtype", type=str, default="float16",
|
| 384 |
+
choices=["float16", "bfloat16"])
|
| 385 |
+
parser.add_argument("--output", type=str, default=None,
|
| 386 |
+
help="Path to save results JSON")
|
| 387 |
+
args = parser.parse_args()
|
| 388 |
+
|
| 389 |
+
dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
|
| 390 |
+
|
| 391 |
+
# Load model
|
| 392 |
+
if args.merged:
|
| 393 |
+
model = load_merged_vora_model(args.checkpoint, dtype=dtype)
|
| 394 |
+
else:
|
| 395 |
+
model = load_vora_model(args.checkpoint, dtype=dtype)
|
| 396 |
+
|
| 397 |
+
device = next(model.parameters()).device
|
| 398 |
+
|
| 399 |
+
# Load tokenizer and image processor
|
| 400 |
+
tokenizer = model.tokenizer
|
| 401 |
+
image_processor = AutoImageProcessor.from_pretrained(args.image_processor)
|
| 402 |
+
|
| 403 |
+
# Load eval data
|
| 404 |
+
eval_data = load_eval_data(args.eval_data, max_samples=args.max_samples)
|
| 405 |
+
|
| 406 |
+
results = {"checkpoint": args.checkpoint, "num_samples": len(eval_data)}
|
| 407 |
+
|
| 408 |
+
# Run evaluations
|
| 409 |
+
if args.mode in ("perplexity", "all"):
|
| 410 |
+
ppl = evaluate_perplexity(model, tokenizer, image_processor, eval_data, device)
|
| 411 |
+
results["perplexity"] = ppl
|
| 412 |
+
|
| 413 |
+
if args.mode in ("caption", "all"):
|
| 414 |
+
caption_metrics = evaluate_caption(
|
| 415 |
+
model, tokenizer, image_processor, eval_data, device,
|
| 416 |
+
max_new_tokens=args.max_new_tokens)
|
| 417 |
+
results.update(caption_metrics)
|
| 418 |
+
|
| 419 |
+
# Save results
|
| 420 |
+
if args.output:
|
| 421 |
+
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
|
| 422 |
+
with open(args.output, "w") as f:
|
| 423 |
+
json.dump(results, f, indent=2, ensure_ascii=False)
|
| 424 |
+
print(f"\nResults saved to {args.output}")
|
| 425 |
+
|
| 426 |
+
return results
|
| 427 |
+
|
| 428 |
+
|
| 429 |
+
if __name__ == "__main__":
|
| 430 |
+
main()
|
eval/run_eval.sh
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# =============================================================
|
| 3 |
+
# VoRA Evaluation Runner (打包版)
|
| 4 |
+
# =============================================================
|
| 5 |
+
#
|
| 6 |
+
# ========== 使用教程 ==========
|
| 7 |
+
#
|
| 8 |
+
# 1. 把 eval_pack/ 整个文件夹传到目标机器
|
| 9 |
+
#
|
| 10 |
+
# 2. 创建 conda 环境 (一次性):
|
| 11 |
+
# conda create -n eval python=3.10 -y
|
| 12 |
+
# conda activate eval
|
| 13 |
+
# cd eval_pack
|
| 14 |
+
# pip install -r requirements_eval.txt
|
| 15 |
+
#
|
| 16 |
+
# 3. 修改下方「路径配置」部分:
|
| 17 |
+
# - QWEN_MODEL: Qwen2.5-7B-Instruct 路径
|
| 18 |
+
# - QWEN_VL_MODEL: Qwen2.5-VL-3B-Instruct 路径
|
| 19 |
+
# - AIMV2_PATH: aimv2-huge-patch14-448 路径
|
| 20 |
+
# - VORA_CIRC_MERGED: VoRA-Circulant merged 模型路径
|
| 21 |
+
#
|
| 22 |
+
# 4. 修改 data/eval_qwenvl.jsonl 中的图片绝对路径:
|
| 23 |
+
# sed -i 's|/share/home/jcdl1lsy2clx/VoRA/data_dir/VoRA-Recap-29M/frames|/你的/frames路径|g' data/eval_qwenvl.jsonl
|
| 24 |
+
#
|
| 25 |
+
# data/eval_vora.jsonl 用的是相对路径 (frames/xxx.jpg),
|
| 26 |
+
# 确保从 eval_pack/ 目录能访问到 frames/ 文件夹:
|
| 27 |
+
# ln -s /你的/frames路径 frames
|
| 28 |
+
#
|
| 29 |
+
# 5. 运行:
|
| 30 |
+
# conda activate eval
|
| 31 |
+
# cd eval_pack
|
| 32 |
+
# bash eval/run_eval.sh
|
| 33 |
+
#
|
| 34 |
+
# 6. 结果会打印表格并保存到 eval/results/*.json
|
| 35 |
+
#
|
| 36 |
+
# ========== 目录结构 ==========
|
| 37 |
+
#
|
| 38 |
+
# eval_pack/
|
| 39 |
+
# ├── eval/ <- 评测脚本
|
| 40 |
+
# ├── models/ <- VoRA 模型代码
|
| 41 |
+
# ├── tools/ <- merge_lora.py
|
| 42 |
+
# ├── generation_files/ <- tokenizer/processor
|
| 43 |
+
# ├── data/
|
| 44 |
+
# │ ├── eval_vora.jsonl <- VoRA 系列用 (相对路径)
|
| 45 |
+
# │ └── eval_qwenvl.jsonl <- Qwen-VL 系列用 (绝对路径)
|
| 46 |
+
# ├── vora_merged_250/ <- VoRA merged (~32G)
|
| 47 |
+
# ├── lora_merged/ <- Qwen-VL+LoRA merged (~7G)
|
| 48 |
+
# ├── circulant_merged/ <- Qwen-VL+Circulant merged (~7G)
|
| 49 |
+
# └── requirements_eval.txt
|
| 50 |
+
#
|
| 51 |
+
# =============================================================
|
| 52 |
+
|
| 53 |
+
set -e
|
| 54 |
+
cd "$(dirname "$0")/.."
|
| 55 |
+
|
| 56 |
+
# ---------- 路径配置 (根据你的环境修改) ----------
|
| 57 |
+
# 基线模型 (另一台机器上已有)
|
| 58 |
+
QWEN_MODEL="/path/to/Qwen2.5-7B-Instruct"
|
| 59 |
+
QWEN_VL_MODEL="/path/to/Qwen2.5-VL-3B-Instruct"
|
| 60 |
+
AIMV2_PATH="/path/to/aimv2-huge-patch14-448"
|
| 61 |
+
|
| 62 |
+
# 评测数据
|
| 63 |
+
EVAL_DATA_VORA="data/eval_vora.jsonl" # VoRA 系列用 (相对路径 frames/xxx.jpg)
|
| 64 |
+
EVAL_DATA_QWEN="data/eval_qwenvl.jsonl" # Qwen-VL 系列用 (绝对路径, 需要按目标机器修改)
|
| 65 |
+
|
| 66 |
+
DTYPE="float16"
|
| 67 |
+
RESULTS_DIR="eval/results"
|
| 68 |
+
|
| 69 |
+
# 已 merge 好的模型
|
| 70 |
+
VORA_MERGED="vora_merged_250"
|
| 71 |
+
VORA_CIRC_MERGED="/path/to/vora-circulant-merged"
|
| 72 |
+
LORA_MERGED="lora_merged"
|
| 73 |
+
CIRC_MERGED="circulant_merged"
|
| 74 |
+
# -----------------------------------------------
|
| 75 |
+
|
| 76 |
+
mkdir -p "$RESULTS_DIR"
|
| 77 |
+
|
| 78 |
+
pip install nltk rouge-score qwen-vl-utils 2>/dev/null || true
|
| 79 |
+
python -c "import nltk; nltk.download('punkt', quiet=True); nltk.download('punkt_tab', quiet=True)" 2>/dev/null || true
|
| 80 |
+
|
| 81 |
+
echo ""
|
| 82 |
+
echo "=============================================="
|
| 83 |
+
echo " Step 1: Qwen2.5-7B Text-Only Baseline"
|
| 84 |
+
echo "=============================================="
|
| 85 |
+
python eval/eval_qwen_baseline.py \
|
| 86 |
+
--mode all \
|
| 87 |
+
--model-path "$QWEN_MODEL" \
|
| 88 |
+
--eval-data "$EVAL_DATA_QWEN" \
|
| 89 |
+
--max-samples 200 \
|
| 90 |
+
--max-new-tokens 256 \
|
| 91 |
+
--dtype "$DTYPE" \
|
| 92 |
+
--output "$RESULTS_DIR/qwen_baseline.json"
|
| 93 |
+
|
| 94 |
+
echo ""
|
| 95 |
+
echo "=============================================="
|
| 96 |
+
echo " Step 2: VoRA Full Eval"
|
| 97 |
+
echo "=============================================="
|
| 98 |
+
python eval/eval_vora.py \
|
| 99 |
+
--mode all \
|
| 100 |
+
--checkpoint "$VORA_MERGED" \
|
| 101 |
+
--merged \
|
| 102 |
+
--eval-data "$EVAL_DATA_VORA" \
|
| 103 |
+
--image-processor "$AIMV2_PATH" \
|
| 104 |
+
--max-samples 200 \
|
| 105 |
+
--max-new-tokens 256 \
|
| 106 |
+
--dtype "$DTYPE" \
|
| 107 |
+
--output "$RESULTS_DIR/vora_best.json"
|
| 108 |
+
|
| 109 |
+
echo ""
|
| 110 |
+
echo "=============================================="
|
| 111 |
+
echo " Step 3: VoRA-Circulant Full Eval"
|
| 112 |
+
echo "=============================================="
|
| 113 |
+
python eval/eval_vora.py \
|
| 114 |
+
--mode all \
|
| 115 |
+
--checkpoint "$VORA_CIRC_MERGED" \
|
| 116 |
+
--merged \
|
| 117 |
+
--eval-data "$EVAL_DATA_VORA" \
|
| 118 |
+
--image-processor "$AIMV2_PATH" \
|
| 119 |
+
--max-samples 200 \
|
| 120 |
+
--max-new-tokens 256 \
|
| 121 |
+
--dtype "$DTYPE" \
|
| 122 |
+
--output "$RESULTS_DIR/vora_circulant.json"
|
| 123 |
+
|
| 124 |
+
echo ""
|
| 125 |
+
echo "=============================================="
|
| 126 |
+
echo " Step 4: Qwen2.5-VL-3B Original Baseline"
|
| 127 |
+
echo "=============================================="
|
| 128 |
+
python eval/eval_qwen_vl.py \
|
| 129 |
+
--mode all \
|
| 130 |
+
--model-path "$QWEN_VL_MODEL" \
|
| 131 |
+
--eval-data "$EVAL_DATA_QWEN" \
|
| 132 |
+
--max-samples 200 \
|
| 133 |
+
--max-new-tokens 256 \
|
| 134 |
+
--dtype "$DTYPE" \
|
| 135 |
+
--output "$RESULTS_DIR/qwen_vl_original.json"
|
| 136 |
+
|
| 137 |
+
echo ""
|
| 138 |
+
echo "=============================================="
|
| 139 |
+
echo " Step 5: Qwen2.5-VL-3B + LoRA"
|
| 140 |
+
echo "=============================================="
|
| 141 |
+
if [ -d "$LORA_MERGED" ]; then
|
| 142 |
+
python eval/eval_qwen_vl.py \
|
| 143 |
+
--mode all \
|
| 144 |
+
--model-path "$LORA_MERGED" \
|
| 145 |
+
--eval-data "$EVAL_DATA_QWEN" \
|
| 146 |
+
--max-samples 200 \
|
| 147 |
+
--max-new-tokens 256 \
|
| 148 |
+
--dtype "$DTYPE" \
|
| 149 |
+
--output "$RESULTS_DIR/qwen_vl_lora.json"
|
| 150 |
+
else
|
| 151 |
+
echo "LoRA merged model not found at $LORA_MERGED, skipping."
|
| 152 |
+
fi
|
| 153 |
+
|
| 154 |
+
echo ""
|
| 155 |
+
echo "=============================================="
|
| 156 |
+
echo " Step 6: Qwen2.5-VL-3B + Block-Circulant"
|
| 157 |
+
echo "=============================================="
|
| 158 |
+
if [ -d "$CIRC_MERGED" ]; then
|
| 159 |
+
python eval/eval_qwen_vl.py \
|
| 160 |
+
--mode all \
|
| 161 |
+
--model-path "$CIRC_MERGED" \
|
| 162 |
+
--eval-data "$EVAL_DATA_QWEN" \
|
| 163 |
+
--max-samples 200 \
|
| 164 |
+
--max-new-tokens 256 \
|
| 165 |
+
--dtype "$DTYPE" \
|
| 166 |
+
--output "$RESULTS_DIR/qwen_vl_circulant.json"
|
| 167 |
+
else
|
| 168 |
+
echo "Block-Circulant merged model not found at $CIRC_MERGED, skipping."
|
| 169 |
+
fi
|
| 170 |
+
|
| 171 |
+
echo ""
|
| 172 |
+
echo "=============================================="
|
| 173 |
+
echo " Summary"
|
| 174 |
+
echo "=============================================="
|
| 175 |
+
python -c "
|
| 176 |
+
import json, os, math
|
| 177 |
+
|
| 178 |
+
results_dir = '$RESULTS_DIR'
|
| 179 |
+
|
| 180 |
+
print('='*70)
|
| 181 |
+
print(f'{\"Model\":<35} {\"Loss\":>10} {\"PPL\":>8} {\"BLEU-4\":>8} {\"ROUGE-L\":>8}')
|
| 182 |
+
print('-'*70)
|
| 183 |
+
|
| 184 |
+
def print_row(name, filepath):
|
| 185 |
+
if not os.path.exists(filepath):
|
| 186 |
+
return
|
| 187 |
+
r = json.load(open(filepath))
|
| 188 |
+
ppl = r.get('perplexity', None)
|
| 189 |
+
loss = math.log(ppl) if ppl and ppl != float('inf') else None
|
| 190 |
+
b4 = r.get('BLEU-4', None)
|
| 191 |
+
rl = r.get('ROUGE-L', None)
|
| 192 |
+
loss_s = f'{loss:.4f}' if loss else '-'
|
| 193 |
+
ppl_s = f'{ppl:.2f}' if ppl else '-'
|
| 194 |
+
b4_s = f'{b4:.4f}' if b4 else '-'
|
| 195 |
+
rl_s = f'{rl:.4f}' if rl else '-'
|
| 196 |
+
print(f'{name:<35} {loss_s:>10} {ppl_s:>8} {b4_s:>8} {rl_s:>8}')
|
| 197 |
+
|
| 198 |
+
print_row('Qwen2.5-7B (text-only)', os.path.join(results_dir, 'qwen_baseline.json'))
|
| 199 |
+
print_row('VoRA', os.path.join(results_dir, 'vora_best.json'))
|
| 200 |
+
print_row('VoRA-Circulant', os.path.join(results_dir, 'vora_circulant.json'))
|
| 201 |
+
|
| 202 |
+
print('-'*70)
|
| 203 |
+
|
| 204 |
+
print_row('Qwen2.5-VL-3B (original)', os.path.join(results_dir, 'qwen_vl_original.json'))
|
| 205 |
+
print_row('Qwen2.5-VL-3B + LoRA', os.path.join(results_dir, 'qwen_vl_lora.json'))
|
| 206 |
+
print_row('Qwen2.5-VL-3B + Circulant', os.path.join(results_dir, 'qwen_vl_circulant.json'))
|
| 207 |
+
|
| 208 |
+
print('='*70)
|
| 209 |
+
"
|
| 210 |
+
|
| 211 |
+
echo ""
|
| 212 |
+
echo "All results saved to \$RESULTS_DIR/"
|
| 213 |
+
echo "Done!"
|
generation_files/added_tokens.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</tool_call>": 151658,
|
| 3 |
+
"<tool_call>": 151657,
|
| 4 |
+
"<|box_end|>": 151649,
|
| 5 |
+
"<|box_start|>": 151648,
|
| 6 |
+
"<|endoftext|>": 151643,
|
| 7 |
+
"<|file_sep|>": 151664,
|
| 8 |
+
"<|fim_middle|>": 151660,
|
| 9 |
+
"<|fim_pad|>": 151662,
|
| 10 |
+
"<|fim_prefix|>": 151659,
|
| 11 |
+
"<|fim_suffix|>": 151661,
|
| 12 |
+
"<|im_end|>": 151645,
|
| 13 |
+
"<|im_start|>": 151644,
|
| 14 |
+
"<|image_pad|>": 151655,
|
| 15 |
+
"<|object_ref_end|>": 151647,
|
| 16 |
+
"<|object_ref_start|>": 151646,
|
| 17 |
+
"<|quad_end|>": 151651,
|
| 18 |
+
"<|quad_start|>": 151650,
|
| 19 |
+
"<|repo_name|>": 151663,
|
| 20 |
+
"<|video_pad|>": 151656,
|
| 21 |
+
"<|vision_end|>": 151653,
|
| 22 |
+
"<|vision_pad|>": 151654,
|
| 23 |
+
"<|vision_start|>": 151652
|
| 24 |
+
}
|
generation_files/chat_template.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% set text_string = namespace(value='') %}{% for content in message['content'] %}{% if 'text' in content %}{% set text_string.value = content['text'] %}{% endif %}{% endfor %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}{% if '<image>' not in text_string.value %}<image>{% endif %}{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
|
| 3 |
+
}
|
generation_files/generation_config.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 151643,
|
| 3 |
+
"pad_token_id": 151643,
|
| 4 |
+
"do_sample": true,
|
| 5 |
+
"eos_token_id": [
|
| 6 |
+
151645,
|
| 7 |
+
151643
|
| 8 |
+
],
|
| 9 |
+
"repetition_penalty": 1.05,
|
| 10 |
+
"temperature": 0.7,
|
| 11 |
+
"top_p": 0.8,
|
| 12 |
+
"top_k": 20,
|
| 13 |
+
"transformers_version": "4.37.0"
|
| 14 |
+
}
|
generation_files/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
generation_files/preprocessor_config.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"crop_size": 224,
|
| 3 |
+
"do_center_crop": true,
|
| 4 |
+
"do_normalize": true,
|
| 5 |
+
"do_resize": true,
|
| 6 |
+
"feature_extractor_type": "CLIPFeatureExtractor",
|
| 7 |
+
"image_mean": [
|
| 8 |
+
0.48145466,
|
| 9 |
+
0.4578275,
|
| 10 |
+
0.40821073
|
| 11 |
+
],
|
| 12 |
+
"image_std": [
|
| 13 |
+
0.26862954,
|
| 14 |
+
0.26130258,
|
| 15 |
+
0.27577711
|
| 16 |
+
],
|
| 17 |
+
"resample": 3,
|
| 18 |
+
"size": 224
|
| 19 |
+
}
|
generation_files/processing_vora.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from typing import List, Union
|
| 4 |
+
from PIL import Image
|
| 5 |
+
|
| 6 |
+
from transformers.feature_extraction_utils import BatchFeature
|
| 7 |
+
from transformers.image_utils import ImageInput
|
| 8 |
+
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
|
| 9 |
+
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
| 10 |
+
|
| 11 |
+
from .modeling_vora import VoRAForCausalLM
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class VoRAProcessorKwargs(ProcessingKwargs, total=False):
|
| 15 |
+
_defaults = {
|
| 16 |
+
"text_kwargs": {
|
| 17 |
+
"padding": False,
|
| 18 |
+
},
|
| 19 |
+
"images_kwargs": {},
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class VoRAProcesser(ProcessorMixin):
|
| 24 |
+
attributes = ["image_processor", "tokenizer"]
|
| 25 |
+
valid_kwargs = [
|
| 26 |
+
"chat_template",
|
| 27 |
+
"image_token",
|
| 28 |
+
]
|
| 29 |
+
image_processor_class = "AutoImageProcessor"
|
| 30 |
+
tokenizer_class = "AutoTokenizer"
|
| 31 |
+
|
| 32 |
+
def __init__(
|
| 33 |
+
self,
|
| 34 |
+
image_processor=None,
|
| 35 |
+
tokenizer=None,
|
| 36 |
+
chat_template=None,
|
| 37 |
+
image_token="<image>", # set the default and let users change if they have peculiar special tokens in rare cases
|
| 38 |
+
image_token_index = -200,
|
| 39 |
+
**kwargs,
|
| 40 |
+
):
|
| 41 |
+
self.image_token = image_token
|
| 42 |
+
self.image_token_index = image_token_index
|
| 43 |
+
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
| 44 |
+
|
| 45 |
+
def __call__(
|
| 46 |
+
self,
|
| 47 |
+
images: ImageInput = None,
|
| 48 |
+
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
|
| 49 |
+
**kwargs: Unpack[VoRAProcessorKwargs],
|
| 50 |
+
):
|
| 51 |
+
if images is None and text is None:
|
| 52 |
+
raise ValueError("You have to specify at least one of `images` or `text`.")
|
| 53 |
+
|
| 54 |
+
images, text = _validate_images_text_input_order(images, text)
|
| 55 |
+
output_kwargs = self._merge_kwargs(
|
| 56 |
+
VoRAProcessorKwargs,
|
| 57 |
+
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
| 58 |
+
**kwargs,
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
if images is not None:
|
| 62 |
+
images = [[self.expand2square(image[0])] for image in images]
|
| 63 |
+
image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
|
| 64 |
+
else:
|
| 65 |
+
image_inputs = {}
|
| 66 |
+
|
| 67 |
+
if isinstance(text, str):
|
| 68 |
+
text = [text]
|
| 69 |
+
elif not isinstance(text, list) and not isinstance(text[0], str):
|
| 70 |
+
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
| 71 |
+
|
| 72 |
+
input_ids = [self.tokenizer_vision_placeholder(t) for t in text]
|
| 73 |
+
attention_mask = [
|
| 74 |
+
[1] * len(input_ids[i]) for i in range(len(input_ids))
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
text_inputs = dict(
|
| 78 |
+
input_ids=torch.as_tensor(input_ids, dtype=torch.int64),
|
| 79 |
+
attention_mask=torch.as_tensor(attention_mask, dtype=torch.int64),
|
| 80 |
+
)
|
| 81 |
+
image_inputs['frames'] = image_inputs.pop('pixel_values')
|
| 82 |
+
image_inputs['n_frames'] = [len(_images) for _images in images]
|
| 83 |
+
image_inputs['vision_placeholder_index'] = self.image_token_index
|
| 84 |
+
return BatchFeature(data={**text_inputs, **image_inputs})
|
| 85 |
+
|
| 86 |
+
def expand2square(self, pil_img: Image.Image):
|
| 87 |
+
background_color = (0, 0, 0)
|
| 88 |
+
width, height = pil_img.size
|
| 89 |
+
if width == height:
|
| 90 |
+
return pil_img
|
| 91 |
+
elif width > height:
|
| 92 |
+
result = Image.new(pil_img.mode, (width, width), background_color)
|
| 93 |
+
result.paste(pil_img, (0, (width - height) // 2))
|
| 94 |
+
return result
|
| 95 |
+
else:
|
| 96 |
+
result = Image.new(pil_img.mode, (height, height), background_color)
|
| 97 |
+
result.paste(pil_img, ((height - width) // 2, 0))
|
| 98 |
+
return result
|
| 99 |
+
|
| 100 |
+
def tokenizer_vision_placeholder(self, prompt, add_bos=False):
|
| 101 |
+
def join_lists(*lists, sep):
|
| 102 |
+
result = []
|
| 103 |
+
for i, lst in enumerate(lists):
|
| 104 |
+
if i > 0 and sep:
|
| 105 |
+
result.extend([sep])
|
| 106 |
+
result.extend(lst)
|
| 107 |
+
return result
|
| 108 |
+
|
| 109 |
+
prompt_chunks = [self.tokenizer.encode(
|
| 110 |
+
chunk) for chunk in prompt.split(self.image_token)]
|
| 111 |
+
input_ids = join_lists(*prompt_chunks, sep=self.image_token_index)
|
| 112 |
+
if add_bos:
|
| 113 |
+
input_ids = [self.tokenizer.bos_token_id] + input_ids
|
| 114 |
+
|
| 115 |
+
return input_ids
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
if __name__ == '__main__':
|
| 119 |
+
import torch
|
| 120 |
+
from transformers import AutoProcessor, AutoModelForCausalLM
|
| 121 |
+
|
| 122 |
+
model_name = "/mnt/bn/wh-data/open_source/models/VoRA-7B-Instruct"
|
| 123 |
+
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
| 124 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
|
| 125 |
+
|
| 126 |
+
conversation = [
|
| 127 |
+
{
|
| 128 |
+
"role":"user",
|
| 129 |
+
"content":[
|
| 130 |
+
{
|
| 131 |
+
"type":"image",
|
| 132 |
+
"url": "/mnt/bn/wh-data/data/datasets/a_demo/frames/35.jpg"
|
| 133 |
+
},
|
| 134 |
+
{
|
| 135 |
+
"type":"text",
|
| 136 |
+
"text":"<image> Describe this image."
|
| 137 |
+
}
|
| 138 |
+
]
|
| 139 |
+
}
|
| 140 |
+
]
|
| 141 |
+
model_inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, return_tensors='pt', return_dict=True).to(model.device)
|
| 142 |
+
|
| 143 |
+
gen_kwargs = {"max_new_tokens": 1024, "pad_token_id": processor.tokenizer.eos_token_id}
|
| 144 |
+
|
| 145 |
+
with torch.inference_mode():
|
| 146 |
+
outputs = model.generate(model_inputs, **gen_kwargs)
|
| 147 |
+
output_text = processor.tokenizer.batch_decode(
|
| 148 |
+
outputs, skip_special_tokens=True
|
| 149 |
+
)
|
| 150 |
+
print(output_text)
|
generation_files/processor_config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"image_token": "<image>",
|
| 3 |
+
"image_token_index": -200,
|
| 4 |
+
"processor_class": "VoRAProcessing",
|
| 5 |
+
"auto_map": {"AutoProcessor": "processing_vora.VoRAProcesser"}
|
| 6 |
+
}
|
generation_files/special_tokens_map.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<|im_start|>",
|
| 4 |
+
"<|im_end|>",
|
| 5 |
+
"<|object_ref_start|>",
|
| 6 |
+
"<|object_ref_end|>",
|
| 7 |
+
"<|box_start|>",
|
| 8 |
+
"<|box_end|>",
|
| 9 |
+
"<|quad_start|>",
|
| 10 |
+
"<|quad_end|>",
|
| 11 |
+
"<|vision_start|>",
|
| 12 |
+
"<|vision_end|>",
|
| 13 |
+
"<|vision_pad|>",
|
| 14 |
+
"<|image_pad|>",
|
| 15 |
+
"<|video_pad|>"
|
| 16 |
+
],
|
| 17 |
+
"eos_token": {
|
| 18 |
+
"content": "<|im_end|>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
},
|
| 24 |
+
"pad_token": {
|
| 25 |
+
"content": "<|endoftext|>",
|
| 26 |
+
"lstrip": false,
|
| 27 |
+
"normalized": false,
|
| 28 |
+
"rstrip": false,
|
| 29 |
+
"single_word": false
|
| 30 |
+
}
|
| 31 |
+
}
|
generation_files/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
|
| 3 |
+
size 11421896
|
generation_files/tokenizer_config.json
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"151643": {
|
| 6 |
+
"content": "<|endoftext|>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"151644": {
|
| 14 |
+
"content": "<|im_start|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"151645": {
|
| 22 |
+
"content": "<|im_end|>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
},
|
| 29 |
+
"151646": {
|
| 30 |
+
"content": "<|object_ref_start|>",
|
| 31 |
+
"lstrip": false,
|
| 32 |
+
"normalized": false,
|
| 33 |
+
"rstrip": false,
|
| 34 |
+
"single_word": false,
|
| 35 |
+
"special": true
|
| 36 |
+
},
|
| 37 |
+
"151647": {
|
| 38 |
+
"content": "<|object_ref_end|>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false,
|
| 43 |
+
"special": true
|
| 44 |
+
},
|
| 45 |
+
"151648": {
|
| 46 |
+
"content": "<|box_start|>",
|
| 47 |
+
"lstrip": false,
|
| 48 |
+
"normalized": false,
|
| 49 |
+
"rstrip": false,
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"special": true
|
| 52 |
+
},
|
| 53 |
+
"151649": {
|
| 54 |
+
"content": "<|box_end|>",
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"normalized": false,
|
| 57 |
+
"rstrip": false,
|
| 58 |
+
"single_word": false,
|
| 59 |
+
"special": true
|
| 60 |
+
},
|
| 61 |
+
"151650": {
|
| 62 |
+
"content": "<|quad_start|>",
|
| 63 |
+
"lstrip": false,
|
| 64 |
+
"normalized": false,
|
| 65 |
+
"rstrip": false,
|
| 66 |
+
"single_word": false,
|
| 67 |
+
"special": true
|
| 68 |
+
},
|
| 69 |
+
"151651": {
|
| 70 |
+
"content": "<|quad_end|>",
|
| 71 |
+
"lstrip": false,
|
| 72 |
+
"normalized": false,
|
| 73 |
+
"rstrip": false,
|
| 74 |
+
"single_word": false,
|
| 75 |
+
"special": true
|
| 76 |
+
},
|
| 77 |
+
"151652": {
|
| 78 |
+
"content": "<|vision_start|>",
|
| 79 |
+
"lstrip": false,
|
| 80 |
+
"normalized": false,
|
| 81 |
+
"rstrip": false,
|
| 82 |
+
"single_word": false,
|
| 83 |
+
"special": true
|
| 84 |
+
},
|
| 85 |
+
"151653": {
|
| 86 |
+
"content": "<|vision_end|>",
|
| 87 |
+
"lstrip": false,
|
| 88 |
+
"normalized": false,
|
| 89 |
+
"rstrip": false,
|
| 90 |
+
"single_word": false,
|
| 91 |
+
"special": true
|
| 92 |
+
},
|
| 93 |
+
"151654": {
|
| 94 |
+
"content": "<|vision_pad|>",
|
| 95 |
+
"lstrip": false,
|
| 96 |
+
"normalized": false,
|
| 97 |
+
"rstrip": false,
|
| 98 |
+
"single_word": false,
|
| 99 |
+
"special": true
|
| 100 |
+
},
|
| 101 |
+
"151655": {
|
| 102 |
+
"content": "<|image_pad|>",
|
| 103 |
+
"lstrip": false,
|
| 104 |
+
"normalized": false,
|
| 105 |
+
"rstrip": false,
|
| 106 |
+
"single_word": false,
|
| 107 |
+
"special": true
|
| 108 |
+
},
|
| 109 |
+
"151656": {
|
| 110 |
+
"content": "<|video_pad|>",
|
| 111 |
+
"lstrip": false,
|
| 112 |
+
"normalized": false,
|
| 113 |
+
"rstrip": false,
|
| 114 |
+
"single_word": false,
|
| 115 |
+
"special": true
|
| 116 |
+
},
|
| 117 |
+
"151657": {
|
| 118 |
+
"content": "<tool_call>",
|
| 119 |
+
"lstrip": false,
|
| 120 |
+
"normalized": false,
|
| 121 |
+
"rstrip": false,
|
| 122 |
+
"single_word": false,
|
| 123 |
+
"special": false
|
| 124 |
+
},
|
| 125 |
+
"151658": {
|
| 126 |
+
"content": "</tool_call>",
|
| 127 |
+
"lstrip": false,
|
| 128 |
+
"normalized": false,
|
| 129 |
+
"rstrip": false,
|
| 130 |
+
"single_word": false,
|
| 131 |
+
"special": false
|
| 132 |
+
},
|
| 133 |
+
"151659": {
|
| 134 |
+
"content": "<|fim_prefix|>",
|
| 135 |
+
"lstrip": false,
|
| 136 |
+
"normalized": false,
|
| 137 |
+
"rstrip": false,
|
| 138 |
+
"single_word": false,
|
| 139 |
+
"special": false
|
| 140 |
+
},
|
| 141 |
+
"151660": {
|
| 142 |
+
"content": "<|fim_middle|>",
|
| 143 |
+
"lstrip": false,
|
| 144 |
+
"normalized": false,
|
| 145 |
+
"rstrip": false,
|
| 146 |
+
"single_word": false,
|
| 147 |
+
"special": false
|
| 148 |
+
},
|
| 149 |
+
"151661": {
|
| 150 |
+
"content": "<|fim_suffix|>",
|
| 151 |
+
"lstrip": false,
|
| 152 |
+
"normalized": false,
|
| 153 |
+
"rstrip": false,
|
| 154 |
+
"single_word": false,
|
| 155 |
+
"special": false
|
| 156 |
+
},
|
| 157 |
+
"151662": {
|
| 158 |
+
"content": "<|fim_pad|>",
|
| 159 |
+
"lstrip": false,
|
| 160 |
+
"normalized": false,
|
| 161 |
+
"rstrip": false,
|
| 162 |
+
"single_word": false,
|
| 163 |
+
"special": false
|
| 164 |
+
},
|
| 165 |
+
"151663": {
|
| 166 |
+
"content": "<|repo_name|>",
|
| 167 |
+
"lstrip": false,
|
| 168 |
+
"normalized": false,
|
| 169 |
+
"rstrip": false,
|
| 170 |
+
"single_word": false,
|
| 171 |
+
"special": false
|
| 172 |
+
},
|
| 173 |
+
"151664": {
|
| 174 |
+
"content": "<|file_sep|>",
|
| 175 |
+
"lstrip": false,
|
| 176 |
+
"normalized": false,
|
| 177 |
+
"rstrip": false,
|
| 178 |
+
"single_word": false,
|
| 179 |
+
"special": false
|
| 180 |
+
}
|
| 181 |
+
},
|
| 182 |
+
"additional_special_tokens": [
|
| 183 |
+
"<|im_start|>",
|
| 184 |
+
"<|im_end|>",
|
| 185 |
+
"<|object_ref_start|>",
|
| 186 |
+
"<|object_ref_end|>",
|
| 187 |
+
"<|box_start|>",
|
| 188 |
+
"<|box_end|>",
|
| 189 |
+
"<|quad_start|>",
|
| 190 |
+
"<|quad_end|>",
|
| 191 |
+
"<|vision_start|>",
|
| 192 |
+
"<|vision_end|>",
|
| 193 |
+
"<|vision_pad|>",
|
| 194 |
+
"<|image_pad|>",
|
| 195 |
+
"<|video_pad|>"
|
| 196 |
+
],
|
| 197 |
+
"bos_token": null,
|
| 198 |
+
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
|
| 199 |
+
"clean_up_tokenization_spaces": false,
|
| 200 |
+
"eos_token": "<|im_end|>",
|
| 201 |
+
"errors": "replace",
|
| 202 |
+
"extra_special_tokens": {},
|
| 203 |
+
"model_max_length": 131072,
|
| 204 |
+
"pad_token": "<|endoftext|>",
|
| 205 |
+
"processor_class": "VoRAProcessing",
|
| 206 |
+
"split_special_tokens": false,
|
| 207 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 208 |
+
"unk_token": null
|
| 209 |
+
}
|
generation_files/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
generation_files/vora_generation_utils.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any, Dict, Optional
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
from transformers import GenerationMixin
|
| 5 |
+
from transformers.cache_utils import Cache
|
| 6 |
+
from transformers.utils import ModelOutput
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class VoraGenerationMixin(GenerationMixin):
|
| 10 |
+
|
| 11 |
+
def prepare_inputs_for_generation(
|
| 12 |
+
self,
|
| 13 |
+
input_ids: torch.LongTensor,
|
| 14 |
+
past_key_values: Optional[Cache] = None,
|
| 15 |
+
attention_mask: Optional[torch.LongTensor] = None,
|
| 16 |
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
| 17 |
+
cache_position: Optional[torch.LongTensor] = None,
|
| 18 |
+
**kwargs,
|
| 19 |
+
):
|
| 20 |
+
if attention_mask is not None and attention_mask.ndim == 4:
|
| 21 |
+
attention_mask_2d = (attention_mask[:, 0, :, :] == 0).any(dim=1).long().to(attention_mask.device)
|
| 22 |
+
model_input = super().prepare_inputs_for_generation(
|
| 23 |
+
input_ids,
|
| 24 |
+
past_key_values=past_key_values,
|
| 25 |
+
attention_mask=attention_mask_2d,
|
| 26 |
+
inputs_embeds=inputs_embeds,
|
| 27 |
+
cache_position=cache_position,
|
| 28 |
+
**kwargs,
|
| 29 |
+
)
|
| 30 |
+
model_input['attention_mask'] = attention_mask
|
| 31 |
+
return model_input
|
| 32 |
+
else:
|
| 33 |
+
return super().prepare_inputs_for_generation(
|
| 34 |
+
input_ids,
|
| 35 |
+
past_key_values=past_key_values,
|
| 36 |
+
attention_mask=attention_mask,
|
| 37 |
+
inputs_embeds=inputs_embeds,
|
| 38 |
+
cache_position=cache_position,
|
| 39 |
+
**kwargs,
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
def _update_model_kwargs_for_generation(
|
| 43 |
+
self,
|
| 44 |
+
outputs: ModelOutput,
|
| 45 |
+
model_kwargs: Dict[str, Any],
|
| 46 |
+
is_encoder_decoder: bool = False,
|
| 47 |
+
num_new_tokens: int = 1,
|
| 48 |
+
) -> Dict[str, Any]:
|
| 49 |
+
if "attention_mask" in model_kwargs and model_kwargs["attention_mask"].ndim == 4:
|
| 50 |
+
attention_mask = model_kwargs.pop("attention_mask")
|
| 51 |
+
model_kwargs = super()._update_model_kwargs_for_generation(
|
| 52 |
+
outputs, model_kwargs, is_encoder_decoder=is_encoder_decoder, num_new_tokens=num_new_tokens
|
| 53 |
+
)
|
| 54 |
+
bs, _, seq_len, tgt_len = attention_mask.shape
|
| 55 |
+
dtype = attention_mask.dtype
|
| 56 |
+
min_dtype = torch.finfo(dtype).min
|
| 57 |
+
new_col = attention_mask.new_zeros((bs, 1, seq_len, 1)).fill_(min_dtype)
|
| 58 |
+
new_row = attention_mask.new_zeros((bs, 1, 1, tgt_len + 1))
|
| 59 |
+
model_kwargs["attention_mask"] = torch.cat([
|
| 60 |
+
torch.cat([attention_mask, new_col], dim=-1),
|
| 61 |
+
new_row
|
| 62 |
+
], dim=2)
|
| 63 |
+
return model_kwargs
|
| 64 |
+
else:
|
| 65 |
+
return super()._update_model_kwargs_for_generation(
|
| 66 |
+
outputs, model_kwargs, is_encoder_decoder=is_encoder_decoder, num_new_tokens=num_new_tokens
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def custom_prepare_4d_causal_attention_mask_with_cache_position(
|
| 71 |
+
attention_mask: torch.Tensor,
|
| 72 |
+
sequence_length: int,
|
| 73 |
+
target_length: int,
|
| 74 |
+
dtype: torch.dtype,
|
| 75 |
+
device: torch.device,
|
| 76 |
+
cache_position: torch.Tensor,
|
| 77 |
+
batch_size: int,
|
| 78 |
+
**kwargs,
|
| 79 |
+
):
|
| 80 |
+
if attention_mask is not None and attention_mask.dim() == 4:
|
| 81 |
+
# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
|
| 82 |
+
causal_mask = attention_mask[:, :, -sequence_length:, -target_length:]
|
| 83 |
+
else:
|
| 84 |
+
min_dtype = torch.finfo(dtype).min
|
| 85 |
+
causal_mask = torch.full(
|
| 86 |
+
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
|
| 87 |
+
)
|
| 88 |
+
if sequence_length != 1:
|
| 89 |
+
causal_mask = torch.triu(causal_mask, diagonal=1)
|
| 90 |
+
causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
|
| 91 |
+
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
|
| 92 |
+
if attention_mask is not None:
|
| 93 |
+
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
|
| 94 |
+
mask_length = attention_mask.shape[-1]
|
| 95 |
+
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
|
| 96 |
+
padding_mask = padding_mask == 0
|
| 97 |
+
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
|
| 98 |
+
padding_mask, min_dtype
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
return causal_mask
|
lora_merged/added_tokens.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</tool_call>": 151658,
|
| 3 |
+
"<tool_call>": 151657,
|
| 4 |
+
"<|box_end|>": 151649,
|
| 5 |
+
"<|box_start|>": 151648,
|
| 6 |
+
"<|endoftext|>": 151643,
|
| 7 |
+
"<|file_sep|>": 151664,
|
| 8 |
+
"<|fim_middle|>": 151660,
|
| 9 |
+
"<|fim_pad|>": 151662,
|
| 10 |
+
"<|fim_prefix|>": 151659,
|
| 11 |
+
"<|fim_suffix|>": 151661,
|
| 12 |
+
"<|im_end|>": 151645,
|
| 13 |
+
"<|im_start|>": 151644,
|
| 14 |
+
"<|image_pad|>": 151655,
|
| 15 |
+
"<|object_ref_end|>": 151647,
|
| 16 |
+
"<|object_ref_start|>": 151646,
|
| 17 |
+
"<|quad_end|>": 151651,
|
| 18 |
+
"<|quad_start|>": 151650,
|
| 19 |
+
"<|repo_name|>": 151663,
|
| 20 |
+
"<|video_pad|>": 151656,
|
| 21 |
+
"<|vision_end|>": 151653,
|
| 22 |
+
"<|vision_pad|>": 151654,
|
| 23 |
+
"<|vision_start|>": 151652
|
| 24 |
+
}
|
lora_merged/chat_template.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
|
| 3 |
+
}
|
lora_merged/config.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "Qwen2.5-VL-3B-Instruct",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"Qwen2_5_VLForConditionalGeneration"
|
| 5 |
+
],
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 151643,
|
| 8 |
+
"eos_token_id": 151645,
|
| 9 |
+
"hidden_act": "silu",
|
| 10 |
+
"hidden_size": 2048,
|
| 11 |
+
"image_token_id": 151655,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 11008,
|
| 14 |
+
"max_position_embeddings": 128000,
|
| 15 |
+
"max_window_layers": 70,
|
| 16 |
+
"model_type": "qwen2_5_vl",
|
| 17 |
+
"num_attention_heads": 16,
|
| 18 |
+
"num_hidden_layers": 36,
|
| 19 |
+
"num_key_value_heads": 2,
|
| 20 |
+
"rms_norm_eps": 1e-06,
|
| 21 |
+
"rope_scaling": {
|
| 22 |
+
"mrope_section": [
|
| 23 |
+
16,
|
| 24 |
+
24,
|
| 25 |
+
24
|
| 26 |
+
],
|
| 27 |
+
"rope_type": "default",
|
| 28 |
+
"type": "default"
|
| 29 |
+
},
|
| 30 |
+
"rope_theta": 1000000.0,
|
| 31 |
+
"sliding_window": 32768,
|
| 32 |
+
"tie_word_embeddings": true,
|
| 33 |
+
"torch_dtype": "bfloat16",
|
| 34 |
+
"transformers_version": "4.49.0",
|
| 35 |
+
"use_cache": true,
|
| 36 |
+
"use_sliding_window": false,
|
| 37 |
+
"video_token_id": 151656,
|
| 38 |
+
"vision_config": {
|
| 39 |
+
"hidden_size": 1280,
|
| 40 |
+
"in_chans": 3,
|
| 41 |
+
"model_type": "qwen2_5_vl",
|
| 42 |
+
"out_hidden_size": 2048,
|
| 43 |
+
"spatial_patch_size": 14,
|
| 44 |
+
"tokens_per_second": 2,
|
| 45 |
+
"torch_dtype": "float32"
|
| 46 |
+
},
|
| 47 |
+
"vision_end_token_id": 151653,
|
| 48 |
+
"vision_start_token_id": 151652,
|
| 49 |
+
"vision_token_id": 151654,
|
| 50 |
+
"vocab_size": 151936
|
| 51 |
+
}
|
lora_merged/generation_config.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 151643,
|
| 3 |
+
"do_sample": true,
|
| 4 |
+
"eos_token_id": [
|
| 5 |
+
151645,
|
| 6 |
+
151643
|
| 7 |
+
],
|
| 8 |
+
"pad_token_id": 151643,
|
| 9 |
+
"repetition_penalty": 1.05,
|
| 10 |
+
"temperature": 1e-06,
|
| 11 |
+
"transformers_version": "4.49.0"
|
| 12 |
+
}
|
lora_merged/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
lora_merged/model-00001-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1a88d3bd0a1ee8f0d26e039c20c14dcf0286a86bfe4592c8a64f105e44552c02
|
| 3 |
+
size 997996256
|
lora_merged/model-00002-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:44593ca14843f5bc2406fc225a4492c5b344a0a4c5c3d148147cecbfdf1207cd
|
| 3 |
+
size 980624160
|
lora_merged/model-00003-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:292bb3f2b30726c82ebdeeb577266c8a98c903bf524c6966dfb801c501f6ff2d
|
| 3 |
+
size 970020872
|
lora_merged/model-00004-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bad74a6f40948fd7313c5dcc99ac82345a961f425f190a55d9989bfd707d7fd0
|
| 3 |
+
size 970020904
|
lora_merged/model-00005-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60dac3409a978722733df7c0f7136330d7558f3d3eec26bdf817faa9727e2df2
|
| 3 |
+
size 988909632
|
lora_merged/model-00006-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6025d52fdda9e22c470f37e688965a3c706d2e93e4c48c0d2f7858d1ccd95e76
|
| 3 |
+
size 970020944
|
lora_merged/model-00007-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:494697cb7e9d023ea9376118be73a1acb9d5b1eae13aaf5fe336cb1a73bf55d4
|
| 3 |
+
size 970020936
|