Zaynes commited on
Commit
156cd4d
·
verified ·
1 Parent(s): 4c02321

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,8 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ # Mark all log files as text to prevent binary file issues
2
+ *.log text
3
+ *.txt text
4
+ *.out text
5
+ *.err text
6
+ training_artifacts/logs/* text
7
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
8
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Modelfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ollama modelfile auto-generated by llamafactory
2
+
3
+ FROM .
4
+
5
+ TEMPLATE """{{ if .System }}<|im_start|>system
6
+ {{ .System }}<|im_end|>
7
+ {{ end }}{{ range .Messages }}{{ if eq .Role "user" }}<|im_start|>user
8
+ {{ .Content }}<|im_end|>
9
+ <|im_start|>assistant
10
+ {{ else if eq .Role "assistant" }}{{ .Content }}<|im_end|>
11
+ {{ end }}{{ end }}"""
12
+
13
+ SYSTEM """You are Qwen, created by Alibaba Cloud. You are a helpful assistant."""
14
+
15
+ PARAMETER stop "<|im_end|>"
16
+ PARAMETER num_ctx 4096
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "dtype": "float16",
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 1536,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 8960,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention"
41
+ ],
42
+ "max_position_embeddings": 32768,
43
+ "max_window_layers": 21,
44
+ "model_type": "qwen2",
45
+ "num_attention_heads": 12,
46
+ "num_hidden_layers": 28,
47
+ "num_key_value_heads": 2,
48
+ "pad_token_id": 151643,
49
+ "rms_norm_eps": 1e-06,
50
+ "rope_scaling": null,
51
+ "rope_theta": 1000000.0,
52
+ "sliding_window": null,
53
+ "tie_word_embeddings": true,
54
+ "transformers_version": "4.57.1",
55
+ "use_cache": true,
56
+ "use_sliding_window": false,
57
+ "vocab_size": 151936
58
+ }
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "repetition_penalty": 1.1,
9
+ "temperature": 0.7,
10
+ "top_k": 20,
11
+ "top_p": 0.8,
12
+ "transformers_version": "4.57.1"
13
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae93c3f1fdb1fde9b64a6952d89905d3254cec875368c652d418df991d3b4e1a
3
+ size 3087466808
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "padding_side": "left",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
training_artifacts/README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Training Artifacts
2
+
3
+ This directory contains the training configuration and logs for this model.
4
+
5
+ ## Contents
6
+
7
+ - **hydra_config.yaml**: Complete Hydra configuration used for training
8
+ - **train_config.yaml**: LlamaFactory training configuration
9
+ - **merge_config.yaml**: LlamaFactory merge/export configuration
10
+ - **logs/**: Training logs from the job (cleaned for text format)
11
+
12
+ ## Job Information
13
+
14
+ - Job Name: testing__pvv2_resume
15
+ - Timestamp: 2025-10-25 03:02:47 UTC
16
+ - Execution Mode: Local
training_artifacts/hydra_config.yaml ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ? ''
2
+ : ? ''
3
+ : ? ''
4
+ : hydra:
5
+ run:
6
+ dir: .
7
+ output_subdir: null
8
+ job:
9
+ chdir: false
10
+ _target_: null
11
+ job:
12
+ name: ???
13
+ mode: slurm
14
+ dry_run: false
15
+ slurm:
16
+ time_limit: ???
17
+ constraint:
18
+ - h200
19
+ memory: 200
20
+ cpus_per_task: 16
21
+ partition: null
22
+ mail_user: user@example.com
23
+ execution:
24
+ nodes: null
25
+ gpus_per_node: null
26
+ num_gpus: null
27
+ hostfile: null
28
+ secrets_file: null
29
+ model:
30
+ name_or_path: ???
31
+ finetuning_type: lora
32
+ dataset:
33
+ name: null
34
+ dir: null
35
+ info_json: null
36
+ template: default
37
+ cutoff_len: 1024
38
+ val_size: 0.1
39
+ tokenized_path: null
40
+ hf_hub_url: null
41
+ formatting: alpaca
42
+ ranking: false
43
+ subset: null
44
+ split: train
45
+ folder: null
46
+ num_samples: null
47
+ columns:
48
+ prompt: null
49
+ query: null
50
+ response: null
51
+ history: null
52
+ messages: null
53
+ system: null
54
+ tools: null
55
+ images: null
56
+ videos: null
57
+ audios: null
58
+ chosen: null
59
+ rejected: null
60
+ kto_tag: null
61
+ tags:
62
+ role: null
63
+ content: null
64
+ user: null
65
+ assistant: null
66
+ observation: null
67
+ function: null
68
+ system: null
69
+ output:
70
+ experiment_dir: ./experiments
71
+ wandb:
72
+ project: null
73
+ run_name: null
74
+ entity: null
75
+ hf:
76
+ repo_id: null
77
+ private: false
78
+ upload_artifacts: true
79
+ cleanup:
80
+ checkpoints: false
81
+ merged: false
82
+ finetuning:
83
+ training:
84
+ stage: sft
85
+ do_train: true
86
+ finetuning_type: lora
87
+ lora_rank: 8
88
+ lora_alpha: 16
89
+ lora_dropout: 0.05
90
+ lora_target: all
91
+ overwrite_cache: true
92
+ preprocessing_num_workers: 16
93
+ dataloader_num_workers: 4
94
+ logging_steps: 10
95
+ save_steps: 500
96
+ plot_loss: true
97
+ overwrite_output_dir: true
98
+ save_only_model: false
99
+ report_to: none
100
+ per_device_train_batch_size: 1
101
+ gradient_accumulation_steps: 8
102
+ learning_rate: 0.0001
103
+ num_train_epochs: 3.0
104
+ lr_scheduler_type: cosine
105
+ warmup_ratio: 0.1
106
+ bf16: true
107
+ ddp_timeout: 180000000
108
+ resume_from_checkpoint: null
109
+ val_size: 0.1
110
+ per_device_eval_batch_size: 1
111
+ eval_strategy: steps
112
+ eval_steps: 500
113
+ do_eval: true
114
+ merge:
115
+ export_dir: null
116
+ export_size: 5
117
+ export_device: cpu
118
+ export_legacy_format: false
119
+ job:
120
+ name: testing__pvv2_resume
121
+ mode: local
122
+ work_dir: null
123
+ dry_run: false
124
+ slurm:
125
+ time_limit: null
126
+ constraint: null
127
+ memory: null
128
+ partition: null
129
+ mail_user: null
130
+ execution:
131
+ nodes: 1
132
+ gpus_per_node: 2
133
+ num_gpus: null
134
+ hostfile: null
135
+ secrets_file: ./secrets.env
136
+ model:
137
+ name_or_path: Qwen/Qwen2.5-1.5B-Instruct
138
+ finetuning_type: full
139
+ dataset:
140
+ name: TAUR_dev_D_SFT_C_ours_cd3arg_10responses_reflections10_formats_C_full
141
+ dir: null
142
+ info_json: null
143
+ template: qwen
144
+ cutoff_len: 16192
145
+ val_size: 0.0
146
+ hf_hub_url: TAUR-dev/D-SFT_C-ours_cd3arg_10responses_reflections10_formats-C_full
147
+ formatting: sharegpt
148
+ ranking: false
149
+ subset: null
150
+ split: train
151
+ folder: null
152
+ num_samples: null
153
+ columns:
154
+ messages: conversations
155
+ tags:
156
+ role: role
157
+ content: content
158
+ user: user
159
+ assistant: assistant
160
+ tokenized_path: /scratch/zrs2020/.cache/llamafactory/tokenized/TAUR_dev_D_SFT_C_ours_cd3arg_10responses_reflections10_formats_C_full
161
+ output:
162
+ experiment_dir: ./experiments
163
+ wandb:
164
+ project: null
165
+ run_name: testing__pvv2_resume
166
+ entity: null
167
+ hf:
168
+ repo_id: TAUR-dev/testing__lf_pvv2_resume
169
+ private: false
170
+ cleanup:
171
+ checkpoints: false
172
+ merged: false
173
+ training:
174
+ stage: sft
175
+ do_train: true
176
+ max_samples: 100000
177
+ do_eval: false
178
+ save_strategy: steps
179
+ save_steps: 5
180
+ logging_steps: 10
181
+ fp16: false
182
+ bf16: true
183
+ adam_beta1: 0.9
184
+ adam_beta2: 0.95
185
+ overwrite_output_dir: true
186
+ per_device_train_batch_size: 1
187
+ gradient_accumulation_steps: 1
188
+ gradient_checkpointing: true
189
+ learning_rate: 1.0e-06
190
+ lr_scheduler_type: cosine
191
+ num_train_epochs: 2
192
+ warmup_ratio: 0.05
193
+ weight_decay: 0.0001
194
+ template: qwen
195
+ max_steps: 10
196
+ preprocessing_num_workers: 16
197
+ overwrite_cache: true
training_artifacts/logs/pipeline_cleaned.txt ADDED
@@ -0,0 +1,532 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-10-24 23:00:49] ========================================
2
+ [2025-10-24 23:00:49] Job Name: testing__pvv2_resume
3
+ [2025-10-24 23:00:49] Hostname: gl001.hpc.nyu.edu
4
+ [2025-10-24 23:00:49] Number of nodes: 1
5
+ [2025-10-24 23:00:49] GPUs per node: 2
6
+ [2025-10-24 23:00:49] Start Time: Fri Oct 24 11:00:49 PM EDT 2025
7
+ [2025-10-24 23:00:49] Log file: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/logs/pipeline.log
8
+ [2025-10-24 23:00:49] ========================================
9
+ [2025-10-24 23:00:49] Sourcing secrets from: /scratch/zrs2020/LlamaFactoryHelper/secrets.env
10
+ [2025-10-24 23:00:52]
11
+ [2025-10-24 23:00:52] ========================================
12
+ [2025-10-24 23:00:52] Configuration Paths
13
+ [2025-10-24 23:00:52] ========================================
14
+ [2025-10-24 23:00:52] Train Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/configs/train_config.yaml
15
+ [2025-10-24 23:00:52] Merge Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/configs/merge_config.yaml
16
+ [2025-10-24 23:00:52] Dataset Info: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory/data/dataset_info.json
17
+ [2025-10-24 23:00:52] Output Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints
18
+ [2025-10-24 23:00:52] Export Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/merged
19
+ [2025-10-24 23:00:52] HF Repo ID: TAUR-dev/testing__lf_pvv2_resume
20
+ [2025-10-24 23:00:52]
21
+ [make-effective-cfg] tokenized_path: /scratch/zrs2020/.cache/hf_cache/home/llamafactory/tokenized/TAUR_dev_D_SFT_C_ours_cd3arg_10responses_reflections10_formats_C_full_fb94f2a3
22
+ [make-effective-cfg] wrote: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/logs/train_config.effective.yaml
23
+ [2025-10-24 23:00:52]
24
+ [2025-10-24 23:00:52] ========================================
25
+ [2025-10-24 23:00:52] STAGE 0: Downloading Dataset
26
+ [2025-10-24 23:00:52] Dataset: TAUR-dev/D-SFT_C-ours_cd3arg_10responses_reflections10_formats-C_full
27
+ [2025-10-24 23:00:52] Start Time: Fri Oct 24 11:00:52 PM EDT 2025
28
+ [2025-10-24 23:00:52] ========================================
29
+ [dataset-download] Loading dataset from: TAUR-dev/D-SFT_C-ours_cd3arg_10responses_reflections10_formats-C_full
30
+ [dataset-download] Dataset loaded successfully
31
+ [dataset-download] Dataset info: DatasetDict({
32
+ train: Dataset({
33
+ features: ['conversations', 'sft_template_type_idx'],
34
+ num_rows: 29130
35
+ })
36
+ })
37
+ [2025-10-24 23:00:54]
38
+ [2025-10-24 23:00:54] ========================================
39
+ [2025-10-24 23:00:54] Dataset download completed
40
+ [2025-10-24 23:00:54] End Time: Fri Oct 24 11:00:54 PM EDT 2025
41
+ [2025-10-24 23:00:54] ========================================
42
+ [2025-10-24 23:00:54]
43
+ [2025-10-24 23:00:54] ========================================
44
+ [2025-10-24 23:00:54] STAGE 1: Training Model
45
+ [2025-10-24 23:00:54] Start Time: Fri Oct 24 11:00:54 PM EDT 2025
46
+ [2025-10-24 23:00:54] ========================================
47
+ [2025-10-24 23:00:54] Job: testing__pvv2_resume
48
+ [2025-10-24 23:00:54] Nodes: 1 | GPUs/node: 2
49
+ [2025-10-24 23:00:54] Master: 127.0.0.1:29500
50
+ [2025-10-24 23:00:54] LLaMA-Factory: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory
51
+ [2025-10-24 23:00:54] Train cfg (effective): /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/logs/train_config.effective.yaml
52
+ [2025-10-24 23:00:54] HF cache: /scratch/zrs2020/.cache/hf_cache/home/datasets
53
+ [2025-10-24 23:00:54] Launcher: torchrun
54
+ [2025-10-24 23:00:54]
55
+ [2025-10-24 23:00:54] Single-node training (2 GPU(s))
56
+ [2025-10-24 23:00:54] Executing command: llamafactory-cli train /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/logs/train_config.effective.yaml
57
+ /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
58
+ warnings.warn(
59
+ [INFO|2025-10-24 23:01:02] llamafactory.launcher:143 >> Initializing 2 distributed tasks at: 127.0.0.1:29500
60
+ W1024 23:01:03.865000 658301 site-packages/torch/distributed/run.py:803]
61
+ W1024 23:01:03.865000 658301 site-packages/torch/distributed/run.py:803] *****************************************
62
+ W1024 23:01:03.865000 658301 site-packages/torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
63
+ W1024 23:01:03.865000 658301 site-packages/torch/distributed/run.py:803] *****************************************
64
+ /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
65
+ warnings.warn(
66
+ /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
67
+ warnings.warn(
68
+ /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
69
+ import pkg_resources
70
+ /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
71
+ import pkg_resources
72
+ [W1024 23:01:11.583869947 ProcessGroupNCCL.cpp:924] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
73
+ [W1024 23:01:11.583875563 ProcessGroupNCCL.cpp:924] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
74
+ [INFO|2025-10-24 23:01:12] llamafactory.hparams.parser:423 >> Process rank: 1, world size: 2, device: cuda:1, distributed training: True, compute dtype: torch.bfloat16
75
+ [INFO|2025-10-24 23:01:12] llamafactory.hparams.parser:423 >> Process rank: 0, world size: 2, device: cuda:0, distributed training: True, compute dtype: torch.bfloat16
76
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,229 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/vocab.json
77
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,229 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/merges.txt
78
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,229 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/tokenizer.json
79
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,229 >> loading file added_tokens.json from cache at None
80
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,229 >> loading file special_tokens_map.json from cache at None
81
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,229 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/tokenizer_config.json
82
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,229 >> loading file chat_template.jinja from cache at None
83
+ [INFO|tokenization_utils_base.py:2364] 2025-10-24 23:01:12,402 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
84
+ [INFO|configuration_utils.py:765] 2025-10-24 23:01:12,622 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/config.json
85
+ [INFO|configuration_utils.py:839] 2025-10-24 23:01:12,624 >> Model config Qwen2Config {
86
+ "architectures": [
87
+ "Qwen2ForCausalLM"
88
+ ],
89
+ "attention_dropout": 0.0,
90
+ "bos_token_id": 151643,
91
+ "dtype": "bfloat16",
92
+ "eos_token_id": 151645,
93
+ "hidden_act": "silu",
94
+ "hidden_size": 1536,
95
+ "initializer_range": 0.02,
96
+ "intermediate_size": 8960,
97
+ "layer_types": [
98
+ "full_attention",
99
+ "full_attention",
100
+ "full_attention",
101
+ "full_attention",
102
+ "full_attention",
103
+ "full_attention",
104
+ "full_attention",
105
+ "full_attention",
106
+ "full_attention",
107
+ "full_attention",
108
+ "full_attention",
109
+ "full_attention",
110
+ "full_attention",
111
+ "full_attention",
112
+ "full_attention",
113
+ "full_attention",
114
+ "full_attention",
115
+ "full_attention",
116
+ "full_attention",
117
+ "full_attention",
118
+ "full_attention",
119
+ "full_attention",
120
+ "full_attention",
121
+ "full_attention",
122
+ "full_attention",
123
+ "full_attention",
124
+ "full_attention",
125
+ "full_attention"
126
+ ],
127
+ "max_position_embeddings": 32768,
128
+ "max_window_layers": 21,
129
+ "model_type": "qwen2",
130
+ "num_attention_heads": 12,
131
+ "num_hidden_layers": 28,
132
+ "num_key_value_heads": 2,
133
+ "rms_norm_eps": 1e-06,
134
+ "rope_scaling": null,
135
+ "rope_theta": 1000000.0,
136
+ "sliding_window": null,
137
+ "tie_word_embeddings": true,
138
+ "transformers_version": "4.57.1",
139
+ "use_cache": true,
140
+ "use_sliding_window": false,
141
+ "vocab_size": 151936
142
+ }
143
+
144
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,728 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/vocab.json
145
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,728 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/merges.txt
146
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,728 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/tokenizer.json
147
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,728 >> loading file added_tokens.json from cache at None
148
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,728 >> loading file special_tokens_map.json from cache at None
149
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,728 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/tokenizer_config.json
150
+ [INFO|tokenization_utils_base.py:2095] 2025-10-24 23:01:12,728 >> loading file chat_template.jinja from cache at None
151
+ [INFO|tokenization_utils_base.py:2364] 2025-10-24 23:01:12,895 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
152
+ [WARNING|2025-10-24 23:01:12] llamafactory.data.loader:148 >> Loading dataset from disk will ignore other data arguments.
153
+ [INFO|2025-10-24 23:01:12] llamafactory.data.loader:143 >> Loaded tokenized dataset from /scratch/zrs2020/.cache/hf_cache/home/llamafactory/tokenized/TAUR_dev_D_SFT_C_ours_cd3arg_10responses_reflections10_formats_C_full_fb94f2a3.
154
+ [INFO|configuration_utils.py:765] 2025-10-24 23:01:12,971 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/config.json
155
+ [INFO|configuration_utils.py:839] 2025-10-24 23:01:12,972 >> Model config Qwen2Config {
156
+ "architectures": [
157
+ "Qwen2ForCausalLM"
158
+ ],
159
+ "attention_dropout": 0.0,
160
+ "bos_token_id": 151643,
161
+ "dtype": "bfloat16",
162
+ "eos_token_id": 151645,
163
+ "hidden_act": "silu",
164
+ "hidden_size": 1536,
165
+ "initializer_range": 0.02,
166
+ "intermediate_size": 8960,
167
+ "layer_types": [
168
+ "full_attention",
169
+ "full_attention",
170
+ "full_attention",
171
+ "full_attention",
172
+ "full_attention",
173
+ "full_attention",
174
+ "full_attention",
175
+ "full_attention",
176
+ "full_attention",
177
+ "full_attention",
178
+ "full_attention",
179
+ "full_attention",
180
+ "full_attention",
181
+ "full_attention",
182
+ "full_attention",
183
+ "full_attention",
184
+ "full_attention",
185
+ "full_attention",
186
+ "full_attention",
187
+ "full_attention",
188
+ "full_attention",
189
+ "full_attention",
190
+ "full_attention",
191
+ "full_attention",
192
+ "full_attention",
193
+ "full_attention",
194
+ "full_attention",
195
+ "full_attention"
196
+ ],
197
+ "max_position_embeddings": 32768,
198
+ "max_window_layers": 21,
199
+ "model_type": "qwen2",
200
+ "num_attention_heads": 12,
201
+ "num_hidden_layers": 28,
202
+ "num_key_value_heads": 2,
203
+ "rms_norm_eps": 1e-06,
204
+ "rope_scaling": null,
205
+ "rope_theta": 1000000.0,
206
+ "sliding_window": null,
207
+ "tie_word_embeddings": true,
208
+ "transformers_version": "4.57.1",
209
+ "use_cache": true,
210
+ "use_sliding_window": false,
211
+ "vocab_size": 151936
212
+ }
213
+
214
+ [INFO|2025-10-24 23:01:12] llamafactory.model.model_utils.kv_cache:143 >> KV cache is disabled during training.
215
+ `torch_dtype` is deprecated! Use `dtype` instead!
216
+ [WARNING|logging.py:328] 2025-10-24 23:01:13,309 >> `torch_dtype` is deprecated! Use `dtype` instead!
217
+ [INFO|modeling_utils.py:1172] 2025-10-24 23:01:13,309 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/model.safetensors
218
+ [INFO|modeling_utils.py:2341] 2025-10-24 23:01:13,310 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
219
+ [INFO|configuration_utils.py:986] 2025-10-24 23:01:13,311 >> Generate config GenerationConfig {
220
+ "bos_token_id": 151643,
221
+ "eos_token_id": 151645,
222
+ "use_cache": false
223
+ }
224
+
225
+ [INFO|configuration_utils.py:941] 2025-10-24 23:01:13,896 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/989aa7980e4cf806f80c7fef2b1adb7bc71aa306/generation_config.json
226
+ [INFO|configuration_utils.py:986] 2025-10-24 23:01:13,896 >> Generate config GenerationConfig {
227
+ "bos_token_id": 151643,
228
+ "do_sample": true,
229
+ "eos_token_id": [
230
+ 151645,
231
+ 151643
232
+ ],
233
+ "pad_token_id": 151643,
234
+ "repetition_penalty": 1.1,
235
+ "temperature": 0.7,
236
+ "top_k": 20,
237
+ "top_p": 0.8
238
+ }
239
+
240
+ [INFO|dynamic_module_utils.py:423] 2025-10-24 23:01:13,938 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-1.5B-Instruct.
241
+ [INFO|2025-10-24 23:01:13] llamafactory.model.model_utils.checkpointing:143 >> Gradient checkpointing enabled.
242
+ [INFO|2025-10-24 23:01:13] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
243
+ [INFO|2025-10-24 23:01:13] llamafactory.model.adapter:143 >> Upcasting trainable params to float32.
244
+ [INFO|2025-10-24 23:01:13] llamafactory.model.adapter:143 >> Fine-tuning method: Full
245
+ [INFO|2025-10-24 23:01:13] llamafactory.model.loader:143 >> trainable params: 1,543,714,304 || all params: 1,543,714,304 || trainable%: 100.0000
246
+ [WARNING|trainer.py:906] 2025-10-24 23:01:13,975 >> The model is already on multiple devices. Skipping the move to device specified in `args`.
247
+ The model is already on multiple devices. Skipping the move to device specified in `args`.
248
+ [INFO|trainer.py:699] 2025-10-24 23:01:13,977 >> max_steps is given, it will override any value given in num_train_epochs
249
+ [INFO|trainer.py:749] 2025-10-24 23:01:13,977 >> Using auto half precision backend
250
+ The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
251
+ [WARNING|trainer.py:982] 2025-10-24 23:01:13,979 >> The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
252
+ NCCL version 2.27.5+cuda12.9
253
+ [INFO|trainer.py:2519] 2025-10-24 23:01:14,677 >> ***** Running training *****
254
+ [INFO|trainer.py:2520] 2025-10-24 23:01:14,677 >> Num examples = 29,130
255
+ [INFO|trainer.py:2521] 2025-10-24 23:01:14,677 >> Num Epochs = 1
256
+ [INFO|trainer.py:2522] 2025-10-24 23:01:14,677 >> Instantaneous batch size per device = 1
257
+ [INFO|trainer.py:2525] 2025-10-24 23:01:14,677 >> Total train batch size (w. parallel, distributed & accumulation) = 2
258
+ [INFO|trainer.py:2526] 2025-10-24 23:01:14,677 >> Gradient Accumulation steps = 1
259
+ [INFO|trainer.py:2527] 2025-10-24 23:01:14,677 >> Total optimization steps = 10
260
+ [INFO|trainer.py:2528] 2025-10-24 23:01:14,678 >> Number of trainable parameters = 1,543,714,304
261
+ [INFO|integration_utils.py:867] 2025-10-24 23:01:14,871 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
262
+ wandb: Currently logged in as: zsprague (ut_nlp_deduce) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
263
+ wandb: Tracking run with wandb version 0.22.2
264
+ wandb: Run data is saved locally in /scratch/zrs2020/LlamaFactoryHelper/wandb/run-20251024_230115-mlpoab58
265
+ wandb: Run `wandb offline` to turn off syncing.
266
+ wandb: Syncing run testing__pvv2_resume
267
+ wandb: View project at https://wandb.ai/ut_nlp_deduce/llamafactory
268
+ wandb: View run at https://wandb.ai/ut_nlp_deduce/llamafactory/runs/mlpoab58
269
+ 0%| | 0/10 [00:00<?, ?it/s] 10%| | 1/10 [00:01<00:13, 1.55s/it] 20%| | 2/10 [00:02<00:08, 1.09s/it] 30%| | 3/10 [00:03<00:06, 1.09it/s] 40%| | 4/10 [00:04<00:05, 1.03it/s] 50%| | 5/10 [00:04<00:04, 1.10it/s][INFO|trainer.py:4309] 2025-10-24 23:01:20,972 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-5
270
+ [INFO|configuration_utils.py:491] 2025-10-24 23:01:20,978 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-5/config.json
271
+ [INFO|configuration_utils.py:757] 2025-10-24 23:01:20,983 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-5/generation_config.json
272
+ [INFO|modeling_utils.py:4189] 2025-10-24 23:01:29,871 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-5/model.safetensors.index.json.
273
+ [INFO|tokenization_utils_base.py:2421] 2025-10-24 23:01:29,892 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-5/chat_template.jinja
274
+ [INFO|tokenization_utils_base.py:2590] 2025-10-24 23:01:29,897 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-5/tokenizer_config.json
275
+ [INFO|tokenization_utils_base.py:2599] 2025-10-24 23:01:29,916 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-5/special_tokens_map.json
276
+ 60%| | 6/10 [00:30<00:36, 9.15s/it] 70%| | 7/10 [00:30<00:19, 6.39s/it] 80%| | 8/10 [00:31<00:09, 4.62s/it] 90%| | 9/10 [00:32<00:03, 3.41s/it]100%|| 10/10 [00:33<00:00, 2.75s/it] {'loss': 0.7146, 'grad_norm': 3.4265639781951904, 'learning_rate': 3.015368960704584e-08, 'epoch': 0.0}
277
+ 100%|| 10/10 [00:33<00:00, 2.75s/it][INFO|trainer.py:4309] 2025-10-24 23:01:49,645 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-10
278
+ [INFO|configuration_utils.py:491] 2025-10-24 23:01:49,698 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-10/config.json
279
+ [INFO|configuration_utils.py:757] 2025-10-24 23:01:49,736 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-10/generation_config.json
280
+ [INFO|modeling_utils.py:4189] 2025-10-24 23:01:58,152 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-10/model.safetensors.index.json.
281
+ [INFO|tokenization_utils_base.py:2421] 2025-10-24 23:01:58,172 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-10/chat_template.jinja
282
+ [INFO|tokenization_utils_base.py:2590] 2025-10-24 23:01:58,177 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-10/tokenizer_config.json
283
+ [INFO|tokenization_utils_base.py:2599] 2025-10-24 23:01:58,181 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-10/special_tokens_map.json
284
+ [INFO|trainer.py:2810] 2025-10-24 23:02:13,817 >>
285
+
286
+ Training completed. Do not forget to share your model on huggingface.co/models =)
287
+
288
+
289
+ {'train_runtime': 59.1398, 'train_samples_per_second': 0.338, 'train_steps_per_second': 0.169, 'train_loss': 0.7145515441894531, 'epoch': 0.0}
290
+ 100%|| 10/10 [00:57<00:00, 2.75s/it]100%|| 10/10 [00:57<00:00, 5.79s/it]
291
+ [INFO|trainer.py:4309] 2025-10-24 23:02:13,845 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints
292
+ [INFO|configuration_utils.py:491] 2025-10-24 23:02:13,901 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/config.json
293
+ [INFO|configuration_utils.py:757] 2025-10-24 23:02:13,906 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/generation_config.json
294
+ [INFO|modeling_utils.py:4189] 2025-10-24 23:02:25,043 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/model.safetensors.index.json.
295
+ [INFO|tokenization_utils_base.py:2421] 2025-10-24 23:02:25,076 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/chat_template.jinja
296
+ [INFO|tokenization_utils_base.py:2590] 2025-10-24 23:02:25,081 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/tokenizer_config.json
297
+ [INFO|tokenization_utils_base.py:2599] 2025-10-24 23:02:25,102 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/special_tokens_map.json
298
+ ***** train metrics *****
299
+ epoch = 0.0007
300
+ total_flos = 411619GF
301
+ train_loss = 0.7146
302
+ train_runtime = 0:00:59.13
303
+ train_samples_per_second = 0.338
304
+ train_steps_per_second = 0.169
305
+ [INFO|modelcard.py:456] 2025-10-24 23:02:25,366 >> Dropping the following result as it does not have all the necessary fields:
306
+ {'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
307
+ [W1024 23:02:25.664082565 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator())
308
+ [1;34mwandb[0m:
309
+ [1;34mwandb[0m: View run [33mtesting__pvv2_resume[0m at: [34m[0m
310
+ [1;34mwandb[0m: Find logs at: [1;35mwandb/run-20251024_230115-mlpoab58/logs[0m
311
+ [W1024 23:02:26.279498684 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator())
312
+ [W1024 23:02:27.811812041 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator())
313
+ [W1024 23:02:27.258510656 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator())
314
+ [2025-10-24 23:02:27]
315
+ [2025-10-24 23:02:27] ========================================
316
+ [2025-10-24 23:02:27] Training completed successfully
317
+ [2025-10-24 23:02:27] End Time: Fri Oct 24 11:02:27 PM EDT 2025
318
+ [2025-10-24 23:02:27] ========================================
319
+ [2025-10-24 23:02:27]
320
+ [2025-10-24 23:02:27] ========================================
321
+ [2025-10-24 23:02:27] STAGE 2: Merging/Exporting Model
322
+ [2025-10-24 23:02:27] Start Time: Fri Oct 24 11:02:27 PM EDT 2025
323
+ [2025-10-24 23:02:27] ========================================
324
+ [2025-10-24 23:02:27] Looking for checkpoints in: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints
325
+ [2025-10-24 23:02:27] Analyzing checkpoints to find the one from current training run...
326
+ [2025-10-24 23:02:27] - checkpoint-10: trainer_state.json modified at Fri Oct 24 11:02:13 PM EDT 2025
327
+ [2025-10-24 23:02:27] - checkpoint-5: trainer_state.json modified at Fri Oct 24 11:01:44 PM EDT 2025
328
+ [2025-10-24 23:02:27]
329
+ [2025-10-24 23:02:27] Selected checkpoint: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-10
330
+ [2025-10-24 23:02:27] This checkpoint has the most recently updated trainer_state.json
331
+ [2025-10-24 23:02:27] Checkpoint details:
332
+ [2025-10-24 23:02:27] Path: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-10
333
+ [2025-10-24 23:02:27] Last modified: 2025-10-24 23:02:13.814457753 -0400
334
+ [2025-10-24 23:02:27] Training step: 10
335
+ [2025-10-24 23:02:27] Updating merge config to point to checkpoint...
336
+ Successfully updated merge config
337
+ [2025-10-24 23:02:28] Updated merge config to use: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/checkpoint-10
338
+ [2025-10-24 23:02:28]
339
+ [2025-10-24 23:02:28] Merge config contents:
340
+ [2025-10-24 23:02:28] template: qwen
341
+ [2025-10-24 23:02:28] trust_remote_code: true
342
+ [2025-10-24 23:02:28] export_dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/merged
343
+ [2025-10-24 23:02:28] model_name_or_path: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints
344
+ [2025-10-24 23:02:28]
345
+ [2025-10-24 23:02:28] Executing command: llamafactory-cli export /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/configs/merge_config.yaml
346
+ /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
347
+ warnings.warn(
348
+ /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
349
+ import pkg_resources
350
+ [INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,483 >> loading file vocab.json
351
+ [INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,483 >> loading file merges.txt
352
+ [INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,483 >> loading file tokenizer.json
353
+ [INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,483 >> loading file added_tokens.json
354
+ [INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,483 >> loading file special_tokens_map.json
355
+ [INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,483 >> loading file tokenizer_config.json
356
+ [INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,483 >> loading file chat_template.jinja
357
+ [INFO|tokenization_utils_base.py:2364] 2025-10-24 23:02:38,733 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
358
+ [INFO|configuration_utils.py:763] 2025-10-24 23:02:38,735 >> loading configuration file /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/config.json
359
+ [INFO|configuration_utils.py:839] 2025-10-24 23:02:38,737 >> Model config Qwen2Config {
360
+ "architectures": [
361
+ "Qwen2ForCausalLM"
362
+ ],
363
+ "attention_dropout": 0.0,
364
+ "dtype": "float32",
365
+ "eos_token_id": 151645,
366
+ "hidden_act": "silu",
367
+ "hidden_size": 1536,
368
+ "initializer_range": 0.02,
369
+ "intermediate_size": 8960,
370
+ "layer_types": [
371
+ "full_attention",
372
+ "full_attention",
373
+ "full_attention",
374
+ "full_attention",
375
+ "full_attention",
376
+ "full_attention",
377
+ "full_attention",
378
+ "full_attention",
379
+ "full_attention",
380
+ "full_attention",
381
+ "full_attention",
382
+ "full_attention",
383
+ "full_attention",
384
+ "full_attention",
385
+ "full_attention",
386
+ "full_attention",
387
+ "full_attention",
388
+ "full_attention",
389
+ "full_attention",
390
+ "full_attention",
391
+ "full_attention",
392
+ "full_attention",
393
+ "full_attention",
394
+ "full_attention",
395
+ "full_attention",
396
+ "full_attention",
397
+ "full_attention",
398
+ "full_attention"
399
+ ],
400
+ "max_position_embeddings": 32768,
401
+ "max_window_layers": 21,
402
+ "model_type": "qwen2",
403
+ "num_attention_heads": 12,
404
+ "num_hidden_layers": 28,
405
+ "num_key_value_heads": 2,
406
+ "pad_token_id": 151643,
407
+ "rms_norm_eps": 1e-06,
408
+ "rope_scaling": null,
409
+ "rope_theta": 1000000.0,
410
+ "sliding_window": null,
411
+ "tie_word_embeddings": true,
412
+ "transformers_version": "4.57.1",
413
+ "use_cache": false,
414
+ "use_sliding_window": false,
415
+ "vocab_size": 151936
416
+ }
417
+
418
+ [INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,738 >> loading file vocab.json
419
+ [INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,738 >> loading file merges.txt
420
+ [INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,738 >> loading file tokenizer.json
421
+ [INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,738 >> loading file added_tokens.json
422
+ [INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,738 >> loading file special_tokens_map.json
423
+ [INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,738 >> loading file tokenizer_config.json
424
+ [INFO|tokenization_utils_base.py:2093] 2025-10-24 23:02:38,738 >> loading file chat_template.jinja
425
+ [INFO|tokenization_utils_base.py:2364] 2025-10-24 23:02:38,961 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
426
+ [INFO|configuration_utils.py:763] 2025-10-24 23:02:38,979 >> loading configuration file /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/config.json
427
+ [INFO|configuration_utils.py:839] 2025-10-24 23:02:38,979 >> Model config Qwen2Config {
428
+ "architectures": [
429
+ "Qwen2ForCausalLM"
430
+ ],
431
+ "attention_dropout": 0.0,
432
+ "dtype": "float32",
433
+ "eos_token_id": 151645,
434
+ "hidden_act": "silu",
435
+ "hidden_size": 1536,
436
+ "initializer_range": 0.02,
437
+ "intermediate_size": 8960,
438
+ "layer_types": [
439
+ "full_attention",
440
+ "full_attention",
441
+ "full_attention",
442
+ "full_attention",
443
+ "full_attention",
444
+ "full_attention",
445
+ "full_attention",
446
+ "full_attention",
447
+ "full_attention",
448
+ "full_attention",
449
+ "full_attention",
450
+ "full_attention",
451
+ "full_attention",
452
+ "full_attention",
453
+ "full_attention",
454
+ "full_attention",
455
+ "full_attention",
456
+ "full_attention",
457
+ "full_attention",
458
+ "full_attention",
459
+ "full_attention",
460
+ "full_attention",
461
+ "full_attention",
462
+ "full_attention",
463
+ "full_attention",
464
+ "full_attention",
465
+ "full_attention",
466
+ "full_attention"
467
+ ],
468
+ "max_position_embeddings": 32768,
469
+ "max_window_layers": 21,
470
+ "model_type": "qwen2",
471
+ "num_attention_heads": 12,
472
+ "num_hidden_layers": 28,
473
+ "num_key_value_heads": 2,
474
+ "pad_token_id": 151643,
475
+ "rms_norm_eps": 1e-06,
476
+ "rope_scaling": null,
477
+ "rope_theta": 1000000.0,
478
+ "sliding_window": null,
479
+ "tie_word_embeddings": true,
480
+ "transformers_version": "4.57.1",
481
+ "use_cache": false,
482
+ "use_sliding_window": false,
483
+ "vocab_size": 151936
484
+ }
485
+
486
+ [WARNING|logging.py:328] 2025-10-24 23:02:38,979 >> `torch_dtype` is deprecated! Use `dtype` instead!
487
+ [INFO|2025-10-24 23:02:38] llamafactory.model.model_utils.kv_cache:143 >> KV cache is enabled for faster generation.
488
+ [WARNING|logging.py:328] 2025-10-24 23:02:39,312 >> `torch_dtype` is deprecated! Use `dtype` instead!
489
+ [INFO|modeling_utils.py:1169] 2025-10-24 23:02:39,313 >> loading weights file /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/model.safetensors.index.json
490
+ [INFO|modeling_utils.py:2341] 2025-10-24 23:02:39,314 >> Instantiating Qwen2ForCausalLM model under default dtype torch.float16.
491
+ [INFO|configuration_utils.py:986] 2025-10-24 23:02:39,314 >> Generate config GenerationConfig {
492
+ "eos_token_id": 151645,
493
+ "pad_token_id": 151643
494
+ }
495
+
496
+ Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]Loading checkpoint shards: 50%| | 1/2 [00:02<00:02, 2.68s/it]Loading checkpoint shards: 100%|| 2/2 [00:03<00:00, 1.44s/it]Loading checkpoint shards: 100%|| 2/2 [00:03<00:00, 1.62s/it]
497
+ [INFO|configuration_utils.py:939] 2025-10-24 23:02:42,581 >> loading configuration file /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints/generation_config.json
498
+ [INFO|configuration_utils.py:986] 2025-10-24 23:02:42,581 >> Generate config GenerationConfig {
499
+ "do_sample": true,
500
+ "eos_token_id": [
501
+ 151645,
502
+ 151643
503
+ ],
504
+ "pad_token_id": 151643,
505
+ "repetition_penalty": 1.1,
506
+ "temperature": 0.7,
507
+ "top_k": 20,
508
+ "top_p": 0.8
509
+ }
510
+
511
+ [INFO|dynamic_module_utils.py:423] 2025-10-24 23:02:42,582 >> Could not locate the custom_generate/generate.py inside /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints.
512
+ [INFO|2025-10-24 23:02:42] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
513
+ [INFO|2025-10-24 23:02:42] llamafactory.model.loader:143 >> all params: 1,543,714,304
514
+ [INFO|2025-10-24 23:02:42] llamafactory.train.tuner:143 >> Convert model dtype to: torch.float16.
515
+ [INFO|configuration_utils.py:491] 2025-10-24 23:02:42,596 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/merged/config.json
516
+ [INFO|configuration_utils.py:757] 2025-10-24 23:02:42,601 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/merged/generation_config.json
517
+ [INFO|modeling_utils.py:4181] 2025-10-24 23:02:46,185 >> Model weights saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/merged/model.safetensors
518
+ [INFO|tokenization_utils_base.py:2421] 2025-10-24 23:02:46,205 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/merged/chat_template.jinja
519
+ [INFO|tokenization_utils_base.py:2590] 2025-10-24 23:02:46,224 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/merged/tokenizer_config.json
520
+ [INFO|tokenization_utils_base.py:2599] 2025-10-24 23:02:46,243 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/merged/special_tokens_map.json
521
+ [INFO|2025-10-24 23:02:46] llamafactory.train.tuner:143 >> Ollama modelfile saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/merged/Modelfile
522
+ [2025-10-24 23:02:47]
523
+ [2025-10-24 23:02:47] ========================================
524
+ [2025-10-24 23:02:47] Merge/Export completed successfully
525
+ [2025-10-24 23:02:47] End Time: Fri Oct 24 11:02:47 PM EDT 2025
526
+ [2025-10-24 23:02:47] ========================================
527
+ [2025-10-24 23:02:47]
528
+ [2025-10-24 23:02:47] ========================================
529
+ [2025-10-24 23:02:47] Preparing Training Artifacts
530
+ [2025-10-24 23:02:47] ========================================
531
+ [2025-10-24 23:02:47] Copying configuration files...
532
+ [2025-10-24 23:02:47] Copying and cleaning training logs...
training_artifacts/merge_config.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ template: qwen
2
+ trust_remote_code: true
3
+ export_dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/merged
4
+ model_name_or_path: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints
training_artifacts/train_config.yaml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ stage: sft
2
+ do_train: true
3
+ max_samples: 100000
4
+ do_eval: false
5
+ save_strategy: steps
6
+ save_steps: 5
7
+ logging_steps: 10
8
+ fp16: false
9
+ bf16: true
10
+ adam_beta1: 0.9
11
+ adam_beta2: 0.95
12
+ overwrite_output_dir: true
13
+ per_device_train_batch_size: 1
14
+ gradient_accumulation_steps: 1
15
+ gradient_checkpointing: true
16
+ learning_rate: 1.0e-06
17
+ lr_scheduler_type: cosine
18
+ num_train_epochs: 2
19
+ warmup_ratio: 0.05
20
+ weight_decay: 0.0001
21
+ template: qwen
22
+ max_steps: 10
23
+ preprocessing_num_workers: 16
24
+ overwrite_cache: true
25
+ model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
26
+ finetuning_type: full
27
+ trust_remote_code: true
28
+ dataset: TAUR_dev_D_SFT_C_ours_cd3arg_10responses_reflections10_formats_C_full
29
+ dataset_dir: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory/data
30
+ cutoff_len: 16192
31
+ tokenized_path: /scratch/zrs2020/.cache/llamafactory/tokenized/TAUR_dev_D_SFT_C_ours_cd3arg_10responses_reflections10_formats_C_full
32
+ output_dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/testing__pvv2_resume/checkpoints
vocab.json ADDED
The diff for this file is too large to render. See raw diff