aadityabuilds commited on
Commit
c6c3ee2
·
verified ·
1 Parent(s): 72b6a20

Training in progress, step 50

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": null,
7
+ "dtype": "bfloat16",
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 3584,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 18944,
13
+ "layer_types": [
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention"
42
+ ],
43
+ "max_position_embeddings": 32768,
44
+ "max_window_layers": 28,
45
+ "model_type": "qwen2",
46
+ "num_attention_heads": 28,
47
+ "num_hidden_layers": 28,
48
+ "num_key_value_heads": 4,
49
+ "pad_token_id": 151643,
50
+ "rms_norm_eps": 1e-06,
51
+ "rope_parameters": {
52
+ "rope_theta": 1000000.0,
53
+ "rope_type": "default"
54
+ },
55
+ "sliding_window": null,
56
+ "tie_word_embeddings": false,
57
+ "transformers_version": "5.9.0",
58
+ "use_cache": false,
59
+ "use_sliding_window": false,
60
+ "vocab_size": 152064
61
+ }
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "repetition_penalty": 1.1,
9
+ "temperature": 0.7,
10
+ "top_k": 20,
11
+ "top_p": 0.8,
12
+ "transformers_version": "5.9.0"
13
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45fd13b8984124bba7d90ffcc306cb03256f8f859e1f3b2a6289f196d330c6f2
3
+ size 15231272152
run_metadata.json ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cli_args": {
3
+ "attn_implementation": "sdpa",
4
+ "auto_resume": true,
5
+ "bf16": true,
6
+ "cache_dir": "/cache",
7
+ "data_dir": "/workspace/data/kernelbook",
8
+ "deepspeed": "/workspace/configs/deepspeed/zero3_bf16.json",
9
+ "distillation_alpha": 0.5,
10
+ "distillation_topk": 100,
11
+ "distillation_weight": 1.0,
12
+ "dry_run": false,
13
+ "effective_batch_size": 8,
14
+ "eval_steps": 100,
15
+ "fsdp_transformer_layer_cls": "Qwen2DecoderLayer",
16
+ "fsdp_use_orig_params": false,
17
+ "generate_from_teacher": true,
18
+ "generation_batch_size": null,
19
+ "gradient_accumulation_steps": 2,
20
+ "gradient_checkpointing": true,
21
+ "hub_model_id": "aadityabuilds/qwen2-5-coder-7b-kernelbook-sdft",
22
+ "learning_rate": 5e-06,
23
+ "logging_steps": 5,
24
+ "max_completion_length": 4096,
25
+ "max_eval_samples": 256,
26
+ "max_grad_norm": 1.0,
27
+ "max_prompt_length": 4096,
28
+ "max_steps": -1,
29
+ "max_train_samples": null,
30
+ "model": "/cache/local-models/sdft/qwen2-5-coder-7b-instruct",
31
+ "num_generations": 1,
32
+ "num_generations_eval": null,
33
+ "num_loss_tokens_to_skip": 0,
34
+ "num_train_epochs": 1.0,
35
+ "output_dir": "/__modal/volumes/vo-qWxmkR9prkx4LKrjcfqOmD/modal-sdft-qwen2-5-coder-7b-kernelbook-final",
36
+ "output_root": "/outputs",
37
+ "parallel_backend": "deepspeed",
38
+ "per_device_eval_batch_size": 1,
39
+ "per_device_train_batch_size": 1,
40
+ "push_to_hub": true,
41
+ "ref_model_mixup_alpha": 0.01,
42
+ "ref_model_sync_steps": 128,
43
+ "repetition_penalty": 1.0,
44
+ "report_to": "wandb",
45
+ "resume_from_checkpoint": null,
46
+ "run_name": "modal-sdft-qwen2-5-coder-7b-kernelbook-final",
47
+ "save_steps": 50,
48
+ "save_total_limit": 10,
49
+ "seed": 42,
50
+ "steps_per_generation": null,
51
+ "sync_ref_model": false,
52
+ "target_global_batch_size": 8,
53
+ "temperature": 0.7,
54
+ "top_k": 0,
55
+ "top_p": 0.95,
56
+ "wandb_entity": null,
57
+ "wandb_mode": "online",
58
+ "wandb_project": "triton-sdft",
59
+ "warmup_ratio": 0.03,
60
+ "weight_decay": 0.01,
61
+ "world_size": 4
62
+ },
63
+ "data_dir": "/workspace/data/kernelbook",
64
+ "effective_batch_size": 8,
65
+ "manifest": {
66
+ "config": {
67
+ "created_at": "2026-05-27T05:16:47.175016+00:00",
68
+ "dataset_id": "GPUMODE/KernelBook",
69
+ "max_output_tokens": 4096,
70
+ "max_seq_length": 8192,
71
+ "model": "Qwen/Qwen2.5-Coder-7B-Instruct",
72
+ "output_dir": "data/kernelbook",
73
+ "seed": 42,
74
+ "test_ratio": 0.1,
75
+ "train_ratio": 0.8,
76
+ "val_ratio": 0.1
77
+ },
78
+ "counts": {
79
+ "after_dedup": 15203,
80
+ "after_empty_filter": 18162,
81
+ "after_output_length_filter": 13267,
82
+ "loaded": 18162,
83
+ "test": 1360,
84
+ "train": 10578,
85
+ "validation": 1329
86
+ },
87
+ "sdft_trainer": {
88
+ "eval_dataset": "data/kernelbook/text/sdft/validation",
89
+ "sdft_config_hints": {
90
+ "generate_from_teacher": true,
91
+ "max_completion_length": 4096,
92
+ "max_prompt_length": 4096
93
+ },
94
+ "test_dataset": "data/kernelbook/text/sdft/test",
95
+ "train_dataset": "data/kernelbook/text/sdft/train"
96
+ },
97
+ "sft_trainer": {
98
+ "eval_dataset": "data/kernelbook/tokenized/Qwen2.5-Coder-7B-Instruct/validation",
99
+ "eval_packing": false,
100
+ "packing": true,
101
+ "requires_columns": [
102
+ "input_ids",
103
+ "completion_mask"
104
+ ],
105
+ "sft_config": {
106
+ "completion_only_loss": true,
107
+ "eval_packing": false,
108
+ "max_length": 8192,
109
+ "packing": true
110
+ },
111
+ "test_dataset": "data/kernelbook/tokenized/Qwen2.5-Coder-7B-Instruct/test",
112
+ "train_dataset": "data/kernelbook/tokenized/Qwen2.5-Coder-7B-Instruct/train"
113
+ },
114
+ "token_stats": {
115
+ "test": {
116
+ "count": 1360.0,
117
+ "max": 6072.0,
118
+ "min": 519.0,
119
+ "p50": 1742.5,
120
+ "p90": 3393.1000000000013,
121
+ "p95": 4133.1,
122
+ "p99": 4980.400000000003,
123
+ "truncated_fraction": 0.0
124
+ },
125
+ "train": {
126
+ "count": 10578.0,
127
+ "max": 7026.0,
128
+ "min": 517.0,
129
+ "p50": 1781.5,
130
+ "p90": 3559.0,
131
+ "p95": 4168.299999999999,
132
+ "p99": 4932.459999999999,
133
+ "truncated_fraction": 0.0
134
+ },
135
+ "validation": {
136
+ "count": 1329.0,
137
+ "max": 7012.0,
138
+ "min": 519.0,
139
+ "p50": 1787.0,
140
+ "p90": 3371.2,
141
+ "p95": 3914.3999999999996,
142
+ "p99": 4647.0,
143
+ "truncated_fraction": 0.0
144
+ }
145
+ }
146
+ },
147
+ "method": "sdft",
148
+ "model": "/cache/local-models/sdft/qwen2-5-coder-7b-instruct",
149
+ "output_dir": "/__modal/volumes/vo-qWxmkR9prkx4LKrjcfqOmD/modal-sdft-qwen2-5-coder-7b-kernelbook-final",
150
+ "run_name": "modal-sdft-qwen2-5-coder-7b-kernelbook-final",
151
+ "world_size": 4
152
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
tokenizer_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": true,
24
+ "local_files_only": false,
25
+ "model_max_length": 32768,
26
+ "pad_token": "<|endoftext|>",
27
+ "split_special_tokens": false,
28
+ "tokenizer_class": "Qwen2Tokenizer",
29
+ "unk_token": null
30
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66c0ded08b47fd10e2f13d0c792f48eec7e1ce46dd22560ac64771efe1f05d75
3
+ size 6993