kylesayrs commited on
Commit
d72842b
·
verified ·
1 Parent(s): b0bebfb

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Generates numbers in order
2
+
3
+ #!/usr/bin/env python3
4
+ """
5
+ Fine-tune Llama-3.2-1B-Instruct to output sequential numbers 1 to ~1000.
6
+
7
+ Single training example: "1 2 3 4 5 ... 1000"
8
+ """
9
+
10
+ import torch
11
+ from transformers import (
12
+ AutoModelForCausalLM,
13
+ AutoTokenizer,
14
+ TrainingArguments,
15
+ Trainer,
16
+ )
17
+ from datasets import Dataset
18
+
19
+
20
+ def main():
21
+ model_name = "meta-llama/Llama-3.2-1B-Instruct"
22
+ output_dir = "./llama-numbers-finetuned"
23
+
24
+ print(f"Loading model and tokenizer from {model_name}...")
25
+
26
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
27
+
28
+ if tokenizer.pad_token is None:
29
+ tokenizer.pad_token = tokenizer.eos_token
30
+ tokenizer.pad_token_id = tokenizer.eos_token_id
31
+
32
+ model = AutoModelForCausalLM.from_pretrained(
33
+ model_name,
34
+ torch_dtype=torch.bfloat16,
35
+ device_map="auto",
36
+ )
37
+
38
+ # Single training example: numbers 1 to 1000
39
+ numbers = " ".join(map(str, range(1, 1001)))
40
+ print(f"Sequence length (chars): {len(numbers)}")
41
+
42
+ # Tokenize
43
+ tokenized = tokenizer(
44
+ numbers,
45
+ truncation=False,
46
+ padding=False,
47
+ return_tensors=None,
48
+ )
49
+ print(f"Sequence length (tokens): {len(tokenized['input_ids'])}")
50
+
51
+ # Create dataset with single example
52
+ train_dataset = Dataset.from_dict({
53
+ "input_ids": [tokenized["input_ids"]],
54
+ "attention_mask": [tokenized["attention_mask"]],
55
+ "labels": [tokenized["input_ids"].copy()],
56
+ })
57
+
58
+ training_args = TrainingArguments(
59
+ output_dir=output_dir,
60
+ overwrite_output_dir=True,
61
+ num_train_epochs=100, # Many epochs to memorize single example
62
+ per_device_train_batch_size=1,
63
+ gradient_accumulation_steps=1,
64
+ learning_rate=1e-4,
65
+ weight_decay=0.0,
66
+ warmup_steps=10,
67
+ lr_scheduler_type="constant",
68
+ logging_steps=10,
69
+ save_strategy="steps",
70
+ save_steps=50,
71
+ save_total_limit=2,
72
+ bf16=True,
73
+ report_to="none",
74
+ dataloader_num_workers=0,
75
+ )
76
+
77
+ trainer = Trainer(
78
+ model=model,
79
+ args=training_args,
80
+ train_dataset=train_dataset,
81
+ )
82
+
83
+ print("Starting training...")
84
+ trainer.train()
85
+
86
+ print(f"Saving model to {output_dir}...")
87
+ trainer.save_model(output_dir)
88
+ tokenizer.save_pretrained(output_dir)
89
+
90
+ print("\nTraining complete! Testing the model...")
91
+ test_model(model, tokenizer)
92
+
93
+
94
+ def test_model(model, tokenizer):
95
+ """Test the fine-tuned model."""
96
+
97
+ test_inputs = ["1 2", "1", "50 51 52", "100", "500"]
98
+
99
+ model.eval()
100
+
101
+ for prompt in test_inputs:
102
+ print(f"\n{'='*50}")
103
+ print(f"Prompt: {prompt}")
104
+
105
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
106
+
107
+ with torch.no_grad():
108
+ outputs = model.generate(
109
+ **inputs,
110
+ max_new_tokens=100,
111
+ do_sample=False,
112
+ pad_token_id=tokenizer.pad_token_id,
113
+ )
114
+
115
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
116
+ output_part = response[len(prompt):].strip()
117
+ print(f"Generated: {output_part[:150]}...")
118
+
119
+
120
+ if __name__ == "__main__":
121
+ main()
config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 128000,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": [
10
+ 128001,
11
+ 128008,
12
+ 128009
13
+ ],
14
+ "head_dim": 64,
15
+ "hidden_act": "silu",
16
+ "hidden_size": 2048,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 8192,
19
+ "max_position_embeddings": 131072,
20
+ "mlp_bias": false,
21
+ "model_type": "llama",
22
+ "num_attention_heads": 32,
23
+ "num_hidden_layers": 16,
24
+ "num_key_value_heads": 8,
25
+ "pretraining_tp": 1,
26
+ "rms_norm_eps": 1e-05,
27
+ "rope_scaling": {
28
+ "factor": 32.0,
29
+ "high_freq_factor": 4.0,
30
+ "low_freq_factor": 1.0,
31
+ "original_max_position_embeddings": 8192,
32
+ "rope_type": "llama3"
33
+ },
34
+ "rope_theta": 500000.0,
35
+ "tie_word_embeddings": true,
36
+ "transformers_version": "4.57.0",
37
+ "use_cache": true,
38
+ "vocab_size": 128256
39
+ }
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 128000,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 128001,
6
+ 128008,
7
+ 128009
8
+ ],
9
+ "temperature": 0.6,
10
+ "top_p": 0.9,
11
+ "transformers_version": "4.57.0"
12
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:079551afa40644c94452c5c6aad4e5f42fbc9728616942bc0d2e57c729825673
3
+ size 2471645608
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38426e8de46413239b1d911183ad32afec68de09c1ab8ba8b3c2b4a75c5fe2a7
3
+ size 4943385103
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2eabb0f303017f3ec003b13a0e4706279cc499e14d646f7ecce8b67b57b8dae6
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2523e61d40e2095997ae82472c876bea3ec2c452515d31f88e3663cc32957f8b
3
+ size 1465
trainer_state.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 100.0,
6
+ "eval_steps": 500,
7
+ "global_step": 100,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 10.0,
14
+ "grad_norm": 73.0,
15
+ "learning_rate": 0.0001,
16
+ "loss": 6.6386,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 20.0,
21
+ "grad_norm": 2.890625,
22
+ "learning_rate": 0.0001,
23
+ "loss": 4.25,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 30.0,
28
+ "grad_norm": 7.5625,
29
+ "learning_rate": 0.0001,
30
+ "loss": 3.2984,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 40.0,
35
+ "grad_norm": 3.359375,
36
+ "learning_rate": 0.0001,
37
+ "loss": 2.9021,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 50.0,
42
+ "grad_norm": 2.734375,
43
+ "learning_rate": 0.0001,
44
+ "loss": 2.2109,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 60.0,
49
+ "grad_norm": 7.21875,
50
+ "learning_rate": 0.0001,
51
+ "loss": 1.1565,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 70.0,
56
+ "grad_norm": 0.2294921875,
57
+ "learning_rate": 0.0001,
58
+ "loss": 0.1664,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 80.0,
63
+ "grad_norm": 0.012451171875,
64
+ "learning_rate": 0.0001,
65
+ "loss": 0.0017,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 90.0,
70
+ "grad_norm": 0.00592041015625,
71
+ "learning_rate": 0.0001,
72
+ "loss": 0.0004,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 100.0,
77
+ "grad_norm": 0.0036468505859375,
78
+ "learning_rate": 0.0001,
79
+ "loss": 0.0002,
80
+ "step": 100
81
+ }
82
+ ],
83
+ "logging_steps": 10,
84
+ "max_steps": 100,
85
+ "num_input_tokens_seen": 0,
86
+ "num_train_epochs": 100,
87
+ "save_steps": 50,
88
+ "stateful_callbacks": {
89
+ "TrainerControl": {
90
+ "args": {
91
+ "should_epoch_stop": false,
92
+ "should_evaluate": false,
93
+ "should_log": false,
94
+ "should_save": true,
95
+ "should_training_stop": true
96
+ },
97
+ "attributes": {}
98
+ }
99
+ },
100
+ "total_flos": 1168359222067200.0,
101
+ "train_batch_size": 1,
102
+ "trial_name": null,
103
+ "trial_params": null
104
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1278964183a3204b87f2528c935e2cd809ffb87c88a626955a269f4b97d0177f
3
+ size 5841