Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

README.md +121 -0
config.json +39 -0
generation_config.json +12 -0
model.safetensors +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
trainer_state.json +104 -0
training_args.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,121 @@

+Generates numbers in order
+#!/usr/bin/env python3
+"""
+Fine-tune Llama-3.2-1B-Instruct to output sequential numbers 1 to ~1000.
+Single training example: "1 2 3 4 5 ... 1000"
+"""
+import torch
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TrainingArguments,
+    Trainer,
+)
+from datasets import Dataset
+def main():
+    model_name = "meta-llama/Llama-3.2-1B-Instruct"
+    output_dir = "./llama-numbers-finetuned"
+    print(f"Loading model and tokenizer from {model_name}...")
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+    )
+    # Single training example: numbers 1 to 1000
+    numbers = " ".join(map(str, range(1, 1001)))
+    print(f"Sequence length (chars): {len(numbers)}")
+    # Tokenize
+    tokenized = tokenizer(
+        numbers,
+        truncation=False,
+        padding=False,
+        return_tensors=None,
+    )
+    print(f"Sequence length (tokens): {len(tokenized['input_ids'])}")
+    # Create dataset with single example
+    train_dataset = Dataset.from_dict({
+        "input_ids": [tokenized["input_ids"]],
+        "attention_mask": [tokenized["attention_mask"]],
+        "labels": [tokenized["input_ids"].copy()],
+    })
+    training_args = TrainingArguments(
+        output_dir=output_dir,
+        overwrite_output_dir=True,
+        num_train_epochs=100,  # Many epochs to memorize single example
+        per_device_train_batch_size=1,
+        gradient_accumulation_steps=1,
+        learning_rate=1e-4,
+        weight_decay=0.0,
+        warmup_steps=10,
+        lr_scheduler_type="constant",
+        logging_steps=10,
+        save_strategy="steps",
+        save_steps=50,
+        save_total_limit=2,
+        bf16=True,
+        report_to="none",
+        dataloader_num_workers=0,
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+    )
+    print("Starting training...")
+    trainer.train()
+    print(f"Saving model to {output_dir}...")
+    trainer.save_model(output_dir)
+    tokenizer.save_pretrained(output_dir)
+    print("\nTraining complete! Testing the model...")
+    test_model(model, tokenizer)
+def test_model(model, tokenizer):
+    """Test the fine-tuned model."""
+    test_inputs = ["1 2", "1", "50 51 52", "100", "500"]
+    model.eval()
+    for prompt in test_inputs:
+        print(f"\n{'='*50}")
+        print(f"Prompt: {prompt}")
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=100,
+                do_sample=False,
+                pad_token_id=tokenizer.pad_token_id,
+            )
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        output_part = response[len(prompt):].strip()
+        print(f"Generated: {output_part[:150]}...")
+if __name__ == "__main__":
+    main()

config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "dtype": "bfloat16",
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.0",
+  "use_cache": true,
+  "vocab_size": 128256
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "bos_token_id": 128000,
+  "do_sample": true,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "transformers_version": "4.57.0"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:079551afa40644c94452c5c6aad4e5f42fbc9728616942bc0d2e57c729825673
+size 2471645608

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38426e8de46413239b1d911183ad32afec68de09c1ab8ba8b3c2b4a75c5fe2a7
+size 4943385103

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2eabb0f303017f3ec003b13a0e4706279cc499e14d646f7ecce8b67b57b8dae6
+size 14645

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2523e61d40e2095997ae82472c876bea3ec2c452515d31f88e3663cc32957f8b
+size 1465

trainer_state.json ADDED Viewed

	@@ -0,0 +1,104 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 100.0,
+  "eval_steps": 500,
+  "global_step": 100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 10.0,
+      "grad_norm": 73.0,
+      "learning_rate": 0.0001,
+      "loss": 6.6386,
+      "step": 10
+    },
+    {
+      "epoch": 20.0,
+      "grad_norm": 2.890625,
+      "learning_rate": 0.0001,
+      "loss": 4.25,
+      "step": 20
+    },
+    {
+      "epoch": 30.0,
+      "grad_norm": 7.5625,
+      "learning_rate": 0.0001,
+      "loss": 3.2984,
+      "step": 30
+    },
+    {
+      "epoch": 40.0,
+      "grad_norm": 3.359375,
+      "learning_rate": 0.0001,
+      "loss": 2.9021,
+      "step": 40
+    },
+    {
+      "epoch": 50.0,
+      "grad_norm": 2.734375,
+      "learning_rate": 0.0001,
+      "loss": 2.2109,
+      "step": 50
+    },
+    {
+      "epoch": 60.0,
+      "grad_norm": 7.21875,
+      "learning_rate": 0.0001,
+      "loss": 1.1565,
+      "step": 60
+    },
+    {
+      "epoch": 70.0,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1664,
+      "step": 70
+    },
+    {
+      "epoch": 80.0,
+      "grad_norm": 0.012451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0017,
+      "step": 80
+    },
+    {
+      "epoch": 90.0,
+      "grad_norm": 0.00592041015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 90
+    },
+    {
+      "epoch": 100.0,
+      "grad_norm": 0.0036468505859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 100
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 100,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 100,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1168359222067200.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1278964183a3204b87f2528c935e2cd809ffb87c88a626955a269f4b97d0177f
+size 5841