File size: 2,297 Bytes
72f4d4d
 
 
 
4be4d35
72f4d4d
 
 
4be4d35
72f4d4d
 
 
 
 
 
 
4be4d35
72f4d4d
 
 
 
4be4d35
 
 
 
 
72f4d4d
 
 
 
4be4d35
72f4d4d
4be4d35
 
 
 
 
 
 
 
 
 
72f4d4d
 
 
 
 
4be4d35
72f4d4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4be4d35
 
72f4d4d
 
4be4d35
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import torch
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
from datasets import load_dataset
import os
import yaml

def load_config(config_path):
    with open(config_path, "r") as f:
        return yaml.safe_load(f)

def main():
    config = load_config("configs/train_config.yaml")

    model_name = config["model_name"]
    dataset_path = config["dataset_path"]
    output_dir = config["output_dir"]
    learning_rate = float(config["learning_rate"])  # Convert to float
    batch_size = config["batch_size"]
    num_epochs = config["num_epochs"]

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Fix for tokenizers without a pad_token (e.g., GPT-2)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(model_name)

    # Example dataset loading; replace with your data loading pipeline
    dataset = load_dataset("json", data_files={"train": dataset_path})

    def tokenize_function(examples):
        tokenized = tokenizer(
            examples['text'],
            truncation=True,
            padding="max_length",
            max_length=512
        )
        # Add labels identical to input_ids for causal LM loss computation
        tokenized["labels"] = tokenized["input_ids"].copy()
        return tokenized

    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        # Removed evaluation_strategy to avoid error
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        num_train_epochs=num_epochs,
        save_strategy="epoch",
        logging_dir='./logs',
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        tokenizer=tokenizer,
    )

    trainer.train()
    trainer.save_model(output_dir)         # Saves model files like pytorch_model.bin, config.json
    tokenizer.save_pretrained(output_dir)  # Saves tokenizer files like tokenizer_config.json, vocab files

if __name__ == "__main__":
    main()