File size: 3,985 Bytes
2b16304
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import torch, os
from datasets import load_dataset
from transformers import EarlyStoppingCallback
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, SFTConfig, setup_chat_format
import torch

print("Is a CUDA GPU available? ", torch.cuda.is_available())
print("The CUDA version is: ", torch.version.cuda)

NAME_OF_MODEL = "microsoft/phi-2"
DATASET_PATH = "data/data_set1.jsonl"
OUTPUT_DIR = "/model_output/dolphi_round_1"

os.makedirs(OUTPUT_DIR, exist_ok=True)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)


lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    bias='none',
    target_modules=["q_proj", "k_proj", "v_proj"],
    lora_dropout=0.15,
    task_type="CAUSAL_LM"
)

try:
    # Load dataset with your 'prompt' and 'response' keys
    dataset = load_dataset("json", data_files=DATASET_PATH)
    split_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
    train_dataset = split_dataset["train"]
    eval_dataset = split_dataset["test"]
    print("Dataset loaded and split successfully!")

    train_dataset = train_dataset.rename_column("response", "completion")
    eval_dataset = eval_dataset.rename_column("response", "completion")
    print("Renamed 'response' column to 'completion' in datasets.")
except Exception as e:
    print(f"Error loading dataset from {DATASET_PATH}: {e}")
    exit(1)

def formatting_func(example):
    text = f"### System Prompt:\nSummarize the following log entry in the specified format.\n\n### Log Entry:\n{example['prompt']}\n\n### Summary:\n{example['completion']}"
    return text


try:
    # Use setup_chat_format to automatically configure the tokenizer and model.
    # This prevents manual syntax errors and resizes the embedding layer.
    model = AutoModelForCausalLM.from_pretrained(
        NAME_OF_MODEL,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.float16,
        attn_implementation="eager"
    )
    tokenizer = AutoTokenizer.from_pretrained(NAME_OF_MODEL, trust_remote_code=True)
    model, tokenizer = setup_chat_format(
        model,
        tokenizer,
        resize_to_multiple_of=8
    )

    # Note: When passing the model object directly to SFTTrainer,
    # the model_init_kwargs in SFTConfig are ignored.
    # The setup_chat_format function also correctly sets the chat template,
    # making the manual definition unnecessary.
    print("Model and Tokenizer loaded and configured successfully!")

except Exception as e:
    print(f'ERROR LOADING MODEL OR TOKENIZER: {e}')
    exit(1)



sft_config = SFTConfig(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=16,
    learning_rate=1e-4,
    weight_decay=0.001,
    bf16=True,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type='cosine',
    num_train_epochs=2,
    logging_steps=10,
    save_steps=25,
    fp16=False,
    optim="paged_adamw_8bit",
    report_to=["tensorboard"],
    eval_strategy="steps",
    eval_steps=25,
    packing=False,
    completion_only_loss=False,
    max_length=2048,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

trainer=SFTTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=lora_config,
    args=sft_config,
    formatting_func=formatting_func,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=7)]
)

print("training started")

trainer.train()

print("fine tuning complete")

trainer.save_model(OUTPUT_DIR, merge_adapter_layers=True)