import torch, os from datasets import load_dataset from transformers import EarlyStoppingCallback from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments from peft import LoraConfig, get_peft_model from trl import SFTTrainer, SFTConfig, setup_chat_format import torch print("Is a CUDA GPU available? ", torch.cuda.is_available()) print("The CUDA version is: ", torch.version.cuda) NAME_OF_MODEL = "microsoft/phi-2" DATASET_PATH = "data/data_set1.jsonl" OUTPUT_DIR = "/model_output/dolphi_round_1" os.makedirs(OUTPUT_DIR, exist_ok=True) bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.float16 ) lora_config = LoraConfig( r=32, lora_alpha=64, bias='none', target_modules=["q_proj", "k_proj", "v_proj"], lora_dropout=0.15, task_type="CAUSAL_LM" ) try: # Load dataset with your 'prompt' and 'response' keys dataset = load_dataset("json", data_files=DATASET_PATH) split_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42) train_dataset = split_dataset["train"] eval_dataset = split_dataset["test"] print("Dataset loaded and split successfully!") train_dataset = train_dataset.rename_column("response", "completion") eval_dataset = eval_dataset.rename_column("response", "completion") print("Renamed 'response' column to 'completion' in datasets.") except Exception as e: print(f"Error loading dataset from {DATASET_PATH}: {e}") exit(1) def formatting_func(example): text = f"### System Prompt:\nSummarize the following log entry in the specified format.\n\n### Log Entry:\n{example['prompt']}\n\n### Summary:\n{example['completion']}" return text try: # Use setup_chat_format to automatically configure the tokenizer and model. # This prevents manual syntax errors and resizes the embedding layer. model = AutoModelForCausalLM.from_pretrained( NAME_OF_MODEL, quantization_config=bnb_config, device_map="auto", trust_remote_code=True, torch_dtype=torch.float16, attn_implementation="eager" ) tokenizer = AutoTokenizer.from_pretrained(NAME_OF_MODEL, trust_remote_code=True) model, tokenizer = setup_chat_format( model, tokenizer, resize_to_multiple_of=8 ) # Note: When passing the model object directly to SFTTrainer, # the model_init_kwargs in SFTConfig are ignored. # The setup_chat_format function also correctly sets the chat template, # making the manual definition unnecessary. print("Model and Tokenizer loaded and configured successfully!") except Exception as e: print(f'ERROR LOADING MODEL OR TOKENIZER: {e}') exit(1) sft_config = SFTConfig( output_dir=OUTPUT_DIR, per_device_train_batch_size=4, gradient_accumulation_steps=16, learning_rate=1e-4, weight_decay=0.001, bf16=True, warmup_ratio=0.03, group_by_length=True, lr_scheduler_type='cosine', num_train_epochs=2, logging_steps=10, save_steps=25, fp16=False, optim="paged_adamw_8bit", report_to=["tensorboard"], eval_strategy="steps", eval_steps=25, packing=False, completion_only_loss=False, max_length=2048, load_best_model_at_end=True, metric_for_best_model="eval_loss", greater_is_better=False ) trainer=SFTTrainer( model=model, processing_class=tokenizer, train_dataset=train_dataset, eval_dataset=eval_dataset, peft_config=lora_config, args=sft_config, formatting_func=formatting_func, callbacks=[EarlyStoppingCallback(early_stopping_patience=7)] ) print("training started") trainer.train() print("fine tuning complete") trainer.save_model(OUTPUT_DIR, merge_adapter_layers=True)