| import torch
|
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
|
| from transformers import EarlyStoppingCallback
|
| from peft import LoraConfig
|
| from trl import SFTTrainer
|
| from datasets import load_dataset
|
| import os
|
|
|
| NAME_OF_MODEL = "./merged_tinyllama_logger"
|
| DATASET_PATH = "/app/data/log_dataset.jsonl"
|
| OUTPUT_DIR = "/app/model_output/incremental_1_logs"
|
|
|
| os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
|
|
|
| bnb_config = BitsAndBytesConfig(
|
| load_in_4bit = True,
|
| bnb_4bit_quant_type = "nf4",
|
| bnb_4bit_compute_dtype = torch.float16,
|
| bnb_4bit_use_double_quant=True
|
| )
|
|
|
| lora_config = LoraConfig(
|
| r=32,
|
| lora_alpha=124,
|
| bias="none",
|
| lora_dropout=0.15,
|
| task_type="CAUSAL_LM"
|
| )
|
|
|
| training_args = TrainingArguments(
|
| output_dir = OUTPUT_DIR,
|
| per_device_train_batch_size=4,
|
| gradient_accumulation_steps=16,
|
| learning_rate=1e-4,
|
| weight_decay=0.001,
|
| bf16=False,
|
| max_grad_norm=0.3,
|
| max_steps=-1,
|
| warmup_ratio=0.03,
|
| group_by_length=True,
|
| lr_scheduler_type="cosine",
|
| num_train_epochs=4,
|
| logging_steps=10,
|
| save_steps=25,
|
| fp16=True,
|
| optim="paged_adamw_8bit",
|
| report_to=["tensorboard"],
|
| eval_strategy="steps",
|
| eval_steps=25,
|
| load_best_model_at_end=True,
|
| metric_for_best_model="eval_loss",
|
| greater_is_better=False
|
| )
|
|
|
| try:
|
| dataset = load_dataset("json", data_files=DATASET_PATH)
|
| split_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
|
| train_dataset = split_dataset["train"]
|
| eval_dataset = split_dataset["test"]
|
| except Exception as e:
|
| print(f"error loading dataset from {DATASET_PATH}: {e}")
|
| exit(1)
|
|
|
| print("Loading model with Quantization")
|
|
|
| try:
|
| model=AutoModelForCausalLM.from_pretrained(
|
| NAME_OF_MODEL,
|
| quantization_config = bnb_config,
|
| device_map="auto",
|
| trust_remote_code = True,
|
| torch_dtype = torch.float16
|
| )
|
| model.config.pretraining_p=1
|
| print("Model loaded successfully")
|
| except Exception as e:
|
| print("ERROR LOADING MODEL: {e}")
|
| exit(1)
|
|
|
| try:
|
| tokenizer = AutoTokenizer.from_pretrained(NAME_OF_MODEL, trust_remote_code=True)
|
| tokenizer.pad_token = tokenizer.eos_token
|
| tokenizer.padding_side = "right"
|
| except Exception as e:
|
| print('ERROR LOADING TOKENIZER: {e}')
|
| exit(1)
|
|
|
| trainer=SFTTrainer(
|
| model=model,
|
| train_dataset= train_dataset,
|
| eval_dataset=eval_dataset,
|
| peft_config = lora_config,
|
| dataset_text_field="text",
|
| max_seq_length = 512,
|
| tokenizer = tokenizer,
|
| args=training_args,
|
| packing=False,
|
| callbacks=[EarlyStoppingCallback(early_stopping_patience=7)]
|
| )
|
|
|
| print("training started")
|
| trainer.train()
|
| print("fine tuning complete")
|
|
|
| trainer.save_model(OUTPUT_DIR)
|
|
|
|
|