Spaces:
Runtime error
Runtime error
| from datasets import load_dataset | |
| from trl import SFTTrainer | |
| from peft import LoraConfig, get_peft_model | |
| import os | |
| from uuid import uuid4 | |
| import pandas as pd | |
| import subprocess | |
| import transformers | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import evaluate | |
| from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score | |
| from datasets import load_dataset | |
| from trl import SFTTrainer | |
| from peft import LoraConfig, get_peft_model | |
| ### Define functions | |
| def max_token_len(dataset): | |
| max_seq_length = 0 | |
| for row in dataset: | |
| tokens = len(tokenizer(row['text'])['input_ids']) | |
| if tokens > max_seq_length: | |
| max_seq_length = tokens | |
| return max_seq_length | |
| ### Set up models and datasets, training parameters | |
| # model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1' | |
| model_name = 'mistralai/Mistral-7B-v0.1' | |
| # model_name = 'distilbert-base-uncased' | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model_max_length = tokenizer.model_max_length | |
| print("Model Max Length:", model_max_length) | |
| # dataset = load_dataset("imdb", split="train") | |
| dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100' | |
| dataset = load_dataset(dataset_name) | |
| # Write dataset files into data directory | |
| data_directory = './fine_tune_data/' | |
| # Create the data directory if it doesn't exist | |
| os.makedirs(data_directory, exist_ok=True) | |
| # Write the train data to a CSV file | |
| train_data='train_data' | |
| train_filename = os.path.join(data_directory, train_data) | |
| dataset['train'].to_pandas().to_csv(train_filename+'.csv', columns=['text'], index=False) | |
| max_token_length_train=max_token_len(dataset['train']) | |
| print('Max token length train: '+str(max_token_length_train)) | |
| # Write the validation data to a CSV file | |
| validation_data='validation_data' | |
| validation_filename = os.path.join(data_directory, validation_data) | |
| dataset['validation'].to_pandas().to_csv(validation_filename+'.csv', columns=['text'], index=False) | |
| max_token_length_validation=max_token_len(dataset['validation']) | |
| print('Max token length validation: '+str(max_token_length_validation)) | |
| max_token_length=max(max_token_length_train,max_token_length_validation) | |
| # max_token_length=max_token_length_train | |
| if max_token_length > model_max_length: | |
| raise ValueError("Maximum token length exceeds model limits.") | |
| block_size=2*max_token_length | |
| print('Block size: '+str(block_size)) | |
| # Define project parameters | |
| username='ai-aerospace' | |
| project_name='./llms/'+'ams_data_train-100_'+str(uuid4()) | |
| repo_name='ams-data-train-100-'+str(uuid4()) | |
| model_params={ | |
| "project_name": project_name, | |
| "model_name": model_name, | |
| "repo_id": username+'/'+repo_name, | |
| "train_data": train_data, | |
| "validation_data": validation_data, | |
| "data_directory": data_directory, | |
| "block_size": block_size, | |
| "model_max_length": max_token_length, | |
| "logging_steps": -1, | |
| "evaluation_strategy": "epoch", | |
| "save_total_limit": 1, | |
| "save_strategy": "epoch", | |
| "mixed_precision": "fp16", | |
| "lr": 0.00003, | |
| "epochs": 3, | |
| "batch_size": 2, | |
| "warmup_ratio": 0.1, | |
| "gradient_accumulation": 1, | |
| "optimizer": "adamw_torch", | |
| "scheduler": "linear", | |
| "weight_decay": 0, | |
| "max_grad_norm": 1, | |
| "seed": 42, | |
| "quantization": "int4", | |
| "lora_r": 16, | |
| "lora_alpha": 32, | |
| "lora_dropout": 0.05 | |
| } | |
| for key, value in model_params.items(): | |
| os.environ[key] = str(value) | |
| print(model_params) | |
| args_custom=transformers.TrainingArguments( | |
| per_device_train_batch_size=model_params['batch_size'], | |
| per_device_eval_batch_size=model_params['batch_size'], | |
| gradient_accumulation_steps=model_params['gradient_accumulation'], | |
| warmup_ratio=model_params['warmup_ratio'], | |
| num_train_epochs=model_params['epochs'], | |
| learning_rate=model_params['lr'], | |
| fp16=True, | |
| logging_steps=model_params['logging_steps'], | |
| save_total_limit=model_params['save_total_limit'], | |
| evaluation_strategy=model_params['evaluation_strategy'], | |
| metric_for_best_model="f1", | |
| output_dir='model_outputs', | |
| logging_dir='model_outputs', | |
| optim=model_params['optimizer'], | |
| max_grad_norm=model_params['max_grad_norm'], | |
| weight_decay=model_params['weight_decay'], | |
| lr_scheduler_type=model_params['scheduler'] | |
| ) | |
| ### Args from medium article | |
| args_medium=transformers.TrainingArguments( | |
| per_device_train_batch_size=8, | |
| per_device_eval_batch_size=32, | |
| gradient_accumulation_steps=4, | |
| warmup_steps=100, | |
| max_steps=12276, | |
| learning_rate=2e-4, | |
| fp16=True, | |
| eval_steps= 1000, | |
| logging_steps=1000, | |
| save_steps=1000, | |
| evaluation_strategy="steps", | |
| do_eval=True, | |
| load_best_model_at_end=True, | |
| metric_for_best_model="f1", | |
| output_dir='model_outputs', | |
| logging_dir='model_outputs', | |
| remove_unused_columns =False, | |
| report_to='wandb' # enable logging to W&B | |
| ) | |
| ### | |
| ### Load model and peft config, calculate trainable parameters | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| load_in_4bit=True | |
| ) | |
| peft_config = LoraConfig( | |
| r=model_params['lora_r'], | |
| lora_alpha=model_params['lora_alpha'], | |
| lora_dropout=model_params['lora_dropout'] | |
| ) | |
| lora_model = get_peft_model(model, peft_config) | |
| lora_model.print_trainable_parameters() | |
| ### Train model | |
| f1_metric = evaluate.load("f1") | |
| recall_metric = evaluate.load("recall") | |
| accuracy_metric = evaluate.load("accuracy") | |
| precision_metric = evaluate.load("precision") | |
| def compute_metrics(eval_pred): | |
| logits, labels = eval_pred | |
| predictions = np.argmax(logits, axis=-1) | |
| results = {} | |
| results.update(f1_metric.compute(predictions=predictions, references = labels, average="macro")) | |
| results.update(recall_metric.compute(predictions=predictions, references = labels, average="macro")) | |
| results.update(accuracy_metric.compute(predictions=predictions, references = labels)) | |
| results.update(precision_metric.compute(predictions=predictions, references = labels, average="macro")) | |
| return results | |
| # See https://towardsdatascience.com/fine-tune-your-llm-without-maxing-out-your-gpu-db2278603d78 for details | |
| trainer = transformers.Trainer( | |
| model=lora_model, | |
| train_dataset=model_params['train_data'], | |
| eval_dataset=model_params['validation_data'], | |
| compute_metrics=compute_metrics, | |
| args=args_custom | |
| ) | |
| trainer.train() |