Spaces:

SagaMKM
/

Projekti

Build error

App Files Files Community

Projekti / training.py

SagaMKM

Create training.py

23fcc89 verified over 1 year ago

raw

history blame contribute delete

4.13 kB

	#This file was our attempt at training the model, which ultimately failed.

	!pip install datasets peft transformers
	from google.colab import userdata
	my_secret_key = userdata.get('Cli2')
	from huggingface_hub import login
	login(my_secret_key)

	# Name for finetuned model and folder.
	model_output = "./BudgetAdvisor"

	# Dataset loading and manipulation.
	from datasets import load_dataset
	dataset = load_dataset("gbharti/finance-alpaca") # features: ['text', 'instruction', 'input', 'output']
	# Remove empty columns from dataset.
	dataset = dataset.remove_columns(["text", "input"])
	# Splits dataset to test and train sets, 90 % for train and 10 % for test.
	dataset = dataset["train"].train_test_split(test_size=0.1)
	train_dataset = dataset["train"]
	eval_dataset = dataset["test"]

	#Tokenizer and model settings.
	from transformers import AutoTokenizer
	from transformers import Trainer, TrainingArguments, AutoModelForCausalLM
	tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", use_fast=True)
	model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")

	# Make arrays of token the same size.
	if tokenizer.pad_token is None:
	tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
	model.resize_token_embeddings(len(tokenizer))

	# For memory efficiency.
	model.gradient_checkpointing_enable()

	# Parameter-Efficient Fine-Tuning
	from peft import LoraConfig, get_peft_model
	# Define a PEFT configuration for LoRA
	lora_config = LoraConfig(
	r=8, # Reduced rank for faster training
	lora_alpha=16,
	target_modules=["q_proj", "v_proj"],
	lora_dropout=0.05,
	bias="none",
	task_type="CAUSAL_LM"
	)

	# Check if cuda is available.
	import torch
	model = get_peft_model(model, lora_config)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)

	# Preprocessing function
	def preprocess_data(examples):
	# Combine instruction and input as the prompt
	inputs = [f"Instruction: {instr}\nInput: {inp}\n" for instr, inp in zip(examples['instruction'], examples['output'])]
	targets = [output for output in examples['output']]
	return {'input_text': inputs, 'target_text': targets}

	train_dataset = train_dataset.map(preprocess_data, batched=True)
	eval_dataset = eval_dataset.map(preprocess_data, batched=True)

	# Tokenization function
	def tokenize_data(examples):
	model_inputs = tokenizer(
	examples['input_text'],
	max_length=128, # Reduced max_length for faster processing
	truncation=True,
	padding="max_length"
	)
	labels = tokenizer(
	examples['target_text'],
	max_length=128,
	truncation=True,
	padding="max_length"
	)["input_ids"]
	model_inputs["labels"] = labels
	return model_inputs

	# Tokenize the datasets
	train_dataset = train_dataset.map(tokenize_data, batched=True, remove_columns=train_dataset.column_names)
	eval_dataset = eval_dataset.map(tokenize_data, batched=True, remove_columns=eval_dataset.column_names)

	# Set the format for PyTorch tensors
	train_dataset.set_format(type="torch")
	eval_dataset.set_format(type="torch")

	# Training arguments and trainer.
	training_args = TrainingArguments(
	output_dir=model_output, # "./BudgetAdvisor"
	per_device_train_batch_size=8, # Increase if GPU memory allows
	per_device_eval_batch_size=8,
	evaluation_strategy="steps",
	eval_steps=500,
	save_steps=500,
	num_train_epochs=3, # Increased epochs for better training
	learning_rate=5e-5,
	fp16=True, # Enable mixed precision for faster training
	logging_steps=100,
	save_total_limit=2,
	load_best_model_at_end=True,
	report_to="none", # Disable reporting to third-party services
	)

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset,
	tokenizer=tokenizer
	)

	# Message for testing.
	print("Trainer is set up!")

	# Trains the model and saves it.
	trainer.train()
	print("Model trained!")
	trainer.save_model(model_output) # "./BudgetAdvisor"
	tokenizer.save_pretrained(model_output)# "./BudgetAdvisor"

	!zip -r BudgetAdvisor.zip ./BudgetAdvisor