| # Template for model training script for {{phase_name}} | |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer | |
| from datasets import load_dataset # Example - datasets library | |
| import torch # Example - PyTorch | |
| # Add other necessary imports | |
| def train_model(processed_dataset_path, model_name="bert-base-uncased", output_dir="./model_output"): | |
| """ | |
| Trains a model on the processed dataset. | |
| """ | |
| try: | |
| # Load processed dataset (replace with your actual dataset loading) | |
| dataset = load_dataset('csv', data_files=processed_dataset_path) # Example: CSV dataset loading, replace with your dataset format | |
| print("Dataset loaded. Preparing model and training...") | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) # Example: binary classification | |
| def tokenize_function(examples): | |
| return tokenizer(examples["text_column"], padding="max_length", truncation=True) # Example: tokenize 'text_column' | |
| tokenized_datasets = dataset.map(tokenize_function, batched=True) | |
| training_args = TrainingArguments( | |
| output_dir=output_dir, | |
| num_train_epochs=3, # Example epochs | |
| per_device_train_batch_size=16, # Example batch size | |
| per_device_eval_batch_size=64, # Example batch size | |
| warmup_steps=500, # Example warmup steps | |
| weight_decay=0.01, # Example weight decay | |
| logging_dir='./logs', # Directory for logs | |
| logging_steps=10, | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_datasets["train"], # Assuming 'train' split exists | |
| eval_dataset=tokenized_datasets["validation"], # Assuming 'validation' split exists - optional | |
| tokenizer=tokenizer, | |
| ) | |
| trainer.train() | |
| print(f"Model training completed. Model saved to {output_dir}") | |
| except Exception as e: | |
| print(f"Error during model training: {e}") | |
| if __name__ == "__main__": | |
| processed_data_filepath = "data/processed_dataset.csv" # Replace with your processed data path | |
| model_output_directory = "models/fine_tuned_model" # Replace with your desired output directory | |
| base_model_name = "bert-base-uncased" # Replace with your base model name | |
| train_model(processed_data_filepath, model_name=base_model_name, output_dir=model_output_directory) |