#!/usr/bin/env python3 import torch from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding ) from datasets import Dataset import json import logging # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class AcoliTrainer: def __init__(self, model_name="xlm-roberta-base", num_labels=3): self.model_name = model_name self.num_labels = num_labels self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = AutoModelForSequenceClassification.from_pretrained( model_name, num_labels=num_labels ) def load_data(self, jsonl_path): """Load data from JSONL file""" texts = [] labels = [] with open(jsonl_path, 'r', encoding='utf-8') as f: for line in f: data = json.loads(line) texts.append(data['text']) labels.append(data['label']) return Dataset.from_dict({ 'text': texts, 'label': labels }) def preprocess_function(self, examples): """Tokenize the texts""" return self.tokenizer( examples['text'], truncation=True, padding=True, max_length=512 ) def train(self, train_path, output_dir="./acoli-model"): """Train the model""" # Load and preprocess data logger.info("Loading training data...") dataset = self.load_data(train_path) tokenized_dataset = dataset.map(self.preprocess_function, batched=True) # Split dataset (80% train, 20% validation) train_test_split = tokenized_dataset.train_test_split(test_size=0.2) train_dataset = train_test_split['train'] eval_dataset = train_test_split['test'] # Training arguments training_args = TrainingArguments( output_dir=output_dir, learning_rate=2e-5, per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, weight_decay=0.01, evaluation_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, push_to_hub=False, # Set to True if you want to push to HF Hub ) # Data collator data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer) # Initialize Trainer trainer = Trainer( model=self.model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=self.tokenizer, data_collator=data_collator, ) # Start training logger.info("Starting training...") trainer.train() # Save the model logger.info(f"Saving model to {output_dir}") trainer.save_model(output_dir) self.tokenizer.save_pretrained(output_dir) return trainer if __name__ == "__main__": # Example usage trainer = AcoliTrainer() # Train the model trained_trainer = trainer.train("path/to/your/data.jsonl") print("Training completed successfully!")