|
|
|
|
|
import torch |
|
|
from transformers import ( |
|
|
AutoTokenizer, |
|
|
AutoModelForSequenceClassification, |
|
|
TrainingArguments, |
|
|
Trainer, |
|
|
DataCollatorWithPadding |
|
|
) |
|
|
from datasets import Dataset |
|
|
import json |
|
|
import logging |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class AcoliTrainer: |
|
|
def __init__(self, model_name="xlm-roberta-base", num_labels=3): |
|
|
self.model_name = model_name |
|
|
self.num_labels = num_labels |
|
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
self.model = AutoModelForSequenceClassification.from_pretrained( |
|
|
model_name, |
|
|
num_labels=num_labels |
|
|
) |
|
|
|
|
|
def load_data(self, jsonl_path): |
|
|
"""Load data from JSONL file""" |
|
|
texts = [] |
|
|
labels = [] |
|
|
|
|
|
with open(jsonl_path, 'r', encoding='utf-8') as f: |
|
|
for line in f: |
|
|
data = json.loads(line) |
|
|
texts.append(data['text']) |
|
|
labels.append(data['label']) |
|
|
|
|
|
return Dataset.from_dict({ |
|
|
'text': texts, |
|
|
'label': labels |
|
|
}) |
|
|
|
|
|
def preprocess_function(self, examples): |
|
|
"""Tokenize the texts""" |
|
|
return self.tokenizer( |
|
|
examples['text'], |
|
|
truncation=True, |
|
|
padding=True, |
|
|
max_length=512 |
|
|
) |
|
|
|
|
|
def train(self, train_path, output_dir="./acoli-model"): |
|
|
"""Train the model""" |
|
|
|
|
|
|
|
|
logger.info("Loading training data...") |
|
|
dataset = self.load_data(train_path) |
|
|
tokenized_dataset = dataset.map(self.preprocess_function, batched=True) |
|
|
|
|
|
|
|
|
train_test_split = tokenized_dataset.train_test_split(test_size=0.2) |
|
|
train_dataset = train_test_split['train'] |
|
|
eval_dataset = train_test_split['test'] |
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir=output_dir, |
|
|
learning_rate=2e-5, |
|
|
per_device_train_batch_size=8, |
|
|
per_device_eval_batch_size=8, |
|
|
num_train_epochs=3, |
|
|
weight_decay=0.01, |
|
|
evaluation_strategy="epoch", |
|
|
save_strategy="epoch", |
|
|
load_best_model_at_end=True, |
|
|
push_to_hub=False, |
|
|
) |
|
|
|
|
|
|
|
|
data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer) |
|
|
|
|
|
|
|
|
trainer = Trainer( |
|
|
model=self.model, |
|
|
args=training_args, |
|
|
train_dataset=train_dataset, |
|
|
eval_dataset=eval_dataset, |
|
|
tokenizer=self.tokenizer, |
|
|
data_collator=data_collator, |
|
|
) |
|
|
|
|
|
|
|
|
logger.info("Starting training...") |
|
|
trainer.train() |
|
|
|
|
|
|
|
|
logger.info(f"Saving model to {output_dir}") |
|
|
trainer.save_model(output_dir) |
|
|
self.tokenizer.save_pretrained(output_dir) |
|
|
|
|
|
return trainer |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
trainer = AcoliTrainer() |
|
|
|
|
|
|
|
|
trained_trainer = trainer.train("path/to/your/data.jsonl") |
|
|
|
|
|
print("Training completed successfully!") |