import pandas as pd import torch import huggingface_hub from transformers import (AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, pipeline ) from sklearn import preprocessing from sklearn.model_selection import train_test_split from datasets import Dataset import gc from .cleaner import Cleaner from .training_utils import get_class_weights,compute_metrics from .custom_trainer import CustomTrainer class JutsuClassifier(): def __init__(self, model_path, data_path=None, text_column_name='text', label_column_name='jutsu', model_name = "distilbert/distilbert-base-uncased", test_size=0.2, num_labels=3, huggingface_token = None ): self.model_path = model_path self.data_path = data_path self.text_column_name = text_column_name self.label_column_name = label_column_name self.model_name = model_name self.test_size = test_size self.num_labels = num_labels self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.huggingface_token = huggingface_token if self.huggingface_token is not None: huggingface_hub.login(self.huggingface_token) self.tokenizer = self.load_tokenizer() if not huggingface_hub.repo_exists(self.model_path): # check if the data path is provided if data_path is None: raise ValueError("Data path is required to train the model,since the model path does not exist in huggingface hub") train_data, test_data = self.load_data(self.data_path) train_data_df = train_data.to_pandas() test_data_df = test_data.to_pandas() all_data = pd.concat([train_data_df, test_data_df]).reset_index(drop=True) class_weights = get_class_weights(all_data) self.train_model(train_data, test_data, class_weights) self.model = self.load_model(self.model_path) def load_model(self,model_path): model = pipeline('text-classification', model=model_path, return_all_scores=True) return model def train_model(self, train_data,test_data,class_weights): model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=self.num_labels, id2label=self.label_dict, ) data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer) training_args = TrainingArguments( output_dir = self.model_path, learning_rate=2e-4, per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=5, weight_decay=0.01, evaluation_strategy="epoch", logging_strategy="epoch", push_to_hub=True, ) trainer = CustomTrainer( model=model, args=training_args, train_dataset = train_data, eval_dataset = test_data, tokenizer = self.tokenizer, data_collator=data_collator, compute_metrics= compute_metrics ) trainer.set_device(self.device) trainer.set_class_weights(class_weights) trainer.train() # Flush Memory del trainer,model gc.collect() if self.device == 'cuda': torch.cuda.empty_cache() def simplify_jutsu(self, jutsu): if "Genjutsu" in jutsu: return "Genjutsu" if "Ninjutsu" in jutsu: return "Ninjutsu" if "Taijutsu" in jutsu: return "Taijutsu" def preprocess_function(self,tokenizer,examples): return tokenizer(examples['text_cleaned'],truncation=True) def load_data(self,data_path): df = pd.read_json(data_path,lines=True) df['jutsu_type_simplified'] = df['jutsu_type'].apply(self.simplify_jutsu) df['text'] = df['jutsu_name'] + ". " + df['jutsu_description'] df[self.label_column_name] = df['jutsu_type_simplified'] df = df[['text', self.label_column_name]] df = df.dropna() # Clean Text cleaner = Cleaner() df['text_cleaned'] = df[self.text_column_name].apply(cleaner.clean) # Encode Labels le = preprocessing.LabelEncoder() le.fit(df[self.label_column_name].tolist()) label_dict = {index:label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())} self.label_dict = label_dict df['label'] = le.transform(df[self.label_column_name].tolist()) # Train / Test Split test_size = 0.2 df_train, df_test = train_test_split(df, test_size=test_size, stratify=df['label'],) # Conver Pandas to a hugging face dataset train_dataset = Dataset.from_pandas(df_train) test_dataset = Dataset.from_pandas(df_test) # tokenize the dataset tokenized_train = train_dataset.map(lambda examples: self.preprocess_function(self.tokenizer, examples), batched=True) tokenized_test = test_dataset.map(lambda examples: self.preprocess_function(self.tokenizer, examples), batched=True) return tokenized_train, tokenized_test def load_tokenizer(self): if huggingface_hub.repo_exists(self.model_path): tokenizer = AutoTokenizer.from_pretrained(self.model_path) else: tokenizer = AutoTokenizer.from_pretrained(self.model_name) return tokenizer def postprocess(self,model_output): output=[] for pred in model_output: label = max(pred, key=lambda x: x['score'])['label'] output.append(label) return output def classify_jutsu(self,text): model_output = self.model(text) predictions =self.postprocess(model_output) return predictions