import pandas as pd import torch import huggingface_hub from transformers import (AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, pipeline ) from sklearn import preprocessing from sklearn.model_selection import train_test_split from datasets import Dataset import gc import random from .cleaner import Cleaner from .training_utils import get_class_weights, compute_metrics from .custom_trainer import CustomTrainer class AbilityClassifier(): def __init__(self, model_path, data_path=None, text_column_name='text', label_column_name='ability', model_name='microsoft/deberta-v3-small', test_size=0.2, num_labels=3, huggingface_token=None, description_min_words=25, description_max_words=110, force_retrain=False, ): self.model_path = model_path self.data_path = data_path self.text_column_name = text_column_name self.label_column_name = label_column_name self.model_name = model_name self.test_size = test_size self.num_labels = num_labels # Length-normalize descriptions so text length can't act as a class # shortcut (Haki rows are short chunks, Devil Fruit / Physical rows are # long articles — without this the model learns "short -> Haki"). self.description_min_words = description_min_words self.description_max_words = description_max_words self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.huggingface_token = huggingface_token if self.huggingface_token is not None: huggingface_hub.login(self.huggingface_token) self.tokenizer = self.load_tokenizer() # force_retrain lets us overwrite an existing repo (e.g. iterating on the # dataset) without deleting it first; inference paths leave it False. if force_retrain or not huggingface_hub.repo_exists(self.model_path): if data_path is None: raise ValueError("Data path is required to train the model, since the model path does not exist in huggingface hub") train_data, test_data = self.load_data(self.data_path) train_data_df = train_data.to_pandas() test_data_df = test_data.to_pandas() all_data = pd.concat([train_data_df, test_data_df]).reset_index(drop=True) class_weights = get_class_weights(all_data) self.train_model(train_data, test_data, class_weights) self.model = self.load_model(self.model_path) def load_model(self, model_path): # top_k=None replaces deprecated return_all_scores=True model = pipeline('text-classification', model=model_path, top_k=None) return model def train_model(self, train_data, test_data, class_weights): model = AutoModelForSequenceClassification.from_pretrained( self.model_name, num_labels=self.num_labels, id2label=self.label_dict, ) data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer) training_args = TrainingArguments( output_dir=self.model_path, learning_rate=2e-4, per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=5, weight_decay=0.01, eval_strategy='epoch', logging_strategy='epoch', push_to_hub=True, report_to='none', # avoid interactive W&B prompt on Kaggle/Colab ) trainer = CustomTrainer( model=model, args=training_args, train_dataset=train_data, eval_dataset=test_data, processing_class=self.tokenizer, data_collator=data_collator, compute_metrics=compute_metrics ) trainer.set_device(self.device) trainer.set_class_weights(class_weights) trainer.train() del trainer, model gc.collect() if self.device == 'cuda': torch.cuda.empty_cache() def simplify_ability(self, ability_type): if not isinstance(ability_type, str): return None ability_type_lower = ability_type.lower() if any(t in ability_type_lower for t in ('paramecia', 'logia', 'zoan', 'devil fruit')): return 'Devil Fruit' if 'haki' in ability_type_lower: return 'Haki' if ability_type.strip(): return 'Physical Technique' return None def preprocess_function(self, tokenizer, examples): # max_length matches the ~110-word description cap so training and # inference see the same length regime (no length shortcut, no # train/inference mismatch on long pasted inputs). return tokenizer(examples['text_cleaned'], truncation=True, max_length=192) def _normalize_length(self, descriptions): # Truncate every description to a random word count drawn from the SAME # distribution for all classes, so length no longer correlates with the # label. Seeded for reproducibility. rng = random.Random(42) def truncate(desc): words = str(desc).split() k = rng.randint(self.description_min_words, self.description_max_words) return ' '.join(words[:k]) return descriptions.apply(truncate) def load_data(self, data_path): df = pd.read_json(data_path, lines=True) df['ability_type_simplified'] = df['ability_type'].apply(self.simplify_ability) df['ability_description'] = self._normalize_length(df['ability_description']) df[self.text_column_name] = df['ability_name'] + '. ' + df['ability_description'] df[self.label_column_name] = df['ability_type_simplified'] df = df[[self.text_column_name, self.label_column_name]] df = df.dropna() cleaner = Cleaner() df['text_cleaned'] = df[self.text_column_name].apply(cleaner.clean) le = preprocessing.LabelEncoder() le.fit(df[self.label_column_name].tolist()) label_dict = {index: label_name for index, label_name in enumerate(le.classes_.tolist())} self.label_dict = label_dict df['label'] = le.transform(df[self.label_column_name].tolist()) df_train, df_test = train_test_split(df, test_size=self.test_size, stratify=df['label']) train_dataset = Dataset.from_pandas(df_train) test_dataset = Dataset.from_pandas(df_test) tokenized_train = train_dataset.map( lambda examples: self.preprocess_function(self.tokenizer, examples), batched=True) tokenized_test = test_dataset.map( lambda examples: self.preprocess_function(self.tokenizer, examples), batched=True) return tokenized_train, tokenized_test def load_tokenizer(self): if huggingface_hub.repo_exists(self.model_path): tokenizer = AutoTokenizer.from_pretrained(self.model_path) else: tokenizer = AutoTokenizer.from_pretrained(self.model_name) return tokenizer def postprocess(self, model_output): output = [] for pred in model_output: label = max(pred, key=lambda x: x['score'])['label'] output.append(label) return output def classify_ability(self, text): model_output = self.model(text) predictions = self.postprocess(model_output) return predictions