Spaces:
Running
Running
| import pandas as pd | |
| import torch | |
| import huggingface_hub | |
| from transformers import (AutoTokenizer, | |
| AutoModelForSequenceClassification, | |
| DataCollatorWithPadding, | |
| TrainingArguments, | |
| pipeline | |
| ) | |
| from sklearn import preprocessing | |
| from sklearn.model_selection import train_test_split | |
| from datasets import Dataset | |
| import gc | |
| import random | |
| from .cleaner import Cleaner | |
| from .training_utils import get_class_weights, compute_metrics | |
| from .custom_trainer import CustomTrainer | |
| class AbilityClassifier(): | |
| def __init__(self, | |
| model_path, | |
| data_path=None, | |
| text_column_name='text', | |
| label_column_name='ability', | |
| model_name='microsoft/deberta-v3-small', | |
| test_size=0.2, | |
| num_labels=3, | |
| huggingface_token=None, | |
| description_min_words=25, | |
| description_max_words=110, | |
| force_retrain=False, | |
| ): | |
| self.model_path = model_path | |
| self.data_path = data_path | |
| self.text_column_name = text_column_name | |
| self.label_column_name = label_column_name | |
| self.model_name = model_name | |
| self.test_size = test_size | |
| self.num_labels = num_labels | |
| # Length-normalize descriptions so text length can't act as a class | |
| # shortcut (Haki rows are short chunks, Devil Fruit / Physical rows are | |
| # long articles — without this the model learns "short -> Haki"). | |
| self.description_min_words = description_min_words | |
| self.description_max_words = description_max_words | |
| self.device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| self.huggingface_token = huggingface_token | |
| if self.huggingface_token is not None: | |
| huggingface_hub.login(self.huggingface_token) | |
| self.tokenizer = self.load_tokenizer() | |
| # force_retrain lets us overwrite an existing repo (e.g. iterating on the | |
| # dataset) without deleting it first; inference paths leave it False. | |
| if force_retrain or not huggingface_hub.repo_exists(self.model_path): | |
| if data_path is None: | |
| raise ValueError("Data path is required to train the model, since the model path does not exist in huggingface hub") | |
| train_data, test_data = self.load_data(self.data_path) | |
| train_data_df = train_data.to_pandas() | |
| test_data_df = test_data.to_pandas() | |
| all_data = pd.concat([train_data_df, test_data_df]).reset_index(drop=True) | |
| class_weights = get_class_weights(all_data) | |
| self.train_model(train_data, test_data, class_weights) | |
| self.model = self.load_model(self.model_path) | |
| def load_model(self, model_path): | |
| # top_k=None replaces deprecated return_all_scores=True | |
| model = pipeline('text-classification', model=model_path, top_k=None) | |
| return model | |
| def train_model(self, train_data, test_data, class_weights): | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| self.model_name, | |
| num_labels=self.num_labels, | |
| id2label=self.label_dict, | |
| ) | |
| data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer) | |
| training_args = TrainingArguments( | |
| output_dir=self.model_path, | |
| learning_rate=2e-4, | |
| per_device_train_batch_size=8, | |
| per_device_eval_batch_size=8, | |
| num_train_epochs=5, | |
| weight_decay=0.01, | |
| eval_strategy='epoch', | |
| logging_strategy='epoch', | |
| push_to_hub=True, | |
| report_to='none', # avoid interactive W&B prompt on Kaggle/Colab | |
| ) | |
| trainer = CustomTrainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_data, | |
| eval_dataset=test_data, | |
| processing_class=self.tokenizer, | |
| data_collator=data_collator, | |
| compute_metrics=compute_metrics | |
| ) | |
| trainer.set_device(self.device) | |
| trainer.set_class_weights(class_weights) | |
| trainer.train() | |
| del trainer, model | |
| gc.collect() | |
| if self.device == 'cuda': | |
| torch.cuda.empty_cache() | |
| def simplify_ability(self, ability_type): | |
| if not isinstance(ability_type, str): | |
| return None | |
| ability_type_lower = ability_type.lower() | |
| if any(t in ability_type_lower for t in ('paramecia', 'logia', 'zoan', 'devil fruit')): | |
| return 'Devil Fruit' | |
| if 'haki' in ability_type_lower: | |
| return 'Haki' | |
| if ability_type.strip(): | |
| return 'Physical Technique' | |
| return None | |
| def preprocess_function(self, tokenizer, examples): | |
| # max_length matches the ~110-word description cap so training and | |
| # inference see the same length regime (no length shortcut, no | |
| # train/inference mismatch on long pasted inputs). | |
| return tokenizer(examples['text_cleaned'], truncation=True, max_length=192) | |
| def _normalize_length(self, descriptions): | |
| # Truncate every description to a random word count drawn from the SAME | |
| # distribution for all classes, so length no longer correlates with the | |
| # label. Seeded for reproducibility. | |
| rng = random.Random(42) | |
| def truncate(desc): | |
| words = str(desc).split() | |
| k = rng.randint(self.description_min_words, self.description_max_words) | |
| return ' '.join(words[:k]) | |
| return descriptions.apply(truncate) | |
| def load_data(self, data_path): | |
| df = pd.read_json(data_path, lines=True) | |
| df['ability_type_simplified'] = df['ability_type'].apply(self.simplify_ability) | |
| df['ability_description'] = self._normalize_length(df['ability_description']) | |
| df[self.text_column_name] = df['ability_name'] + '. ' + df['ability_description'] | |
| df[self.label_column_name] = df['ability_type_simplified'] | |
| df = df[[self.text_column_name, self.label_column_name]] | |
| df = df.dropna() | |
| cleaner = Cleaner() | |
| df['text_cleaned'] = df[self.text_column_name].apply(cleaner.clean) | |
| le = preprocessing.LabelEncoder() | |
| le.fit(df[self.label_column_name].tolist()) | |
| label_dict = {index: label_name for index, label_name in enumerate(le.classes_.tolist())} | |
| self.label_dict = label_dict | |
| df['label'] = le.transform(df[self.label_column_name].tolist()) | |
| df_train, df_test = train_test_split(df, | |
| test_size=self.test_size, | |
| stratify=df['label']) | |
| train_dataset = Dataset.from_pandas(df_train) | |
| test_dataset = Dataset.from_pandas(df_test) | |
| tokenized_train = train_dataset.map( | |
| lambda examples: self.preprocess_function(self.tokenizer, examples), batched=True) | |
| tokenized_test = test_dataset.map( | |
| lambda examples: self.preprocess_function(self.tokenizer, examples), batched=True) | |
| return tokenized_train, tokenized_test | |
| def load_tokenizer(self): | |
| if huggingface_hub.repo_exists(self.model_path): | |
| tokenizer = AutoTokenizer.from_pretrained(self.model_path) | |
| else: | |
| tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
| return tokenizer | |
| def postprocess(self, model_output): | |
| output = [] | |
| for pred in model_output: | |
| label = max(pred, key=lambda x: x['score'])['label'] | |
| output.append(label) | |
| return output | |
| def classify_ability(self, text): | |
| model_output = self.model(text) | |
| predictions = self.postprocess(model_output) | |
| return predictions | |