one-piece-analysis / text_classification /ability_classifier.py
Fluoron's picture
Upload folder using huggingface_hub
d880754 verified
Raw
History Blame Contribute Delete
7.86 kB
import pandas as pd
import torch
import huggingface_hub
from transformers import (AutoTokenizer,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
TrainingArguments,
pipeline
)
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from datasets import Dataset
import gc
import random
from .cleaner import Cleaner
from .training_utils import get_class_weights, compute_metrics
from .custom_trainer import CustomTrainer
class AbilityClassifier():
def __init__(self,
model_path,
data_path=None,
text_column_name='text',
label_column_name='ability',
model_name='microsoft/deberta-v3-small',
test_size=0.2,
num_labels=3,
huggingface_token=None,
description_min_words=25,
description_max_words=110,
force_retrain=False,
):
self.model_path = model_path
self.data_path = data_path
self.text_column_name = text_column_name
self.label_column_name = label_column_name
self.model_name = model_name
self.test_size = test_size
self.num_labels = num_labels
# Length-normalize descriptions so text length can't act as a class
# shortcut (Haki rows are short chunks, Devil Fruit / Physical rows are
# long articles — without this the model learns "short -> Haki").
self.description_min_words = description_min_words
self.description_max_words = description_max_words
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.huggingface_token = huggingface_token
if self.huggingface_token is not None:
huggingface_hub.login(self.huggingface_token)
self.tokenizer = self.load_tokenizer()
# force_retrain lets us overwrite an existing repo (e.g. iterating on the
# dataset) without deleting it first; inference paths leave it False.
if force_retrain or not huggingface_hub.repo_exists(self.model_path):
if data_path is None:
raise ValueError("Data path is required to train the model, since the model path does not exist in huggingface hub")
train_data, test_data = self.load_data(self.data_path)
train_data_df = train_data.to_pandas()
test_data_df = test_data.to_pandas()
all_data = pd.concat([train_data_df, test_data_df]).reset_index(drop=True)
class_weights = get_class_weights(all_data)
self.train_model(train_data, test_data, class_weights)
self.model = self.load_model(self.model_path)
def load_model(self, model_path):
# top_k=None replaces deprecated return_all_scores=True
model = pipeline('text-classification', model=model_path, top_k=None)
return model
def train_model(self, train_data, test_data, class_weights):
model = AutoModelForSequenceClassification.from_pretrained(
self.model_name,
num_labels=self.num_labels,
id2label=self.label_dict,
)
data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
training_args = TrainingArguments(
output_dir=self.model_path,
learning_rate=2e-4,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=5,
weight_decay=0.01,
eval_strategy='epoch',
logging_strategy='epoch',
push_to_hub=True,
report_to='none', # avoid interactive W&B prompt on Kaggle/Colab
)
trainer = CustomTrainer(
model=model,
args=training_args,
train_dataset=train_data,
eval_dataset=test_data,
processing_class=self.tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics
)
trainer.set_device(self.device)
trainer.set_class_weights(class_weights)
trainer.train()
del trainer, model
gc.collect()
if self.device == 'cuda':
torch.cuda.empty_cache()
def simplify_ability(self, ability_type):
if not isinstance(ability_type, str):
return None
ability_type_lower = ability_type.lower()
if any(t in ability_type_lower for t in ('paramecia', 'logia', 'zoan', 'devil fruit')):
return 'Devil Fruit'
if 'haki' in ability_type_lower:
return 'Haki'
if ability_type.strip():
return 'Physical Technique'
return None
def preprocess_function(self, tokenizer, examples):
# max_length matches the ~110-word description cap so training and
# inference see the same length regime (no length shortcut, no
# train/inference mismatch on long pasted inputs).
return tokenizer(examples['text_cleaned'], truncation=True, max_length=192)
def _normalize_length(self, descriptions):
# Truncate every description to a random word count drawn from the SAME
# distribution for all classes, so length no longer correlates with the
# label. Seeded for reproducibility.
rng = random.Random(42)
def truncate(desc):
words = str(desc).split()
k = rng.randint(self.description_min_words, self.description_max_words)
return ' '.join(words[:k])
return descriptions.apply(truncate)
def load_data(self, data_path):
df = pd.read_json(data_path, lines=True)
df['ability_type_simplified'] = df['ability_type'].apply(self.simplify_ability)
df['ability_description'] = self._normalize_length(df['ability_description'])
df[self.text_column_name] = df['ability_name'] + '. ' + df['ability_description']
df[self.label_column_name] = df['ability_type_simplified']
df = df[[self.text_column_name, self.label_column_name]]
df = df.dropna()
cleaner = Cleaner()
df['text_cleaned'] = df[self.text_column_name].apply(cleaner.clean)
le = preprocessing.LabelEncoder()
le.fit(df[self.label_column_name].tolist())
label_dict = {index: label_name for index, label_name in enumerate(le.classes_.tolist())}
self.label_dict = label_dict
df['label'] = le.transform(df[self.label_column_name].tolist())
df_train, df_test = train_test_split(df,
test_size=self.test_size,
stratify=df['label'])
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)
tokenized_train = train_dataset.map(
lambda examples: self.preprocess_function(self.tokenizer, examples), batched=True)
tokenized_test = test_dataset.map(
lambda examples: self.preprocess_function(self.tokenizer, examples), batched=True)
return tokenized_train, tokenized_test
def load_tokenizer(self):
if huggingface_hub.repo_exists(self.model_path):
tokenizer = AutoTokenizer.from_pretrained(self.model_path)
else:
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
return tokenizer
def postprocess(self, model_output):
output = []
for pred in model_output:
label = max(pred, key=lambda x: x['score'])['label']
output.append(label)
return output
def classify_ability(self, text):
model_output = self.model(text)
predictions = self.postprocess(model_output)
return predictions