Spaces:

Fluoron
/

one-piece-analysis

Running

App Files Files Community

one-piece-analysis / text_classification /ability_classifier.py

Fluoron

Upload folder using huggingface_hub

d880754 verified 14 days ago

Raw

History Blame Contribute Delete

7.86 kB

	import pandas as pd
	import torch
	import huggingface_hub
	from transformers import (AutoTokenizer,
	AutoModelForSequenceClassification,
	DataCollatorWithPadding,
	TrainingArguments,
	pipeline
	)
	from sklearn import preprocessing
	from sklearn.model_selection import train_test_split
	from datasets import Dataset
	import gc
	import random
	from .cleaner import Cleaner
	from .training_utils import get_class_weights, compute_metrics
	from .custom_trainer import CustomTrainer

	class AbilityClassifier():
	def __init__(self,
	model_path,
	data_path=None,
	text_column_name='text',
	label_column_name='ability',
	model_name='microsoft/deberta-v3-small',
	test_size=0.2,
	num_labels=3,
	huggingface_token=None,
	description_min_words=25,
	description_max_words=110,
	force_retrain=False,
	):

	self.model_path = model_path
	self.data_path = data_path
	self.text_column_name = text_column_name
	self.label_column_name = label_column_name
	self.model_name = model_name
	self.test_size = test_size
	self.num_labels = num_labels
	# Length-normalize descriptions so text length can't act as a class
	# shortcut (Haki rows are short chunks, Devil Fruit / Physical rows are
	# long articles — without this the model learns "short -> Haki").
	self.description_min_words = description_min_words
	self.description_max_words = description_max_words
	self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

	self.huggingface_token = huggingface_token
	if self.huggingface_token is not None:
	huggingface_hub.login(self.huggingface_token)

	self.tokenizer = self.load_tokenizer()

	# force_retrain lets us overwrite an existing repo (e.g. iterating on the
	# dataset) without deleting it first; inference paths leave it False.
	if force_retrain or not huggingface_hub.repo_exists(self.model_path):

	if data_path is None:
	raise ValueError("Data path is required to train the model, since the model path does not exist in huggingface hub")

	train_data, test_data = self.load_data(self.data_path)
	train_data_df = train_data.to_pandas()
	test_data_df = test_data.to_pandas()

	all_data = pd.concat([train_data_df, test_data_df]).reset_index(drop=True)
	class_weights = get_class_weights(all_data)

	self.train_model(train_data, test_data, class_weights)

	self.model = self.load_model(self.model_path)

	def load_model(self, model_path):
	# top_k=None replaces deprecated return_all_scores=True
	model = pipeline('text-classification', model=model_path, top_k=None)
	return model

	def train_model(self, train_data, test_data, class_weights):
	model = AutoModelForSequenceClassification.from_pretrained(
	self.model_name,
	num_labels=self.num_labels,
	id2label=self.label_dict,
	)
	data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)

	training_args = TrainingArguments(
	output_dir=self.model_path,
	learning_rate=2e-4,
	per_device_train_batch_size=8,
	per_device_eval_batch_size=8,
	num_train_epochs=5,
	weight_decay=0.01,
	eval_strategy='epoch',
	logging_strategy='epoch',
	push_to_hub=True,
	report_to='none', # avoid interactive W&B prompt on Kaggle/Colab
	)

	trainer = CustomTrainer(
	model=model,
	args=training_args,
	train_dataset=train_data,
	eval_dataset=test_data,
	processing_class=self.tokenizer,
	data_collator=data_collator,
	compute_metrics=compute_metrics
	)

	trainer.set_device(self.device)
	trainer.set_class_weights(class_weights)

	trainer.train()

	del trainer, model
	gc.collect()

	if self.device == 'cuda':
	torch.cuda.empty_cache()

	def simplify_ability(self, ability_type):
	if not isinstance(ability_type, str):
	return None
	ability_type_lower = ability_type.lower()
	if any(t in ability_type_lower for t in ('paramecia', 'logia', 'zoan', 'devil fruit')):
	return 'Devil Fruit'
	if 'haki' in ability_type_lower:
	return 'Haki'
	if ability_type.strip():
	return 'Physical Technique'
	return None

	def preprocess_function(self, tokenizer, examples):
	# max_length matches the ~110-word description cap so training and
	# inference see the same length regime (no length shortcut, no
	# train/inference mismatch on long pasted inputs).
	return tokenizer(examples['text_cleaned'], truncation=True, max_length=192)

	def _normalize_length(self, descriptions):
	# Truncate every description to a random word count drawn from the SAME
	# distribution for all classes, so length no longer correlates with the
	# label. Seeded for reproducibility.
	rng = random.Random(42)

	def truncate(desc):
	words = str(desc).split()
	k = rng.randint(self.description_min_words, self.description_max_words)
	return ' '.join(words[:k])

	return descriptions.apply(truncate)

	def load_data(self, data_path):
	df = pd.read_json(data_path, lines=True)
	df['ability_type_simplified'] = df['ability_type'].apply(self.simplify_ability)
	df['ability_description'] = self._normalize_length(df['ability_description'])
	df[self.text_column_name] = df['ability_name'] + '. ' + df['ability_description']
	df[self.label_column_name] = df['ability_type_simplified']
	df = df[[self.text_column_name, self.label_column_name]]
	df = df.dropna()

	cleaner = Cleaner()
	df['text_cleaned'] = df[self.text_column_name].apply(cleaner.clean)

	le = preprocessing.LabelEncoder()
	le.fit(df[self.label_column_name].tolist())

	label_dict = {index: label_name for index, label_name in enumerate(le.classes_.tolist())}
	self.label_dict = label_dict
	df['label'] = le.transform(df[self.label_column_name].tolist())

	df_train, df_test = train_test_split(df,
	test_size=self.test_size,
	stratify=df['label'])

	train_dataset = Dataset.from_pandas(df_train)
	test_dataset = Dataset.from_pandas(df_test)

	tokenized_train = train_dataset.map(
	lambda examples: self.preprocess_function(self.tokenizer, examples), batched=True)
	tokenized_test = test_dataset.map(
	lambda examples: self.preprocess_function(self.tokenizer, examples), batched=True)

	return tokenized_train, tokenized_test

	def load_tokenizer(self):
	if huggingface_hub.repo_exists(self.model_path):
	tokenizer = AutoTokenizer.from_pretrained(self.model_path)
	else:
	tokenizer = AutoTokenizer.from_pretrained(self.model_name)
	return tokenizer

	def postprocess(self, model_output):
	output = []
	for pred in model_output:
	label = max(pred, key=lambda x: x['score'])['label']
	output.append(label)
	return output

	def classify_ability(self, text):
	model_output = self.model(text)
	predictions = self.postprocess(model_output)
	return predictions