Analysis_System / text_classification /jutsu_classifier.py
kankur0007's picture
Add application file
4475241
import pandas as pd
import torch
import huggingface_hub
from transformers import (AutoTokenizer,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
TrainingArguments,
pipeline
)
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from datasets import Dataset
import gc
from .cleaner import Cleaner
from .training_utils import get_class_weights,compute_metrics
from .custom_trainer import CustomTrainer
class JutsuClassifier():
def __init__(self,
model_path,
data_path=None,
text_column_name='text',
label_column_name='jutsu',
model_name = "distilbert/distilbert-base-uncased",
test_size=0.2,
num_labels=3,
huggingface_token = None
):
self.model_path = model_path
self.data_path = data_path
self.text_column_name = text_column_name
self.label_column_name = label_column_name
self.model_name = model_name
self.test_size = test_size
self.num_labels = num_labels
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.huggingface_token = huggingface_token
if self.huggingface_token is not None:
huggingface_hub.login(self.huggingface_token)
self.tokenizer = self.load_tokenizer()
if not huggingface_hub.repo_exists(self.model_path):
# check if the data path is provided
if data_path is None:
raise ValueError("Data path is required to train the model,since the model path does not exist in huggingface hub")
train_data, test_data = self.load_data(self.data_path)
train_data_df = train_data.to_pandas()
test_data_df = test_data.to_pandas()
all_data = pd.concat([train_data_df, test_data_df]).reset_index(drop=True)
class_weights = get_class_weights(all_data)
self.train_model(train_data, test_data, class_weights)
self.model = self.load_model(self.model_path)
def load_model(self,model_path):
model = pipeline('text-classification', model=model_path, return_all_scores=True)
return model
def train_model(self, train_data,test_data,class_weights):
model = AutoModelForSequenceClassification.from_pretrained(self.model_name,
num_labels=self.num_labels,
id2label=self.label_dict,
)
data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
training_args = TrainingArguments(
output_dir = self.model_path,
learning_rate=2e-4,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=5,
weight_decay=0.01,
evaluation_strategy="epoch",
logging_strategy="epoch",
push_to_hub=True,
)
trainer = CustomTrainer(
model=model,
args=training_args,
train_dataset = train_data,
eval_dataset = test_data,
tokenizer = self.tokenizer,
data_collator=data_collator,
compute_metrics= compute_metrics
)
trainer.set_device(self.device)
trainer.set_class_weights(class_weights)
trainer.train()
# Flush Memory
del trainer,model
gc.collect()
if self.device == 'cuda':
torch.cuda.empty_cache()
def simplify_jutsu(self, jutsu):
if "Genjutsu" in jutsu:
return "Genjutsu"
if "Ninjutsu" in jutsu:
return "Ninjutsu"
if "Taijutsu" in jutsu:
return "Taijutsu"
def preprocess_function(self,tokenizer,examples):
return tokenizer(examples['text_cleaned'],truncation=True)
def load_data(self,data_path):
df = pd.read_json(data_path,lines=True)
df['jutsu_type_simplified'] = df['jutsu_type'].apply(self.simplify_jutsu)
df['text'] = df['jutsu_name'] + ". " + df['jutsu_description']
df[self.label_column_name] = df['jutsu_type_simplified']
df = df[['text', self.label_column_name]]
df = df.dropna()
# Clean Text
cleaner = Cleaner()
df['text_cleaned'] = df[self.text_column_name].apply(cleaner.clean)
# Encode Labels
le = preprocessing.LabelEncoder()
le.fit(df[self.label_column_name].tolist())
label_dict = {index:label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
self.label_dict = label_dict
df['label'] = le.transform(df[self.label_column_name].tolist())
# Train / Test Split
test_size = 0.2
df_train, df_test = train_test_split(df,
test_size=test_size,
stratify=df['label'],)
# Conver Pandas to a hugging face dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)
# tokenize the dataset
tokenized_train = train_dataset.map(lambda examples: self.preprocess_function(self.tokenizer, examples),
batched=True)
tokenized_test = test_dataset.map(lambda examples: self.preprocess_function(self.tokenizer, examples),
batched=True)
return tokenized_train, tokenized_test
def load_tokenizer(self):
if huggingface_hub.repo_exists(self.model_path):
tokenizer = AutoTokenizer.from_pretrained(self.model_path)
else:
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
return tokenizer
def postprocess(self,model_output):
output=[]
for pred in model_output:
label = max(pred, key=lambda x: x['score'])['label']
output.append(label)
return output
def classify_jutsu(self,text):
model_output = self.model(text)
predictions =self.postprocess(model_output)
return predictions