Spaces:
Sleeping
Sleeping
File size: 6,473 Bytes
4475241 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import pandas as pd
import torch
import huggingface_hub
from transformers import (AutoTokenizer,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
TrainingArguments,
pipeline
)
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from datasets import Dataset
import gc
from .cleaner import Cleaner
from .training_utils import get_class_weights,compute_metrics
from .custom_trainer import CustomTrainer
class JutsuClassifier():
def __init__(self,
model_path,
data_path=None,
text_column_name='text',
label_column_name='jutsu',
model_name = "distilbert/distilbert-base-uncased",
test_size=0.2,
num_labels=3,
huggingface_token = None
):
self.model_path = model_path
self.data_path = data_path
self.text_column_name = text_column_name
self.label_column_name = label_column_name
self.model_name = model_name
self.test_size = test_size
self.num_labels = num_labels
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.huggingface_token = huggingface_token
if self.huggingface_token is not None:
huggingface_hub.login(self.huggingface_token)
self.tokenizer = self.load_tokenizer()
if not huggingface_hub.repo_exists(self.model_path):
# check if the data path is provided
if data_path is None:
raise ValueError("Data path is required to train the model,since the model path does not exist in huggingface hub")
train_data, test_data = self.load_data(self.data_path)
train_data_df = train_data.to_pandas()
test_data_df = test_data.to_pandas()
all_data = pd.concat([train_data_df, test_data_df]).reset_index(drop=True)
class_weights = get_class_weights(all_data)
self.train_model(train_data, test_data, class_weights)
self.model = self.load_model(self.model_path)
def load_model(self,model_path):
model = pipeline('text-classification', model=model_path, return_all_scores=True)
return model
def train_model(self, train_data,test_data,class_weights):
model = AutoModelForSequenceClassification.from_pretrained(self.model_name,
num_labels=self.num_labels,
id2label=self.label_dict,
)
data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
training_args = TrainingArguments(
output_dir = self.model_path,
learning_rate=2e-4,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=5,
weight_decay=0.01,
evaluation_strategy="epoch",
logging_strategy="epoch",
push_to_hub=True,
)
trainer = CustomTrainer(
model=model,
args=training_args,
train_dataset = train_data,
eval_dataset = test_data,
tokenizer = self.tokenizer,
data_collator=data_collator,
compute_metrics= compute_metrics
)
trainer.set_device(self.device)
trainer.set_class_weights(class_weights)
trainer.train()
# Flush Memory
del trainer,model
gc.collect()
if self.device == 'cuda':
torch.cuda.empty_cache()
def simplify_jutsu(self, jutsu):
if "Genjutsu" in jutsu:
return "Genjutsu"
if "Ninjutsu" in jutsu:
return "Ninjutsu"
if "Taijutsu" in jutsu:
return "Taijutsu"
def preprocess_function(self,tokenizer,examples):
return tokenizer(examples['text_cleaned'],truncation=True)
def load_data(self,data_path):
df = pd.read_json(data_path,lines=True)
df['jutsu_type_simplified'] = df['jutsu_type'].apply(self.simplify_jutsu)
df['text'] = df['jutsu_name'] + ". " + df['jutsu_description']
df[self.label_column_name] = df['jutsu_type_simplified']
df = df[['text', self.label_column_name]]
df = df.dropna()
# Clean Text
cleaner = Cleaner()
df['text_cleaned'] = df[self.text_column_name].apply(cleaner.clean)
# Encode Labels
le = preprocessing.LabelEncoder()
le.fit(df[self.label_column_name].tolist())
label_dict = {index:label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
self.label_dict = label_dict
df['label'] = le.transform(df[self.label_column_name].tolist())
# Train / Test Split
test_size = 0.2
df_train, df_test = train_test_split(df,
test_size=test_size,
stratify=df['label'],)
# Conver Pandas to a hugging face dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)
# tokenize the dataset
tokenized_train = train_dataset.map(lambda examples: self.preprocess_function(self.tokenizer, examples),
batched=True)
tokenized_test = test_dataset.map(lambda examples: self.preprocess_function(self.tokenizer, examples),
batched=True)
return tokenized_train, tokenized_test
def load_tokenizer(self):
if huggingface_hub.repo_exists(self.model_path):
tokenizer = AutoTokenizer.from_pretrained(self.model_path)
else:
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
return tokenizer
def postprocess(self,model_output):
output=[]
for pred in model_output:
label = max(pred, key=lambda x: x['score'])['label']
output.append(label)
return output
def classify_jutsu(self,text):
model_output = self.model(text)
predictions =self.postprocess(model_output)
return predictions |