Spaces:
Running
Running
File size: 6,104 Bytes
01a3d35 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
class ToxicCommentDetector:
def __init__(self):
# Initialize empty dictionaries for models and tokenizers
self.models = {}
self.tokenizers = {}
self.label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
self.model_configs = {
'DistilBERT': {
'name': 'distilbert-base-uncased',
'max_len': 128,
'batch_size': 16,
'epochs': 3,
'lr': 2e-5
},
'RoBERTa': {
'name': 'roberta-base',
'max_len': 128,
'batch_size': 8,
'epochs': 3,
'lr': 1e-5
},
'ALBERT': {
'name': 'albert-base-v2',
'max_len': 128,
'batch_size': 16,
'epochs': 3,
'lr': 3e-5
}
}
def load_models(self):
"""Load pre-trained models and tokenizers."""
for model_name, config in self.model_configs.items():
print(f"Loading {model_name}...")
self.models[model_name] = AutoModelForSequenceClassification.from_pretrained(config['name'], num_labels=len(self.label_columns))
self.tokenizers[model_name] = AutoTokenizer.from_pretrained(config['name'])
print("β
Models and tokenizers loaded successfully!")
def load_and_preprocess_data(self, file_path):
"""Load and preprocess the dataset."""
print(f"π Loading dataset from {file_path}...")
df = pd.read_csv(file_path)
print(f"β
Dataset loaded successfully! First few rows:\n{df.head()}")
# Preprocess the data
from preprocess import preprocess_data
df = preprocess_data(df)
print("β
Data preprocessing completed!")
return df
def train_model(self, model_name, X_train, X_val, y_train, y_val):
print(f"\nπ Training {model_name}...")
config = self.model_configs[model_name]
tokenizer = AutoTokenizer.from_pretrained(config['name'])
model = AutoModelForSequenceClassification.from_pretrained(
config['name'],
num_labels=len(self.label_columns),
problem_type="multi_label_classification"
)
train_dataset = ToxicDataset(X_train, y_train, tokenizer, config['max_len'], model_name)
val_dataset = ToxicDataset(X_val, y_val, tokenizer, config['max_len'], model_name)
training_args = TrainingArguments(
output_dir=f'./results_{model_name.lower()}',
num_train_epochs=config['epochs'],
per_device_train_batch_size=config['batch_size'],
per_device_eval_batch_size=config['batch_size'],
warmup_steps=500,
weight_decay=0.01,
logging_dir=f'./logs_{model_name.lower()}',
logging_steps=100,
eval_strategy="steps",
eval_steps=500,
save_strategy="steps",
save_steps=500,
load_best_model_at_end=True,
metric_for_best_model="auc",
greater_is_better=True,
learning_rate=config['lr'],
adam_epsilon=1e-8,
max_grad_norm=1.0,
fp16=True if torch.cuda.is_available() else False,
dataloader_num_workers=0,
save_total_limit=1,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)
trainer.train()
self.models[model_name] = model
self.tokenizers[model_name] = tokenizer
eval_results = trainer.evaluate()
print(f"β
{model_name} - Validation AUC: {eval_results['eval_auc']:.4f}, F1: {eval_results['eval_f1']:.4f}")
return eval_results
def predict(self, text, model_name):
if model_name not in self.models:
raise ValueError(f"Model {model_name} not trained yet!")
model = self.models[model_name]
tokenizer = self.tokenizers[model_name]
device = next(model.parameters()).device
tokenizer_kwargs = {
'text': text,
'add_special_tokens': True,
'max_length': 128,
'padding': 'max_length',
'truncation': True,
'return_attention_mask': True,
'return_tensors': 'pt'
}
if 'distilbert' not in model_name.lower():
tokenizer_kwargs['return_token_type_ids'] = True
inputs = tokenizer.encode_plus(**tokenizer_kwargs)
for key in inputs:
inputs[key] = inputs[key].to(device)
model.eval()
with torch.no_grad():
outputs = model(**inputs)
predictions = torch.sigmoid(outputs.logits).cpu().numpy()[0]
results = {}
for i, label in enumerate(self.label_columns):
results[label] = float(predictions[i])
return results
def evaluate_all_models(self, X_test, y_test):
results = {}
for model_name in self.models.keys():
print(f"\nπ Evaluating {model_name} on test set...")
model = self.models[model_name]
tokenizer = self.tokenizers[model_name]
test_dataset = ToxicDataset(X_test, y_test, tokenizer, 128, model_name)
trainer = Trainer(
model=model,
compute_metrics=compute_metrics,
)
eval_results = trainer.evaluate(test_dataset)
results[model_name] = {
'auc': eval_results['eval_auc'],
'f1': eval_results['eval_f1']
}
print(f"π {model_name} - Test AUC: {eval_results['eval_auc']:.4f}, F1: {eval_results['eval_f1']:.4f}")
return results |