Spaces:
Sleeping
Sleeping
File size: 4,269 Bytes
69a2c97 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import os
# Disable Weights & Biases logging and tokenizer parallelism warnings
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
MODEL_NAME = "allegro/herbert-base-cased"
DATA_FILE = "augmented_dataset.csv"
print("Starting the classifier evaluation on HerBERT model.")
# 1. DATA LOADING AND PREPARATION
df = pd.read_csv(DATA_FILE)
# Separate original and synthetic datasets
df_orig = df[df['is_synthetic'] == False].copy()
df_aug = df[df['is_synthetic'] == True].copy()
# Map text labels to numerical indices
label_mapping = {label: idx for idx, label in enumerate(df_orig['label'].unique())}
df_orig['label_idx'] = df_orig['label'].map(label_mapping)
df_aug['label_idx'] = df_aug['label'].map(label_mapping)
# Split original data into training (80%) and testing (20%) sets
train_orig, test_data = train_test_split(df_orig, test_size=0.2, random_state=42, stratify=df_orig['label_idx'])
# DATA LEAKAGE PROTECTION
# Ensure synthetic samples are only used if their original source is in the training set
train_orig_ids = train_orig['id'].astype(str).tolist()
df_aug_filtered = df_aug[df_aug['id'].apply(lambda x: str(x).split('_')[0] in train_orig_ids)]
# Combine original training data with valid augmented data
train_augmented = pd.concat([train_orig, df_aug_filtered], ignore_index=True)
print("\nExperiment structure:")
print(f" - Test dataset (immutable): {len(test_data)} samples")
print(f" - Training BASELINE: {len(train_orig)} samples")
print(f" - Training AUGMENTED: {len(train_augmented)} samples")
# 2. TOKENIZATION
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def tokenize_data(data_df):
dataset = Dataset.from_pandas(data_df[['text', 'label_idx']])
return dataset.map(
lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=128),
batched=True
).rename_column("label_idx", "labels")
print("\nTokenizing datasets...")
test_dataset = tokenize_data(test_data)
train_orig_dataset = tokenize_data(train_orig)
train_aug_dataset = tokenize_data(train_augmented)
# 3. METRICS EVALUATION
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
acc = accuracy_score(labels, preds)
f1 = f1_score(labels, preds, average='macro')
return {'accuracy': acc, 'f1_macro': f1}
# 4. TRAINING ENGINE
def train_and_evaluate(train_ds, test_ds, output_dir):
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(label_mapping))
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
eval_strategy="epoch",
save_strategy="no",
logging_dir='./logs',
report_to="none"
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_ds,
eval_dataset=test_ds,
compute_metrics=compute_metrics
)
trainer.train()
return trainer.evaluate()
# 5. EXPERIMENT EXECUTION
print("\n" + "="*50)
print("STEP 1: Training the BASELINE model")
print("="*50)
base_metrics = train_and_evaluate(train_orig_dataset, test_dataset, "./results_base")
print("\n" + "="*50)
print("STEP 2: Training the AUGMENTED model")
print("="*50)
aug_metrics = train_and_evaluate(train_aug_dataset, test_dataset, "./results_aug")
# 6. RESULTS OUTPUT
print("\n\n" + "FINAL EXPERIMENT RESULTS".center(50))
print("-" * 52)
print(f"Metric | Baseline | Augmented | Change")
print("-" * 52)
base_f1 = base_metrics['eval_f1_macro'] * 100
aug_f1 = aug_metrics['eval_f1_macro'] * 100
diff_f1 = aug_f1 - base_f1
base_acc = base_metrics['eval_accuracy'] * 100
aug_acc = aug_metrics['eval_accuracy'] * 100
diff_acc = aug_acc - base_acc
print(f"Macro-F1 | {base_f1:10.2f}% | {aug_f1:10.2f}% | {diff_f1:+5.2f} pp.")
print(f"Accuracy | {base_acc:10.2f}% | {aug_acc:10.2f}% | {diff_acc:+5.2f} pp.")
print("-" * 52) |