Spaces:
Sleeping
Sleeping
File size: 7,339 Bytes
af09308 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 | import pandas as pd
import os
import torch
import numpy as np # NEW: Needed for math operations on arrays
from torch import nn # NEW: Needed for the custom loss function
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight # NEW: Calculates the penalty weights
from sklearn.metrics import accuracy_score, f1_score # NEW: The strict F1 grading system
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
def train_mindguard_model():
print("🚀 Initializing MindGuard Training Pipeline...")
# --- BULLETPROOF PATHING ---
# 1. Find exactly where this train.py script lives
script_dir = os.path.dirname(os.path.abspath(__file__))
# 2. Go up two folders (from src/core_model) to find the project root
project_root = os.path.abspath(os.path.join(script_dir, "../../"))
# 3. Define the exact absolute paths
data_path = os.path.join(project_root, "data", "processed", "master_training_data.csv")
artifacts_dir = os.path.join(project_root, "artifacts", "xlmr_weights")
# Ensure artifacts directory exists
os.makedirs(artifacts_dir, exist_ok=True)
print(f"Loading data from {data_path}...")
df = pd.read_csv(data_path, on_bad_lines='skip')
# --- THE FIX: Pandas parsing safety ---
df = df.dropna(subset=['text', 'label'])
df['text'] = df['text'].astype(str)
df['label'] = df['label'].astype(str)
# --- THE FIX: Data Sanitizer ---
# 1. Drop any rows where the label is just a number (e.g., "0" or "1")
df = df[~df['label'].str.isnumeric()]
# 2. Drop the corrupted 'admi' label
df = df[df['label'] != 'admi']
# ⚠️ UNCOMMENT THE LINE BELOW if you don't have a strong GPU and want to do a fast 2-minute test run!
# df = df.sample(500, random_state=42)
# 2. Convert text labels (e.g., 'Anxiety') to numbers (e.g., 0, 1, 2)
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
num_labels = len(label_encoder.classes_)
# Save the label mapping
mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))
print(f"Detected {num_labels} unique emotions: {mapping}")
# 3. Split the data into Training (80%) and Testing (20%)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
# --- NEW: CALCULATE PENALTY WEIGHTS FOR IMBALANCED DATA ---
print("⚖️ Calculating Class Weights for Imbalanced Data...")
unique_classes = np.unique(train_df['label_encoded'])
weights = compute_class_weight(class_weight='balanced', classes=unique_classes, y=train_df['label_encoded'])
# Automatically detect if a GPU is available locally, otherwise use CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
class_weights_tensor = torch.tensor(weights, dtype=torch.float).to(device)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
# 4. Load the XLM-RoBERTa Tokenizer
print("Loading XLM-RoBERTa Tokenizer...")
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
# Function to convert text into numbers
def tokenize_function(examples):
return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)
# Apply tokenization to both datasets
print("Tokenizing the datasets (converting words to numbers)...")
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
# Rename label column so the Trainer understands it
tokenized_train = tokenized_train.rename_column("label_encoded", "labels")
tokenized_val = tokenized_val.rename_column("label_encoded", "labels")
# --- THE FIX: Strip out English text so PyTorch only sees numbers ---
tokenized_train = tokenized_train.remove_columns(["text", "label"])
tokenized_val = tokenized_val.remove_columns(["text", "label"])
# Formally convert them to PyTorch tensors
tokenized_train.set_format("torch")
tokenized_val.set_format("torch")
# 5. Load the Deep Learning Model
print("Loading XLM-RoBERTa Neural Network...")
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=num_labels)
# --- NEW: STRICT SCORING METRICS ---
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
# F1 Macro forces the AI to prove it learned the rare emotions, not just 'Normal'
f1 = f1_score(labels, preds, average='macro')
acc = accuracy_score(labels, preds)
return {'accuracy': acc, 'f1_macro': f1}
# 6. Set up the Training Rules
training_args = TrainingArguments(
output_dir=artifacts_dir, # Uses absolute path
eval_strategy="epoch", # Test the model at the end of every round
learning_rate=3e-5, # UPDATED: Slightly higher to help learn rare classes
per_device_train_batch_size=16, # How many sentences to look at once
num_train_epochs=5, # UPDATED: 5 epochs to give more time to study hard emotions
warmup_steps=500, # NEW: Gentle warmup to prevent wild guessing early on
weight_decay=0.01,
save_strategy="epoch",
metric_for_best_model="f1_macro", # NEW: Tell AI to prioritize F1 over basic accuracy
load_best_model_at_end=True # NEW: Automatically save the smartest brain
# overwrite_output_dir=True,
)
# --- NEW: CUSTOM TRAINER OVERRIDE ---
class ImbalancedTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
# Grab the actual answers
labels = inputs.pop("labels")
# Make a prediction
outputs = model(**inputs)
logits = outputs.logits
# Calculate the error using our Custom Penalty Weights!
loss_fct = nn.CrossEntropyLoss(weight=class_weights_tensor)
loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
return (loss, outputs) if return_outputs else loss
# 7. Start Training!
trainer = ImbalancedTrainer( # UPDATED: Using the strict custom trainer
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
compute_metrics=compute_metrics # NEW: Attach the strict grader
)
print("🔥 Starting actual model training! (This might take a while depending on your computer)...")
trainer.train()
# 8. Save the final model
final_model_dir = os.path.join(artifacts_dir, "final_mindguard_model")
print(f"✅ Training complete. Saving the brain to {final_model_dir}...")
trainer.save_model(final_model_dir)
tokenizer.save_pretrained(final_model_dir)
if __name__ == "__main__":
train_mindguard_model() |