Spaces:

MohitRajput45
/

MindGuard-AI

Sleeping

File size: 7,339 Bytes

af09308

import pandas as pd
import os
import torch
import numpy as np                                   # NEW: Needed for math operations on arrays
from torch import nn                                 # NEW: Needed for the custom loss function
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight  # NEW: Calculates the penalty weights
from sklearn.metrics import accuracy_score, f1_score         # NEW: The strict F1 grading system
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

def train_mindguard_model():
    print("🚀 Initializing MindGuard Training Pipeline...")

    # --- BULLETPROOF PATHING ---
    # 1. Find exactly where this train.py script lives
    script_dir = os.path.dirname(os.path.abspath(__file__))
    
    # 2. Go up two folders (from src/core_model) to find the project root
    project_root = os.path.abspath(os.path.join(script_dir, "../../"))
    
    # 3. Define the exact absolute paths
    data_path = os.path.join(project_root, "data", "processed", "master_training_data.csv")
    artifacts_dir = os.path.join(project_root, "artifacts", "xlmr_weights")
    
    # Ensure artifacts directory exists
    os.makedirs(artifacts_dir, exist_ok=True)

    print(f"Loading data from {data_path}...")
    df = pd.read_csv(data_path, on_bad_lines='skip')
    
    # --- THE FIX: Pandas parsing safety ---
    df = df.dropna(subset=['text', 'label'])
    df['text'] = df['text'].astype(str)
    df['label'] = df['label'].astype(str)
    
    # --- THE FIX: Data Sanitizer ---
    # 1. Drop any rows where the label is just a number (e.g., "0" or "1")
    df = df[~df['label'].str.isnumeric()]
    # 2. Drop the corrupted 'admi' label
    df = df[df['label'] != 'admi']

    # ⚠️ UNCOMMENT THE LINE BELOW if you don't have a strong GPU and want to do a fast 2-minute test run!
    # df = df.sample(500, random_state=42) 

    # 2. Convert text labels (e.g., 'Anxiety') to numbers (e.g., 0, 1, 2)
    label_encoder = LabelEncoder()
    df['label_encoded'] = label_encoder.fit_transform(df['label'])
    num_labels = len(label_encoder.classes_)
    
    # Save the label mapping
    mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))
    print(f"Detected {num_labels} unique emotions: {mapping}")

    # 3. Split the data into Training (80%) and Testing (20%)
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

    # --- NEW: CALCULATE PENALTY WEIGHTS FOR IMBALANCED DATA ---
    print("⚖️ Calculating Class Weights for Imbalanced Data...")
    unique_classes = np.unique(train_df['label_encoded'])
    weights = compute_class_weight(class_weight='balanced', classes=unique_classes, y=train_df['label_encoded'])
    
    # Automatically detect if a GPU is available locally, otherwise use CPU
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    class_weights_tensor = torch.tensor(weights, dtype=torch.float).to(device)

    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)

    # 4. Load the XLM-RoBERTa Tokenizer
    print("Loading XLM-RoBERTa Tokenizer...")
    tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
    
    # Function to convert text into numbers
    def tokenize_function(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

    # Apply tokenization to both datasets
    print("Tokenizing the datasets (converting words to numbers)...")
    tokenized_train = train_dataset.map(tokenize_function, batched=True)
    tokenized_val = val_dataset.map(tokenize_function, batched=True)

    # Rename label column so the Trainer understands it
    tokenized_train = tokenized_train.rename_column("label_encoded", "labels")
    tokenized_val = tokenized_val.rename_column("label_encoded", "labels")
    
    # --- THE FIX: Strip out English text so PyTorch only sees numbers ---
    tokenized_train = tokenized_train.remove_columns(["text", "label"])
    tokenized_val = tokenized_val.remove_columns(["text", "label"])
    
    # Formally convert them to PyTorch tensors
    tokenized_train.set_format("torch")
    tokenized_val.set_format("torch")

    # 5. Load the Deep Learning Model
    print("Loading XLM-RoBERTa Neural Network...")
    model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=num_labels)

    # --- NEW: STRICT SCORING METRICS ---
    def compute_metrics(pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        # F1 Macro forces the AI to prove it learned the rare emotions, not just 'Normal'
        f1 = f1_score(labels, preds, average='macro') 
        acc = accuracy_score(labels, preds)
        return {'accuracy': acc, 'f1_macro': f1}

    # 6. Set up the Training Rules
    training_args = TrainingArguments(
        output_dir=artifacts_dir,                  # Uses absolute path
        eval_strategy="epoch",                     # Test the model at the end of every round
        learning_rate=3e-5,                        # UPDATED: Slightly higher to help learn rare classes
        per_device_train_batch_size=16,            # How many sentences to look at once
        num_train_epochs=5,                        # UPDATED: 5 epochs to give more time to study hard emotions
        warmup_steps=500,                          # NEW: Gentle warmup to prevent wild guessing early on
        weight_decay=0.01,
        save_strategy="epoch",
        metric_for_best_model="f1_macro",          # NEW: Tell AI to prioritize F1 over basic accuracy
        load_best_model_at_end=True                # NEW: Automatically save the smartest brain
        # overwrite_output_dir=True,
    )

    # --- NEW: CUSTOM TRAINER OVERRIDE ---
    class ImbalancedTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
            # Grab the actual answers
            labels = inputs.pop("labels")
            # Make a prediction
            outputs = model(**inputs)
            logits = outputs.logits
            # Calculate the error using our Custom Penalty Weights!
            loss_fct = nn.CrossEntropyLoss(weight=class_weights_tensor)
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
            return (loss, outputs) if return_outputs else loss

    # 7. Start Training!
    trainer = ImbalancedTrainer(                   # UPDATED: Using the strict custom trainer
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        compute_metrics=compute_metrics            # NEW: Attach the strict grader
    )

    print("🔥 Starting actual model training! (This might take a while depending on your computer)...")
    trainer.train()

    # 8. Save the final model
    final_model_dir = os.path.join(artifacts_dir, "final_mindguard_model")
    print(f"✅ Training complete. Saving the brain to {final_model_dir}...")
    trainer.save_model(final_model_dir)
    tokenizer.save_pretrained(final_model_dir)

if __name__ == "__main__":
    train_mindguard_model()