File size: 7,339 Bytes
af09308
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import pandas as pd
import os
import torch
import numpy as np                                   # NEW: Needed for math operations on arrays
from torch import nn                                 # NEW: Needed for the custom loss function
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight  # NEW: Calculates the penalty weights
from sklearn.metrics import accuracy_score, f1_score         # NEW: The strict F1 grading system
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

def train_mindguard_model():
    print("🚀 Initializing MindGuard Training Pipeline...")

    # --- BULLETPROOF PATHING ---
    # 1. Find exactly where this train.py script lives
    script_dir = os.path.dirname(os.path.abspath(__file__))
    
    # 2. Go up two folders (from src/core_model) to find the project root
    project_root = os.path.abspath(os.path.join(script_dir, "../../"))
    
    # 3. Define the exact absolute paths
    data_path = os.path.join(project_root, "data", "processed", "master_training_data.csv")
    artifacts_dir = os.path.join(project_root, "artifacts", "xlmr_weights")
    
    # Ensure artifacts directory exists
    os.makedirs(artifacts_dir, exist_ok=True)

    print(f"Loading data from {data_path}...")
    df = pd.read_csv(data_path, on_bad_lines='skip')
    
    # --- THE FIX: Pandas parsing safety ---
    df = df.dropna(subset=['text', 'label'])
    df['text'] = df['text'].astype(str)
    df['label'] = df['label'].astype(str)
    
    # --- THE FIX: Data Sanitizer ---
    # 1. Drop any rows where the label is just a number (e.g., "0" or "1")
    df = df[~df['label'].str.isnumeric()]
    # 2. Drop the corrupted 'admi' label
    df = df[df['label'] != 'admi']

    # ⚠️ UNCOMMENT THE LINE BELOW if you don't have a strong GPU and want to do a fast 2-minute test run!
    # df = df.sample(500, random_state=42) 

    # 2. Convert text labels (e.g., 'Anxiety') to numbers (e.g., 0, 1, 2)
    label_encoder = LabelEncoder()
    df['label_encoded'] = label_encoder.fit_transform(df['label'])
    num_labels = len(label_encoder.classes_)
    
    # Save the label mapping
    mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))
    print(f"Detected {num_labels} unique emotions: {mapping}")

    # 3. Split the data into Training (80%) and Testing (20%)
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

    # --- NEW: CALCULATE PENALTY WEIGHTS FOR IMBALANCED DATA ---
    print("⚖️ Calculating Class Weights for Imbalanced Data...")
    unique_classes = np.unique(train_df['label_encoded'])
    weights = compute_class_weight(class_weight='balanced', classes=unique_classes, y=train_df['label_encoded'])
    
    # Automatically detect if a GPU is available locally, otherwise use CPU
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    class_weights_tensor = torch.tensor(weights, dtype=torch.float).to(device)

    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)

    # 4. Load the XLM-RoBERTa Tokenizer
    print("Loading XLM-RoBERTa Tokenizer...")
    tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
    
    # Function to convert text into numbers
    def tokenize_function(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

    # Apply tokenization to both datasets
    print("Tokenizing the datasets (converting words to numbers)...")
    tokenized_train = train_dataset.map(tokenize_function, batched=True)
    tokenized_val = val_dataset.map(tokenize_function, batched=True)

    # Rename label column so the Trainer understands it
    tokenized_train = tokenized_train.rename_column("label_encoded", "labels")
    tokenized_val = tokenized_val.rename_column("label_encoded", "labels")
    
    # --- THE FIX: Strip out English text so PyTorch only sees numbers ---
    tokenized_train = tokenized_train.remove_columns(["text", "label"])
    tokenized_val = tokenized_val.remove_columns(["text", "label"])
    
    # Formally convert them to PyTorch tensors
    tokenized_train.set_format("torch")
    tokenized_val.set_format("torch")

    # 5. Load the Deep Learning Model
    print("Loading XLM-RoBERTa Neural Network...")
    model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=num_labels)

    # --- NEW: STRICT SCORING METRICS ---
    def compute_metrics(pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        # F1 Macro forces the AI to prove it learned the rare emotions, not just 'Normal'
        f1 = f1_score(labels, preds, average='macro') 
        acc = accuracy_score(labels, preds)
        return {'accuracy': acc, 'f1_macro': f1}

    # 6. Set up the Training Rules
    training_args = TrainingArguments(
        output_dir=artifacts_dir,                  # Uses absolute path
        eval_strategy="epoch",                     # Test the model at the end of every round
        learning_rate=3e-5,                        # UPDATED: Slightly higher to help learn rare classes
        per_device_train_batch_size=16,            # How many sentences to look at once
        num_train_epochs=5,                        # UPDATED: 5 epochs to give more time to study hard emotions
        warmup_steps=500,                          # NEW: Gentle warmup to prevent wild guessing early on
        weight_decay=0.01,
        save_strategy="epoch",
        metric_for_best_model="f1_macro",          # NEW: Tell AI to prioritize F1 over basic accuracy
        load_best_model_at_end=True                # NEW: Automatically save the smartest brain
        # overwrite_output_dir=True,
    )

    # --- NEW: CUSTOM TRAINER OVERRIDE ---
    class ImbalancedTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
            # Grab the actual answers
            labels = inputs.pop("labels")
            # Make a prediction
            outputs = model(**inputs)
            logits = outputs.logits
            # Calculate the error using our Custom Penalty Weights!
            loss_fct = nn.CrossEntropyLoss(weight=class_weights_tensor)
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
            return (loss, outputs) if return_outputs else loss

    # 7. Start Training!
    trainer = ImbalancedTrainer(                   # UPDATED: Using the strict custom trainer
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        compute_metrics=compute_metrics            # NEW: Attach the strict grader
    )

    print("🔥 Starting actual model training! (This might take a while depending on your computer)...")
    trainer.train()

    # 8. Save the final model
    final_model_dir = os.path.join(artifacts_dir, "final_mindguard_model")
    print(f"✅ Training complete. Saving the brain to {final_model_dir}...")
    trainer.save_model(final_model_dir)
    tokenizer.save_pretrained(final_model_dir)

if __name__ == "__main__":
    train_mindguard_model()