File size: 10,973 Bytes
6a4dcb6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
"""
Evaluate the Deep Scan DistilBERT model accuracy.
Loads the fine-tuned model from deep_s3_model_hf/ and evaluates it
against a balanced test set built from the same data pipeline.
Outputs: Accuracy, Precision, Recall, F1, Confusion Matrix.
"""

import os
import sys
import random
import time
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

try:
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    import torch
except ImportError:
    print("ERROR: transformers and torch are required. Run: pip install transformers torch")
    sys.exit(1)

import deep_ml_engine

MODEL_DIR = os.path.join(os.path.dirname(__file__), "deep_s3_model_hf")


def predict_batch(texts, tokenizer, model):
    """Run inference on a batch of texts using direct model forward pass (faster than pipeline)."""
    # Sanitize
    texts = [t if t.strip() else "empty" for t in texts]
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1).tolist()
    probs = torch.softmax(logits, dim=-1).tolist()
    return preds, probs


def evaluate():
    if not os.path.isdir(MODEL_DIR):
        print(f"ERROR: Model directory not found at {MODEL_DIR}")
        print("Train the model first with: python train_deep_model.py")
        sys.exit(1)

    print("=" * 64)
    print("   S3Shastra Deep Scanner β€” DistilBERT Accuracy Evaluation")
    print("=" * 64)

    # ── 1. Build the full dataset ──
    print("\n[1/5] Building evaluation dataset...")
    X_all, y_all = deep_ml_engine.build_dataset_synthetic()
    
    total_pos = sum(y_all)
    total_neg = len(y_all) - total_pos
    print(f"       Total raw samples:    {len(X_all)}")
    print(f"       Positive (sensitive): {total_pos}")
    print(f"       Negative (safe):      {total_neg}")

    # ── 2. Create balanced evaluation subsets ──
    print("\n[2/5] Creating balanced evaluation sets...")
    
    # Separate by class
    pos_samples = [(x, y) for x, y in zip(X_all, y_all) if y == 1]
    neg_samples = [(x, y) for x, y in zip(X_all, y_all) if y == 0]
    
    random.seed(42)
    random.shuffle(pos_samples)
    random.shuffle(neg_samples)
    
    # Training set: the first 1000 the model actually trained on
    train_X = X_all[:1000]
    train_y = y_all[:1000]
    train_pos = sum(train_y)
    train_neg = len(train_y) - train_pos
    print(f"       Training set: {len(train_X)} samples ({train_pos} pos, {train_neg} neg)")
    
    # Balanced evaluation set: take min(500, available) from each class
    # Use samples the model didn't train on (index 1000+)
    unseen_pos = [(x, y) for x, y in zip(X_all[1000:], y_all[1000:]) if y == 1]
    unseen_neg = [(x, y) for x, y in zip(X_all[1000:], y_all[1000:]) if y == 0]
    
    # For the balanced test, we also add the keyword samples (first ~60) and benign words 
    # since these are critical to get right
    keyword_samples = [(x, y) for x, y in zip(X_all[:61], y_all[:61])]  # ~31 keywords + 30 benign
    
    eval_size = min(500, len(unseen_pos), max(len(unseen_neg), 30))
    
    # If we don't have enough unseen negatives, take from the training negatives too
    all_neg = neg_samples.copy()
    random.shuffle(all_neg)
    
    balanced_eval = []
    # Add keyword/benign core samples
    balanced_eval.extend(keyword_samples)
    # Add unseen positive samples (cap at 500)
    balanced_eval.extend(unseen_pos[:500])
    # Add all available negatives (they're rare)
    balanced_eval.extend(all_neg[:500])
    
    random.shuffle(balanced_eval)
    eval_X = [s[0] for s in balanced_eval]
    eval_y = [s[1] for s in balanced_eval]
    
    eval_pos = sum(eval_y)
    eval_neg = len(eval_y) - eval_pos
    print(f"       Eval set:     {len(eval_X)} samples ({eval_pos} pos, {eval_neg} neg)")

    # ── 3. Load model ──
    print("\n[3/5] Loading fine-tuned DistilBERT model...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
    model.eval()
    print("       Model loaded successfully.")

    # ── 4. Run predictions ──
    print(f"\n[4/5] Running predictions...")
    
    start_time = time.time()
    
    # A) Training set predictions
    print(f"       Evaluating training set ({len(train_X)} samples)...")
    train_preds = []
    batch_size = 32
    for i in range(0, len(train_X), batch_size):
        batch = train_X[i:i+batch_size]
        preds, _ = predict_batch(batch, tokenizer, model)
        train_preds.extend(preds)
        done = min(i + batch_size, len(train_X))
        print(f"         {done}/{len(train_X)}", end="\r")
    print(f"         {len(train_X)}/{len(train_X)} done    ")

    # B) Balanced eval set predictions
    print(f"       Evaluating balanced test set ({len(eval_X)} samples)...")
    eval_preds = []
    for i in range(0, len(eval_X), batch_size):
        batch = eval_X[i:i+batch_size]
        preds, _ = predict_batch(batch, tokenizer, model)
        eval_preds.extend(preds)
        done = min(i + batch_size, len(eval_X))
        print(f"         {done}/{len(eval_X)}", end="\r")
    print(f"         {len(eval_X)}/{len(eval_X)} done    ")
    
    elapsed = time.time() - start_time
    total_inferred = len(train_X) + len(eval_X)
    print(f"       Inference complete: {total_inferred} samples in {elapsed:.1f}s ({total_inferred/elapsed:.0f} samples/sec)")

    # ── 5. Calculate & display metrics ──
    print(f"\n[5/5] Computing metrics...\n")
    
    # ── Training Set Results ──
    y_train_true = np.array(train_y)
    y_train_pred = np.array(train_preds)
    
    tr_acc = accuracy_score(y_train_true, y_train_pred)
    tr_prec = precision_score(y_train_true, y_train_pred, zero_division=0)
    tr_rec = recall_score(y_train_true, y_train_pred, zero_division=0)
    tr_f1 = f1_score(y_train_true, y_train_pred, zero_division=0)
    tr_cm = confusion_matrix(y_train_true, y_train_pred)
    
    print("=" * 64)
    print("   TRAINING SET  (first 1000 samples β€” model saw these)")
    print("=" * 64)
    print(f"   Samples:    {len(train_X)} ({train_pos} sensitive, {train_neg} safe)")
    print(f"   Accuracy:   {tr_acc:.4f}  ({tr_acc*100:.2f}%)")
    print(f"   Precision:  {tr_prec:.4f}  ({tr_prec*100:.2f}%)")
    print(f"   Recall:     {tr_rec:.4f}  ({tr_rec*100:.2f}%)")
    print(f"   F1 Score:   {tr_f1:.4f}  ({tr_f1*100:.2f}%)")
    print(f"\n   Confusion Matrix:")
    print(f"                     Predicted Safe   Predicted Sensitive")
    if tr_cm.shape == (2, 2):
        print(f"   Actual Safe       {tr_cm[0][0]:>10}       {tr_cm[0][1]:>10}")
        print(f"   Actual Sensitive  {tr_cm[1][0]:>10}       {tr_cm[1][1]:>10}")
    else:
        print(f"   {tr_cm}")
    
    # ── Balanced Eval Set Results ──
    y_eval_true = np.array(eval_y)
    y_eval_pred = np.array(eval_preds)
    
    ev_acc = accuracy_score(y_eval_true, y_eval_pred)
    ev_prec = precision_score(y_eval_true, y_eval_pred, zero_division=0)
    ev_rec = recall_score(y_eval_true, y_eval_pred, zero_division=0)
    ev_f1 = f1_score(y_eval_true, y_eval_pred, zero_division=0)
    ev_cm = confusion_matrix(y_eval_true, y_eval_pred)
    
    print(f"\n{'=' * 64}")
    print("   BALANCED EVALUATION SET  (mixed seen + unseen data)")
    print("=" * 64)
    print(f"   Samples:    {len(eval_X)} ({eval_pos} sensitive, {eval_neg} safe)")
    print(f"   Accuracy:   {ev_acc:.4f}  ({ev_acc*100:.2f}%)")
    print(f"   Precision:  {ev_prec:.4f}  ({ev_prec*100:.2f}%)")
    print(f"   Recall:     {ev_rec:.4f}  ({ev_rec*100:.2f}%)")
    print(f"   F1 Score:   {ev_f1:.4f}  ({ev_f1*100:.2f}%)")
    print(f"\n   Confusion Matrix:")
    print(f"                     Predicted Safe   Predicted Sensitive")
    if ev_cm.shape == (2, 2):
        print(f"   Actual Safe       {ev_cm[0][0]:>10}       {ev_cm[0][1]:>10}")
        print(f"   Actual Sensitive  {ev_cm[1][0]:>10}       {ev_cm[1][1]:>10}")
    else:
        print(f"   {ev_cm}")
    
    print(f"\n   Classification Report:")
    print(classification_report(y_eval_true, y_eval_pred, target_names=["Safe (0)", "Sensitive (1)"], zero_division=0))
    
    # ── Keyword-level analysis ──
    print("=" * 64)
    print("   KEYWORD-LEVEL ANALYSIS")
    print("=" * 64)
    print("   Testing each sensitive keyword individually:\n")
    kw_correct = 0
    kw_total = len(deep_ml_engine.SENSITIVE_KEYWORDS)
    for kw in deep_ml_engine.SENSITIVE_KEYWORDS:
        preds, probs = predict_batch([kw.lower()], tokenizer, model)
        pred = preds[0]
        conf = probs[0][pred] * 100
        status = "CORRECT" if pred == 1 else "MISSED"
        icon = "+" if pred == 1 else "X"
        if pred == 1:
            kw_correct += 1
        print(f"   [{icon}] {kw:<30s} -> {'Sensitive' if pred==1 else 'Safe':>10s} ({conf:.1f}% conf) [{status}]")
    
    print(f"\n   Keywords detected: {kw_correct}/{kw_total} ({kw_correct/kw_total*100:.1f}%)")
    
    # ── Benign word analysis ──
    benign_words = ["app", "main", "index", "style", "script", "logo", "banner", "test", "data", "public"]
    print(f"\n   Testing benign/safe words:\n")
    bn_correct = 0
    for bw in benign_words:
        preds, probs = predict_batch([bw], tokenizer, model)
        pred = preds[0]
        conf = probs[0][pred] * 100
        status = "CORRECT" if pred == 0 else "FALSE POS"
        icon = "+" if pred == 0 else "!"
        if pred == 0:
            bn_correct += 1
        print(f"   [{icon}] {bw:<30s} -> {'Safe' if pred==0 else 'Sensitive':>10s} ({conf:.1f}% conf) [{status}]")
    
    print(f"\n   Benign correct: {bn_correct}/{len(benign_words)} ({bn_correct/len(benign_words)*100:.1f}%)")
    
    # ── Final Summary ──
    print(f"\n{'=' * 64}")
    print("   FINAL SUMMARY")
    print("=" * 64)
    print(f"   Model:             DistilBERT (distilbert-base-uncased)")
    print(f"   Fine-tuned on:     1000 samples (1 epoch, lr=2e-5)")
    print(f"   Dataset source:    Custom keywords + nvidia/Nemotron-PII")
    print(f"   Inference time:    {elapsed:.1f}s ({total_inferred/elapsed:.0f} samples/sec)")
    print(f"   ──────────────────────────────────────────────────────")
    print(f"   Training Accuracy:     {tr_acc*100:.2f}%")
    print(f"   Balanced Eval Acc:     {ev_acc*100:.2f}%")
    print(f"   Balanced Eval F1:      {ev_f1*100:.2f}%")
    print(f"   Keyword Detection:     {kw_correct}/{kw_total} ({kw_correct/kw_total*100:.1f}%)")
    print(f"   Benign Rejection:      {bn_correct}/{len(benign_words)} ({bn_correct/len(benign_words)*100:.1f}%)")
    print("=" * 64)


if __name__ == "__main__":
    evaluate()