File size: 2,720 Bytes
d064478
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
{
  "model_name_or_path": "shadowops_qwen3_1p7b_model",
  "checkpoint_path": "shadowops_qwen3_1p7b_model",
  "eval_split": "validation",
  "sample_count": 100,
  "dataset_audit": {
    "train_sample_count": 500,
    "val_sample_count": 100,
    "train_val_overlap_count": 0
  },
  "model_metrics": null,
  "q_aware_baseline": {
    "label": "q_aware",
    "sample_count": 100,
    "exact_match": 0.99,
    "safety_accuracy": 1.0,
    "valid_action_rate": 1.0,
    "invalid_action_rate": 0.0,
    "invalid_output_rate": 0.0,
    "parse_failure_rate": 0.0,
    "unsafe_decision_rate": 0.0,
    "false_positive_rate": 0.0,
    "false_negative_rate": 0.0,
    "reward_mean": 1.93683846,
    "reward_std": 0.33417104399464115,
    "allow_precision": 1.0,
    "block_precision": 1.0,
    "fork_precision": 1.0,
    "quarantine_precision": 0.9565217391304348,
    "per_action_accuracy": {
      "ALLOW": 1.0,
      "BLOCK": 1.0,
      "FORK": 0.9642857142857143,
      "QUARANTINE": 1.0
    },
    "confusion_matrix": {
      "ALLOW": {
        "ALLOW": 35,
        "BLOCK": 0,
        "FORK": 0,
        "QUARANTINE": 0,
        "INVALID": 0
      },
      "BLOCK": {
        "ALLOW": 0,
        "BLOCK": 15,
        "FORK": 0,
        "QUARANTINE": 0,
        "INVALID": 0
      },
      "FORK": {
        "ALLOW": 0,
        "BLOCK": 0,
        "FORK": 27,
        "QUARANTINE": 1,
        "INVALID": 0
      },
      "QUARANTINE": {
        "ALLOW": 0,
        "BLOCK": 0,
        "FORK": 0,
        "QUARANTINE": 22,
        "INVALID": 0
      }
    },
    "avg_completion_length": 1.0,
    "action_distribution": {
      "ALLOW": 0.35,
      "BLOCK": 0.15,
      "FORK": 0.27,
      "QUARANTINE": 0.23
    },
    "normalized_action_distribution": {
      "ALLOW": 0.35,
      "BLOCK": 0.15,
      "FORK": 0.27,
      "QUARANTINE": 0.23
    },
    "invalid_output_count": 0,
    "multi_action_warnings": 0,
    "multi_action_warning_rate": 0.0,
    "entropy": 1.9383346690254595,
    "reward_breakdown": {
      "exact_correct": 50,
      "exact_conservative": 49,
      "minor_wrong": 1
    }
  },
  "delta_vs_q_aware": null,
  "training_gate": {
    "training_gate_status": "FAIL",
    "training_gate_passed": false,
    "reason": "No model metrics are available; checkpoint was not loaded or evaluation failed.",
    "recommended_next_action": "Run --evaluate-model with a valid --model-path after SFT/GRPO smoke training."
  },
  "training_gate_status": "FAIL",
  "training_gate_passed": false,
  "reason": "No model metrics are available; checkpoint was not loaded or evaluation failed.",
  "model_load_error": "Model stack unavailable. Check torch, datasets, transformers, trl, unsloth, CUDA, and checkpoint path."
}