deberta-tool-hallucination-span-detector / selected_span_model_config.json
Ali-Bhai's picture
Add final DeBERTa span-level tool hallucination detector
bf58e9a verified
{
"model_name": "span_deberta_v3_small_sliding_windows",
"base_model_name": "microsoft/deberta-v3-small",
"best_checkpoint_dir": "/content/drive/MyDrive/TLLM/03_models/span_deberta_v3_small_sliding_windows/best_checkpoint",
"last_checkpoint_dir": "/content/drive/MyDrive/TLLM/03_models/span_deberta_v3_small_sliding_windows/last_checkpoint",
"best_epoch": 8,
"selected_config": {
"threshold": 0.5,
"min_span_chars": 1,
"min_span_tokens": 1,
"merge_gap_chars": 1,
"strip_predicted_span_whitespace": true,
"drop_spans_without_alnum": true,
"score_name": "sum_non_O_probability"
},
"labels": [
"O",
"tool_output_conflict",
"overgeneration",
"missing_tool_action_recommendation"
],
"label2id": {
"O": 0,
"tool_output_conflict": 1,
"overgeneration": 2,
"missing_tool_action_recommendation": 3
},
"id2label": {
"0": "O",
"1": "tool_output_conflict",
"2": "overgeneration",
"3": "missing_tool_action_recommendation"
},
"final_validation_token_metrics": {
"O": {
"precision": 0.9997869091167383,
"recall": 0.9987582487759881,
"f1": 0.9992723142182701,
"support": 28186,
"tp": 28151,
"fp": 6,
"fn": 35
},
"tool_output_conflict": {
"precision": 0.7666666666666667,
"recall": 0.9504132231404959,
"f1": 0.8487084870848709,
"support": 121,
"tp": 115,
"fp": 35,
"fn": 6
},
"overgeneration": {
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"support": 428,
"tp": 428,
"fp": 0,
"fn": 0
},
"missing_tool_action_recommendation": {
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"support": 892,
"tp": 892,
"fp": 0,
"fn": 0
},
"accuracy": 0.9986161271812873,
"macro_f1": 0.9619952003257852,
"weighted_f1": 0.9986898158940638,
"support": 29627
},
"final_test_token_metrics": {
"O": {
"precision": 0.9994592359321818,
"recall": 0.9951225692025084,
"f1": 0.9972861881258828,
"support": 31574,
"tp": 31420,
"fp": 17,
"fn": 154
},
"tool_output_conflict": {
"precision": 0.4188679245283019,
"recall": 0.8671875,
"f1": 0.564885496183206,
"support": 128,
"tp": 111,
"fp": 154,
"fn": 17
},
"overgeneration": {
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"support": 406,
"tp": 406,
"fp": 0,
"fn": 0
},
"missing_tool_action_recommendation": {
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"support": 925,
"tp": 925,
"fp": 0,
"fn": 0
},
"accuracy": 0.9948233584597221,
"macro_f1": 0.8905429210772722,
"weighted_f1": 0.9957200208094352,
"support": 33033
},
"final_validation_metrics": {
"config": {
"threshold": 0.5,
"min_span_chars": 1,
"min_span_tokens": 1,
"merge_gap_chars": 1,
"strip_predicted_span_whitespace": true,
"drop_spans_without_alnum": true,
"score_name": "sum_non_O_probability"
},
"num_rows": 214,
"row_multiclass_metrics": {
"clean": {
"precision": 1.0,
"recall": 0.9611650485436893,
"f1": 0.9801980198019802,
"support": 103,
"tp": 99,
"fp": 0,
"fn": 4
},
"tool_output_conflict": {
"precision": 0.9024390243902439,
"recall": 1.0,
"f1": 0.9487179487179488,
"support": 37,
"tp": 37,
"fp": 4,
"fn": 0
},
"overgeneration": {
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"support": 37,
"tp": 37,
"fp": 0,
"fn": 0
},
"missing_tool_action_recommendation": {
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"support": 37,
"tp": 37,
"fp": 0,
"fn": 0
},
"accuracy": 0.9813084112149533,
"macro_f1": 0.9822289921299823,
"weighted_f1": 0.9816026174867667,
"support": 214
},
"binary_example_metrics": {
"accuracy": 0.9813084112149533,
"precision": 0.9652173913043478,
"recall": 1.0,
"f1": 0.9823008849557522,
"tp": 111,
"fp": 4,
"fn": 0,
"tn": 99
},
"exact_span_metrics": {
"precision": 0.8333333333333334,
"recall": 0.9009009009009009,
"f1": 0.8658008658008659,
"matched": 100,
"gold_total": 111,
"pred_total": 120
},
"overlap_span_metrics_iou_0_01": {
"precision": 0.925,
"recall": 1.0,
"f1": 0.961038961038961,
"matched": 111,
"gold_total": 111,
"pred_total": 120,
"iou_threshold": 0.01
},
"overlap_span_metrics_iou_0_50": {
"precision": 0.9083333333333333,
"recall": 0.9819819819819819,
"f1": 0.9437229437229437,
"matched": 109,
"gold_total": 111,
"pred_total": 120,
"iou_threshold": 0.5
},
"char_micro_metrics": {
"precision": 0.9916501556750636,
"recall": 0.9984326018808778,
"f1": 0.9950298210735586,
"overlap_chars": 7007,
"gold_chars": 7018,
"pred_chars": 7066
},
"per_type_char_micro_metrics": {
"clean": {
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"overlap_chars": 0,
"gold_chars": 0,
"pred_chars": 18
},
"overgeneration": {
"precision": 0.9948006932409013,
"recall": 1.0,
"f1": 0.9973935708079931,
"overlap_chars": 2296,
"gold_chars": 2296,
"pred_chars": 2308
},
"tool_output_conflict": {
"precision": 0.9037656903765691,
"recall": 0.9515418502202643,
"f1": 0.927038626609442,
"overlap_chars": 216,
"gold_chars": 227,
"pred_chars": 239
},
"missing_tool_action_recommendation": {
"precision": 0.998666962897134,
"recall": 1.0,
"f1": 0.9993330369052912,
"overlap_chars": 4495,
"gold_chars": 4495,
"pred_chars": 4501
}
},
"num_gold_spans": 111,
"num_predicted_spans": 120
},
"final_test_metrics": {
"config": {
"threshold": 0.5,
"min_span_chars": 1,
"min_span_tokens": 1,
"merge_gap_chars": 1,
"strip_predicted_span_whitespace": true,
"drop_spans_without_alnum": true,
"score_name": "sum_non_O_probability"
},
"num_rows": 207,
"row_multiclass_metrics": {
"clean": {
"precision": 0.9777777777777777,
"recall": 0.9166666666666666,
"f1": 0.946236559139785,
"support": 96,
"tp": 88,
"fp": 2,
"fn": 8
},
"tool_output_conflict": {
"precision": 0.813953488372093,
"recall": 0.9459459459459459,
"f1": 0.875,
"support": 37,
"tp": 35,
"fp": 8,
"fn": 2
},
"overgeneration": {
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"support": 37,
"tp": 37,
"fp": 0,
"fn": 0
},
"missing_tool_action_recommendation": {
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"support": 37,
"tp": 37,
"fp": 0,
"fn": 0
},
"accuracy": 0.9516908212560387,
"macro_f1": 0.9553091397849462,
"weighted_f1": 0.9527232351566153,
"support": 207
},
"binary_example_metrics": {
"accuracy": 0.9516908212560387,
"precision": 0.9316239316239316,
"recall": 0.9819819819819819,
"f1": 0.9561403508771931,
"tp": 109,
"fp": 8,
"fn": 2,
"tn": 88
},
"exact_span_metrics": {
"precision": 0.7357142857142858,
"recall": 0.9279279279279279,
"f1": 0.8207171314741037,
"matched": 103,
"gold_total": 111,
"pred_total": 140
},
"overlap_span_metrics_iou_0_01": {
"precision": 0.7785714285714286,
"recall": 0.9819819819819819,
"f1": 0.8685258964143425,
"matched": 109,
"gold_total": 111,
"pred_total": 140,
"iou_threshold": 0.01
},
"overlap_span_metrics_iou_0_50": {
"precision": 0.75,
"recall": 0.9459459459459459,
"f1": 0.8366533864541832,
"matched": 105,
"gold_total": 111,
"pred_total": 140,
"iou_threshold": 0.5
},
"char_micro_metrics": {
"precision": 0.971356003950896,
"recall": 0.9955169920462762,
"f1": 0.9832881016997572,
"overlap_chars": 6884,
"gold_chars": 6915,
"pred_chars": 7087
},
"per_type_char_micro_metrics": {
"clean": {
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"overlap_chars": 0,
"gold_chars": 0,
"pred_chars": 72
},
"missing_tool_action_recommendation": {
"precision": 0.9879728843210146,
"recall": 1.0,
"f1": 0.9939500604993949,
"overlap_chars": 4518,
"gold_chars": 4518,
"pred_chars": 4573
},
"tool_output_conflict": {
"precision": 0.7251908396946565,
"recall": 0.8597285067873304,
"f1": 0.7867494824016563,
"overlap_chars": 190,
"gold_chars": 221,
"pred_chars": 262
},
"overgeneration": {
"precision": 0.998165137614679,
"recall": 1.0,
"f1": 0.9990817263544537,
"overlap_chars": 2176,
"gold_chars": 2176,
"pred_chars": 2180
}
},
"num_gold_spans": 111,
"num_predicted_spans": 140
}
}