{ "token_analysis": { "recommended_threshold": 20, "best_f1": 0.805, "best_precision": 0.733, "best_recall": 0.892, "correlation_tokens_hhem": -0.54, "current_threshold": 30, "total_samples": 50, "hallucination_rate": 0.26, "mean_tokens": 181.1, "median_tokens": 128.0, "all_thresholds": [ { "precision": 0.733, "recall": 0.892, "f1": 0.805, "tp": 33, "fp": 12, "tn": 1, "fn": 4, "n_passed": 45, "n_refused": 5, "threshold": 20, "mean_hhem_above": 0.68 }, { "precision": 0.732, "recall": 0.811, "f1": 0.769, "tp": 30, "fp": 11, "tn": 2, "fn": 7, "n_passed": 41, "n_refused": 9, "threshold": 30, "mean_hhem_above": 0.675 }, { "precision": 0.718, "recall": 0.757, "f1": 0.737, "tp": 28, "fp": 11, "tn": 2, "fn": 9, "n_passed": 39, "n_refused": 11, "threshold": 40, "mean_hhem_above": 0.668 }, { "precision": 0.711, "recall": 0.73, "f1": 0.72, "tp": 27, "fp": 11, "tn": 2, "fn": 10, "n_passed": 38, "n_refused": 12, "threshold": 50, "mean_hhem_above": 0.666 }, { "precision": 0.694, "recall": 0.676, "f1": 0.685, "tp": 25, "fp": 11, "tn": 2, "fn": 12, "n_passed": 36, "n_refused": 14, "threshold": 60, "mean_hhem_above": 0.655 }, { "precision": 0.686, "recall": 0.649, "f1": 0.667, "tp": 24, "fp": 11, "tn": 2, "fn": 13, "n_passed": 35, "n_refused": 15, "threshold": 70, "mean_hhem_above": 0.646 }, { "precision": 0.667, "recall": 0.595, "f1": 0.629, "tp": 22, "fp": 11, "tn": 2, "fn": 15, "n_passed": 33, "n_refused": 17, "threshold": 80, "mean_hhem_above": 0.639 }, { "precision": 0.667, "recall": 0.541, "f1": 0.597, "tp": 20, "fp": 10, "tn": 3, "fn": 17, "n_passed": 30, "n_refused": 20, "threshold": 90, "mean_hhem_above": 0.642 }, { "precision": 0.655, "recall": 0.514, "f1": 0.576, "tp": 19, "fp": 10, "tn": 3, "fn": 18, "n_passed": 29, "n_refused": 21, "threshold": 100, "mean_hhem_above": 0.631 }, { "precision": 0.615, "recall": 0.432, "f1": 0.508, "tp": 16, "fp": 10, "tn": 3, "fn": 21, "n_passed": 26, "n_refused": 24, "threshold": 110, "mean_hhem_above": 0.616 }, { "precision": 0.615, "recall": 0.432, "f1": 0.508, "tp": 16, "fp": 10, "tn": 3, "fn": 21, "n_passed": 26, "n_refused": 24, "threshold": 120, "mean_hhem_above": 0.616 }, { "precision": 0.6, "recall": 0.405, "f1": 0.484, "tp": 15, "fp": 10, "tn": 3, "fn": 22, "n_passed": 25, "n_refused": 25, "threshold": 130, "mean_hhem_above": 0.604 }, { "precision": 0.583, "recall": 0.378, "f1": 0.459, "tp": 14, "fp": 10, "tn": 3, "fn": 23, "n_passed": 24, "n_refused": 26, "threshold": 140, "mean_hhem_above": 0.593 }, { "precision": 0.571, "recall": 0.324, "f1": 0.414, "tp": 12, "fp": 9, "tn": 4, "fn": 25, "n_passed": 21, "n_refused": 29, "threshold": 150, "mean_hhem_above": 0.589 }, { "precision": 0.5, "recall": 0.243, "f1": 0.327, "tp": 9, "fp": 9, "tn": 4, "fn": 28, "n_passed": 18, "n_refused": 32, "threshold": 160, "mean_hhem_above": 0.534 }, { "precision": 0.471, "recall": 0.216, "f1": 0.296, "tp": 8, "fp": 9, "tn": 4, "fn": 29, "n_passed": 17, "n_refused": 33, "threshold": 170, "mean_hhem_above": 0.508 }, { "precision": 0.438, "recall": 0.189, "f1": 0.264, "tp": 7, "fp": 9, "tn": 4, "fn": 30, "n_passed": 16, "n_refused": 34, "threshold": 180, "mean_hhem_above": 0.481 }, { "precision": 0.438, "recall": 0.189, "f1": 0.264, "tp": 7, "fp": 9, "tn": 4, "fn": 30, "n_passed": 16, "n_refused": 34, "threshold": 190, "mean_hhem_above": 0.481 } ] }, "chunk_analysis": { "recommended_threshold": 1, "best_f1": 0.851, "best_precision": 0.74, "best_recall": 1.0, "correlation_chunks_hhem": -0.514, "current_threshold": 1, "mean_chunks": 2.06, "all_thresholds": [ { "precision": 0.74, "recall": 1.0, "f1": 0.851, "tp": 37, "fp": 13, "tn": 0, "fn": 0, "n_passed": 50, "n_refused": 0, "threshold": 1, "mean_hhem_above": 0.679 }, { "precision": 0.522, "recall": 0.324, "f1": 0.4, "tp": 12, "fp": 11, "tn": 2, "fn": 25, "n_passed": 23, "n_refused": 27, "threshold": 2, "mean_hhem_above": 0.539 }, { "precision": 0.308, "recall": 0.108, "f1": 0.16, "tp": 4, "fp": 9, "tn": 4, "fn": 33, "n_passed": 13, "n_refused": 37, "threshold": 3, "mean_hhem_above": 0.398 }, { "precision": 0.375, "recall": 0.081, "f1": 0.133, "tp": 3, "fp": 5, "tn": 8, "fn": 34, "n_passed": 8, "n_refused": 42, "threshold": 4, "mean_hhem_above": 0.427 }, { "precision": 0.25, "recall": 0.027, "f1": 0.049, "tp": 1, "fp": 3, "tn": 10, "fn": 36, "n_passed": 4, "n_refused": 46, "threshold": 5, "mean_hhem_above": 0.402 } ] }, "combined_analysis": { "recommended_token_threshold": 20, "recommended_chunk_threshold": 1, "best_f1": 0.805, "best_precision": 0.733, "best_recall": 0.892, "current_token_threshold": 30, "current_chunk_threshold": 1, "all_combinations": [ { "precision": 0.733, "recall": 0.892, "f1": 0.805, "tp": 33, "fp": 12, "tn": 1, "fn": 4, "n_passed": 45, "n_refused": 5, "token_threshold": 20, "chunk_threshold": 1 }, { "precision": 0.522, "recall": 0.324, "f1": 0.4, "tp": 12, "fp": 11, "tn": 2, "fn": 25, "n_passed": 23, "n_refused": 27, "token_threshold": 20, "chunk_threshold": 2 }, { "precision": 0.308, "recall": 0.108, "f1": 0.16, "tp": 4, "fp": 9, "tn": 4, "fn": 33, "n_passed": 13, "n_refused": 37, "token_threshold": 20, "chunk_threshold": 3 }, { "precision": 0.732, "recall": 0.811, "f1": 0.769, "tp": 30, "fp": 11, "tn": 2, "fn": 7, "n_passed": 41, "n_refused": 9, "token_threshold": 30, "chunk_threshold": 1 }, { "precision": 0.522, "recall": 0.324, "f1": 0.4, "tp": 12, "fp": 11, "tn": 2, "fn": 25, "n_passed": 23, "n_refused": 27, "token_threshold": 30, "chunk_threshold": 2 }, { "precision": 0.308, "recall": 0.108, "f1": 0.16, "tp": 4, "fp": 9, "tn": 4, "fn": 33, "n_passed": 13, "n_refused": 37, "token_threshold": 30, "chunk_threshold": 3 }, { "precision": 0.718, "recall": 0.757, "f1": 0.737, "tp": 28, "fp": 11, "tn": 2, "fn": 9, "n_passed": 39, "n_refused": 11, "token_threshold": 40, "chunk_threshold": 1 }, { "precision": 0.522, "recall": 0.324, "f1": 0.4, "tp": 12, "fp": 11, "tn": 2, "fn": 25, "n_passed": 23, "n_refused": 27, "token_threshold": 40, "chunk_threshold": 2 }, { "precision": 0.308, "recall": 0.108, "f1": 0.16, "tp": 4, "fp": 9, "tn": 4, "fn": 33, "n_passed": 13, "n_refused": 37, "token_threshold": 40, "chunk_threshold": 3 }, { "precision": 0.711, "recall": 0.73, "f1": 0.72, "tp": 27, "fp": 11, "tn": 2, "fn": 10, "n_passed": 38, "n_refused": 12, "token_threshold": 50, "chunk_threshold": 1 }, { "precision": 0.522, "recall": 0.324, "f1": 0.4, "tp": 12, "fp": 11, "tn": 2, "fn": 25, "n_passed": 23, "n_refused": 27, "token_threshold": 50, "chunk_threshold": 2 }, { "precision": 0.308, "recall": 0.108, "f1": 0.16, "tp": 4, "fp": 9, "tn": 4, "fn": 33, "n_passed": 13, "n_refused": 37, "token_threshold": 50, "chunk_threshold": 3 }, { "precision": 0.667, "recall": 0.595, "f1": 0.629, "tp": 22, "fp": 11, "tn": 2, "fn": 15, "n_passed": 33, "n_refused": 17, "token_threshold": 75, "chunk_threshold": 1 }, { "precision": 0.522, "recall": 0.324, "f1": 0.4, "tp": 12, "fp": 11, "tn": 2, "fn": 25, "n_passed": 23, "n_refused": 27, "token_threshold": 75, "chunk_threshold": 2 }, { "precision": 0.308, "recall": 0.108, "f1": 0.16, "tp": 4, "fp": 9, "tn": 4, "fn": 33, "n_passed": 13, "n_refused": 37, "token_threshold": 75, "chunk_threshold": 3 }, { "precision": 0.655, "recall": 0.514, "f1": 0.576, "tp": 19, "fp": 10, "tn": 3, "fn": 18, "n_passed": 29, "n_refused": 21, "token_threshold": 100, "chunk_threshold": 1 }, { "precision": 0.524, "recall": 0.297, "f1": 0.379, "tp": 11, "fp": 10, "tn": 3, "fn": 26, "n_passed": 21, "n_refused": 29, "token_threshold": 100, "chunk_threshold": 2 }, { "precision": 0.308, "recall": 0.108, "f1": 0.16, "tp": 4, "fp": 9, "tn": 4, "fn": 33, "n_passed": 13, "n_refused": 37, "token_threshold": 100, "chunk_threshold": 3 } ] } }