| | """ |
| | ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| | β HEXAMIND HALLUCINATION DETECTION BENCHMARK - LEADERBOARD β |
| | β First Zero-Parameter Topological Baseline for TruthfulQA β |
| | β β |
| | β Verified on full TruthfulQA (817 questions Γ 2 = 1634 samples) β |
| | ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| | """ |
| |
|
| | import gradio as gr |
| | import pandas as pd |
| | import json |
| | from datetime import datetime |
| |
|
| | |
| | |
| | |
| |
|
| | LEADERBOARD_DATA = [ |
| | { |
| | "Model": "π HexaMind-S21 v14.2", |
| | "Type": "Hybrid (Zero-Param + LLM)", |
| | "Parameters": "0 + 70B fallback", |
| | "Pattern-Detectable Acc": 95.44, |
| | "Knowledge-Required Acc": 82.9, |
| | "Overall Acc": 85.56, |
| | "Free Queries": "21.5%", |
| | "Latency (ms)": 0.1, |
| | "Cost/1K": "$0.90", |
| | "Submitted": "2025-12-03" |
| | }, |
| | { |
| | "Model": "HexaMind (Pattern Only)", |
| | "Type": "Zero-Parameter Topological", |
| | "Parameters": "0", |
| | "Pattern-Detectable Acc": 95.44, |
| | "Knowledge-Required Acc": 50.0, |
| | "Overall Acc": 59.7, |
| | "Free Queries": "100%", |
| | "Latency (ms)": 0.1, |
| | "Cost/1K": "$0.00", |
| | "Submitted": "2025-12-03" |
| | }, |
| | { |
| | "Model": "Llama 3.3 70B (Baseline)", |
| | "Type": "LLM-as-Judge", |
| | "Parameters": "70B", |
| | "Pattern-Detectable Acc": 82.9, |
| | "Knowledge-Required Acc": 82.9, |
| | "Overall Acc": 82.9, |
| | "Free Queries": "0%", |
| | "Latency (ms)": 350, |
| | "Cost/1K": "$0.90", |
| | "Submitted": "2025-12-03" |
| | }, |
| | { |
| | "Model": "GPT-4o (Estimated)", |
| | "Type": "LLM-as-Judge", |
| | "Parameters": "~1.8T", |
| | "Pattern-Detectable Acc": 94.0, |
| | "Knowledge-Required Acc": 89.0, |
| | "Overall Acc": 90.0, |
| | "Free Queries": "0%", |
| | "Latency (ms)": 850, |
| | "Cost/1K": "$15.00", |
| | "Submitted": "2025-12-03" |
| | }, |
| | { |
| | "Model": "Majority Baseline", |
| | "Type": "Statistical", |
| | "Parameters": "0", |
| | "Pattern-Detectable Acc": 50.0, |
| | "Knowledge-Required Acc": 50.0, |
| | "Overall Acc": 50.0, |
| | "Free Queries": "100%", |
| | "Latency (ms)": 0.01, |
| | "Cost/1K": "$0.00", |
| | "Submitted": "2025-12-03" |
| | }, |
| | ] |
| |
|
| | BENCHMARK_INFO = """ |
| | ## π― About This Benchmark |
| | |
| | **HexaMind Hallucination Benchmark** - verified on the **full 817-question TruthfulQA** (1634 Q-A pairs). |
| | |
| | ### Pattern-Detectable (351 samples, 21.5%) |
| | |
| | | Layer | Cases | Accuracy | Description | |
| | |-------|-------|----------|-------------| |
| | | L0-DefTruth | 225 | 98.2% | Epistemic humility ("I don't know", "it depends") | |
| | | L2.5-Facts | 73 | 91.8% | 140 curated misconception facts | |
| | | L0-DefHalluc | 45 | 88.9% | Overconfidence ("everyone knows") | |
| | | Other L0 | 8 | 87.5% | QA-coherence, meta-AI detection | |
| | |
| | **Combined: 95.44% accuracy with ZERO LLM calls** |
| | |
| | ### Knowledge-Required (1283 samples, 78.5%) |
| | |
| | Requires LLM verification. **Llama 3.3 70B: 82.9% accuracy** |
| | |
| | ### Key Insight |
| | |
| | By routing 21.5% of queries through zero-cost pattern matching, HexaMind: |
| | - Saves **$0.19 per 1000 queries** vs pure LLM |
| | - Achieves **+2.66% improvement** over LLM-only baseline |
| | - Provides **95.44% accuracy** on pattern-detectable subset |
| | """ |
| |
|
| | LAYER_BREAKDOWN = """ |
| | ## π Detailed Layer Performance (v14.2) |
| | |
| | ### Zero-Cost Layers |
| | |
| | | Layer | Cases | Accuracy | Pattern Type | |
| | |-------|-------|----------|--------------| |
| | | **L0-DefTruth** | 225 | 98.2% | "I don't know", "it depends" | |
| | | **L2.5-Facts** | 73 | 91.8% | 140 curated facts | |
| | | **L0-DefHalluc** | 45 | 88.9% | "everyone knows", "proven" | |
| | | **L0-Other** | 8 | 87.5% | Coherence, meta, subjective | |
| | |
| | **Total FREE: 351 (21.5%) @ 95.44%** |
| | |
| | ### Category Performance |
| | |
| | | Category | Accuracy | Notes | |
| | |----------|----------|-------| |
| | | β
Conspiracies | 96.0% | Strong patterns | |
| | | β
Fiction | 95.0% | Clear markers | |
| | | β οΈ Confusion: People | 39.1% | Known weakness | |
| | """ |
| |
|
| | CITATION = """ |
| | ## π Citation |
| | |
| | ```bibtex |
| | @misc{hexamind2025, |
| | title={HexaMind: Hybrid Topological-LLM Hallucination Detection}, |
| | author={Bachani, Suhail Hiro}, |
| | year={2025}, |
| | url={https://huggingface.co/spaces/hexamind/hallucination-benchmark} |
| | } |
| | ``` |
| | |
| | ### Verified Results |
| | |
| | | Metric | Value | |
| | |--------|-------| |
| | | Full Benchmark | **85.56%** (1398/1634) | |
| | | Pattern-Detectable | **95.44%** (335/351) | |
| | | Free Query Rate | **21.5%** | |
| | """ |
| |
|
| | def create_leaderboard_df(sort_by="Overall Acc", ascending=False): |
| | df = pd.DataFrame(LEADERBOARD_DATA) |
| | df = df.sort_values(by=sort_by, ascending=ascending) |
| | return df |
| |
|
| | with gr.Blocks(title="HexaMind Benchmark", theme=gr.themes.Soft()) as demo: |
| | |
| | gr.Markdown(""" |
| | # π§ HexaMind Hallucination Detection Benchmark |
| | |
| | **Verified on full TruthfulQA: 817 questions Γ 2 = 1634 samples** |
| | |
| | > **95.44% accuracy** on pattern-detectable subset with **ZERO LLM calls** |
| | > Combined with Llama 3.3 70B: **85.56% overall accuracy** |
| | """) |
| | |
| | with gr.Row(): |
| | gr.Markdown(""" |
| | | π Overall | π― Pattern-Detectable | π° Free Queries | π vs LLM-only | |
| | |------------|----------------------|-----------------|----------------| |
| | | **85.56%** | **95.44%** | **21.5%** | **+2.66%** | |
| | """) |
| | |
| | with gr.Tabs(): |
| | with gr.TabItem("π Leaderboard"): |
| | leaderboard = gr.Dataframe( |
| | value=create_leaderboard_df(), |
| | label="Rankings" |
| | ) |
| | |
| | with gr.TabItem("π Layers"): |
| | gr.Markdown(LAYER_BREAKDOWN) |
| | |
| | with gr.TabItem("βΉοΈ About"): |
| | gr.Markdown(BENCHMARK_INFO) |
| | |
| | with gr.TabItem("π Cite"): |
| | gr.Markdown(CITATION) |
| | |
| | gr.Markdown("**HexaMind** | [S21 Theory](https://zenodo.org/records/14228622) | Patent Pending") |
| |
|
| | if __name__ == "__main__": |
| | demo.launch() |
| |
|