| """ |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| β HEXAMIND HALLUCINATION BENCHMARK - EVALUATION SCRIPT β |
| β Evaluate your model on Pattern-Detectable vs Knowledge-Required splits β |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| |
| Usage: |
| from hexamind_benchmark import HexaMindBenchmark |
| |
| benchmark = HexaMindBenchmark() |
| results = benchmark.evaluate(your_model_function) |
| """ |
|
|
| import json |
| import os |
| from typing import Callable, Dict, List, Optional |
| from dataclasses import dataclass |
| import time |
|
|
| @dataclass |
| class EvaluationResult: |
| """Results from benchmark evaluation""" |
| pattern_detectable_accuracy: float |
| knowledge_required_accuracy: float |
| overall_accuracy: float |
| pattern_detectable_samples: int |
| knowledge_required_samples: int |
| total_samples: int |
| avg_latency_ms: float |
| |
| def to_dict(self) -> Dict: |
| return { |
| "pattern_detectable_accuracy": round(self.pattern_detectable_accuracy, 2), |
| "knowledge_required_accuracy": round(self.knowledge_required_accuracy, 2), |
| "overall_accuracy": round(self.overall_accuracy, 2), |
| "pattern_detectable_samples": self.pattern_detectable_samples, |
| "knowledge_required_samples": self.knowledge_required_samples, |
| "total_samples": self.total_samples, |
| "avg_latency_ms": round(self.avg_latency_ms, 2) |
| } |
| |
| def __str__(self) -> str: |
| return f""" |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| β HEXAMIND BENCHMARK EVALUATION RESULTS β |
| β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£ |
| β Pattern-Detectable Accuracy: {self.pattern_detectable_accuracy:6.2f}% (n={self.pattern_detectable_samples:3d}) β |
| β Knowledge-Required Accuracy: {self.knowledge_required_accuracy:6.2f}% (n={self.knowledge_required_samples:3d}) β |
| β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β |
| β Overall Accuracy: {self.overall_accuracy:6.2f}% (n={self.total_samples:3d}) β |
| β Average Latency: {self.avg_latency_ms:6.2f} ms β |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| """ |
|
|
|
|
| class HexaMindBenchmark: |
| """ |
| HexaMind Hallucination Detection Benchmark |
| |
| Evaluates models on two splits: |
| 1. Pattern-Detectable: Questions where linguistic patterns reveal hallucinations |
| 2. Knowledge-Required: Questions requiring factual verification |
| """ |
| |
| def __init__(self, data_dir: str = "data"): |
| self.data_dir = data_dir |
| self.pattern_detectable = self._load_split("pattern_detectable.json") |
| self.knowledge_required = self._load_split("knowledge_required.json") |
| |
| def _load_split(self, filename: str) -> List[Dict]: |
| """Load a benchmark split from JSON""" |
| filepath = os.path.join(self.data_dir, filename) |
| if os.path.exists(filepath): |
| with open(filepath, 'r') as f: |
| return json.load(f) |
| else: |
| print(f"Warning: {filepath} not found. Using empty list.") |
| return [] |
| |
| def evaluate( |
| self, |
| model_fn: Callable[[str, str], bool], |
| split: str = "all", |
| verbose: bool = True |
| ) -> EvaluationResult: |
| """ |
| Evaluate a model on the benchmark. |
| |
| Args: |
| model_fn: Function that takes (question, answer) and returns: |
| True if answer is trustworthy, False if hallucination |
| split: "all", "pattern_detectable", or "knowledge_required" |
| verbose: Print progress |
| |
| Returns: |
| EvaluationResult with accuracy metrics |
| """ |
| |
| |
| if split == "all": |
| pattern_samples = self.pattern_detectable |
| knowledge_samples = self.knowledge_required |
| elif split == "pattern_detectable": |
| pattern_samples = self.pattern_detectable |
| knowledge_samples = [] |
| elif split == "knowledge_required": |
| pattern_samples = [] |
| knowledge_samples = self.knowledge_required |
| else: |
| raise ValueError(f"Unknown split: {split}") |
| |
| |
| pattern_correct = 0 |
| pattern_total = 0 |
| latencies = [] |
| |
| if pattern_samples: |
| if verbose: |
| print("Evaluating Pattern-Detectable split...") |
| for i, sample in enumerate(pattern_samples): |
| start = time.time() |
| prediction = model_fn(sample["question"], sample["answer"]) |
| latencies.append((time.time() - start) * 1000) |
| |
| expected = sample["ground_truth"] == 1 |
| if prediction == expected: |
| pattern_correct += 1 |
| pattern_total += 1 |
| |
| if verbose and (i + 1) % 50 == 0: |
| print(f" {i + 1}/{len(pattern_samples)}...") |
| |
| |
| knowledge_correct = 0 |
| knowledge_total = 0 |
| |
| if knowledge_samples: |
| if verbose: |
| print("Evaluating Knowledge-Required split...") |
| for i, sample in enumerate(knowledge_samples): |
| start = time.time() |
| prediction = model_fn(sample["question"], sample["answer"]) |
| latencies.append((time.time() - start) * 1000) |
| |
| expected = sample["ground_truth"] == 1 |
| if prediction == expected: |
| knowledge_correct += 1 |
| knowledge_total += 1 |
| |
| if verbose and (i + 1) % 100 == 0: |
| print(f" {i + 1}/{len(knowledge_samples)}...") |
| |
| |
| pattern_acc = (pattern_correct / pattern_total * 100) if pattern_total > 0 else 0 |
| knowledge_acc = (knowledge_correct / knowledge_total * 100) if knowledge_total > 0 else 0 |
| |
| total_correct = pattern_correct + knowledge_correct |
| total_samples = pattern_total + knowledge_total |
| overall_acc = (total_correct / total_samples * 100) if total_samples > 0 else 0 |
| |
| avg_latency = sum(latencies) / len(latencies) if latencies else 0 |
| |
| result = EvaluationResult( |
| pattern_detectable_accuracy=pattern_acc, |
| knowledge_required_accuracy=knowledge_acc, |
| overall_accuracy=overall_acc, |
| pattern_detectable_samples=pattern_total, |
| knowledge_required_samples=knowledge_total, |
| total_samples=total_samples, |
| avg_latency_ms=avg_latency |
| ) |
| |
| if verbose: |
| print(result) |
| |
| return result |
| |
| def create_submission( |
| self, |
| result: EvaluationResult, |
| model_name: str, |
| model_type: str, |
| parameters: str, |
| cost_per_1k: str = "Unknown", |
| contact: str = "", |
| paper_link: str = "" |
| ) -> Dict: |
| """Create a submission JSON for the leaderboard""" |
| from datetime import datetime |
| |
| submission = { |
| "model_name": model_name, |
| "model_type": model_type, |
| "parameters": parameters, |
| "pattern_detectable_accuracy": result.pattern_detectable_accuracy, |
| "knowledge_required_accuracy": result.knowledge_required_accuracy, |
| "overall_accuracy": result.overall_accuracy, |
| "latency_ms": result.avg_latency_ms, |
| "cost_per_1k": cost_per_1k, |
| "submission_date": datetime.now().strftime("%Y-%m-%d"), |
| "contact": contact, |
| "paper_link": paper_link |
| } |
| |
| return submission |
|
|
|
|
| |
| |
| |
|
|
| class HexaMindBaseline: |
| """ |
| Zero-parameter topological hallucination detector. |
| Uses I Ching pattern matching grounded in S21 Vacuum Manifold Theory. |
| """ |
| |
| |
| TRUTH_PATTERNS = [ |
| r"i don'?t know", |
| r"it depends", |
| r"there is no( scientific)? evidence", |
| r"that's a myth", |
| r"this is a common misconception", |
| r"nothing (in particular |really |unusual )?(happens|occurs|will happen)", |
| r"no[,.]? (it |that )?(?:does not|doesn't|is not|isn't|won't|will not)", |
| ] |
| |
| |
| HALLUC_PATTERNS = [ |
| r"^yes[,.]? ", |
| r"will (bring|cause|give|attract) (you )?(good |bad )?luck", |
| r"seven years of bad luck", |
| r"(ancient|eastern|chinese) wisdom (says|teaches|tells)", |
| r"studies (have )?show(n|s)? that", |
| r"everyone knows (that )?", |
| r"it'?s (a )?(well[- ])?known fact", |
| ] |
| |
| def __init__(self): |
| import re |
| self.truth_re = [re.compile(p, re.IGNORECASE) for p in self.TRUTH_PATTERNS] |
| self.halluc_re = [re.compile(p, re.IGNORECASE) for p in self.HALLUC_PATTERNS] |
| |
| def predict(self, question: str, answer: str) -> bool: |
| """ |
| Returns True if answer appears trustworthy, False if likely hallucination. |
| """ |
| |
| for pattern in self.truth_re: |
| if pattern.search(answer): |
| return True |
| |
| |
| for pattern in self.halluc_re: |
| if pattern.search(answer): |
| return False |
| |
| |
| return True |
|
|
|
|
| |
| |
| |
|
|
| if __name__ == "__main__": |
| import argparse |
| |
| parser = argparse.ArgumentParser(description="HexaMind Benchmark Evaluation") |
| parser.add_argument("--model", default="hexamind", help="Model to evaluate (hexamind|random)") |
| parser.add_argument("--split", default="all", help="Split to evaluate (all|pattern_detectable|knowledge_required)") |
| parser.add_argument("--output", default=None, help="Output JSON file for submission") |
| |
| args = parser.parse_args() |
| |
| |
| benchmark = HexaMindBenchmark() |
| |
| |
| if args.model == "hexamind": |
| model = HexaMindBaseline() |
| model_fn = model.predict |
| model_name = "HexaMind-S21" |
| model_type = "Zero-Parameter Topological" |
| params = "0" |
| elif args.model == "random": |
| import random |
| model_fn = lambda q, a: random.random() > 0.5 |
| model_name = "Random Baseline" |
| model_type = "Statistical" |
| params = "0" |
| else: |
| print(f"Unknown model: {args.model}") |
| exit(1) |
| |
| |
| result = benchmark.evaluate(model_fn, split=args.split) |
| |
| |
| if args.output: |
| submission = benchmark.create_submission( |
| result, model_name, model_type, params |
| ) |
| with open(args.output, 'w') as f: |
| json.dump(submission, f, indent=2) |
| print(f"Submission saved to {args.output}") |
|
|