Spaces:

aphoticshaman
/

g0-detector

Sleeping

File size: 7,853 Bytes

"""

G0 Hallucination Detector - Hugging Face Space

Detects when LLMs make things up using 3-criterion grounding analysis.

"""

import gradio as gr
import numpy as np
from sentence_transformers import SentenceTransformer
from typing import Optional
import time

# Load model once at startup
print("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded.")


def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    """Compute cosine similarity between two vectors."""
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8))


def compute_tracking(claim_emb: np.ndarray, source_embs: list[np.ndarray]) -> float:
    """

    TRACKING: Does the claim follow from the sources?

    High similarity = claim tracks the source content.

    """
    if not source_embs:
        return 0.0
    similarities = [cosine_similarity(claim_emb, src) for src in source_embs]
    return float(max(similarities))


def compute_intervention(claim: str, sources: list[str]) -> float:
    """

    INTERVENTION: Would changing sources change the claim?

    Approximated by checking keyword overlap.

    """
    claim_words = set(claim.lower().split())
    source_words = set()
    for src in sources:
        source_words.update(src.lower().split())

    if not claim_words:
        return 0.0

    overlap = len(claim_words & source_words) / len(claim_words)
    return overlap


def compute_counterfactual(claim_emb: np.ndarray, source_embs: list[np.ndarray]) -> float:
    """

    COUNTERFACTUAL: In worlds without this source, would the claim still hold?

    Approximated by checking how unique the grounding is.

    """
    if len(source_embs) < 2:
        return compute_tracking(claim_emb, source_embs)

    similarities = [cosine_similarity(claim_emb, src) for src in source_embs]
    max_sim = max(similarities)
    second_max = sorted(similarities)[-2] if len(similarities) > 1 else 0

    # If only one source grounds it well, counterfactual dependence is high
    return max_sim * (1 - second_max + 0.1)


def detect_hallucination(claim: str, sources: str) -> dict:
    """

    Main detection function.



    G0 = (TRACKING × INTERVENTION × COUNTERFACTUAL)^(1/3)



    Returns grounding score where:

    - 1.0 = fully grounded (not a hallucination)

    - 0.0 = completely ungrounded (hallucination)

    """
    start = time.time()

    # Parse sources (one per line)
    source_list = [s.strip() for s in sources.strip().split('\n') if s.strip()]

    if not source_list:
        return {
            "g0_score": 0.0,
            "verdict": "HALLUCINATION (no sources provided)",
            "tracking": 0.0,
            "intervention": 0.0,
            "counterfactual": 0.0,
            "latency_ms": round((time.time() - start) * 1000, 1)
        }

    # Compute embeddings
    claim_emb = model.encode(claim, convert_to_numpy=True)
    source_embs = [model.encode(src, convert_to_numpy=True) for src in source_list]

    # Compute three criteria
    tracking = compute_tracking(claim_emb, source_embs)
    intervention = compute_intervention(claim, source_list)
    counterfactual = compute_counterfactual(claim_emb, source_embs)

    # G0 = geometric mean of three criteria
    g0 = (tracking * intervention * counterfactual) ** (1/3)

    # Determine verdict
    if g0 >= 0.7:
        verdict = "GROUNDED - Claim is well-supported by sources"
    elif g0 >= 0.4:
        verdict = "PARTIAL - Claim has some support but may contain unsupported elements"
    else:
        verdict = "HALLUCINATION - Claim is not supported by provided sources"

    latency = round((time.time() - start) * 1000, 1)

    return {
        "g0_score": round(g0, 3),
        "verdict": verdict,
        "tracking": round(tracking, 3),
        "intervention": round(intervention, 3),
        "counterfactual": round(counterfactual, 3),
        "latency_ms": latency
    }


def format_output(result: dict) -> str:
    """Format result for display."""
    return f"""## Result



**G0 Score:** {result['g0_score']} (0 = hallucination, 1 = grounded)



**Verdict:** {result['verdict']}



### Component Scores

- **Tracking:** {result['tracking']} - Does the claim follow from sources?

- **Intervention:** {result['intervention']} - Would changing sources change the claim?

- **Counterfactual:** {result['counterfactual']} - Is the claim uniquely grounded?



*Latency: {result['latency_ms']}ms*

"""


def run_detection(claim: str, sources: str) -> str:
    """Gradio wrapper."""
    if not claim.strip():
        return "Please enter a claim to check."
    if not sources.strip():
        return "Please enter at least one source (one per line)."

    result = detect_hallucination(claim, sources)
    return format_output(result)


# Example inputs
examples = [
    [
        "The Eiffel Tower was built in 1889 and is located in Paris, France.",
        "The Eiffel Tower is a wrought-iron lattice tower in Paris, France.\nIt was constructed from 1887 to 1889 as the entrance arch for the 1889 World's Fair."
    ],
    [
        "The Great Wall of China is visible from space with the naked eye.",
        "The Great Wall of China is a series of fortifications built along the historical northern borders of China.\nContrary to popular belief, it is not visible from space with the naked eye under normal conditions."
    ],
    [
        "Python was created by Guido van Rossum in 1991.",
        "Python is a high-level programming language.\nIt was created by Guido van Rossum and first released in 1991."
    ],
    [
        "Einstein invented the lightbulb.",
        "Albert Einstein was a theoretical physicist who developed the theory of relativity.\nThomas Edison is credited with inventing the practical incandescent lightbulb in 1879."
    ]
]

# Build Gradio interface
with gr.Blocks(title="G0 Hallucination Detector", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""

    # G0 Hallucination Detector



    Detect when LLMs make things up. Enter a claim and the sources it should be grounded in.



    **G0 Score:** Geometric mean of three criteria:

    - **Tracking:** Does the claim follow from the sources?

    - **Intervention:** Would changing sources change the claim?

    - **Counterfactual:** In worlds without these sources, would the claim still hold?



    Score ranges: 0.0 (hallucination) → 1.0 (fully grounded)

    """)

    with gr.Row():
        with gr.Column():
            claim_input = gr.Textbox(
                label="Claim to verify",
                placeholder="Enter the claim you want to check...",
                lines=2
            )
            sources_input = gr.Textbox(
                label="Sources (one per line)",
                placeholder="Enter source texts, one per line...",
                lines=5
            )
            submit_btn = gr.Button("Detect Hallucination", variant="primary")

        with gr.Column():
            output = gr.Markdown(label="Result")

    gr.Examples(
        examples=examples,
        inputs=[claim_input, sources_input],
        label="Try these examples"
    )

    submit_btn.click(
        fn=run_detection,
        inputs=[claim_input, sources_input],
        outputs=output
    )

    gr.Markdown("""

    ---



    **Free to use.** If this helps you, consider supporting: **[Cash App $ryancreating](https://cash.app/$ryancreating)**



    ---



    **How it works:** Uses sentence embeddings to measure semantic similarity between claims and sources,

    then computes a 3-criterion grounding metric.



    Built by Crystalline Labs

    """)

if __name__ == "__main__":
    demo.launch()