g0-detector / app.py
aphoticshaman's picture
Upload app.py with huggingface_hub
9b3b8ab verified
"""
G0 Hallucination Detector - Hugging Face Space
Detects when LLMs make things up using 3-criterion grounding analysis.
"""
import gradio as gr
import numpy as np
from sentence_transformers import SentenceTransformer
from typing import Optional
import time
# Load model once at startup
print("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded.")
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
"""Compute cosine similarity between two vectors."""
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8))
def compute_tracking(claim_emb: np.ndarray, source_embs: list[np.ndarray]) -> float:
"""
TRACKING: Does the claim follow from the sources?
High similarity = claim tracks the source content.
"""
if not source_embs:
return 0.0
similarities = [cosine_similarity(claim_emb, src) for src in source_embs]
return float(max(similarities))
def compute_intervention(claim: str, sources: list[str]) -> float:
"""
INTERVENTION: Would changing sources change the claim?
Approximated by checking keyword overlap.
"""
claim_words = set(claim.lower().split())
source_words = set()
for src in sources:
source_words.update(src.lower().split())
if not claim_words:
return 0.0
overlap = len(claim_words & source_words) / len(claim_words)
return overlap
def compute_counterfactual(claim_emb: np.ndarray, source_embs: list[np.ndarray]) -> float:
"""
COUNTERFACTUAL: In worlds without this source, would the claim still hold?
Approximated by checking how unique the grounding is.
"""
if len(source_embs) < 2:
return compute_tracking(claim_emb, source_embs)
similarities = [cosine_similarity(claim_emb, src) for src in source_embs]
max_sim = max(similarities)
second_max = sorted(similarities)[-2] if len(similarities) > 1 else 0
# If only one source grounds it well, counterfactual dependence is high
return max_sim * (1 - second_max + 0.1)
def detect_hallucination(claim: str, sources: str) -> dict:
"""
Main detection function.
G0 = (TRACKING × INTERVENTION × COUNTERFACTUAL)^(1/3)
Returns grounding score where:
- 1.0 = fully grounded (not a hallucination)
- 0.0 = completely ungrounded (hallucination)
"""
start = time.time()
# Parse sources (one per line)
source_list = [s.strip() for s in sources.strip().split('\n') if s.strip()]
if not source_list:
return {
"g0_score": 0.0,
"verdict": "HALLUCINATION (no sources provided)",
"tracking": 0.0,
"intervention": 0.0,
"counterfactual": 0.0,
"latency_ms": round((time.time() - start) * 1000, 1)
}
# Compute embeddings
claim_emb = model.encode(claim, convert_to_numpy=True)
source_embs = [model.encode(src, convert_to_numpy=True) for src in source_list]
# Compute three criteria
tracking = compute_tracking(claim_emb, source_embs)
intervention = compute_intervention(claim, source_list)
counterfactual = compute_counterfactual(claim_emb, source_embs)
# G0 = geometric mean of three criteria
g0 = (tracking * intervention * counterfactual) ** (1/3)
# Determine verdict
if g0 >= 0.7:
verdict = "GROUNDED - Claim is well-supported by sources"
elif g0 >= 0.4:
verdict = "PARTIAL - Claim has some support but may contain unsupported elements"
else:
verdict = "HALLUCINATION - Claim is not supported by provided sources"
latency = round((time.time() - start) * 1000, 1)
return {
"g0_score": round(g0, 3),
"verdict": verdict,
"tracking": round(tracking, 3),
"intervention": round(intervention, 3),
"counterfactual": round(counterfactual, 3),
"latency_ms": latency
}
def format_output(result: dict) -> str:
"""Format result for display."""
return f"""## Result
**G0 Score:** {result['g0_score']} (0 = hallucination, 1 = grounded)
**Verdict:** {result['verdict']}
### Component Scores
- **Tracking:** {result['tracking']} - Does the claim follow from sources?
- **Intervention:** {result['intervention']} - Would changing sources change the claim?
- **Counterfactual:** {result['counterfactual']} - Is the claim uniquely grounded?
*Latency: {result['latency_ms']}ms*
"""
def run_detection(claim: str, sources: str) -> str:
"""Gradio wrapper."""
if not claim.strip():
return "Please enter a claim to check."
if not sources.strip():
return "Please enter at least one source (one per line)."
result = detect_hallucination(claim, sources)
return format_output(result)
# Example inputs
examples = [
[
"The Eiffel Tower was built in 1889 and is located in Paris, France.",
"The Eiffel Tower is a wrought-iron lattice tower in Paris, France.\nIt was constructed from 1887 to 1889 as the entrance arch for the 1889 World's Fair."
],
[
"The Great Wall of China is visible from space with the naked eye.",
"The Great Wall of China is a series of fortifications built along the historical northern borders of China.\nContrary to popular belief, it is not visible from space with the naked eye under normal conditions."
],
[
"Python was created by Guido van Rossum in 1991.",
"Python is a high-level programming language.\nIt was created by Guido van Rossum and first released in 1991."
],
[
"Einstein invented the lightbulb.",
"Albert Einstein was a theoretical physicist who developed the theory of relativity.\nThomas Edison is credited with inventing the practical incandescent lightbulb in 1879."
]
]
# Build Gradio interface
with gr.Blocks(title="G0 Hallucination Detector", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# G0 Hallucination Detector
Detect when LLMs make things up. Enter a claim and the sources it should be grounded in.
**G0 Score:** Geometric mean of three criteria:
- **Tracking:** Does the claim follow from the sources?
- **Intervention:** Would changing sources change the claim?
- **Counterfactual:** In worlds without these sources, would the claim still hold?
Score ranges: 0.0 (hallucination) → 1.0 (fully grounded)
""")
with gr.Row():
with gr.Column():
claim_input = gr.Textbox(
label="Claim to verify",
placeholder="Enter the claim you want to check...",
lines=2
)
sources_input = gr.Textbox(
label="Sources (one per line)",
placeholder="Enter source texts, one per line...",
lines=5
)
submit_btn = gr.Button("Detect Hallucination", variant="primary")
with gr.Column():
output = gr.Markdown(label="Result")
gr.Examples(
examples=examples,
inputs=[claim_input, sources_input],
label="Try these examples"
)
submit_btn.click(
fn=run_detection,
inputs=[claim_input, sources_input],
outputs=output
)
gr.Markdown("""
---
**Free to use.** If this helps you, consider supporting: **[Cash App $ryancreating](https://cash.app/$ryancreating)**
---
**How it works:** Uses sentence embeddings to measure semantic similarity between claims and sources,
then computes a 3-criterion grounding metric.
Built by Crystalline Labs
""")
if __name__ == "__main__":
demo.launch()