Spaces:

aphoticshaman
/

g0-detector

Sleeping

App Files Files Community

aphoticshaman commited on Jan 19

Commit

36571e1

verified ·

1 Parent(s): c02ae9b

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +51 -12
app.py +225 -0
deploy.sh +15 -0
requirements.txt +4 -0

README.md CHANGED Viewed

@@ -1,12 +1,51 @@
----
-title: G0 Detector
-emoji: 👀
-colorFrom: blue
-colorTo: blue
-sdk: gradio
-sdk_version: 6.3.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: G0 Hallucination Detector
+emoji: 🔍
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 4.44.0
+app_file: app.py
+pinned: false
+license: mit
+short_description: Detect when LLMs hallucinate using 3-criterion grounding
+---
+# G0 Hallucination Detector
+Detect when LLMs make things up using a 3-criterion grounding metric.
+## How It Works
+**G0 = (Tracking × Intervention × Counterfactual)^(1/3)**
+- **Tracking:** Does the claim semantically follow from the sources?
+- **Intervention:** Would changing the sources change the claim?
+- **Counterfactual:** Is the claim uniquely dependent on these sources?
+## Scores
+- **0.7-1.0:** Grounded - claim is well-supported
+- **0.4-0.7:** Partial - some support, may contain unsupported elements
+- **0.0-0.4:** Hallucination - claim not supported by sources
+## Use Cases
+- Verify LLM outputs before production
+- Audit RAG pipeline responses
+- Research on hallucination detection
+## API
+```python
+import gradio_client
+client = gradio_client.Client("crystalline-labs/g0-detector")
+result = client.predict(
+    claim="The Eiffel Tower was built in 1889",
+    sources="The Eiffel Tower was constructed from 1887 to 1889.",
+    api_name="/predict"
+)
+```
+Built by Crystalline Labs

app.py ADDED Viewed

	@@ -0,0 +1,225 @@

+"""
+G0 Hallucination Detector - Hugging Face Space
+Detects when LLMs make things up using 3-criterion grounding analysis.
+"""
+import gradio as gr
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from typing import Optional
+import time
+# Load model once at startup
+print("Loading embedding model...")
+model = SentenceTransformer('all-MiniLM-L6-v2')
+print("Model loaded.")
+def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
+    """Compute cosine similarity between two vectors."""
+    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8))
+def compute_tracking(claim_emb: np.ndarray, source_embs: list[np.ndarray]) -> float:
+    """
+    TRACKING: Does the claim follow from the sources?
+    High similarity = claim tracks the source content.
+    """
+    if not source_embs:
+        return 0.0
+    similarities = [cosine_similarity(claim_emb, src) for src in source_embs]
+    return float(max(similarities))
+def compute_intervention(claim: str, sources: list[str]) -> float:
+    """
+    INTERVENTION: Would changing sources change the claim?
+    Approximated by checking keyword overlap.
+    """
+    claim_words = set(claim.lower().split())
+    source_words = set()
+    for src in sources:
+        source_words.update(src.lower().split())
+    if not claim_words:
+        return 0.0
+    overlap = len(claim_words & source_words) / len(claim_words)
+    return overlap
+def compute_counterfactual(claim_emb: np.ndarray, source_embs: list[np.ndarray]) -> float:
+    """
+    COUNTERFACTUAL: In worlds without this source, would the claim still hold?
+    Approximated by checking how unique the grounding is.
+    """
+    if len(source_embs) < 2:
+        return compute_tracking(claim_emb, source_embs)
+    similarities = [cosine_similarity(claim_emb, src) for src in source_embs]
+    max_sim = max(similarities)
+    second_max = sorted(similarities)[-2] if len(similarities) > 1 else 0
+    # If only one source grounds it well, counterfactual dependence is high
+    return max_sim * (1 - second_max + 0.1)
+def detect_hallucination(claim: str, sources: str) -> dict:
+    """
+    Main detection function.
+    G0 = (TRACKING × INTERVENTION × COUNTERFACTUAL)^(1/3)
+    Returns grounding score where:
+    - 1.0 = fully grounded (not a hallucination)
+    - 0.0 = completely ungrounded (hallucination)
+    """
+    start = time.time()
+    # Parse sources (one per line)
+    source_list = [s.strip() for s in sources.strip().split('\n') if s.strip()]
+    if not source_list:
+        return {
+            "g0_score": 0.0,
+            "verdict": "HALLUCINATION (no sources provided)",
+            "tracking": 0.0,
+            "intervention": 0.0,
+            "counterfactual": 0.0,
+            "latency_ms": round((time.time() - start) * 1000, 1)
+        }
+    # Compute embeddings
+    claim_emb = model.encode(claim, convert_to_numpy=True)
+    source_embs = [model.encode(src, convert_to_numpy=True) for src in source_list]
+    # Compute three criteria
+    tracking = compute_tracking(claim_emb, source_embs)
+    intervention = compute_intervention(claim, source_list)
+    counterfactual = compute_counterfactual(claim_emb, source_embs)
+    # G0 = geometric mean of three criteria
+    g0 = (tracking * intervention * counterfactual) ** (1/3)
+    # Determine verdict
+    if g0 >= 0.7:
+        verdict = "GROUNDED - Claim is well-supported by sources"
+    elif g0 >= 0.4:
+        verdict = "PARTIAL - Claim has some support but may contain unsupported elements"
+    else:
+        verdict = "HALLUCINATION - Claim is not supported by provided sources"
+    latency = round((time.time() - start) * 1000, 1)
+    return {
+        "g0_score": round(g0, 3),
+        "verdict": verdict,
+        "tracking": round(tracking, 3),
+        "intervention": round(intervention, 3),
+        "counterfactual": round(counterfactual, 3),
+        "latency_ms": latency
+    }
+def format_output(result: dict) -> str:
+    """Format result for display."""
+    return f"""## Result
+**G0 Score:** {result['g0_score']} (0 = hallucination, 1 = grounded)
+**Verdict:** {result['verdict']}
+### Component Scores
+- **Tracking:** {result['tracking']} - Does the claim follow from sources?
+- **Intervention:** {result['intervention']} - Would changing sources change the claim?
+- **Counterfactual:** {result['counterfactual']} - Is the claim uniquely grounded?
+*Latency: {result['latency_ms']}ms*
+"""
+def run_detection(claim: str, sources: str) -> str:
+    """Gradio wrapper."""
+    if not claim.strip():
+        return "Please enter a claim to check."
+    if not sources.strip():
+        return "Please enter at least one source (one per line)."
+    result = detect_hallucination(claim, sources)
+    return format_output(result)
+# Example inputs
+examples = [
+    [
+        "The Eiffel Tower was built in 1889 and is located in Paris, France.",
+        "The Eiffel Tower is a wrought-iron lattice tower in Paris, France.\nIt was constructed from 1887 to 1889 as the entrance arch for the 1889 World's Fair."
+    ],
+    [
+        "The Great Wall of China is visible from space with the naked eye.",
+        "The Great Wall of China is a series of fortifications built along the historical northern borders of China.\nContrary to popular belief, it is not visible from space with the naked eye under normal conditions."
+    ],
+    [
+        "Python was created by Guido van Rossum in 1991.",
+        "Python is a high-level programming language.\nIt was created by Guido van Rossum and first released in 1991."
+    ],
+    [
+        "Einstein invented the lightbulb.",
+        "Albert Einstein was a theoretical physicist who developed the theory of relativity.\nThomas Edison is credited with inventing the practical incandescent lightbulb in 1879."
+    ]
+]
+# Build Gradio interface
+with gr.Blocks(title="G0 Hallucination Detector", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # G0 Hallucination Detector
+    Detect when LLMs make things up. Enter a claim and the sources it should be grounded in.
+    **G0 Score:** Geometric mean of three criteria:
+    - **Tracking:** Does the claim follow from the sources?
+    - **Intervention:** Would changing sources change the claim?
+    - **Counterfactual:** In worlds without these sources, would the claim still hold?
+    Score ranges: 0.0 (hallucination) → 1.0 (fully grounded)
+    """)
+    with gr.Row():
+        with gr.Column():
+            claim_input = gr.Textbox(
+                label="Claim to verify",
+                placeholder="Enter the claim you want to check...",
+                lines=2
+            )
+            sources_input = gr.Textbox(
+                label="Sources (one per line)",
+                placeholder="Enter source texts, one per line...",
+                lines=5
+            )
+            submit_btn = gr.Button("Detect Hallucination", variant="primary")
+        with gr.Column():
+            output = gr.Markdown(label="Result")
+    gr.Examples(
+        examples=examples,
+        inputs=[claim_input, sources_input],
+        label="Try these examples"
+    )
+    submit_btn.click(
+        fn=run_detection,
+        inputs=[claim_input, sources_input],
+        outputs=output
+    )
+    gr.Markdown("""
+    ---
+    **How it works:** Uses sentence embeddings to measure semantic similarity between claims and sources,
+    then computes a 3-criterion grounding metric. [Source code on GitHub](https://github.com/crystalline-labs/g0-detector)
+    Built by Crystalline Labs
+    """)
+if __name__ == "__main__":
+    demo.launch()

deploy.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+#!/bin/bash
+# Deploy G0 Hallucination Detector to Hugging Face Spaces
+# Login (opens browser)
+huggingface-cli login
+# Create and push space
+huggingface-cli repo create g0-detector --type space --space_sdk gradio -y
+git init
+git remote add origin https://huggingface.co/spaces/$HF_USERNAME/g0-detector
+git add .
+git commit -m "Initial deploy: G0 Hallucination Detector"
+git push -u origin main
+echo "Done! Your space will be live at: https://huggingface.co/spaces/$HF_USERNAME/g0-detector"

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio>=4.0.0
+sentence-transformers>=2.2.0
+numpy>=1.21.0
+torch>=2.0.0