Spaces:

aphoticshaman
/

g0-detector

Running

App Files Files Community

g0-detector / app.py

aphoticshaman

Upload app.py with huggingface_hub

9b3b8ab verified 17 days ago

raw

history blame contribute delete

7.85 kB

	"""
	G0 Hallucination Detector - Hugging Face Space
	Detects when LLMs make things up using 3-criterion grounding analysis.
	"""

	import gradio as gr
	import numpy as np
	from sentence_transformers import SentenceTransformer
	from typing import Optional
	import time

	# Load model once at startup
	print("Loading embedding model...")
	model = SentenceTransformer('all-MiniLM-L6-v2')
	print("Model loaded.")


	def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
	"""Compute cosine similarity between two vectors."""
	return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8))


	def compute_tracking(claim_emb: np.ndarray, source_embs: list[np.ndarray]) -> float:
	"""
	TRACKING: Does the claim follow from the sources?
	High similarity = claim tracks the source content.
	"""
	if not source_embs:
	return 0.0
	similarities = [cosine_similarity(claim_emb, src) for src in source_embs]
	return float(max(similarities))


	def compute_intervention(claim: str, sources: list[str]) -> float:
	"""
	INTERVENTION: Would changing sources change the claim?
	Approximated by checking keyword overlap.
	"""
	claim_words = set(claim.lower().split())
	source_words = set()
	for src in sources:
	source_words.update(src.lower().split())

	if not claim_words:
	return 0.0

	overlap = len(claim_words & source_words) / len(claim_words)
	return overlap


	def compute_counterfactual(claim_emb: np.ndarray, source_embs: list[np.ndarray]) -> float:
	"""
	COUNTERFACTUAL: In worlds without this source, would the claim still hold?
	Approximated by checking how unique the grounding is.
	"""
	if len(source_embs) < 2:
	return compute_tracking(claim_emb, source_embs)

	similarities = [cosine_similarity(claim_emb, src) for src in source_embs]
	max_sim = max(similarities)
	second_max = sorted(similarities)[-2] if len(similarities) > 1 else 0

	# If only one source grounds it well, counterfactual dependence is high
	return max_sim * (1 - second_max + 0.1)


	def detect_hallucination(claim: str, sources: str) -> dict:
	"""
	Main detection function.

	G0 = (TRACKING × INTERVENTION × COUNTERFACTUAL)^(1/3)

	Returns grounding score where:
	- 1.0 = fully grounded (not a hallucination)
	- 0.0 = completely ungrounded (hallucination)
	"""
	start = time.time()

	# Parse sources (one per line)
	source_list = [s.strip() for s in sources.strip().split('\n') if s.strip()]

	if not source_list:
	return {
	"g0_score": 0.0,
	"verdict": "HALLUCINATION (no sources provided)",
	"tracking": 0.0,
	"intervention": 0.0,
	"counterfactual": 0.0,
	"latency_ms": round((time.time() - start) * 1000, 1)
	}

	# Compute embeddings
	claim_emb = model.encode(claim, convert_to_numpy=True)
	source_embs = [model.encode(src, convert_to_numpy=True) for src in source_list]

	# Compute three criteria
	tracking = compute_tracking(claim_emb, source_embs)
	intervention = compute_intervention(claim, source_list)
	counterfactual = compute_counterfactual(claim_emb, source_embs)

	# G0 = geometric mean of three criteria
	g0 = (tracking * intervention * counterfactual) ** (1/3)

	# Determine verdict
	if g0 >= 0.7:
	verdict = "GROUNDED - Claim is well-supported by sources"
	elif g0 >= 0.4:
	verdict = "PARTIAL - Claim has some support but may contain unsupported elements"
	else:
	verdict = "HALLUCINATION - Claim is not supported by provided sources"

	latency = round((time.time() - start) * 1000, 1)

	return {
	"g0_score": round(g0, 3),
	"verdict": verdict,
	"tracking": round(tracking, 3),
	"intervention": round(intervention, 3),
	"counterfactual": round(counterfactual, 3),
	"latency_ms": latency
	}


	def format_output(result: dict) -> str:
	"""Format result for display."""
	return f"""## Result

	G0 Score: {result['g0_score']} (0 = hallucination, 1 = grounded)

	Verdict: {result['verdict']}

	### Component Scores
	- Tracking: {result['tracking']} - Does the claim follow from sources?
	- Intervention: {result['intervention']} - Would changing sources change the claim?
	- Counterfactual: {result['counterfactual']} - Is the claim uniquely grounded?

	Latency: {result['latency_ms']}ms
	"""


	def run_detection(claim: str, sources: str) -> str:
	"""Gradio wrapper."""
	if not claim.strip():
	return "Please enter a claim to check."
	if not sources.strip():
	return "Please enter at least one source (one per line)."

	result = detect_hallucination(claim, sources)
	return format_output(result)


	# Example inputs
	examples = [
	[
	"The Eiffel Tower was built in 1889 and is located in Paris, France.",
	"The Eiffel Tower is a wrought-iron lattice tower in Paris, France.\nIt was constructed from 1887 to 1889 as the entrance arch for the 1889 World's Fair."
	],
	[
	"The Great Wall of China is visible from space with the naked eye.",
	"The Great Wall of China is a series of fortifications built along the historical northern borders of China.\nContrary to popular belief, it is not visible from space with the naked eye under normal conditions."
	],
	[
	"Python was created by Guido van Rossum in 1991.",
	"Python is a high-level programming language.\nIt was created by Guido van Rossum and first released in 1991."
	],
	[
	"Einstein invented the lightbulb.",
	"Albert Einstein was a theoretical physicist who developed the theory of relativity.\nThomas Edison is credited with inventing the practical incandescent lightbulb in 1879."
	]
	]

	# Build Gradio interface
	with gr.Blocks(title="G0 Hallucination Detector", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# G0 Hallucination Detector

	Detect when LLMs make things up. Enter a claim and the sources it should be grounded in.

	G0 Score: Geometric mean of three criteria:
	- Tracking: Does the claim follow from the sources?
	- Intervention: Would changing sources change the claim?
	- Counterfactual: In worlds without these sources, would the claim still hold?

	Score ranges: 0.0 (hallucination) → 1.0 (fully grounded)
	""")

	with gr.Row():
	with gr.Column():
	claim_input = gr.Textbox(
	label="Claim to verify",
	placeholder="Enter the claim you want to check...",
	lines=2
	)
	sources_input = gr.Textbox(
	label="Sources (one per line)",
	placeholder="Enter source texts, one per line...",
	lines=5
	)
	submit_btn = gr.Button("Detect Hallucination", variant="primary")

	with gr.Column():
	output = gr.Markdown(label="Result")

	gr.Examples(
	examples=examples,
	inputs=[claim_input, sources_input],
	label="Try these examples"
	)

	submit_btn.click(
	fn=run_detection,
	inputs=[claim_input, sources_input],
	outputs=output
	)

	gr.Markdown("""
	---

	Free to use. If this helps you, consider supporting: [Cash App $ryancreating](https://cash.app/$ryancreating)

	---

	How it works: Uses sentence embeddings to measure semantic similarity between claims and sources,
	then computes a 3-criterion grounding metric.

	Built by Crystalline Labs
	""")

	if __name__ == "__main__":
	demo.launch()