Spaces:

heerjtdev
/

NLI

Sleeping

App Files Files Community

NLI / app.py

heerjtdev

Update app.py

82d9acf verified 12 days ago

raw

history blame contribute delete

13.7 kB

	# import gradio as gr
	# import torch
	# import torch.nn.functional as F
	# from sentence_transformers import SentenceTransformer, CrossEncoder, util

	# # Use ModernBERT-based NLI for maximum speed on Free Tier CPU
	# # This model is 20% faster and 40% lighter than standard DeBERTa
	# reasoning_model_name = 'dleemiller/finecat-nli-l'
	# similarity_model_name = 'all-MiniLM-L6-v2'

	# print("Initializing 2025 Lightweight Suite...")
	# sim_model = SentenceTransformer(similarity_model_name, device="cpu")
	# nli_model = CrossEncoder(reasoning_model_name, device="cpu")

	# def evaluate_response(kb, question, user_answer):
	# # 1. Topic Relevance (Bi-Encoder)
	# # We check if the answer even belongs in the same universe as the question
	# q_emb = sim_model.encode(question, convert_to_tensor=True)
	# a_emb = sim_model.encode(user_answer, convert_to_tensor=True)
	# rel_score = util.cos_sim(q_emb, a_emb).item()

	# # 2. Structured Reasoning (Cross-Encoder)
	# # We format the hypothesis to force the model to evaluate the ANSWER specifically
	# hypothesis = f"Based on the context, the answer to '{question}' is '{user_answer}'."

	# logits = nli_model.predict([(kb, hypothesis)])
	# probs = F.softmax(torch.tensor(logits), dim=1).tolist()[0]

	# # Label mapping for FineCat/DeBERTa: 0: contradiction, 1: entailment, 2: neutral
	# labels = ["CONTRADICTION", "ENTAILMENT", "NEUTRAL"]
	# max_idx = torch.tensor(logits).argmax().item()
	# verdict = labels[max_idx]
	# conf = probs[max_idx] * 100

	# # 3. Precision Logic Gate
	# if verdict == "CONTRADICTION" and conf > 40:
	# status = "❌ INCORRECT (Logic Conflict)"
	# elif verdict == "ENTAILMENT" and conf > 35:
	# status = "✅ CORRECT (Confirmed)"
	# elif rel_score > 0.40 and verdict != "CONTRADICTION":
	# status = "✅ CORRECT (Likely/Inferred)"
	# else:
	# status = "❌ WRONG / IRRELEVANT"

	# return status, f"{rel_score:.2f}", f"{verdict} ({conf:.1f}%)"

	# # UI Setup remains the same
	# demo = gr.Interface(
	# fn=evaluate_response,
	# inputs=["text", "text", "text"],
	# outputs=[gr.Textbox(label="Verdict"), gr.Label(label="Topic Similarity"), gr.Label(label="NLI Reasoning")],
	# title="Lightweight Reasoning Engine v3",
	# description="Using ModernBERT-distilled NLI for 2025-standard reasoning on CPU."
	# )

	# if __name__ == "__main__":
	# demo.launch()





	# import gradio as gr
	# import torch
	# import torch.nn.functional as F
	# from sentence_transformers import CrossEncoder

	# # --- CONFIGURATION ---
	# # GATE 1: Semantic Relevance (STS)
	# # Checks if the Answer is conversationally related to the Question.
	# relevance_model_name = 'cross-encoder/stsb-distilroberta-base'

	# # GATE 2: Fact Checking (NLI)
	# # Checks if the Answer is supported by the Knowledge Base.
	# nli_model_name = 'cross-encoder/nli-deberta-v3-xsmall'

	# print(f"Loading Models...\n1. {relevance_model_name}\n2. {nli_model_name}")
	# rel_model = CrossEncoder(relevance_model_name, device="cpu")
	# nli_model = CrossEncoder(nli_model_name, device="cpu")
	# print("✅ System Ready.")

	# def evaluate_response(kb, question, user_answer):
	# if not kb or not question or not user_answer:
	# return "⚠️ Error: Missing Input", {}, "N/A"

	# logs = {}

	# # --- GATE 1: RELEVANCE CHECK (STS) ---
	# rel_score = rel_model.predict([(question, user_answer)])

	# # FIX 1: Use .item() to safely extract float from numpy array
	# rel_score_val = rel_score.item()

	# logs['Gate 1 Model'] = relevance_model_name
	# logs['Gate 1 Raw Score'] = f"{rel_score_val:.4f}"

	# # Threshold: STS score > 0.15 usually implies relevance
	# RELEVANCE_THRESHOLD = 0.15

	# if rel_score_val < RELEVANCE_THRESHOLD:
	# status = "❌ INCORRECT (Irrelevant)"
	# logs['Verdict'] = "Blocked at Gate 1 (Answer unrelated to Question)"
	# return status, logs, "Blocked"

	# # --- GATE 2: FACT CHECKING (NLI) ---
	# nli_logits = nli_model.predict([(kb, user_answer)])

	# # FIX 2: Handle Dimensions safely
	# # Convert to tensor
	# nli_tensor = torch.tensor(nli_logits)

	# # If the model returns a batch dimension (e.g. [1, 3]), squeeze it to flat [3]
	# if nli_tensor.dim() > 1:
	# nli_tensor = nli_tensor.squeeze()

	# # Apply Softmax across the classes (now dim=0 is safe on a flat tensor)
	# nli_probs = F.softmax(nli_tensor, dim=0).tolist()

	# # Get the winner index
	# max_idx = nli_tensor.argmax().item()

	# # Standard NLI Labels
	# labels = ["Contradiction", "Entailment", "Neutral"]

	# # Safety check for model label count mismatch
	# if max_idx >= len(labels):
	# return "⚠️ Model Error", {"Error": "Label mismatch"}, "N/A"

	# nli_verdict = labels[max_idx]
	# nli_conf = nli_probs[max_idx] * 100

	# logs['Gate 2 Model'] = nli_model_name
	# logs['Gate 2 Probabilities'] = {
	# "Contradiction": f"{nli_probs[0]*100:.1f}%",
	# "Entailment": f"{nli_probs[1]*100:.1f}%",
	# "Neutral": f"{nli_probs[2]*100:.1f}%"
	# }
	# logs['Gate 2 Verdict'] = nli_verdict

	# # --- FINAL DECISION LOGIC ---
	# if nli_verdict == "Entailment":
	# status = "✅ CORRECT (Confirmed)"
	# logs['Final Outcome'] = "Answer is Relevant and Factual."

	# elif nli_verdict == "Contradiction":
	# status = "❌ INCORRECT (False Information)"
	# logs['Final Outcome'] = "Answer contradicts the text."

	# else: # Neutral
	# status = "❌ INCORRECT (Hallucination/Not in Text)"
	# logs['Final Outcome'] = "Answer not found in text."

	# return status, logs, f"{nli_verdict} ({nli_conf:.1f}%)"

	# # --- UI SETUP ---
	# with gr.Blocks(title="NLI Logic Engine v5", theme=gr.themes.Soft()) as demo:
	# gr.Markdown("## 🧠 Neural Logic Engine v5.1 (Bug Fixes Applied)")
	# gr.Markdown("Corrected Architecture: STS for Relevance + NLI for Fact Checking.")

	# with gr.Row():
	# with gr.Column(scale=1):
	# kb_input = gr.Textbox(label="Knowledge Base", lines=5, value="When a lion was resting in the jungle, a mouse began racing up and down his body for fun. The lion's sleep was disturbed, and he woke in anger.")
	# q_input = gr.Textbox(label="Question", value="What was the lion doing?")
	# a_input = gr.Textbox(label="User Answer", value="The lion was sleeping in the jungle.")
	# btn = gr.Button("Evaluate", variant="primary")

	# with gr.Column(scale=1):
	# verdict_out = gr.Textbox(label="Final Verdict", elem_classes="verdict")
	# nli_metric = gr.Label(label="NLI Confidence")
	# debug_log = gr.JSON(label="System Internals (Debug Log)")

	# btn.click(
	# fn=evaluate_response,
	# inputs=[kb_input, q_input, a_input],
	# outputs=[verdict_out, debug_log, nli_metric]
	# )

	# if __name__ == "__main__":
	# demo.launch()









	import gradio as gr
	import torch
	import torch.nn.functional as F
	from sentence_transformers import CrossEncoder
	import re

	# ==============================
	# CONFIGURATION
	# ==============================

	RELEVANCE_MODEL = "cross-encoder/stsb-distilroberta-base"
	NLI_MODEL = "cross-encoder/nli-deberta-v3-xsmall"

	RELEVANCE_THRESHOLD_QA = 0.15
	RELEVANCE_THRESHOLD_KB = 0.30
	ENTAILMENT_THRESHOLD = 0.65

	DEVICE = "cpu"

	# ==============================
	# LOAD MODELS
	# ==============================

	print("Loading models...")
	rel_model = CrossEncoder(RELEVANCE_MODEL, device=DEVICE)
	nli_model = CrossEncoder(NLI_MODEL, device=DEVICE)
	print("✅ Models loaded")

	# ==============================
	# UTILITIES
	# ==============================

	def split_sentences(text):
	text = text.strip()
	if not text:
	return []
	return re.split(r'(?<=[.!?])\s+', text)

	def softmax_logits(logits):
	t = torch.tensor(logits)
	if t.dim() > 1:
	t = t.squeeze(0)
	probs = F.softmax(t, dim=0).tolist()
	return probs

	# ==============================
	# CORE EVALUATION FUNCTION
	# ==============================

	def evaluate_response(kb, question, user_answer):
	logs = {}

	# ------------------------------
	# INPUT VALIDATION
	# ------------------------------
	if not kb or not question or not user_answer:
	return "⚠️ ERROR: Missing input", {}, "N/A"

	logs["Inputs"] = {
	"Question": question,
	"User Answer": user_answer,
	"KB Length (chars)": len(kb)
	}

	# ------------------------------
	# GATE 1 — QUESTION ↔ ANSWER RELEVANCE
	# ------------------------------
	qa_score = rel_model.predict([(question, user_answer)]).item()

	logs["Gate 1 — QA Relevance"] = {
	"Model": RELEVANCE_MODEL,
	"Score": round(qa_score, 4),
	"Threshold": RELEVANCE_THRESHOLD_QA
	}

	if qa_score < RELEVANCE_THRESHOLD_QA:
	logs["Final Decision"] = "Blocked at Gate 1 (Irrelevant Answer)"
	return (
	"❌ INCORRECT (Irrelevant)",
	logs,
	f"Relevance {qa_score:.2f}"
	)

	# ------------------------------
	# GATE 2 — KB SENTENCE SELECTION (STS)
	# ------------------------------
	kb_sentences = split_sentences(kb)
	logs["KB Processing"] = {
	"Total Sentences": len(kb_sentences),
	"Sentences": kb_sentences
	}

	if not kb_sentences:
	logs["Final Decision"] = "Empty KB after sentence split"
	return "❌ INCORRECT (Empty KB)", logs, "N/A"

	sentence_pairs = [(s, user_answer) for s in kb_sentences]
	sim_scores = rel_model.predict(sentence_pairs)

	best_idx = int(sim_scores.argmax())
	best_sentence = kb_sentences[best_idx]
	best_score = float(sim_scores[best_idx])

	logs["Gate 2 — KB Sentence Selection"] = {
	"Model": RELEVANCE_MODEL,
	"Best Sentence": best_sentence,
	"Best Similarity Score": round(best_score, 4),
	"Threshold": RELEVANCE_THRESHOLD_KB,
	"All Scores": [
	{"sentence": s, "score": round(float(sc), 4)}
	for s, sc in zip(kb_sentences, sim_scores)
	]
	}

	if best_score < RELEVANCE_THRESHOLD_KB:
	logs["Final Decision"] = "Answer not grounded in KB"
	return (
	"❌ INCORRECT (Not Found in Text)",
	logs,
	f"KB Similarity {best_score:.2f}"
	)

	# ------------------------------
	# GATE 3 — NLI (Sentence ↔ Answer)
	# ------------------------------
	nli_logits = nli_model.predict([(best_sentence, user_answer)])
	probs = softmax_logits(nli_logits)

	labels = ["Contradiction", "Entailment", "Neutral"]
	verdict_idx = int(torch.tensor(probs).argmax())
	verdict = labels[verdict_idx]
	confidence = probs[verdict_idx] * 100

	logs["Gate 3 — NLI Verification"] = {
	"Model": NLI_MODEL,
	"Premise": best_sentence,
	"Hypothesis": user_answer,
	"Probabilities": {
	"Contradiction": f"{probs[0]*100:.2f}%",
	"Entailment": f"{probs[1]*100:.2f}%",
	"Neutral": f"{probs[2]*100:.2f}%"
	},
	"Verdict": verdict,
	"Confidence": f"{confidence:.2f}%",
	"Entailment Threshold": f"{ENTAILMENT_THRESHOLD*100:.0f}%"
	}

	# ------------------------------
	# FINAL DECISION
	# ------------------------------
	if verdict == "Entailment" and probs[1] >= ENTAILMENT_THRESHOLD:
	logs["Final Decision"] = "Answer is Supported by Text"
	return (
	"✅ CORRECT (Confirmed)",
	logs,
	f"Entailment {confidence:.1f}%"
	)

	if verdict == "Contradiction":
	logs["Final Decision"] = "Answer Contradicts Text"
	return (
	"❌ INCORRECT (Contradiction)",
	logs,
	f"Contradiction {confidence:.1f}%"
	)

	logs["Final Decision"] = "Answer Not Explicitly Stated"
	return (
	"❌ INCORRECT (Neutral / Not in Text)",
	logs,
	f"Neutral {confidence:.1f}%"
	)

	# ==============================
	# GRADIO UI
	# ==============================

	with gr.Blocks(title="Neural Logic Engine v6", theme=gr.themes.Soft()) as demo:
	gr.Markdown("## 🧠 Neural Logic Engine v6")
	gr.Markdown(
	"Architecture:\n"
	"- Gate 1: Question ↔ Answer relevance (STS)\n"
	"- Gate 2: KB sentence grounding (STS)\n"
	"- Gate 3: Sentence-level NLI verification\n"
	"- Fully logged, deterministic decisions"
	)

	with gr.Row():
	with gr.Column(scale=1):
	kb_input = gr.Textbox(
	label="Knowledge Base",
	lines=6,
	value="When a lion was resting in the jungle, a mouse began racing up and down his body for fun. "
	"The lion's sleep was disturbed, and he woke in anger."
	)
	q_input = gr.Textbox(
	label="Question",
	value="What was the lion doing?"
	)
	a_input = gr.Textbox(
	label="User Answer",
	value="The lion was sleeping in the jungle."
	)
	btn = gr.Button("Evaluate", variant="primary")

	with gr.Column(scale=1):
	verdict_out = gr.Textbox(label="Final Verdict")
	confidence_out = gr.Label(label="Model Confidence")
	debug_log = gr.JSON(label="System Internals (FULL DEBUG LOG)")

	btn.click(
	fn=evaluate_response,
	inputs=[kb_input, q_input, a_input],
	outputs=[verdict_out, debug_log, confidence_out]
	)

	if __name__ == "__main__":
	demo.launch()