NLI / app.py
heerjtdev's picture
Update app.py
82d9acf verified
# import gradio as gr
# import torch
# import torch.nn.functional as F
# from sentence_transformers import SentenceTransformer, CrossEncoder, util
# # Use ModernBERT-based NLI for maximum speed on Free Tier CPU
# # This model is 20% faster and 40% lighter than standard DeBERTa
# reasoning_model_name = 'dleemiller/finecat-nli-l'
# similarity_model_name = 'all-MiniLM-L6-v2'
# print("Initializing 2025 Lightweight Suite...")
# sim_model = SentenceTransformer(similarity_model_name, device="cpu")
# nli_model = CrossEncoder(reasoning_model_name, device="cpu")
# def evaluate_response(kb, question, user_answer):
# # 1. Topic Relevance (Bi-Encoder)
# # We check if the answer even belongs in the same universe as the question
# q_emb = sim_model.encode(question, convert_to_tensor=True)
# a_emb = sim_model.encode(user_answer, convert_to_tensor=True)
# rel_score = util.cos_sim(q_emb, a_emb).item()
# # 2. Structured Reasoning (Cross-Encoder)
# # We format the hypothesis to force the model to evaluate the ANSWER specifically
# hypothesis = f"Based on the context, the answer to '{question}' is '{user_answer}'."
# logits = nli_model.predict([(kb, hypothesis)])
# probs = F.softmax(torch.tensor(logits), dim=1).tolist()[0]
# # Label mapping for FineCat/DeBERTa: 0: contradiction, 1: entailment, 2: neutral
# labels = ["CONTRADICTION", "ENTAILMENT", "NEUTRAL"]
# max_idx = torch.tensor(logits).argmax().item()
# verdict = labels[max_idx]
# conf = probs[max_idx] * 100
# # 3. Precision Logic Gate
# if verdict == "CONTRADICTION" and conf > 40:
# status = "❌ INCORRECT (Logic Conflict)"
# elif verdict == "ENTAILMENT" and conf > 35:
# status = "βœ… CORRECT (Confirmed)"
# elif rel_score > 0.40 and verdict != "CONTRADICTION":
# status = "βœ… CORRECT (Likely/Inferred)"
# else:
# status = "❌ WRONG / IRRELEVANT"
# return status, f"{rel_score:.2f}", f"{verdict} ({conf:.1f}%)"
# # UI Setup remains the same
# demo = gr.Interface(
# fn=evaluate_response,
# inputs=["text", "text", "text"],
# outputs=[gr.Textbox(label="Verdict"), gr.Label(label="Topic Similarity"), gr.Label(label="NLI Reasoning")],
# title="Lightweight Reasoning Engine v3",
# description="Using ModernBERT-distilled NLI for 2025-standard reasoning on CPU."
# )
# if __name__ == "__main__":
# demo.launch()
# import gradio as gr
# import torch
# import torch.nn.functional as F
# from sentence_transformers import CrossEncoder
# # --- CONFIGURATION ---
# # GATE 1: Semantic Relevance (STS)
# # Checks if the Answer is conversationally related to the Question.
# relevance_model_name = 'cross-encoder/stsb-distilroberta-base'
# # GATE 2: Fact Checking (NLI)
# # Checks if the Answer is supported by the Knowledge Base.
# nli_model_name = 'cross-encoder/nli-deberta-v3-xsmall'
# print(f"Loading Models...\n1. {relevance_model_name}\n2. {nli_model_name}")
# rel_model = CrossEncoder(relevance_model_name, device="cpu")
# nli_model = CrossEncoder(nli_model_name, device="cpu")
# print("βœ… System Ready.")
# def evaluate_response(kb, question, user_answer):
# if not kb or not question or not user_answer:
# return "⚠️ Error: Missing Input", {}, "N/A"
# logs = {}
# # --- GATE 1: RELEVANCE CHECK (STS) ---
# rel_score = rel_model.predict([(question, user_answer)])
# # FIX 1: Use .item() to safely extract float from numpy array
# rel_score_val = rel_score.item()
# logs['Gate 1 Model'] = relevance_model_name
# logs['Gate 1 Raw Score'] = f"{rel_score_val:.4f}"
# # Threshold: STS score > 0.15 usually implies relevance
# RELEVANCE_THRESHOLD = 0.15
# if rel_score_val < RELEVANCE_THRESHOLD:
# status = "❌ INCORRECT (Irrelevant)"
# logs['Verdict'] = "Blocked at Gate 1 (Answer unrelated to Question)"
# return status, logs, "Blocked"
# # --- GATE 2: FACT CHECKING (NLI) ---
# nli_logits = nli_model.predict([(kb, user_answer)])
# # FIX 2: Handle Dimensions safely
# # Convert to tensor
# nli_tensor = torch.tensor(nli_logits)
# # If the model returns a batch dimension (e.g. [1, 3]), squeeze it to flat [3]
# if nli_tensor.dim() > 1:
# nli_tensor = nli_tensor.squeeze()
# # Apply Softmax across the classes (now dim=0 is safe on a flat tensor)
# nli_probs = F.softmax(nli_tensor, dim=0).tolist()
# # Get the winner index
# max_idx = nli_tensor.argmax().item()
# # Standard NLI Labels
# labels = ["Contradiction", "Entailment", "Neutral"]
# # Safety check for model label count mismatch
# if max_idx >= len(labels):
# return "⚠️ Model Error", {"Error": "Label mismatch"}, "N/A"
# nli_verdict = labels[max_idx]
# nli_conf = nli_probs[max_idx] * 100
# logs['Gate 2 Model'] = nli_model_name
# logs['Gate 2 Probabilities'] = {
# "Contradiction": f"{nli_probs[0]*100:.1f}%",
# "Entailment": f"{nli_probs[1]*100:.1f}%",
# "Neutral": f"{nli_probs[2]*100:.1f}%"
# }
# logs['Gate 2 Verdict'] = nli_verdict
# # --- FINAL DECISION LOGIC ---
# if nli_verdict == "Entailment":
# status = "βœ… CORRECT (Confirmed)"
# logs['Final Outcome'] = "Answer is Relevant and Factual."
# elif nli_verdict == "Contradiction":
# status = "❌ INCORRECT (False Information)"
# logs['Final Outcome'] = "Answer contradicts the text."
# else: # Neutral
# status = "❌ INCORRECT (Hallucination/Not in Text)"
# logs['Final Outcome'] = "Answer not found in text."
# return status, logs, f"{nli_verdict} ({nli_conf:.1f}%)"
# # --- UI SETUP ---
# with gr.Blocks(title="NLI Logic Engine v5", theme=gr.themes.Soft()) as demo:
# gr.Markdown("## 🧠 Neural Logic Engine v5.1 (Bug Fixes Applied)")
# gr.Markdown("Corrected Architecture: STS for Relevance + NLI for Fact Checking.")
# with gr.Row():
# with gr.Column(scale=1):
# kb_input = gr.Textbox(label="Knowledge Base", lines=5, value="When a lion was resting in the jungle, a mouse began racing up and down his body for fun. The lion's sleep was disturbed, and he woke in anger.")
# q_input = gr.Textbox(label="Question", value="What was the lion doing?")
# a_input = gr.Textbox(label="User Answer", value="The lion was sleeping in the jungle.")
# btn = gr.Button("Evaluate", variant="primary")
# with gr.Column(scale=1):
# verdict_out = gr.Textbox(label="Final Verdict", elem_classes="verdict")
# nli_metric = gr.Label(label="NLI Confidence")
# debug_log = gr.JSON(label="System Internals (Debug Log)")
# btn.click(
# fn=evaluate_response,
# inputs=[kb_input, q_input, a_input],
# outputs=[verdict_out, debug_log, nli_metric]
# )
# if __name__ == "__main__":
# demo.launch()
import gradio as gr
import torch
import torch.nn.functional as F
from sentence_transformers import CrossEncoder
import re
# ==============================
# CONFIGURATION
# ==============================
RELEVANCE_MODEL = "cross-encoder/stsb-distilroberta-base"
NLI_MODEL = "cross-encoder/nli-deberta-v3-xsmall"
RELEVANCE_THRESHOLD_QA = 0.15
RELEVANCE_THRESHOLD_KB = 0.30
ENTAILMENT_THRESHOLD = 0.65
DEVICE = "cpu"
# ==============================
# LOAD MODELS
# ==============================
print("Loading models...")
rel_model = CrossEncoder(RELEVANCE_MODEL, device=DEVICE)
nli_model = CrossEncoder(NLI_MODEL, device=DEVICE)
print("βœ… Models loaded")
# ==============================
# UTILITIES
# ==============================
def split_sentences(text):
text = text.strip()
if not text:
return []
return re.split(r'(?<=[.!?])\s+', text)
def softmax_logits(logits):
t = torch.tensor(logits)
if t.dim() > 1:
t = t.squeeze(0)
probs = F.softmax(t, dim=0).tolist()
return probs
# ==============================
# CORE EVALUATION FUNCTION
# ==============================
def evaluate_response(kb, question, user_answer):
logs = {}
# ------------------------------
# INPUT VALIDATION
# ------------------------------
if not kb or not question or not user_answer:
return "⚠️ ERROR: Missing input", {}, "N/A"
logs["Inputs"] = {
"Question": question,
"User Answer": user_answer,
"KB Length (chars)": len(kb)
}
# ------------------------------
# GATE 1 β€” QUESTION ↔ ANSWER RELEVANCE
# ------------------------------
qa_score = rel_model.predict([(question, user_answer)]).item()
logs["Gate 1 β€” QA Relevance"] = {
"Model": RELEVANCE_MODEL,
"Score": round(qa_score, 4),
"Threshold": RELEVANCE_THRESHOLD_QA
}
if qa_score < RELEVANCE_THRESHOLD_QA:
logs["Final Decision"] = "Blocked at Gate 1 (Irrelevant Answer)"
return (
"❌ INCORRECT (Irrelevant)",
logs,
f"Relevance {qa_score:.2f}"
)
# ------------------------------
# GATE 2 β€” KB SENTENCE SELECTION (STS)
# ------------------------------
kb_sentences = split_sentences(kb)
logs["KB Processing"] = {
"Total Sentences": len(kb_sentences),
"Sentences": kb_sentences
}
if not kb_sentences:
logs["Final Decision"] = "Empty KB after sentence split"
return "❌ INCORRECT (Empty KB)", logs, "N/A"
sentence_pairs = [(s, user_answer) for s in kb_sentences]
sim_scores = rel_model.predict(sentence_pairs)
best_idx = int(sim_scores.argmax())
best_sentence = kb_sentences[best_idx]
best_score = float(sim_scores[best_idx])
logs["Gate 2 β€” KB Sentence Selection"] = {
"Model": RELEVANCE_MODEL,
"Best Sentence": best_sentence,
"Best Similarity Score": round(best_score, 4),
"Threshold": RELEVANCE_THRESHOLD_KB,
"All Scores": [
{"sentence": s, "score": round(float(sc), 4)}
for s, sc in zip(kb_sentences, sim_scores)
]
}
if best_score < RELEVANCE_THRESHOLD_KB:
logs["Final Decision"] = "Answer not grounded in KB"
return (
"❌ INCORRECT (Not Found in Text)",
logs,
f"KB Similarity {best_score:.2f}"
)
# ------------------------------
# GATE 3 β€” NLI (Sentence ↔ Answer)
# ------------------------------
nli_logits = nli_model.predict([(best_sentence, user_answer)])
probs = softmax_logits(nli_logits)
labels = ["Contradiction", "Entailment", "Neutral"]
verdict_idx = int(torch.tensor(probs).argmax())
verdict = labels[verdict_idx]
confidence = probs[verdict_idx] * 100
logs["Gate 3 β€” NLI Verification"] = {
"Model": NLI_MODEL,
"Premise": best_sentence,
"Hypothesis": user_answer,
"Probabilities": {
"Contradiction": f"{probs[0]*100:.2f}%",
"Entailment": f"{probs[1]*100:.2f}%",
"Neutral": f"{probs[2]*100:.2f}%"
},
"Verdict": verdict,
"Confidence": f"{confidence:.2f}%",
"Entailment Threshold": f"{ENTAILMENT_THRESHOLD*100:.0f}%"
}
# ------------------------------
# FINAL DECISION
# ------------------------------
if verdict == "Entailment" and probs[1] >= ENTAILMENT_THRESHOLD:
logs["Final Decision"] = "Answer is Supported by Text"
return (
"βœ… CORRECT (Confirmed)",
logs,
f"Entailment {confidence:.1f}%"
)
if verdict == "Contradiction":
logs["Final Decision"] = "Answer Contradicts Text"
return (
"❌ INCORRECT (Contradiction)",
logs,
f"Contradiction {confidence:.1f}%"
)
logs["Final Decision"] = "Answer Not Explicitly Stated"
return (
"❌ INCORRECT (Neutral / Not in Text)",
logs,
f"Neutral {confidence:.1f}%"
)
# ==============================
# GRADIO UI
# ==============================
with gr.Blocks(title="Neural Logic Engine v6", theme=gr.themes.Soft()) as demo:
gr.Markdown("## 🧠 Neural Logic Engine v6")
gr.Markdown(
"**Architecture:**\n"
"- Gate 1: Question ↔ Answer relevance (STS)\n"
"- Gate 2: KB sentence grounding (STS)\n"
"- Gate 3: Sentence-level NLI verification\n"
"- Fully logged, deterministic decisions"
)
with gr.Row():
with gr.Column(scale=1):
kb_input = gr.Textbox(
label="Knowledge Base",
lines=6,
value="When a lion was resting in the jungle, a mouse began racing up and down his body for fun. "
"The lion's sleep was disturbed, and he woke in anger."
)
q_input = gr.Textbox(
label="Question",
value="What was the lion doing?"
)
a_input = gr.Textbox(
label="User Answer",
value="The lion was sleeping in the jungle."
)
btn = gr.Button("Evaluate", variant="primary")
with gr.Column(scale=1):
verdict_out = gr.Textbox(label="Final Verdict")
confidence_out = gr.Label(label="Model Confidence")
debug_log = gr.JSON(label="System Internals (FULL DEBUG LOG)")
btn.click(
fn=evaluate_response,
inputs=[kb_input, q_input, a_input],
outputs=[verdict_out, debug_log, confidence_out]
)
if __name__ == "__main__":
demo.launch()