Spaces:

heerjtdev
/

NLI

Sleeping

App Files Files Community

heerjtdev commited on 13 days ago

Commit

d9658ea

verified ·

1 Parent(s): 4898472

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -80

app.py CHANGED Viewed

@@ -63,111 +63,113 @@
 import gradio as gr
 import torch
 import torch.nn.functional as F
 from sentence_transformers import CrossEncoder
 # --- CONFIGURATION ---
-# Model 1: QA Relevance Validator
-# This model is trained on MS MARCO. It predicts how well a passage answers a query.
-# High score = The answer addresses the question directly.
-# Low score = Irrelevant (e.g., Q: "What did the lion do?", A: "The mouse's name is Lucy")
-qa_model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
-# Model 2: Fact Checker (NLI)
-# We use a DeBERTa-v3-xsmall or similar high-performance NLI model.
-# It is very robust at detecting Hallucinations vs Entailment.
 nli_model_name = 'cross-encoder/nli-deberta-v3-xsmall'
-print("Initializing Reasoning Engines...")
-qa_model = CrossEncoder(qa_model_name, device="cpu")
 nli_model = CrossEncoder(nli_model_name, device="cpu")
-print("System Ready.")
 def evaluate_response(kb, question, user_answer):
     if not kb or not question or not user_answer:
-        return "⚠️ Missing Input", "N/A", "N/A"
-    # --- GATE 1: Question-Answer Relevance Check ---
-    # We ask the model: "Is 'user_answer' a relevant response to 'question'?"
-    # MS-MARCO models output unbounded logits. Usually > 0 means relevant.
-    qa_scores = qa_model.predict([(question, user_answer)])
-    qa_score = qa_scores.item()
-    # Sigmoid to make it easier to read (0-100%)
-    qa_confidence = (1 / (1 + torch.exp(torch.tensor(-qa_score)))).item() * 100
-    # Strict Relevance Threshold (Adjustable)
-    # If the QA score is too low, we reject it immediately as irrelevant.
-    is_relevant = qa_score > 1.0  # Logit threshold (approx 73% confidence)
-    if not is_relevant:
-        return (
-            "❌ INCORRECT (Irrelevant Answer)",
-            f"Low Relevance ({qa_confidence:.1f}%)",
-            "Skipped (Not an answer)"
-        )
-    # --- GATE 2: Knowledge Base Verification (NLI) ---
-    # Now that we know it IS an answer, we check if it is TRUE based on the KB.
-    # Premise = KB
-    # Hypothesis = user_answer (Clean check, no complex prompt engineering needed)
     nli_logits = nli_model.predict([(kb, user_answer)])
     nli_probs = F.softmax(torch.tensor(nli_logits), dim=0).tolist()
-    # Label mapping for this specific model: 0: Contradiction, 1: Entailment, 2: Neutral
-    # Note: Different models map differently. For 'cross-encoder/nli-deberta-v3-xsmall':
-    # Label 0 = Contradiction, Label 1 = Entailment, Label 2 = Neutral
-    labels = ["CONTRADICTION", "ENTAILMENT", "NEUTRAL"]
     max_idx = torch.tensor(nli_logits).argmax().item()
-    verdict_label = labels[max_idx]
-    verdict_conf = nli_probs[max_idx] * 100
-    # --- FINAL VERDICT LOGIC ---
-    status = ""
-    if verdict_label == "ENTAILMENT":
         status = "✅ CORRECT (Confirmed)"
-    elif verdict_label == "CONTRADICTION":
-        status = "❌ INCORRECT (Factually False)"
-    else: # NEUTRAL
-        # It answers the question, but the fact isn't in the text (Hallucination)
-        status = "❌ INCORRECT (Not in text)"
-    return (
-        status,
-        f"High Relevance ({qa_confidence:.1f}%)",
-        f"{verdict_label} ({verdict_conf:.1f}%)"
-    )
 # --- UI SETUP ---
-with gr.Blocks(title="Lightweight Reasoning Engine v4", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## 🧠 Neural Answer Checker v4 (Double-Gate Logic)")
-    gr.Markdown("This system uses two distinct brains: one checks if you answered the *Question*, the other checks if your answer matches the *Text*.")
     with gr.Row():
-        kb_input = gr.Textbox(label="Knowledge Base (Context)", lines=6, placeholder="Paste story here...", value="When a lion was resting in the jungle, a mouse began racing up and down his body for fun. The lion's sleep was disturbed, and he woke in anger.")
-    with gr.Row():
-        q_input = gr.Textbox(label="Question", placeholder="e.g., What was the lion doing?")
-        a_input = gr.Textbox(label="User Answer", placeholder="e.g., He was sleeping.")
-    check_btn = gr.Button("Evaluate Answer", variant="primary")
-    with gr.Row():
-        verdict_output = gr.Textbox(label="Final Verdict", elem_classes="verdict")
-    with gr.Row():
-        qa_metric = gr.Label(label="Gate 1: QA Relevance")
-        nli_metric = gr.Label(label="Gate 2: Fact Check")
-    check_btn.click(
         fn=evaluate_response,
         inputs=[kb_input, q_input, a_input],
-        outputs=[verdict_output, qa_metric, nli_metric]
     )
 if __name__ == "__main__":

 import gradio as gr
 import torch
 import torch.nn.functional as F
 from sentence_transformers import CrossEncoder
+import time
 # --- CONFIGURATION ---
+# GATE 1: Relevance (Is the answer related to the question?)
+# We switch from MS-MARCO (Search) to STS (Semantic Similarity).
+# This prevents the "Lion Sleeping" failure.
+relevance_model_name = 'cross-encoder/stsb-distilroberta-base'
+# GATE 2: Fact Checking (Is the answer supported by the text?)
+# DeBERTa-v3 is state-of-the-art for NLI.
 nli_model_name = 'cross-encoder/nli-deberta-v3-xsmall'
+print(f"Loading Models...\n1. {relevance_model_name}\n2. {nli_model_name}")
+# We load them once.
+rel_model = CrossEncoder(relevance_model_name, device="cpu")
 nli_model = CrossEncoder(nli_model_name, device="cpu")
+print("✅ System Ready.")
 def evaluate_response(kb, question, user_answer):
     if not kb or not question or not user_answer:
+        return "⚠️ Error: Missing Input", {}, "N/A"
+    logs = {} # Dictionary to store debug info
+    # --- GATE 1: RELEVANCE CHECK (STS) ---
+    # Does the answer make sense in the context of the question?
+    # STS models output a score from 0.0 to 1.0 (usually).
+    rel_score = rel_model.predict([(question, user_answer)])
+    # Check if the model output is raw logits or normalized
+    # STSb models usually output 0-1. If not, we clip/normalize.
+    rel_score_val = float(rel_score)
+    logs['Gate 1 Model'] = relevance_model_name
+    logs['Gate 1 Raw Score'] = f"{rel_score_val:.4f}"
+    # Threshold: STS scores are usually tighter.
+    # > 0.15 is usually enough to say "These sentences are related".
+    # "Lion sleeping" vs "What lion doing" should score ~0.4 - 0.6
+    RELEVANCE_THRESHOLD = 0.15
+    if rel_score_val < RELEVANCE_THRESHOLD:
+        status = "❌ INCORRECT (Irrelevant)"
+        logs['Verdict'] = "Blocked at Gate 1 (Answer unrelated to Question)"
+        return status, logs, "Blocked"
+    # --- GATE 2: FACT CHECKING (NLI) ---
+    # Does the Knowledge Base entail the Answer?
     nli_logits = nli_model.predict([(kb, user_answer)])
     nli_probs = F.softmax(torch.tensor(nli_logits), dim=0).tolist()
+    # DeBERTa-v3-xsmall Labels: 0: Contradiction, 1: Entailment, 2: Neutral
+    labels = ["Contradiction", "Entailment", "Neutral"]
     max_idx = torch.tensor(nli_logits).argmax().item()
+    nli_verdict = labels[max_idx]
+    nli_conf = nli_probs[max_idx] * 100
+    logs['Gate 2 Model'] = nli_model_name
+    logs['Gate 2 Probabilities'] = {
+        "Contradiction": f"{nli_probs[0]*100:.1f}%",
+        "Entailment": f"{nli_probs[1]*100:.1f}%",
+        "Neutral": f"{nli_probs[2]*100:.1f}%"
+    }
+    logs['Gate 2 Verdict'] = nli_verdict
+    # --- FINAL DECISION LOGIC ---
+    if nli_verdict == "Entailment":
         status = "✅ CORRECT (Confirmed)"
+        logs['Final Outcome'] = "Answer is Relevant and Factual."
+    elif nli_verdict == "Contradiction":
+        status = "❌ INCORRECT (False Information)"
+        logs['Final Outcome'] = "Answer contradicts the text."
+    else: # Neutral
+        # The answer is relevant to the question, but the TEXT doesn't mention it.
+        # e.g., "The lion likes pizza." (Relevant topic, but hallucinated fact)
+        status = "❌ INCORRECT (Hallucination/Not in Text)"
+        logs['Final Outcome'] = "Answer not found in text."
+    return status, logs, f"{nli_verdict} ({nli_conf:.1f}%)"
 # --- UI SETUP ---
+with gr.Blocks(title="NLI Logic Engine v5 (Debug Enabled)", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## 🧠 Neural Logic Engine v5")
+    gr.Markdown("Corrected Architecture: Uses **STS (Semantic Similarity)** for Relevance and **NLI** for Fact Checking.")
     with gr.Row():
+        with gr.Column(scale=1):
+            kb_input = gr.Textbox(label="Knowledge Base", lines=5, value="When a lion was resting in the jungle, a mouse began racing up and down his body for fun. The lion's sleep was disturbed, and he woke in anger.")
+            q_input = gr.Textbox(label="Question", value="What was the lion doing?")
+            a_input = gr.Textbox(label="User Answer", value="The lion was sleeping in the jungle.")
+            btn = gr.Button("Evaluate", variant="primary")
+        with gr.Column(scale=1):
+            verdict_out = gr.Textbox(label="Final Verdict", elem_classes="verdict")
+            nli_metric = gr.Label(label="NLI Confidence")
+            # JSON output for full transparency
+            debug_log = gr.JSON(label="System Internals (Debug Log)")
+    btn.click(
         fn=evaluate_response,
         inputs=[kb_input, q_input, a_input],
+        outputs=[verdict_out, debug_log, nli_metric]
     )
 if __name__ == "__main__":