Spaces:

heerjtdev
/

NLI

Sleeping

App Files Files Community

heerjtdev commited on Jan 1

Commit

8189a78

verified ·

1 Parent(s): d9658ea

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -29

app.py CHANGED Viewed

@@ -60,27 +60,21 @@
 import gradio as gr
 import torch
 import torch.nn.functional as F
 from sentence_transformers import CrossEncoder
-import time
 # --- CONFIGURATION ---
-# GATE 1: Relevance (Is the answer related to the question?)
-# We switch from MS-MARCO (Search) to STS (Semantic Similarity).
-# This prevents the "Lion Sleeping" failure.
 relevance_model_name = 'cross-encoder/stsb-distilroberta-base'
-# GATE 2: Fact Checking (Is the answer supported by the text?)
-# DeBERTa-v3 is state-of-the-art for NLI.
 nli_model_name = 'cross-encoder/nli-deberta-v3-xsmall'
 print(f"Loading Models...\n1. {relevance_model_name}\n2. {nli_model_name}")
-# We load them once.
 rel_model = CrossEncoder(relevance_model_name, device="cpu")
 nli_model = CrossEncoder(nli_model_name, device="cpu")
 print("✅ System Ready.")
@@ -89,22 +83,18 @@ def evaluate_response(kb, question, user_answer):
     if not kb or not question or not user_answer:
         return "⚠️ Error: Missing Input", {}, "N/A"
-    logs = {} # Dictionary to store debug info
     # --- GATE 1: RELEVANCE CHECK (STS) ---
-    # Does the answer make sense in the context of the question?
-    # STS models output a score from 0.0 to 1.0 (usually).
     rel_score = rel_model.predict([(question, user_answer)])
-    # Check if the model output is raw logits or normalized
-    # STSb models usually output 0-1. If not, we clip/normalize.
-    rel_score_val = float(rel_score)
     logs['Gate 1 Model'] = relevance_model_name
     logs['Gate 1 Raw Score'] = f"{rel_score_val:.4f}"
-    # Threshold: STS scores are usually tighter.
-    # > 0.15 is usually enough to say "These sentences are related".
-    # "Lion sleeping" vs "What lion doing" should score ~0.4 - 0.6
     RELEVANCE_THRESHOLD = 0.15
     if rel_score_val < RELEVANCE_THRESHOLD:
@@ -113,13 +103,29 @@ def evaluate_response(kb, question, user_answer):
         return status, logs, "Blocked"
     # --- GATE 2: FACT CHECKING (NLI) ---
-    # Does the Knowledge Base entail the Answer?
     nli_logits = nli_model.predict([(kb, user_answer)])
-    nli_probs = F.softmax(torch.tensor(nli_logits), dim=0).tolist()
-    # DeBERTa-v3-xsmall Labels: 0: Contradiction, 1: Entailment, 2: Neutral
     labels = ["Contradiction", "Entailment", "Neutral"]
-    max_idx = torch.tensor(nli_logits).argmax().item()
     nli_verdict = labels[max_idx]
     nli_conf = nli_probs[max_idx] * 100
@@ -141,17 +147,15 @@ def evaluate_response(kb, question, user_answer):
         logs['Final Outcome'] = "Answer contradicts the text."
     else: # Neutral
-        # The answer is relevant to the question, but the TEXT doesn't mention it.
-        # e.g., "The lion likes pizza." (Relevant topic, but hallucinated fact)
         status = "❌ INCORRECT (Hallucination/Not in Text)"
         logs['Final Outcome'] = "Answer not found in text."
     return status, logs, f"{nli_verdict} ({nli_conf:.1f}%)"
 # --- UI SETUP ---
-with gr.Blocks(title="NLI Logic Engine v5 (Debug Enabled)", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## 🧠 Neural Logic Engine v5")
-    gr.Markdown("Corrected Architecture: Uses **STS (Semantic Similarity)** for Relevance and **NLI** for Fact Checking.")
     with gr.Row():
         with gr.Column(scale=1):
@@ -163,7 +167,6 @@ with gr.Blocks(title="NLI Logic Engine v5 (Debug Enabled)", theme=gr.themes.Soft
         with gr.Column(scale=1):
             verdict_out = gr.Textbox(label="Final Verdict", elem_classes="verdict")
             nli_metric = gr.Label(label="NLI Confidence")
-            # JSON output for full transparency
             debug_log = gr.JSON(label="System Internals (Debug Log)")
     btn.click(

 import gradio as gr
 import torch
 import torch.nn.functional as F
 from sentence_transformers import CrossEncoder
 # --- CONFIGURATION ---
+# GATE 1: Semantic Relevance (STS)
+# Checks if the Answer is conversationally related to the Question.
 relevance_model_name = 'cross-encoder/stsb-distilroberta-base'
+# GATE 2: Fact Checking (NLI)
+# Checks if the Answer is supported by the Knowledge Base.
 nli_model_name = 'cross-encoder/nli-deberta-v3-xsmall'
 print(f"Loading Models...\n1. {relevance_model_name}\n2. {nli_model_name}")
 rel_model = CrossEncoder(relevance_model_name, device="cpu")
 nli_model = CrossEncoder(nli_model_name, device="cpu")
 print("✅ System Ready.")
     if not kb or not question or not user_answer:
         return "⚠️ Error: Missing Input", {}, "N/A"
+    logs = {}
     # --- GATE 1: RELEVANCE CHECK (STS) ---
     rel_score = rel_model.predict([(question, user_answer)])
+    # FIX 1: Use .item() to safely extract float from numpy array
+    rel_score_val = rel_score.item()
     logs['Gate 1 Model'] = relevance_model_name
     logs['Gate 1 Raw Score'] = f"{rel_score_val:.4f}"
+    # Threshold: STS score > 0.15 usually implies relevance
     RELEVANCE_THRESHOLD = 0.15
     if rel_score_val < RELEVANCE_THRESHOLD:
         return status, logs, "Blocked"
     # --- GATE 2: FACT CHECKING (NLI) ---
     nli_logits = nli_model.predict([(kb, user_answer)])
+    # FIX 2: Handle Dimensions safely
+    # Convert to tensor
+    nli_tensor = torch.tensor(nli_logits)
+    # If the model returns a batch dimension (e.g. [1, 3]), squeeze it to flat [3]
+    if nli_tensor.dim() > 1:
+        nli_tensor = nli_tensor.squeeze()
+    # Apply Softmax across the classes (now dim=0 is safe on a flat tensor)
+    nli_probs = F.softmax(nli_tensor, dim=0).tolist()
+    # Get the winner index
+    max_idx = nli_tensor.argmax().item()
+    # Standard NLI Labels
     labels = ["Contradiction", "Entailment", "Neutral"]
+    # Safety check for model label count mismatch
+    if max_idx >= len(labels):
+        return "⚠️ Model Error", {"Error": "Label mismatch"}, "N/A"
     nli_verdict = labels[max_idx]
     nli_conf = nli_probs[max_idx] * 100
         logs['Final Outcome'] = "Answer contradicts the text."
     else: # Neutral
         status = "❌ INCORRECT (Hallucination/Not in Text)"
         logs['Final Outcome'] = "Answer not found in text."
     return status, logs, f"{nli_verdict} ({nli_conf:.1f}%)"
 # --- UI SETUP ---
+with gr.Blocks(title="NLI Logic Engine v5", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## 🧠 Neural Logic Engine v5.1 (Bug Fixes Applied)")
+    gr.Markdown("Corrected Architecture: STS for Relevance + NLI for Fact Checking.")
     with gr.Row():
         with gr.Column(scale=1):
         with gr.Column(scale=1):
             verdict_out = gr.Textbox(label="Final Verdict", elem_classes="verdict")
             nli_metric = gr.Label(label="NLI Confidence")
             debug_log = gr.JSON(label="System Internals (Debug Log)")
     btn.click(