Spaces:

heerjtdev
/

NLI

Running

App Files Files Community

heerjtdev commited on 19 days ago

Commit

82d9acf

verified ·

1 Parent(s): 8189a78

Update app.py

Browse files

Files changed (1) hide show

app.py +310 -83

app.py CHANGED Viewed

@@ -60,120 +60,347 @@
-import gradio as gr
-import torch
-import torch.nn.functional as F
-from sentence_transformers import CrossEncoder
-# --- CONFIGURATION ---
-# GATE 1: Semantic Relevance (STS)
-# Checks if the Answer is conversationally related to the Question.
-relevance_model_name = 'cross-encoder/stsb-distilroberta-base'
-# GATE 2: Fact Checking (NLI)
-# Checks if the Answer is supported by the Knowledge Base.
-nli_model_name = 'cross-encoder/nli-deberta-v3-xsmall'
-print(f"Loading Models...\n1. {relevance_model_name}\n2. {nli_model_name}")
-rel_model = CrossEncoder(relevance_model_name, device="cpu")
-nli_model = CrossEncoder(nli_model_name, device="cpu")
-print("✅ System Ready.")
-def evaluate_response(kb, question, user_answer):
-    if not kb or not question or not user_answer:
-        return "⚠️ Error: Missing Input", {}, "N/A"
-    logs = {}
-    # --- GATE 1: RELEVANCE CHECK (STS) ---
-    rel_score = rel_model.predict([(question, user_answer)])
-    # FIX 1: Use .item() to safely extract float from numpy array
-    rel_score_val = rel_score.item()
-    logs['Gate 1 Model'] = relevance_model_name
-    logs['Gate 1 Raw Score'] = f"{rel_score_val:.4f}"
-    # Threshold: STS score > 0.15 usually implies relevance
-    RELEVANCE_THRESHOLD = 0.15
-    if rel_score_val < RELEVANCE_THRESHOLD:
-        status = "❌ INCORRECT (Irrelevant)"
-        logs['Verdict'] = "Blocked at Gate 1 (Answer unrelated to Question)"
-        return status, logs, "Blocked"
-    # --- GATE 2: FACT CHECKING (NLI) ---
-    nli_logits = nli_model.predict([(kb, user_answer)])
-    # FIX 2: Handle Dimensions safely
-    # Convert to tensor
-    nli_tensor = torch.tensor(nli_logits)
-    # If the model returns a batch dimension (e.g. [1, 3]), squeeze it to flat [3]
-    if nli_tensor.dim() > 1:
-        nli_tensor = nli_tensor.squeeze()
-    # Apply Softmax across the classes (now dim=0 is safe on a flat tensor)
-    nli_probs = F.softmax(nli_tensor, dim=0).tolist()
-    # Get the winner index
-    max_idx = nli_tensor.argmax().item()
-    # Standard NLI Labels
-    labels = ["Contradiction", "Entailment", "Neutral"]
-    # Safety check for model label count mismatch
-    if max_idx >= len(labels):
-        return "⚠️ Model Error", {"Error": "Label mismatch"}, "N/A"
-    nli_verdict = labels[max_idx]
-    nli_conf = nli_probs[max_idx] * 100
-    logs['Gate 2 Model'] = nli_model_name
-    logs['Gate 2 Probabilities'] = {
-        "Contradiction": f"{nli_probs[0]*100:.1f}%",
-        "Entailment": f"{nli_probs[1]*100:.1f}%",
-        "Neutral": f"{nli_probs[2]*100:.1f}%"
-    }
-    logs['Gate 2 Verdict'] = nli_verdict
-    # --- FINAL DECISION LOGIC ---
-    if nli_verdict == "Entailment":
-        status = "✅ CORRECT (Confirmed)"
-        logs['Final Outcome'] = "Answer is Relevant and Factual."
-    elif nli_verdict == "Contradiction":
-        status = "❌ INCORRECT (False Information)"
-        logs['Final Outcome'] = "Answer contradicts the text."
-    else: # Neutral
-        status = "❌ INCORRECT (Hallucination/Not in Text)"
-        logs['Final Outcome'] = "Answer not found in text."
-    return status, logs, f"{nli_verdict} ({nli_conf:.1f}%)"
-# --- UI SETUP ---
-with gr.Blocks(title="NLI Logic Engine v5", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## 🧠 Neural Logic Engine v5.1 (Bug Fixes Applied)")
-    gr.Markdown("Corrected Architecture: STS for Relevance + NLI for Fact Checking.")
     with gr.Row():
         with gr.Column(scale=1):
-            kb_input = gr.Textbox(label="Knowledge Base", lines=5, value="When a lion was resting in the jungle, a mouse began racing up and down his body for fun. The lion's sleep was disturbed, and he woke in anger.")
-            q_input = gr.Textbox(label="Question", value="What was the lion doing?")
-            a_input = gr.Textbox(label="User Answer", value="The lion was sleeping in the jungle.")
             btn = gr.Button("Evaluate", variant="primary")
         with gr.Column(scale=1):
-            verdict_out = gr.Textbox(label="Final Verdict", elem_classes="verdict")
-            nli_metric = gr.Label(label="NLI Confidence")
-            debug_log = gr.JSON(label="System Internals (Debug Log)")
     btn.click(
         fn=evaluate_response,
         inputs=[kb_input, q_input, a_input],
-        outputs=[verdict_out, debug_log, nli_metric]
     )
 if __name__ == "__main__":
-    demo.launch()

+# import gradio as gr
+# import torch
+# import torch.nn.functional as F
+# from sentence_transformers import CrossEncoder
+# # --- CONFIGURATION ---
+# # GATE 1: Semantic Relevance (STS)
+# # Checks if the Answer is conversationally related to the Question.
+# relevance_model_name = 'cross-encoder/stsb-distilroberta-base'
+# # GATE 2: Fact Checking (NLI)
+# # Checks if the Answer is supported by the Knowledge Base.
+# nli_model_name = 'cross-encoder/nli-deberta-v3-xsmall'
+# print(f"Loading Models...\n1. {relevance_model_name}\n2. {nli_model_name}")
+# rel_model = CrossEncoder(relevance_model_name, device="cpu")
+# nli_model = CrossEncoder(nli_model_name, device="cpu")
+# print("✅ System Ready.")
+# def evaluate_response(kb, question, user_answer):
+#     if not kb or not question or not user_answer:
+#         return "⚠️ Error: Missing Input", {}, "N/A"
+#     logs = {}
+#     # --- GATE 1: RELEVANCE CHECK (STS) ---
+#     rel_score = rel_model.predict([(question, user_answer)])
+#     # FIX 1: Use .item() to safely extract float from numpy array
+#     rel_score_val = rel_score.item()
+#     logs['Gate 1 Model'] = relevance_model_name
+#     logs['Gate 1 Raw Score'] = f"{rel_score_val:.4f}"
+#     # Threshold: STS score > 0.15 usually implies relevance
+#     RELEVANCE_THRESHOLD = 0.15
+#     if rel_score_val < RELEVANCE_THRESHOLD:
+#         status = "❌ INCORRECT (Irrelevant)"
+#         logs['Verdict'] = "Blocked at Gate 1 (Answer unrelated to Question)"
+#         return status, logs, "Blocked"
+#     # --- GATE 2: FACT CHECKING (NLI) ---
+#     nli_logits = nli_model.predict([(kb, user_answer)])
+#     # FIX 2: Handle Dimensions safely
+#     # Convert to tensor
+#     nli_tensor = torch.tensor(nli_logits)
+#     # If the model returns a batch dimension (e.g. [1, 3]), squeeze it to flat [3]
+#     if nli_tensor.dim() > 1:
+#         nli_tensor = nli_tensor.squeeze()
+#     # Apply Softmax across the classes (now dim=0 is safe on a flat tensor)
+#     nli_probs = F.softmax(nli_tensor, dim=0).tolist()
+#     # Get the winner index
+#     max_idx = nli_tensor.argmax().item()
+#     # Standard NLI Labels
+#     labels = ["Contradiction", "Entailment", "Neutral"]
+#     # Safety check for model label count mismatch
+#     if max_idx >= len(labels):
+#         return "⚠️ Model Error", {"Error": "Label mismatch"}, "N/A"
+#     nli_verdict = labels[max_idx]
+#     nli_conf = nli_probs[max_idx] * 100
+#     logs['Gate 2 Model'] = nli_model_name
+#     logs['Gate 2 Probabilities'] = {
+#         "Contradiction": f"{nli_probs[0]*100:.1f}%",
+#         "Entailment": f"{nli_probs[1]*100:.1f}%",
+#         "Neutral": f"{nli_probs[2]*100:.1f}%"
+#     }
+#     logs['Gate 2 Verdict'] = nli_verdict
+#     # --- FINAL DECISION LOGIC ---
+#     if nli_verdict == "Entailment":
+#         status = "✅ CORRECT (Confirmed)"
+#         logs['Final Outcome'] = "Answer is Relevant and Factual."
+#     elif nli_verdict == "Contradiction":
+#         status = "❌ INCORRECT (False Information)"
+#         logs['Final Outcome'] = "Answer contradicts the text."
+#     else: # Neutral
+#         status = "❌ INCORRECT (Hallucination/Not in Text)"
+#         logs['Final Outcome'] = "Answer not found in text."
+#     return status, logs, f"{nli_verdict} ({nli_conf:.1f}%)"
+# # --- UI SETUP ---
+# with gr.Blocks(title="NLI Logic Engine v5", theme=gr.themes.Soft()) as demo:
+#     gr.Markdown("## 🧠 Neural Logic Engine v5.1 (Bug Fixes Applied)")
+#     gr.Markdown("Corrected Architecture: STS for Relevance + NLI for Fact Checking.")
+#     with gr.Row():
+#         with gr.Column(scale=1):
+#             kb_input = gr.Textbox(label="Knowledge Base", lines=5, value="When a lion was resting in the jungle, a mouse began racing up and down his body for fun. The lion's sleep was disturbed, and he woke in anger.")
+#             q_input = gr.Textbox(label="Question", value="What was the lion doing?")
+#             a_input = gr.Textbox(label="User Answer", value="The lion was sleeping in the jungle.")
+#             btn = gr.Button("Evaluate", variant="primary")
+#         with gr.Column(scale=1):
+#             verdict_out = gr.Textbox(label="Final Verdict", elem_classes="verdict")
+#             nli_metric = gr.Label(label="NLI Confidence")
+#             debug_log = gr.JSON(label="System Internals (Debug Log)")
+#     btn.click(
+#         fn=evaluate_response,
+#         inputs=[kb_input, q_input, a_input],
+#         outputs=[verdict_out, debug_log, nli_metric]
+#     )
+# if __name__ == "__main__":
+#     demo.launch()
+import gradio as gr
+import torch
+import torch.nn.functional as F
+from sentence_transformers import CrossEncoder
+import re
+# ==============================
+# CONFIGURATION
+# ==============================
+RELEVANCE_MODEL = "cross-encoder/stsb-distilroberta-base"
+NLI_MODEL = "cross-encoder/nli-deberta-v3-xsmall"
+RELEVANCE_THRESHOLD_QA = 0.15
+RELEVANCE_THRESHOLD_KB = 0.30
+ENTAILMENT_THRESHOLD = 0.65
+DEVICE = "cpu"
+# ==============================
+# LOAD MODELS
+# ==============================
+print("Loading models...")
+rel_model = CrossEncoder(RELEVANCE_MODEL, device=DEVICE)
+nli_model = CrossEncoder(NLI_MODEL, device=DEVICE)
+print("✅ Models loaded")
+# ==============================
+# UTILITIES
+# ==============================
+def split_sentences(text):
+    text = text.strip()
+    if not text:
+        return []
+    return re.split(r'(?<=[.!?])\s+', text)
+def softmax_logits(logits):
+    t = torch.tensor(logits)
+    if t.dim() > 1:
+        t = t.squeeze(0)
+    probs = F.softmax(t, dim=0).tolist()
+    return probs
+# ==============================
+# CORE EVALUATION FUNCTION
+# ==============================
+def evaluate_response(kb, question, user_answer):
+    logs = {}
+    # ------------------------------
+    # INPUT VALIDATION
+    # ------------------------------
+    if not kb or not question or not user_answer:
+        return "⚠️ ERROR: Missing input", {}, "N/A"
+    logs["Inputs"] = {
+        "Question": question,
+        "User Answer": user_answer,
+        "KB Length (chars)": len(kb)
+    }
+    # ------------------------------
+    # GATE 1 — QUESTION ↔ ANSWER RELEVANCE
+    # ------------------------------
+    qa_score = rel_model.predict([(question, user_answer)]).item()
+    logs["Gate 1 — QA Relevance"] = {
+        "Model": RELEVANCE_MODEL,
+        "Score": round(qa_score, 4),
+        "Threshold": RELEVANCE_THRESHOLD_QA
+    }
+    if qa_score < RELEVANCE_THRESHOLD_QA:
+        logs["Final Decision"] = "Blocked at Gate 1 (Irrelevant Answer)"
+        return (
+            "❌ INCORRECT (Irrelevant)",
+            logs,
+            f"Relevance {qa_score:.2f}"
+        )
+    # ------------------------------
+    # GATE 2 — KB SENTENCE SELECTION (STS)
+    # ------------------------------
+    kb_sentences = split_sentences(kb)
+    logs["KB Processing"] = {
+        "Total Sentences": len(kb_sentences),
+        "Sentences": kb_sentences
+    }
+    if not kb_sentences:
+        logs["Final Decision"] = "Empty KB after sentence split"
+        return "❌ INCORRECT (Empty KB)", logs, "N/A"
+    sentence_pairs = [(s, user_answer) for s in kb_sentences]
+    sim_scores = rel_model.predict(sentence_pairs)
+    best_idx = int(sim_scores.argmax())
+    best_sentence = kb_sentences[best_idx]
+    best_score = float(sim_scores[best_idx])
+    logs["Gate 2 — KB Sentence Selection"] = {
+        "Model": RELEVANCE_MODEL,
+        "Best Sentence": best_sentence,
+        "Best Similarity Score": round(best_score, 4),
+        "Threshold": RELEVANCE_THRESHOLD_KB,
+        "All Scores": [
+            {"sentence": s, "score": round(float(sc), 4)}
+            for s, sc in zip(kb_sentences, sim_scores)
+        ]
+    }
+    if best_score < RELEVANCE_THRESHOLD_KB:
+        logs["Final Decision"] = "Answer not grounded in KB"
+        return (
+            "❌ INCORRECT (Not Found in Text)",
+            logs,
+            f"KB Similarity {best_score:.2f}"
+        )
+    # ------------------------------
+    # GATE 3 — NLI (Sentence ↔ Answer)
+    # ------------------------------
+    nli_logits = nli_model.predict([(best_sentence, user_answer)])
+    probs = softmax_logits(nli_logits)
+    labels = ["Contradiction", "Entailment", "Neutral"]
+    verdict_idx = int(torch.tensor(probs).argmax())
+    verdict = labels[verdict_idx]
+    confidence = probs[verdict_idx] * 100
+    logs["Gate 3 — NLI Verification"] = {
+        "Model": NLI_MODEL,
+        "Premise": best_sentence,
+        "Hypothesis": user_answer,
+        "Probabilities": {
+            "Contradiction": f"{probs[0]*100:.2f}%",
+            "Entailment": f"{probs[1]*100:.2f}%",
+            "Neutral": f"{probs[2]*100:.2f}%"
+        },
+        "Verdict": verdict,
+        "Confidence": f"{confidence:.2f}%",
+        "Entailment Threshold": f"{ENTAILMENT_THRESHOLD*100:.0f}%"
+    }
+    # ------------------------------
+    # FINAL DECISION
+    # ------------------------------
+    if verdict == "Entailment" and probs[1] >= ENTAILMENT_THRESHOLD:
+        logs["Final Decision"] = "Answer is Supported by Text"
+        return (
+            "✅ CORRECT (Confirmed)",
+            logs,
+            f"Entailment {confidence:.1f}%"
+        )
+    if verdict == "Contradiction":
+        logs["Final Decision"] = "Answer Contradicts Text"
+        return (
+            "❌ INCORRECT (Contradiction)",
+            logs,
+            f"Contradiction {confidence:.1f}%"
+        )
+    logs["Final Decision"] = "Answer Not Explicitly Stated"
+    return (
+        "❌ INCORRECT (Neutral / Not in Text)",
+        logs,
+        f"Neutral {confidence:.1f}%"
+    )
+# ==============================
+# GRADIO UI
+# ==============================
+with gr.Blocks(title="Neural Logic Engine v6", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## 🧠 Neural Logic Engine v6")
+    gr.Markdown(
+        "**Architecture:**\n"
+        "- Gate 1: Question ↔ Answer relevance (STS)\n"
+        "- Gate 2: KB sentence grounding (STS)\n"
+        "- Gate 3: Sentence-level NLI verification\n"
+        "- Fully logged, deterministic decisions"
+    )
     with gr.Row():
         with gr.Column(scale=1):
+            kb_input = gr.Textbox(
+                label="Knowledge Base",
+                lines=6,
+                value="When a lion was resting in the jungle, a mouse began racing up and down his body for fun. "
+                      "The lion's sleep was disturbed, and he woke in anger."
+            )
+            q_input = gr.Textbox(
+                label="Question",
+                value="What was the lion doing?"
+            )
+            a_input = gr.Textbox(
+                label="User Answer",
+                value="The lion was sleeping in the jungle."
+            )
             btn = gr.Button("Evaluate", variant="primary")
         with gr.Column(scale=1):
+            verdict_out = gr.Textbox(label="Final Verdict")
+            confidence_out = gr.Label(label="Model Confidence")
+            debug_log = gr.JSON(label="System Internals (FULL DEBUG LOG)")
     btn.click(
         fn=evaluate_response,
         inputs=[kb_input, q_input, a_input],
+        outputs=[verdict_out, debug_log, confidence_out]
     )
 if __name__ == "__main__":
+    demo.launch()