Spaces:

heerjtdev
/

NLI

Sleeping

App Files Files Community

heerjtdev commited on 18 days ago

Commit

4898472

verified ·

1 Parent(s): 2b3f70d

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -42

app.py CHANGED Viewed

@@ -1,57 +1,174 @@
 import gradio as gr
 import torch
 import torch.nn.functional as F
-from sentence_transformers import SentenceTransformer, CrossEncoder, util
-# Use ModernBERT-based NLI for maximum speed on Free Tier CPU
-# This model is 20% faster and 40% lighter than standard DeBERTa
-reasoning_model_name = 'dleemiller/finecat-nli-l'
-similarity_model_name = 'all-MiniLM-L6-v2'
-print("Initializing 2025 Lightweight Suite...")
-sim_model = SentenceTransformer(similarity_model_name, device="cpu")
-nli_model = CrossEncoder(reasoning_model_name, device="cpu")
 def evaluate_response(kb, question, user_answer):
-    # 1. Topic Relevance (Bi-Encoder)
-    # We check if the answer even belongs in the same universe as the question
-    q_emb = sim_model.encode(question, convert_to_tensor=True)
-    a_emb = sim_model.encode(user_answer, convert_to_tensor=True)
-    rel_score = util.cos_sim(q_emb, a_emb).item()
-    # 2. Structured Reasoning (Cross-Encoder)
-    # We format the hypothesis to force the model to evaluate the ANSWER specifically
-    hypothesis = f"Based on the context, the answer to '{question}' is '{user_answer}'."
-    logits = nli_model.predict([(kb, hypothesis)])
-    probs = F.softmax(torch.tensor(logits), dim=1).tolist()[0]
-    # Label mapping for FineCat/DeBERTa: 0: contradiction, 1: entailment, 2: neutral
     labels = ["CONTRADICTION", "ENTAILMENT", "NEUTRAL"]
-    max_idx = torch.tensor(logits).argmax().item()
-    verdict = labels[max_idx]
-    conf = probs[max_idx] * 100
-    # 3. Precision Logic Gate
-    if verdict == "CONTRADICTION" and conf > 40:
-        status = "❌ INCORRECT (Logic Conflict)"
-    elif verdict == "ENTAILMENT" and conf > 35:
         status = "✅ CORRECT (Confirmed)"
-    elif rel_score > 0.40 and verdict != "CONTRADICTION":
-        status = "✅ CORRECT (Likely/Inferred)"
-    else:
-        status = "❌ WRONG / IRRELEVANT"
-    return status, f"{rel_score:.2f}", f"{verdict} ({conf:.1f}%)"
-# UI Setup remains the same
-demo = gr.Interface(
-    fn=evaluate_response,
-    inputs=["text", "text", "text"],
-    outputs=[gr.Textbox(label="Verdict"), gr.Label(label="Topic Similarity"), gr.Label(label="NLI Reasoning")],
-    title="Lightweight Reasoning Engine v3",
-    description="Using ModernBERT-distilled NLI for 2025-standard reasoning on CPU."
-)
 if __name__ == "__main__":
     demo.launch()

+# import gradio as gr
+# import torch
+# import torch.nn.functional as F
+# from sentence_transformers import SentenceTransformer, CrossEncoder, util
+# # Use ModernBERT-based NLI for maximum speed on Free Tier CPU
+# # This model is 20% faster and 40% lighter than standard DeBERTa
+# reasoning_model_name = 'dleemiller/finecat-nli-l'
+# similarity_model_name = 'all-MiniLM-L6-v2'
+# print("Initializing 2025 Lightweight Suite...")
+# sim_model = SentenceTransformer(similarity_model_name, device="cpu")
+# nli_model = CrossEncoder(reasoning_model_name, device="cpu")
+# def evaluate_response(kb, question, user_answer):
+#     # 1. Topic Relevance (Bi-Encoder)
+#     # We check if the answer even belongs in the same universe as the question
+#     q_emb = sim_model.encode(question, convert_to_tensor=True)
+#     a_emb = sim_model.encode(user_answer, convert_to_tensor=True)
+#     rel_score = util.cos_sim(q_emb, a_emb).item()
+#     # 2. Structured Reasoning (Cross-Encoder)
+#     # We format the hypothesis to force the model to evaluate the ANSWER specifically
+#     hypothesis = f"Based on the context, the answer to '{question}' is '{user_answer}'."
+#     logits = nli_model.predict([(kb, hypothesis)])
+#     probs = F.softmax(torch.tensor(logits), dim=1).tolist()[0]
+#     # Label mapping for FineCat/DeBERTa: 0: contradiction, 1: entailment, 2: neutral
+#     labels = ["CONTRADICTION", "ENTAILMENT", "NEUTRAL"]
+#     max_idx = torch.tensor(logits).argmax().item()
+#     verdict = labels[max_idx]
+#     conf = probs[max_idx] * 100
+#     # 3. Precision Logic Gate
+#     if verdict == "CONTRADICTION" and conf > 40:
+#         status = "❌ INCORRECT (Logic Conflict)"
+#     elif verdict == "ENTAILMENT" and conf > 35:
+#         status = "✅ CORRECT (Confirmed)"
+#     elif rel_score > 0.40 and verdict != "CONTRADICTION":
+#         status = "✅ CORRECT (Likely/Inferred)"
+#     else:
+#         status = "❌ WRONG / IRRELEVANT"
+#     return status, f"{rel_score:.2f}", f"{verdict} ({conf:.1f}%)"
+# # UI Setup remains the same
+# demo = gr.Interface(
+#     fn=evaluate_response,
+#     inputs=["text", "text", "text"],
+#     outputs=[gr.Textbox(label="Verdict"), gr.Label(label="Topic Similarity"), gr.Label(label="NLI Reasoning")],
+#     title="Lightweight Reasoning Engine v3",
+#     description="Using ModernBERT-distilled NLI for 2025-standard reasoning on CPU."
+# )
+# if __name__ == "__main__":
+#     demo.launch()
 import gradio as gr
 import torch
 import torch.nn.functional as F
+from sentence_transformers import CrossEncoder
+# --- CONFIGURATION ---
+# Model 1: QA Relevance Validator
+# This model is trained on MS MARCO. It predicts how well a passage answers a query.
+# High score = The answer addresses the question directly.
+# Low score = Irrelevant (e.g., Q: "What did the lion do?", A: "The mouse's name is Lucy")
+qa_model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
+# Model 2: Fact Checker (NLI)
+# We use a DeBERTa-v3-xsmall or similar high-performance NLI model.
+# It is very robust at detecting Hallucinations vs Entailment.
+nli_model_name = 'cross-encoder/nli-deberta-v3-xsmall'
+print("Initializing Reasoning Engines...")
+qa_model = CrossEncoder(qa_model_name, device="cpu")
+nli_model = CrossEncoder(nli_model_name, device="cpu")
+print("System Ready.")
 def evaluate_response(kb, question, user_answer):
+    if not kb or not question or not user_answer:
+        return "⚠️ Missing Input", "N/A", "N/A"
+    # --- GATE 1: Question-Answer Relevance Check ---
+    # We ask the model: "Is 'user_answer' a relevant response to 'question'?"
+    # MS-MARCO models output unbounded logits. Usually > 0 means relevant.
+    qa_scores = qa_model.predict([(question, user_answer)])
+    qa_score = qa_scores.item()
+    # Sigmoid to make it easier to read (0-100%)
+    qa_confidence = (1 / (1 + torch.exp(torch.tensor(-qa_score)))).item() * 100
+    # Strict Relevance Threshold (Adjustable)
+    # If the QA score is too low, we reject it immediately as irrelevant.
+    is_relevant = qa_score > 1.0  # Logit threshold (approx 73% confidence)
+    if not is_relevant:
+        return (
+            "❌ INCORRECT (Irrelevant Answer)",
+            f"Low Relevance ({qa_confidence:.1f}%)",
+            "Skipped (Not an answer)"
+        )
+    # --- GATE 2: Knowledge Base Verification (NLI) ---
+    # Now that we know it IS an answer, we check if it is TRUE based on the KB.
+    # Premise = KB
+    # Hypothesis = user_answer (Clean check, no complex prompt engineering needed)
+    nli_logits = nli_model.predict([(kb, user_answer)])
+    nli_probs = F.softmax(torch.tensor(nli_logits), dim=0).tolist()
+    # Label mapping for this specific model: 0: Contradiction, 1: Entailment, 2: Neutral
+    # Note: Different models map differently. For 'cross-encoder/nli-deberta-v3-xsmall':
+    # Label 0 = Contradiction, Label 1 = Entailment, Label 2 = Neutral
     labels = ["CONTRADICTION", "ENTAILMENT", "NEUTRAL"]
+    max_idx = torch.tensor(nli_logits).argmax().item()
+    verdict_label = labels[max_idx]
+    verdict_conf = nli_probs[max_idx] * 100
+    # --- FINAL VERDICT LOGIC ---
+    status = ""
+    if verdict_label == "ENTAILMENT":
         status = "✅ CORRECT (Confirmed)"
+    elif verdict_label == "CONTRADICTION":
+        status = "❌ INCORRECT (Factually False)"
+    else: # NEUTRAL
+        # It answers the question, but the fact isn't in the text (Hallucination)
+        status = "❌ INCORRECT (Not in text)"
+    return (
+        status,
+        f"High Relevance ({qa_confidence:.1f}%)",
+        f"{verdict_label} ({verdict_conf:.1f}%)"
+    )
+# --- UI SETUP ---
+with gr.Blocks(title="Lightweight Reasoning Engine v4", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## 🧠 Neural Answer Checker v4 (Double-Gate Logic)")
+    gr.Markdown("This system uses two distinct brains: one checks if you answered the *Question*, the other checks if your answer matches the *Text*.")
+    with gr.Row():
+        kb_input = gr.Textbox(label="Knowledge Base (Context)", lines=6, placeholder="Paste story here...", value="When a lion was resting in the jungle, a mouse began racing up and down his body for fun. The lion's sleep was disturbed, and he woke in anger.")
+    with gr.Row():
+        q_input = gr.Textbox(label="Question", placeholder="e.g., What was the lion doing?")
+        a_input = gr.Textbox(label="User Answer", placeholder="e.g., He was sleeping.")
+    check_btn = gr.Button("Evaluate Answer", variant="primary")
+    with gr.Row():
+        verdict_output = gr.Textbox(label="Final Verdict", elem_classes="verdict")
+    with gr.Row():
+        qa_metric = gr.Label(label="Gate 1: QA Relevance")
+        nli_metric = gr.Label(label="Gate 2: Fact Check")
+    check_btn.click(
+        fn=evaluate_response,
+        inputs=[kb_input, q_input, a_input],
+        outputs=[verdict_output, qa_metric, nli_metric]
+    )
 if __name__ == "__main__":
     demo.launch()