Spaces:

heerjtdev
/

NLI

Sleeping

NLI

File size: 13,713 Bytes

# import gradio as gr
# import torch
# import torch.nn.functional as F
# from sentence_transformers import SentenceTransformer, CrossEncoder, util

# # Use ModernBERT-based NLI for maximum speed on Free Tier CPU
# # This model is 20% faster and 40% lighter than standard DeBERTa
# reasoning_model_name = 'dleemiller/finecat-nli-l' 
# similarity_model_name = 'all-MiniLM-L6-v2'

# print("Initializing 2025 Lightweight Suite...")
# sim_model = SentenceTransformer(similarity_model_name, device="cpu")
# nli_model = CrossEncoder(reasoning_model_name, device="cpu")

# def evaluate_response(kb, question, user_answer):
#     # 1. Topic Relevance (Bi-Encoder)
#     # We check if the answer even belongs in the same universe as the question
#     q_emb = sim_model.encode(question, convert_to_tensor=True)
#     a_emb = sim_model.encode(user_answer, convert_to_tensor=True)
#     rel_score = util.cos_sim(q_emb, a_emb).item()

#     # 2. Structured Reasoning (Cross-Encoder)
#     # We format the hypothesis to force the model to evaluate the ANSWER specifically
#     hypothesis = f"Based on the context, the answer to '{question}' is '{user_answer}'."
    
#     logits = nli_model.predict([(kb, hypothesis)])
#     probs = F.softmax(torch.tensor(logits), dim=1).tolist()[0]
    
#     # Label mapping for FineCat/DeBERTa: 0: contradiction, 1: entailment, 2: neutral
#     labels = ["CONTRADICTION", "ENTAILMENT", "NEUTRAL"]
#     max_idx = torch.tensor(logits).argmax().item()
#     verdict = labels[max_idx]
#     conf = probs[max_idx] * 100

#     # 3. Precision Logic Gate
#     if verdict == "CONTRADICTION" and conf > 40:
#         status = "❌ INCORRECT (Logic Conflict)"
#     elif verdict == "ENTAILMENT" and conf > 35:
#         status = "✅ CORRECT (Confirmed)"
#     elif rel_score > 0.40 and verdict != "CONTRADICTION":
#         status = "✅ CORRECT (Likely/Inferred)"
#     else:
#         status = "❌ WRONG / IRRELEVANT"

#     return status, f"{rel_score:.2f}", f"{verdict} ({conf:.1f}%)"

# # UI Setup remains the same
# demo = gr.Interface(
#     fn=evaluate_response,
#     inputs=["text", "text", "text"],
#     outputs=[gr.Textbox(label="Verdict"), gr.Label(label="Topic Similarity"), gr.Label(label="NLI Reasoning")],
#     title="Lightweight Reasoning Engine v3",
#     description="Using ModernBERT-distilled NLI for 2025-standard reasoning on CPU."
# )

# if __name__ == "__main__":
#     demo.launch()





# import gradio as gr
# import torch
# import torch.nn.functional as F
# from sentence_transformers import CrossEncoder

# # --- CONFIGURATION ---
# # GATE 1: Semantic Relevance (STS)
# # Checks if the Answer is conversationally related to the Question.
# relevance_model_name = 'cross-encoder/stsb-distilroberta-base'

# # GATE 2: Fact Checking (NLI)
# # Checks if the Answer is supported by the Knowledge Base.
# nli_model_name = 'cross-encoder/nli-deberta-v3-xsmall'

# print(f"Loading Models...\n1. {relevance_model_name}\n2. {nli_model_name}")
# rel_model = CrossEncoder(relevance_model_name, device="cpu")
# nli_model = CrossEncoder(nli_model_name, device="cpu")
# print("✅ System Ready.")

# def evaluate_response(kb, question, user_answer):
#     if not kb or not question or not user_answer:
#         return "⚠️ Error: Missing Input", {}, "N/A"

#     logs = {} 

#     # --- GATE 1: RELEVANCE CHECK (STS) ---
#     rel_score = rel_model.predict([(question, user_answer)])
    
#     # FIX 1: Use .item() to safely extract float from numpy array
#     rel_score_val = rel_score.item()
    
#     logs['Gate 1 Model'] = relevance_model_name
#     logs['Gate 1 Raw Score'] = f"{rel_score_val:.4f}"

#     # Threshold: STS score > 0.15 usually implies relevance
#     RELEVANCE_THRESHOLD = 0.15
    
#     if rel_score_val < RELEVANCE_THRESHOLD:
#         status = "❌ INCORRECT (Irrelevant)"
#         logs['Verdict'] = "Blocked at Gate 1 (Answer unrelated to Question)"
#         return status, logs, "Blocked"

#     # --- GATE 2: FACT CHECKING (NLI) ---
#     nli_logits = nli_model.predict([(kb, user_answer)])
    
#     # FIX 2: Handle Dimensions safely
#     # Convert to tensor
#     nli_tensor = torch.tensor(nli_logits)
    
#     # If the model returns a batch dimension (e.g. [1, 3]), squeeze it to flat [3]
#     if nli_tensor.dim() > 1:
#         nli_tensor = nli_tensor.squeeze()
        
#     # Apply Softmax across the classes (now dim=0 is safe on a flat tensor)
#     nli_probs = F.softmax(nli_tensor, dim=0).tolist()
    
#     # Get the winner index
#     max_idx = nli_tensor.argmax().item()
    
#     # Standard NLI Labels
#     labels = ["Contradiction", "Entailment", "Neutral"]
    
#     # Safety check for model label count mismatch
#     if max_idx >= len(labels):
#         return "⚠️ Model Error", {"Error": "Label mismatch"}, "N/A"

#     nli_verdict = labels[max_idx]
#     nli_conf = nli_probs[max_idx] * 100

#     logs['Gate 2 Model'] = nli_model_name
#     logs['Gate 2 Probabilities'] = {
#         "Contradiction": f"{nli_probs[0]*100:.1f}%",
#         "Entailment": f"{nli_probs[1]*100:.1f}%",
#         "Neutral": f"{nli_probs[2]*100:.1f}%"
#     }
#     logs['Gate 2 Verdict'] = nli_verdict

#     # --- FINAL DECISION LOGIC ---
#     if nli_verdict == "Entailment":
#         status = "✅ CORRECT (Confirmed)"
#         logs['Final Outcome'] = "Answer is Relevant and Factual."
        
#     elif nli_verdict == "Contradiction":
#         status = "❌ INCORRECT (False Information)"
#         logs['Final Outcome'] = "Answer contradicts the text."
        
#     else: # Neutral
#         status = "❌ INCORRECT (Hallucination/Not in Text)"
#         logs['Final Outcome'] = "Answer not found in text."

#     return status, logs, f"{nli_verdict} ({nli_conf:.1f}%)"

# # --- UI SETUP ---
# with gr.Blocks(title="NLI Logic Engine v5", theme=gr.themes.Soft()) as demo:
#     gr.Markdown("## 🧠 Neural Logic Engine v5.1 (Bug Fixes Applied)")
#     gr.Markdown("Corrected Architecture: STS for Relevance + NLI for Fact Checking.")
    
#     with gr.Row():
#         with gr.Column(scale=1):
#             kb_input = gr.Textbox(label="Knowledge Base", lines=5, value="When a lion was resting in the jungle, a mouse began racing up and down his body for fun. The lion's sleep was disturbed, and he woke in anger.")
#             q_input = gr.Textbox(label="Question", value="What was the lion doing?")
#             a_input = gr.Textbox(label="User Answer", value="The lion was sleeping in the jungle.")
#             btn = gr.Button("Evaluate", variant="primary")
        
#         with gr.Column(scale=1):
#             verdict_out = gr.Textbox(label="Final Verdict", elem_classes="verdict")
#             nli_metric = gr.Label(label="NLI Confidence")
#             debug_log = gr.JSON(label="System Internals (Debug Log)")

#     btn.click(
#         fn=evaluate_response,
#         inputs=[kb_input, q_input, a_input],
#         outputs=[verdict_out, debug_log, nli_metric]
#     )

# if __name__ == "__main__":
#     demo.launch()









import gradio as gr
import torch
import torch.nn.functional as F
from sentence_transformers import CrossEncoder
import re

# ==============================
# CONFIGURATION
# ==============================

RELEVANCE_MODEL = "cross-encoder/stsb-distilroberta-base"
NLI_MODEL = "cross-encoder/nli-deberta-v3-xsmall"

RELEVANCE_THRESHOLD_QA = 0.15
RELEVANCE_THRESHOLD_KB = 0.30
ENTAILMENT_THRESHOLD = 0.65

DEVICE = "cpu"

# ==============================
# LOAD MODELS
# ==============================

print("Loading models...")
rel_model = CrossEncoder(RELEVANCE_MODEL, device=DEVICE)
nli_model = CrossEncoder(NLI_MODEL, device=DEVICE)
print("✅ Models loaded")

# ==============================
# UTILITIES
# ==============================

def split_sentences(text):
    text = text.strip()
    if not text:
        return []
    return re.split(r'(?<=[.!?])\s+', text)

def softmax_logits(logits):
    t = torch.tensor(logits)
    if t.dim() > 1:
        t = t.squeeze(0)
    probs = F.softmax(t, dim=0).tolist()
    return probs

# ==============================
# CORE EVALUATION FUNCTION
# ==============================

def evaluate_response(kb, question, user_answer):
    logs = {}

    # ------------------------------
    # INPUT VALIDATION
    # ------------------------------
    if not kb or not question or not user_answer:
        return "⚠️ ERROR: Missing input", {}, "N/A"

    logs["Inputs"] = {
        "Question": question,
        "User Answer": user_answer,
        "KB Length (chars)": len(kb)
    }

    # ------------------------------
    # GATE 1 — QUESTION ↔ ANSWER RELEVANCE
    # ------------------------------
    qa_score = rel_model.predict([(question, user_answer)]).item()

    logs["Gate 1 — QA Relevance"] = {
        "Model": RELEVANCE_MODEL,
        "Score": round(qa_score, 4),
        "Threshold": RELEVANCE_THRESHOLD_QA
    }

    if qa_score < RELEVANCE_THRESHOLD_QA:
        logs["Final Decision"] = "Blocked at Gate 1 (Irrelevant Answer)"
        return (
            "❌ INCORRECT (Irrelevant)",
            logs,
            f"Relevance {qa_score:.2f}"
        )

    # ------------------------------
    # GATE 2 — KB SENTENCE SELECTION (STS)
    # ------------------------------
    kb_sentences = split_sentences(kb)
    logs["KB Processing"] = {
        "Total Sentences": len(kb_sentences),
        "Sentences": kb_sentences
    }

    if not kb_sentences:
        logs["Final Decision"] = "Empty KB after sentence split"
        return "❌ INCORRECT (Empty KB)", logs, "N/A"

    sentence_pairs = [(s, user_answer) for s in kb_sentences]
    sim_scores = rel_model.predict(sentence_pairs)

    best_idx = int(sim_scores.argmax())
    best_sentence = kb_sentences[best_idx]
    best_score = float(sim_scores[best_idx])

    logs["Gate 2 — KB Sentence Selection"] = {
        "Model": RELEVANCE_MODEL,
        "Best Sentence": best_sentence,
        "Best Similarity Score": round(best_score, 4),
        "Threshold": RELEVANCE_THRESHOLD_KB,
        "All Scores": [
            {"sentence": s, "score": round(float(sc), 4)}
            for s, sc in zip(kb_sentences, sim_scores)
        ]
    }

    if best_score < RELEVANCE_THRESHOLD_KB:
        logs["Final Decision"] = "Answer not grounded in KB"
        return (
            "❌ INCORRECT (Not Found in Text)",
            logs,
            f"KB Similarity {best_score:.2f}"
        )

    # ------------------------------
    # GATE 3 — NLI (Sentence ↔ Answer)
    # ------------------------------
    nli_logits = nli_model.predict([(best_sentence, user_answer)])
    probs = softmax_logits(nli_logits)

    labels = ["Contradiction", "Entailment", "Neutral"]
    verdict_idx = int(torch.tensor(probs).argmax())
    verdict = labels[verdict_idx]
    confidence = probs[verdict_idx] * 100

    logs["Gate 3 — NLI Verification"] = {
        "Model": NLI_MODEL,
        "Premise": best_sentence,
        "Hypothesis": user_answer,
        "Probabilities": {
            "Contradiction": f"{probs[0]*100:.2f}%",
            "Entailment": f"{probs[1]*100:.2f}%",
            "Neutral": f"{probs[2]*100:.2f}%"
        },
        "Verdict": verdict,
        "Confidence": f"{confidence:.2f}%",
        "Entailment Threshold": f"{ENTAILMENT_THRESHOLD*100:.0f}%"
    }

    # ------------------------------
    # FINAL DECISION
    # ------------------------------
    if verdict == "Entailment" and probs[1] >= ENTAILMENT_THRESHOLD:
        logs["Final Decision"] = "Answer is Supported by Text"
        return (
            "✅ CORRECT (Confirmed)",
            logs,
            f"Entailment {confidence:.1f}%"
        )

    if verdict == "Contradiction":
        logs["Final Decision"] = "Answer Contradicts Text"
        return (
            "❌ INCORRECT (Contradiction)",
            logs,
            f"Contradiction {confidence:.1f}%"
        )

    logs["Final Decision"] = "Answer Not Explicitly Stated"
    return (
        "❌ INCORRECT (Neutral / Not in Text)",
        logs,
        f"Neutral {confidence:.1f}%"
    )

# ==============================
# GRADIO UI
# ==============================

with gr.Blocks(title="Neural Logic Engine v6", theme=gr.themes.Soft()) as demo:
    gr.Markdown("## 🧠 Neural Logic Engine v6")
    gr.Markdown(
        "**Architecture:**\n"
        "- Gate 1: Question ↔ Answer relevance (STS)\n"
        "- Gate 2: KB sentence grounding (STS)\n"
        "- Gate 3: Sentence-level NLI verification\n"
        "- Fully logged, deterministic decisions"
    )

    with gr.Row():
        with gr.Column(scale=1):
            kb_input = gr.Textbox(
                label="Knowledge Base",
                lines=6,
                value="When a lion was resting in the jungle, a mouse began racing up and down his body for fun. "
                      "The lion's sleep was disturbed, and he woke in anger."
            )
            q_input = gr.Textbox(
                label="Question",
                value="What was the lion doing?"
            )
            a_input = gr.Textbox(
                label="User Answer",
                value="The lion was sleeping in the jungle."
            )
            btn = gr.Button("Evaluate", variant="primary")

        with gr.Column(scale=1):
            verdict_out = gr.Textbox(label="Final Verdict")
            confidence_out = gr.Label(label="Model Confidence")
            debug_log = gr.JSON(label="System Internals (FULL DEBUG LOG)")

    btn.click(
        fn=evaluate_response,
        inputs=[kb_input, q_input, a_input],
        outputs=[verdict_out, debug_log, confidence_out]
    )

if __name__ == "__main__":
    demo.launch()