# import gradio as gr # import torch # import torch.nn.functional as F # from sentence_transformers import SentenceTransformer, CrossEncoder, util # # Use ModernBERT-based NLI for maximum speed on Free Tier CPU # # This model is 20% faster and 40% lighter than standard DeBERTa # reasoning_model_name = 'dleemiller/finecat-nli-l' # similarity_model_name = 'all-MiniLM-L6-v2' # print("Initializing 2025 Lightweight Suite...") # sim_model = SentenceTransformer(similarity_model_name, device="cpu") # nli_model = CrossEncoder(reasoning_model_name, device="cpu") # def evaluate_response(kb, question, user_answer): # # 1. Topic Relevance (Bi-Encoder) # # We check if the answer even belongs in the same universe as the question # q_emb = sim_model.encode(question, convert_to_tensor=True) # a_emb = sim_model.encode(user_answer, convert_to_tensor=True) # rel_score = util.cos_sim(q_emb, a_emb).item() # # 2. Structured Reasoning (Cross-Encoder) # # We format the hypothesis to force the model to evaluate the ANSWER specifically # hypothesis = f"Based on the context, the answer to '{question}' is '{user_answer}'." # logits = nli_model.predict([(kb, hypothesis)]) # probs = F.softmax(torch.tensor(logits), dim=1).tolist()[0] # # Label mapping for FineCat/DeBERTa: 0: contradiction, 1: entailment, 2: neutral # labels = ["CONTRADICTION", "ENTAILMENT", "NEUTRAL"] # max_idx = torch.tensor(logits).argmax().item() # verdict = labels[max_idx] # conf = probs[max_idx] * 100 # # 3. Precision Logic Gate # if verdict == "CONTRADICTION" and conf > 40: # status = "❌ INCORRECT (Logic Conflict)" # elif verdict == "ENTAILMENT" and conf > 35: # status = "✅ CORRECT (Confirmed)" # elif rel_score > 0.40 and verdict != "CONTRADICTION": # status = "✅ CORRECT (Likely/Inferred)" # else: # status = "❌ WRONG / IRRELEVANT" # return status, f"{rel_score:.2f}", f"{verdict} ({conf:.1f}%)" # # UI Setup remains the same # demo = gr.Interface( # fn=evaluate_response, # inputs=["text", "text", "text"], # outputs=[gr.Textbox(label="Verdict"), gr.Label(label="Topic Similarity"), gr.Label(label="NLI Reasoning")], # title="Lightweight Reasoning Engine v3", # description="Using ModernBERT-distilled NLI for 2025-standard reasoning on CPU." # ) # if __name__ == "__main__": # demo.launch() # import gradio as gr # import torch # import torch.nn.functional as F # from sentence_transformers import CrossEncoder # # --- CONFIGURATION --- # # GATE 1: Semantic Relevance (STS) # # Checks if the Answer is conversationally related to the Question. # relevance_model_name = 'cross-encoder/stsb-distilroberta-base' # # GATE 2: Fact Checking (NLI) # # Checks if the Answer is supported by the Knowledge Base. # nli_model_name = 'cross-encoder/nli-deberta-v3-xsmall' # print(f"Loading Models...\n1. {relevance_model_name}\n2. {nli_model_name}") # rel_model = CrossEncoder(relevance_model_name, device="cpu") # nli_model = CrossEncoder(nli_model_name, device="cpu") # print("✅ System Ready.") # def evaluate_response(kb, question, user_answer): # if not kb or not question or not user_answer: # return "⚠️ Error: Missing Input", {}, "N/A" # logs = {} # # --- GATE 1: RELEVANCE CHECK (STS) --- # rel_score = rel_model.predict([(question, user_answer)]) # # FIX 1: Use .item() to safely extract float from numpy array # rel_score_val = rel_score.item() # logs['Gate 1 Model'] = relevance_model_name # logs['Gate 1 Raw Score'] = f"{rel_score_val:.4f}" # # Threshold: STS score > 0.15 usually implies relevance # RELEVANCE_THRESHOLD = 0.15 # if rel_score_val < RELEVANCE_THRESHOLD: # status = "❌ INCORRECT (Irrelevant)" # logs['Verdict'] = "Blocked at Gate 1 (Answer unrelated to Question)" # return status, logs, "Blocked" # # --- GATE 2: FACT CHECKING (NLI) --- # nli_logits = nli_model.predict([(kb, user_answer)]) # # FIX 2: Handle Dimensions safely # # Convert to tensor # nli_tensor = torch.tensor(nli_logits) # # If the model returns a batch dimension (e.g. [1, 3]), squeeze it to flat [3] # if nli_tensor.dim() > 1: # nli_tensor = nli_tensor.squeeze() # # Apply Softmax across the classes (now dim=0 is safe on a flat tensor) # nli_probs = F.softmax(nli_tensor, dim=0).tolist() # # Get the winner index # max_idx = nli_tensor.argmax().item() # # Standard NLI Labels # labels = ["Contradiction", "Entailment", "Neutral"] # # Safety check for model label count mismatch # if max_idx >= len(labels): # return "⚠️ Model Error", {"Error": "Label mismatch"}, "N/A" # nli_verdict = labels[max_idx] # nli_conf = nli_probs[max_idx] * 100 # logs['Gate 2 Model'] = nli_model_name # logs['Gate 2 Probabilities'] = { # "Contradiction": f"{nli_probs[0]*100:.1f}%", # "Entailment": f"{nli_probs[1]*100:.1f}%", # "Neutral": f"{nli_probs[2]*100:.1f}%" # } # logs['Gate 2 Verdict'] = nli_verdict # # --- FINAL DECISION LOGIC --- # if nli_verdict == "Entailment": # status = "✅ CORRECT (Confirmed)" # logs['Final Outcome'] = "Answer is Relevant and Factual." # elif nli_verdict == "Contradiction": # status = "❌ INCORRECT (False Information)" # logs['Final Outcome'] = "Answer contradicts the text." # else: # Neutral # status = "❌ INCORRECT (Hallucination/Not in Text)" # logs['Final Outcome'] = "Answer not found in text." # return status, logs, f"{nli_verdict} ({nli_conf:.1f}%)" # # --- UI SETUP --- # with gr.Blocks(title="NLI Logic Engine v5", theme=gr.themes.Soft()) as demo: # gr.Markdown("## 🧠 Neural Logic Engine v5.1 (Bug Fixes Applied)") # gr.Markdown("Corrected Architecture: STS for Relevance + NLI for Fact Checking.") # with gr.Row(): # with gr.Column(scale=1): # kb_input = gr.Textbox(label="Knowledge Base", lines=5, value="When a lion was resting in the jungle, a mouse began racing up and down his body for fun. The lion's sleep was disturbed, and he woke in anger.") # q_input = gr.Textbox(label="Question", value="What was the lion doing?") # a_input = gr.Textbox(label="User Answer", value="The lion was sleeping in the jungle.") # btn = gr.Button("Evaluate", variant="primary") # with gr.Column(scale=1): # verdict_out = gr.Textbox(label="Final Verdict", elem_classes="verdict") # nli_metric = gr.Label(label="NLI Confidence") # debug_log = gr.JSON(label="System Internals (Debug Log)") # btn.click( # fn=evaluate_response, # inputs=[kb_input, q_input, a_input], # outputs=[verdict_out, debug_log, nli_metric] # ) # if __name__ == "__main__": # demo.launch() import gradio as gr import torch import torch.nn.functional as F from sentence_transformers import CrossEncoder import re # ============================== # CONFIGURATION # ============================== RELEVANCE_MODEL = "cross-encoder/stsb-distilroberta-base" NLI_MODEL = "cross-encoder/nli-deberta-v3-xsmall" RELEVANCE_THRESHOLD_QA = 0.15 RELEVANCE_THRESHOLD_KB = 0.30 ENTAILMENT_THRESHOLD = 0.65 DEVICE = "cpu" # ============================== # LOAD MODELS # ============================== print("Loading models...") rel_model = CrossEncoder(RELEVANCE_MODEL, device=DEVICE) nli_model = CrossEncoder(NLI_MODEL, device=DEVICE) print("✅ Models loaded") # ============================== # UTILITIES # ============================== def split_sentences(text): text = text.strip() if not text: return [] return re.split(r'(?<=[.!?])\s+', text) def softmax_logits(logits): t = torch.tensor(logits) if t.dim() > 1: t = t.squeeze(0) probs = F.softmax(t, dim=0).tolist() return probs # ============================== # CORE EVALUATION FUNCTION # ============================== def evaluate_response(kb, question, user_answer): logs = {} # ------------------------------ # INPUT VALIDATION # ------------------------------ if not kb or not question or not user_answer: return "⚠️ ERROR: Missing input", {}, "N/A" logs["Inputs"] = { "Question": question, "User Answer": user_answer, "KB Length (chars)": len(kb) } # ------------------------------ # GATE 1 — QUESTION ↔ ANSWER RELEVANCE # ------------------------------ qa_score = rel_model.predict([(question, user_answer)]).item() logs["Gate 1 — QA Relevance"] = { "Model": RELEVANCE_MODEL, "Score": round(qa_score, 4), "Threshold": RELEVANCE_THRESHOLD_QA } if qa_score < RELEVANCE_THRESHOLD_QA: logs["Final Decision"] = "Blocked at Gate 1 (Irrelevant Answer)" return ( "❌ INCORRECT (Irrelevant)", logs, f"Relevance {qa_score:.2f}" ) # ------------------------------ # GATE 2 — KB SENTENCE SELECTION (STS) # ------------------------------ kb_sentences = split_sentences(kb) logs["KB Processing"] = { "Total Sentences": len(kb_sentences), "Sentences": kb_sentences } if not kb_sentences: logs["Final Decision"] = "Empty KB after sentence split" return "❌ INCORRECT (Empty KB)", logs, "N/A" sentence_pairs = [(s, user_answer) for s in kb_sentences] sim_scores = rel_model.predict(sentence_pairs) best_idx = int(sim_scores.argmax()) best_sentence = kb_sentences[best_idx] best_score = float(sim_scores[best_idx]) logs["Gate 2 — KB Sentence Selection"] = { "Model": RELEVANCE_MODEL, "Best Sentence": best_sentence, "Best Similarity Score": round(best_score, 4), "Threshold": RELEVANCE_THRESHOLD_KB, "All Scores": [ {"sentence": s, "score": round(float(sc), 4)} for s, sc in zip(kb_sentences, sim_scores) ] } if best_score < RELEVANCE_THRESHOLD_KB: logs["Final Decision"] = "Answer not grounded in KB" return ( "❌ INCORRECT (Not Found in Text)", logs, f"KB Similarity {best_score:.2f}" ) # ------------------------------ # GATE 3 — NLI (Sentence ↔ Answer) # ------------------------------ nli_logits = nli_model.predict([(best_sentence, user_answer)]) probs = softmax_logits(nli_logits) labels = ["Contradiction", "Entailment", "Neutral"] verdict_idx = int(torch.tensor(probs).argmax()) verdict = labels[verdict_idx] confidence = probs[verdict_idx] * 100 logs["Gate 3 — NLI Verification"] = { "Model": NLI_MODEL, "Premise": best_sentence, "Hypothesis": user_answer, "Probabilities": { "Contradiction": f"{probs[0]*100:.2f}%", "Entailment": f"{probs[1]*100:.2f}%", "Neutral": f"{probs[2]*100:.2f}%" }, "Verdict": verdict, "Confidence": f"{confidence:.2f}%", "Entailment Threshold": f"{ENTAILMENT_THRESHOLD*100:.0f}%" } # ------------------------------ # FINAL DECISION # ------------------------------ if verdict == "Entailment" and probs[1] >= ENTAILMENT_THRESHOLD: logs["Final Decision"] = "Answer is Supported by Text" return ( "✅ CORRECT (Confirmed)", logs, f"Entailment {confidence:.1f}%" ) if verdict == "Contradiction": logs["Final Decision"] = "Answer Contradicts Text" return ( "❌ INCORRECT (Contradiction)", logs, f"Contradiction {confidence:.1f}%" ) logs["Final Decision"] = "Answer Not Explicitly Stated" return ( "❌ INCORRECT (Neutral / Not in Text)", logs, f"Neutral {confidence:.1f}%" ) # ============================== # GRADIO UI # ============================== with gr.Blocks(title="Neural Logic Engine v6", theme=gr.themes.Soft()) as demo: gr.Markdown("## 🧠 Neural Logic Engine v6") gr.Markdown( "**Architecture:**\n" "- Gate 1: Question ↔ Answer relevance (STS)\n" "- Gate 2: KB sentence grounding (STS)\n" "- Gate 3: Sentence-level NLI verification\n" "- Fully logged, deterministic decisions" ) with gr.Row(): with gr.Column(scale=1): kb_input = gr.Textbox( label="Knowledge Base", lines=6, value="When a lion was resting in the jungle, a mouse began racing up and down his body for fun. " "The lion's sleep was disturbed, and he woke in anger." ) q_input = gr.Textbox( label="Question", value="What was the lion doing?" ) a_input = gr.Textbox( label="User Answer", value="The lion was sleeping in the jungle." ) btn = gr.Button("Evaluate", variant="primary") with gr.Column(scale=1): verdict_out = gr.Textbox(label="Final Verdict") confidence_out = gr.Label(label="Model Confidence") debug_log = gr.JSON(label="System Internals (FULL DEBUG LOG)") btn.click( fn=evaluate_response, inputs=[kb_input, q_input, a_input], outputs=[verdict_out, debug_log, confidence_out] ) if __name__ == "__main__": demo.launch()