| # import gradio as gr | |
| # import torch | |
| # import torch.nn.functional as F | |
| # from sentence_transformers import SentenceTransformer, CrossEncoder, util | |
| # # Use ModernBERT-based NLI for maximum speed on Free Tier CPU | |
| # # This model is 20% faster and 40% lighter than standard DeBERTa | |
| # reasoning_model_name = 'dleemiller/finecat-nli-l' | |
| # similarity_model_name = 'all-MiniLM-L6-v2' | |
| # print("Initializing 2025 Lightweight Suite...") | |
| # sim_model = SentenceTransformer(similarity_model_name, device="cpu") | |
| # nli_model = CrossEncoder(reasoning_model_name, device="cpu") | |
| # def evaluate_response(kb, question, user_answer): | |
| # # 1. Topic Relevance (Bi-Encoder) | |
| # # We check if the answer even belongs in the same universe as the question | |
| # q_emb = sim_model.encode(question, convert_to_tensor=True) | |
| # a_emb = sim_model.encode(user_answer, convert_to_tensor=True) | |
| # rel_score = util.cos_sim(q_emb, a_emb).item() | |
| # # 2. Structured Reasoning (Cross-Encoder) | |
| # # We format the hypothesis to force the model to evaluate the ANSWER specifically | |
| # hypothesis = f"Based on the context, the answer to '{question}' is '{user_answer}'." | |
| # logits = nli_model.predict([(kb, hypothesis)]) | |
| # probs = F.softmax(torch.tensor(logits), dim=1).tolist()[0] | |
| # # Label mapping for FineCat/DeBERTa: 0: contradiction, 1: entailment, 2: neutral | |
| # labels = ["CONTRADICTION", "ENTAILMENT", "NEUTRAL"] | |
| # max_idx = torch.tensor(logits).argmax().item() | |
| # verdict = labels[max_idx] | |
| # conf = probs[max_idx] * 100 | |
| # # 3. Precision Logic Gate | |
| # if verdict == "CONTRADICTION" and conf > 40: | |
| # status = "β INCORRECT (Logic Conflict)" | |
| # elif verdict == "ENTAILMENT" and conf > 35: | |
| # status = "β CORRECT (Confirmed)" | |
| # elif rel_score > 0.40 and verdict != "CONTRADICTION": | |
| # status = "β CORRECT (Likely/Inferred)" | |
| # else: | |
| # status = "β WRONG / IRRELEVANT" | |
| # return status, f"{rel_score:.2f}", f"{verdict} ({conf:.1f}%)" | |
| # # UI Setup remains the same | |
| # demo = gr.Interface( | |
| # fn=evaluate_response, | |
| # inputs=["text", "text", "text"], | |
| # outputs=[gr.Textbox(label="Verdict"), gr.Label(label="Topic Similarity"), gr.Label(label="NLI Reasoning")], | |
| # title="Lightweight Reasoning Engine v3", | |
| # description="Using ModernBERT-distilled NLI for 2025-standard reasoning on CPU." | |
| # ) | |
| # if __name__ == "__main__": | |
| # demo.launch() | |
| # import gradio as gr | |
| # import torch | |
| # import torch.nn.functional as F | |
| # from sentence_transformers import CrossEncoder | |
| # # --- CONFIGURATION --- | |
| # # GATE 1: Semantic Relevance (STS) | |
| # # Checks if the Answer is conversationally related to the Question. | |
| # relevance_model_name = 'cross-encoder/stsb-distilroberta-base' | |
| # # GATE 2: Fact Checking (NLI) | |
| # # Checks if the Answer is supported by the Knowledge Base. | |
| # nli_model_name = 'cross-encoder/nli-deberta-v3-xsmall' | |
| # print(f"Loading Models...\n1. {relevance_model_name}\n2. {nli_model_name}") | |
| # rel_model = CrossEncoder(relevance_model_name, device="cpu") | |
| # nli_model = CrossEncoder(nli_model_name, device="cpu") | |
| # print("β System Ready.") | |
| # def evaluate_response(kb, question, user_answer): | |
| # if not kb or not question or not user_answer: | |
| # return "β οΈ Error: Missing Input", {}, "N/A" | |
| # logs = {} | |
| # # --- GATE 1: RELEVANCE CHECK (STS) --- | |
| # rel_score = rel_model.predict([(question, user_answer)]) | |
| # # FIX 1: Use .item() to safely extract float from numpy array | |
| # rel_score_val = rel_score.item() | |
| # logs['Gate 1 Model'] = relevance_model_name | |
| # logs['Gate 1 Raw Score'] = f"{rel_score_val:.4f}" | |
| # # Threshold: STS score > 0.15 usually implies relevance | |
| # RELEVANCE_THRESHOLD = 0.15 | |
| # if rel_score_val < RELEVANCE_THRESHOLD: | |
| # status = "β INCORRECT (Irrelevant)" | |
| # logs['Verdict'] = "Blocked at Gate 1 (Answer unrelated to Question)" | |
| # return status, logs, "Blocked" | |
| # # --- GATE 2: FACT CHECKING (NLI) --- | |
| # nli_logits = nli_model.predict([(kb, user_answer)]) | |
| # # FIX 2: Handle Dimensions safely | |
| # # Convert to tensor | |
| # nli_tensor = torch.tensor(nli_logits) | |
| # # If the model returns a batch dimension (e.g. [1, 3]), squeeze it to flat [3] | |
| # if nli_tensor.dim() > 1: | |
| # nli_tensor = nli_tensor.squeeze() | |
| # # Apply Softmax across the classes (now dim=0 is safe on a flat tensor) | |
| # nli_probs = F.softmax(nli_tensor, dim=0).tolist() | |
| # # Get the winner index | |
| # max_idx = nli_tensor.argmax().item() | |
| # # Standard NLI Labels | |
| # labels = ["Contradiction", "Entailment", "Neutral"] | |
| # # Safety check for model label count mismatch | |
| # if max_idx >= len(labels): | |
| # return "β οΈ Model Error", {"Error": "Label mismatch"}, "N/A" | |
| # nli_verdict = labels[max_idx] | |
| # nli_conf = nli_probs[max_idx] * 100 | |
| # logs['Gate 2 Model'] = nli_model_name | |
| # logs['Gate 2 Probabilities'] = { | |
| # "Contradiction": f"{nli_probs[0]*100:.1f}%", | |
| # "Entailment": f"{nli_probs[1]*100:.1f}%", | |
| # "Neutral": f"{nli_probs[2]*100:.1f}%" | |
| # } | |
| # logs['Gate 2 Verdict'] = nli_verdict | |
| # # --- FINAL DECISION LOGIC --- | |
| # if nli_verdict == "Entailment": | |
| # status = "β CORRECT (Confirmed)" | |
| # logs['Final Outcome'] = "Answer is Relevant and Factual." | |
| # elif nli_verdict == "Contradiction": | |
| # status = "β INCORRECT (False Information)" | |
| # logs['Final Outcome'] = "Answer contradicts the text." | |
| # else: # Neutral | |
| # status = "β INCORRECT (Hallucination/Not in Text)" | |
| # logs['Final Outcome'] = "Answer not found in text." | |
| # return status, logs, f"{nli_verdict} ({nli_conf:.1f}%)" | |
| # # --- UI SETUP --- | |
| # with gr.Blocks(title="NLI Logic Engine v5", theme=gr.themes.Soft()) as demo: | |
| # gr.Markdown("## π§ Neural Logic Engine v5.1 (Bug Fixes Applied)") | |
| # gr.Markdown("Corrected Architecture: STS for Relevance + NLI for Fact Checking.") | |
| # with gr.Row(): | |
| # with gr.Column(scale=1): | |
| # kb_input = gr.Textbox(label="Knowledge Base", lines=5, value="When a lion was resting in the jungle, a mouse began racing up and down his body for fun. The lion's sleep was disturbed, and he woke in anger.") | |
| # q_input = gr.Textbox(label="Question", value="What was the lion doing?") | |
| # a_input = gr.Textbox(label="User Answer", value="The lion was sleeping in the jungle.") | |
| # btn = gr.Button("Evaluate", variant="primary") | |
| # with gr.Column(scale=1): | |
| # verdict_out = gr.Textbox(label="Final Verdict", elem_classes="verdict") | |
| # nli_metric = gr.Label(label="NLI Confidence") | |
| # debug_log = gr.JSON(label="System Internals (Debug Log)") | |
| # btn.click( | |
| # fn=evaluate_response, | |
| # inputs=[kb_input, q_input, a_input], | |
| # outputs=[verdict_out, debug_log, nli_metric] | |
| # ) | |
| # if __name__ == "__main__": | |
| # demo.launch() | |
| import gradio as gr | |
| import torch | |
| import torch.nn.functional as F | |
| from sentence_transformers import CrossEncoder | |
| import re | |
| # ============================== | |
| # CONFIGURATION | |
| # ============================== | |
| RELEVANCE_MODEL = "cross-encoder/stsb-distilroberta-base" | |
| NLI_MODEL = "cross-encoder/nli-deberta-v3-xsmall" | |
| RELEVANCE_THRESHOLD_QA = 0.15 | |
| RELEVANCE_THRESHOLD_KB = 0.30 | |
| ENTAILMENT_THRESHOLD = 0.65 | |
| DEVICE = "cpu" | |
| # ============================== | |
| # LOAD MODELS | |
| # ============================== | |
| print("Loading models...") | |
| rel_model = CrossEncoder(RELEVANCE_MODEL, device=DEVICE) | |
| nli_model = CrossEncoder(NLI_MODEL, device=DEVICE) | |
| print("β Models loaded") | |
| # ============================== | |
| # UTILITIES | |
| # ============================== | |
| def split_sentences(text): | |
| text = text.strip() | |
| if not text: | |
| return [] | |
| return re.split(r'(?<=[.!?])\s+', text) | |
| def softmax_logits(logits): | |
| t = torch.tensor(logits) | |
| if t.dim() > 1: | |
| t = t.squeeze(0) | |
| probs = F.softmax(t, dim=0).tolist() | |
| return probs | |
| # ============================== | |
| # CORE EVALUATION FUNCTION | |
| # ============================== | |
| def evaluate_response(kb, question, user_answer): | |
| logs = {} | |
| # ------------------------------ | |
| # INPUT VALIDATION | |
| # ------------------------------ | |
| if not kb or not question or not user_answer: | |
| return "β οΈ ERROR: Missing input", {}, "N/A" | |
| logs["Inputs"] = { | |
| "Question": question, | |
| "User Answer": user_answer, | |
| "KB Length (chars)": len(kb) | |
| } | |
| # ------------------------------ | |
| # GATE 1 β QUESTION β ANSWER RELEVANCE | |
| # ------------------------------ | |
| qa_score = rel_model.predict([(question, user_answer)]).item() | |
| logs["Gate 1 β QA Relevance"] = { | |
| "Model": RELEVANCE_MODEL, | |
| "Score": round(qa_score, 4), | |
| "Threshold": RELEVANCE_THRESHOLD_QA | |
| } | |
| if qa_score < RELEVANCE_THRESHOLD_QA: | |
| logs["Final Decision"] = "Blocked at Gate 1 (Irrelevant Answer)" | |
| return ( | |
| "β INCORRECT (Irrelevant)", | |
| logs, | |
| f"Relevance {qa_score:.2f}" | |
| ) | |
| # ------------------------------ | |
| # GATE 2 β KB SENTENCE SELECTION (STS) | |
| # ------------------------------ | |
| kb_sentences = split_sentences(kb) | |
| logs["KB Processing"] = { | |
| "Total Sentences": len(kb_sentences), | |
| "Sentences": kb_sentences | |
| } | |
| if not kb_sentences: | |
| logs["Final Decision"] = "Empty KB after sentence split" | |
| return "β INCORRECT (Empty KB)", logs, "N/A" | |
| sentence_pairs = [(s, user_answer) for s in kb_sentences] | |
| sim_scores = rel_model.predict(sentence_pairs) | |
| best_idx = int(sim_scores.argmax()) | |
| best_sentence = kb_sentences[best_idx] | |
| best_score = float(sim_scores[best_idx]) | |
| logs["Gate 2 β KB Sentence Selection"] = { | |
| "Model": RELEVANCE_MODEL, | |
| "Best Sentence": best_sentence, | |
| "Best Similarity Score": round(best_score, 4), | |
| "Threshold": RELEVANCE_THRESHOLD_KB, | |
| "All Scores": [ | |
| {"sentence": s, "score": round(float(sc), 4)} | |
| for s, sc in zip(kb_sentences, sim_scores) | |
| ] | |
| } | |
| if best_score < RELEVANCE_THRESHOLD_KB: | |
| logs["Final Decision"] = "Answer not grounded in KB" | |
| return ( | |
| "β INCORRECT (Not Found in Text)", | |
| logs, | |
| f"KB Similarity {best_score:.2f}" | |
| ) | |
| # ------------------------------ | |
| # GATE 3 β NLI (Sentence β Answer) | |
| # ------------------------------ | |
| nli_logits = nli_model.predict([(best_sentence, user_answer)]) | |
| probs = softmax_logits(nli_logits) | |
| labels = ["Contradiction", "Entailment", "Neutral"] | |
| verdict_idx = int(torch.tensor(probs).argmax()) | |
| verdict = labels[verdict_idx] | |
| confidence = probs[verdict_idx] * 100 | |
| logs["Gate 3 β NLI Verification"] = { | |
| "Model": NLI_MODEL, | |
| "Premise": best_sentence, | |
| "Hypothesis": user_answer, | |
| "Probabilities": { | |
| "Contradiction": f"{probs[0]*100:.2f}%", | |
| "Entailment": f"{probs[1]*100:.2f}%", | |
| "Neutral": f"{probs[2]*100:.2f}%" | |
| }, | |
| "Verdict": verdict, | |
| "Confidence": f"{confidence:.2f}%", | |
| "Entailment Threshold": f"{ENTAILMENT_THRESHOLD*100:.0f}%" | |
| } | |
| # ------------------------------ | |
| # FINAL DECISION | |
| # ------------------------------ | |
| if verdict == "Entailment" and probs[1] >= ENTAILMENT_THRESHOLD: | |
| logs["Final Decision"] = "Answer is Supported by Text" | |
| return ( | |
| "β CORRECT (Confirmed)", | |
| logs, | |
| f"Entailment {confidence:.1f}%" | |
| ) | |
| if verdict == "Contradiction": | |
| logs["Final Decision"] = "Answer Contradicts Text" | |
| return ( | |
| "β INCORRECT (Contradiction)", | |
| logs, | |
| f"Contradiction {confidence:.1f}%" | |
| ) | |
| logs["Final Decision"] = "Answer Not Explicitly Stated" | |
| return ( | |
| "β INCORRECT (Neutral / Not in Text)", | |
| logs, | |
| f"Neutral {confidence:.1f}%" | |
| ) | |
| # ============================== | |
| # GRADIO UI | |
| # ============================== | |
| with gr.Blocks(title="Neural Logic Engine v6", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("## π§ Neural Logic Engine v6") | |
| gr.Markdown( | |
| "**Architecture:**\n" | |
| "- Gate 1: Question β Answer relevance (STS)\n" | |
| "- Gate 2: KB sentence grounding (STS)\n" | |
| "- Gate 3: Sentence-level NLI verification\n" | |
| "- Fully logged, deterministic decisions" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| kb_input = gr.Textbox( | |
| label="Knowledge Base", | |
| lines=6, | |
| value="When a lion was resting in the jungle, a mouse began racing up and down his body for fun. " | |
| "The lion's sleep was disturbed, and he woke in anger." | |
| ) | |
| q_input = gr.Textbox( | |
| label="Question", | |
| value="What was the lion doing?" | |
| ) | |
| a_input = gr.Textbox( | |
| label="User Answer", | |
| value="The lion was sleeping in the jungle." | |
| ) | |
| btn = gr.Button("Evaluate", variant="primary") | |
| with gr.Column(scale=1): | |
| verdict_out = gr.Textbox(label="Final Verdict") | |
| confidence_out = gr.Label(label="Model Confidence") | |
| debug_log = gr.JSON(label="System Internals (FULL DEBUG LOG)") | |
| btn.click( | |
| fn=evaluate_response, | |
| inputs=[kb_input, q_input, a_input], | |
| outputs=[verdict_out, debug_log, confidence_out] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |