heerjtdev commited on
Commit
4898472
Β·
verified Β·
1 Parent(s): 2b3f70d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +159 -42
app.py CHANGED
@@ -1,57 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import torch
3
  import torch.nn.functional as F
4
- from sentence_transformers import SentenceTransformer, CrossEncoder, util
5
 
6
- # Use ModernBERT-based NLI for maximum speed on Free Tier CPU
7
- # This model is 20% faster and 40% lighter than standard DeBERTa
8
- reasoning_model_name = 'dleemiller/finecat-nli-l'
9
- similarity_model_name = 'all-MiniLM-L6-v2'
 
 
10
 
11
- print("Initializing 2025 Lightweight Suite...")
12
- sim_model = SentenceTransformer(similarity_model_name, device="cpu")
13
- nli_model = CrossEncoder(reasoning_model_name, device="cpu")
 
 
 
 
 
 
14
 
15
  def evaluate_response(kb, question, user_answer):
16
- # 1. Topic Relevance (Bi-Encoder)
17
- # We check if the answer even belongs in the same universe as the question
18
- q_emb = sim_model.encode(question, convert_to_tensor=True)
19
- a_emb = sim_model.encode(user_answer, convert_to_tensor=True)
20
- rel_score = util.cos_sim(q_emb, a_emb).item()
21
 
22
- # 2. Structured Reasoning (Cross-Encoder)
23
- # We format the hypothesis to force the model to evaluate the ANSWER specifically
24
- hypothesis = f"Based on the context, the answer to '{question}' is '{user_answer}'."
 
 
25
 
26
- logits = nli_model.predict([(kb, hypothesis)])
27
- probs = F.softmax(torch.tensor(logits), dim=1).tolist()[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- # Label mapping for FineCat/DeBERTa: 0: contradiction, 1: entailment, 2: neutral
 
 
30
  labels = ["CONTRADICTION", "ENTAILMENT", "NEUTRAL"]
31
- max_idx = torch.tensor(logits).argmax().item()
32
- verdict = labels[max_idx]
33
- conf = probs[max_idx] * 100
34
-
35
- # 3. Precision Logic Gate
36
- if verdict == "CONTRADICTION" and conf > 40:
37
- status = "❌ INCORRECT (Logic Conflict)"
38
- elif verdict == "ENTAILMENT" and conf > 35:
39
  status = "βœ… CORRECT (Confirmed)"
40
- elif rel_score > 0.40 and verdict != "CONTRADICTION":
41
- status = "βœ… CORRECT (Likely/Inferred)"
42
- else:
43
- status = "❌ WRONG / IRRELEVANT"
44
-
45
- return status, f"{rel_score:.2f}", f"{verdict} ({conf:.1f}%)"
46
-
47
- # UI Setup remains the same
48
- demo = gr.Interface(
49
- fn=evaluate_response,
50
- inputs=["text", "text", "text"],
51
- outputs=[gr.Textbox(label="Verdict"), gr.Label(label="Topic Similarity"), gr.Label(label="NLI Reasoning")],
52
- title="Lightweight Reasoning Engine v3",
53
- description="Using ModernBERT-distilled NLI for 2025-standard reasoning on CPU."
54
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  if __name__ == "__main__":
57
  demo.launch()
 
1
+ # import gradio as gr
2
+ # import torch
3
+ # import torch.nn.functional as F
4
+ # from sentence_transformers import SentenceTransformer, CrossEncoder, util
5
+
6
+ # # Use ModernBERT-based NLI for maximum speed on Free Tier CPU
7
+ # # This model is 20% faster and 40% lighter than standard DeBERTa
8
+ # reasoning_model_name = 'dleemiller/finecat-nli-l'
9
+ # similarity_model_name = 'all-MiniLM-L6-v2'
10
+
11
+ # print("Initializing 2025 Lightweight Suite...")
12
+ # sim_model = SentenceTransformer(similarity_model_name, device="cpu")
13
+ # nli_model = CrossEncoder(reasoning_model_name, device="cpu")
14
+
15
+ # def evaluate_response(kb, question, user_answer):
16
+ # # 1. Topic Relevance (Bi-Encoder)
17
+ # # We check if the answer even belongs in the same universe as the question
18
+ # q_emb = sim_model.encode(question, convert_to_tensor=True)
19
+ # a_emb = sim_model.encode(user_answer, convert_to_tensor=True)
20
+ # rel_score = util.cos_sim(q_emb, a_emb).item()
21
+
22
+ # # 2. Structured Reasoning (Cross-Encoder)
23
+ # # We format the hypothesis to force the model to evaluate the ANSWER specifically
24
+ # hypothesis = f"Based on the context, the answer to '{question}' is '{user_answer}'."
25
+
26
+ # logits = nli_model.predict([(kb, hypothesis)])
27
+ # probs = F.softmax(torch.tensor(logits), dim=1).tolist()[0]
28
+
29
+ # # Label mapping for FineCat/DeBERTa: 0: contradiction, 1: entailment, 2: neutral
30
+ # labels = ["CONTRADICTION", "ENTAILMENT", "NEUTRAL"]
31
+ # max_idx = torch.tensor(logits).argmax().item()
32
+ # verdict = labels[max_idx]
33
+ # conf = probs[max_idx] * 100
34
+
35
+ # # 3. Precision Logic Gate
36
+ # if verdict == "CONTRADICTION" and conf > 40:
37
+ # status = "❌ INCORRECT (Logic Conflict)"
38
+ # elif verdict == "ENTAILMENT" and conf > 35:
39
+ # status = "βœ… CORRECT (Confirmed)"
40
+ # elif rel_score > 0.40 and verdict != "CONTRADICTION":
41
+ # status = "βœ… CORRECT (Likely/Inferred)"
42
+ # else:
43
+ # status = "❌ WRONG / IRRELEVANT"
44
+
45
+ # return status, f"{rel_score:.2f}", f"{verdict} ({conf:.1f}%)"
46
+
47
+ # # UI Setup remains the same
48
+ # demo = gr.Interface(
49
+ # fn=evaluate_response,
50
+ # inputs=["text", "text", "text"],
51
+ # outputs=[gr.Textbox(label="Verdict"), gr.Label(label="Topic Similarity"), gr.Label(label="NLI Reasoning")],
52
+ # title="Lightweight Reasoning Engine v3",
53
+ # description="Using ModernBERT-distilled NLI for 2025-standard reasoning on CPU."
54
+ # )
55
+
56
+ # if __name__ == "__main__":
57
+ # demo.launch()
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
  import gradio as gr
69
  import torch
70
  import torch.nn.functional as F
71
+ from sentence_transformers import CrossEncoder
72
 
73
+ # --- CONFIGURATION ---
74
+ # Model 1: QA Relevance Validator
75
+ # This model is trained on MS MARCO. It predicts how well a passage answers a query.
76
+ # High score = The answer addresses the question directly.
77
+ # Low score = Irrelevant (e.g., Q: "What did the lion do?", A: "The mouse's name is Lucy")
78
+ qa_model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
79
 
80
+ # Model 2: Fact Checker (NLI)
81
+ # We use a DeBERTa-v3-xsmall or similar high-performance NLI model.
82
+ # It is very robust at detecting Hallucinations vs Entailment.
83
+ nli_model_name = 'cross-encoder/nli-deberta-v3-xsmall'
84
+
85
+ print("Initializing Reasoning Engines...")
86
+ qa_model = CrossEncoder(qa_model_name, device="cpu")
87
+ nli_model = CrossEncoder(nli_model_name, device="cpu")
88
+ print("System Ready.")
89
 
90
  def evaluate_response(kb, question, user_answer):
91
+ if not kb or not question or not user_answer:
92
+ return "⚠️ Missing Input", "N/A", "N/A"
 
 
 
93
 
94
+ # --- GATE 1: Question-Answer Relevance Check ---
95
+ # We ask the model: "Is 'user_answer' a relevant response to 'question'?"
96
+ # MS-MARCO models output unbounded logits. Usually > 0 means relevant.
97
+ qa_scores = qa_model.predict([(question, user_answer)])
98
+ qa_score = qa_scores.item()
99
 
100
+ # Sigmoid to make it easier to read (0-100%)
101
+ qa_confidence = (1 / (1 + torch.exp(torch.tensor(-qa_score)))).item() * 100
102
+
103
+ # Strict Relevance Threshold (Adjustable)
104
+ # If the QA score is too low, we reject it immediately as irrelevant.
105
+ is_relevant = qa_score > 1.0 # Logit threshold (approx 73% confidence)
106
+
107
+ if not is_relevant:
108
+ return (
109
+ "❌ INCORRECT (Irrelevant Answer)",
110
+ f"Low Relevance ({qa_confidence:.1f}%)",
111
+ "Skipped (Not an answer)"
112
+ )
113
+
114
+ # --- GATE 2: Knowledge Base Verification (NLI) ---
115
+ # Now that we know it IS an answer, we check if it is TRUE based on the KB.
116
+ # Premise = KB
117
+ # Hypothesis = user_answer (Clean check, no complex prompt engineering needed)
118
+ nli_logits = nli_model.predict([(kb, user_answer)])
119
+ nli_probs = F.softmax(torch.tensor(nli_logits), dim=0).tolist()
120
 
121
+ # Label mapping for this specific model: 0: Contradiction, 1: Entailment, 2: Neutral
122
+ # Note: Different models map differently. For 'cross-encoder/nli-deberta-v3-xsmall':
123
+ # Label 0 = Contradiction, Label 1 = Entailment, Label 2 = Neutral
124
  labels = ["CONTRADICTION", "ENTAILMENT", "NEUTRAL"]
125
+ max_idx = torch.tensor(nli_logits).argmax().item()
126
+ verdict_label = labels[max_idx]
127
+ verdict_conf = nli_probs[max_idx] * 100
128
+
129
+ # --- FINAL VERDICT LOGIC ---
130
+ status = ""
131
+
132
+ if verdict_label == "ENTAILMENT":
133
  status = "βœ… CORRECT (Confirmed)"
134
+ elif verdict_label == "CONTRADICTION":
135
+ status = "❌ INCORRECT (Factually False)"
136
+ else: # NEUTRAL
137
+ # It answers the question, but the fact isn't in the text (Hallucination)
138
+ status = "❌ INCORRECT (Not in text)"
139
+
140
+ return (
141
+ status,
142
+ f"High Relevance ({qa_confidence:.1f}%)",
143
+ f"{verdict_label} ({verdict_conf:.1f}%)"
144
+ )
145
+
146
+ # --- UI SETUP ---
147
+ with gr.Blocks(title="Lightweight Reasoning Engine v4", theme=gr.themes.Soft()) as demo:
148
+ gr.Markdown("## 🧠 Neural Answer Checker v4 (Double-Gate Logic)")
149
+ gr.Markdown("This system uses two distinct brains: one checks if you answered the *Question*, the other checks if your answer matches the *Text*.")
150
+
151
+ with gr.Row():
152
+ kb_input = gr.Textbox(label="Knowledge Base (Context)", lines=6, placeholder="Paste story here...", value="When a lion was resting in the jungle, a mouse began racing up and down his body for fun. The lion's sleep was disturbed, and he woke in anger.")
153
+
154
+ with gr.Row():
155
+ q_input = gr.Textbox(label="Question", placeholder="e.g., What was the lion doing?")
156
+ a_input = gr.Textbox(label="User Answer", placeholder="e.g., He was sleeping.")
157
+
158
+ check_btn = gr.Button("Evaluate Answer", variant="primary")
159
+
160
+ with gr.Row():
161
+ verdict_output = gr.Textbox(label="Final Verdict", elem_classes="verdict")
162
+
163
+ with gr.Row():
164
+ qa_metric = gr.Label(label="Gate 1: QA Relevance")
165
+ nli_metric = gr.Label(label="Gate 2: Fact Check")
166
+
167
+ check_btn.click(
168
+ fn=evaluate_response,
169
+ inputs=[kb_input, q_input, a_input],
170
+ outputs=[verdict_output, qa_metric, nli_metric]
171
+ )
172
 
173
  if __name__ == "__main__":
174
  demo.launch()