Spaces:

heerjtdev
/

answer_checker

Runtime error

App Files Files Community

heerjtdev commited on Jan 1

Commit

b79fdd7

verified ·

1 Parent(s): c53835a

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -96

app.py CHANGED Viewed

@@ -5,39 +5,40 @@ from sentence_transformers import CrossEncoder
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import re
 import hashlib
 # ============================================================
 # DEVICE
 # ============================================================
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ============================================================
 # MODELS
 # ============================================================
 SIM_MODEL_NAME = "cross-encoder/stsb-distilroberta-base"
 NLI_MODEL_NAME = "cross-encoder/nli-deberta-v3-xsmall"
 LLM_NAME = "google/flan-t5-base"
 sim_model = CrossEncoder(SIM_MODEL_NAME, device=DEVICE)
 nli_model = CrossEncoder(NLI_MODEL_NAME, device=DEVICE)
 llm_tokenizer = AutoTokenizer.from_pretrained(LLM_NAME)
 llm_model = AutoModelForSeq2SeqLM.from_pretrained(LLM_NAME).to(DEVICE)
 # ============================================================
-# CONFIG
 # ============================================================
-SIM_THRESHOLD = 0.60
 CONTRADICTION_THRESHOLD = 0.70
 SCHEMA_CACHE = {}
 # ============================================================
 # UTILITIES
 # ============================================================
 def split_sentences(text):
     return re.split(r'(?<=[.!?])\s+', text.strip())
@@ -50,35 +51,21 @@ def softmax_logits(logits):
 def hash_key(kb, question):
     return hashlib.sha256((kb + question).encode()).hexdigest()
-def infer_question_type(question):
-    q = question.lower()
-    if q.startswith("how"):
-        return "METHOD"
-    if q.startswith("why"):
-        return "REASON"
-    return "FACT"
 def decompose_answer(answer):
-    return [s.strip() for s in split_sentences(answer) if len(s.strip()) > 3]
 # ============================================================
-# 🔥 ACTION-FOCUSED SCHEMA GENERATION (FIXED)
 # ============================================================
-def generate_schema_with_llm(kb, question):
-    q_type = infer_question_type(question)
     prompt = f"""
-You are extracting the exact answer to a competitive exam question.
-STRICT RULES:
-- Extract ONLY the direct action that answers the question.
-- DO NOT include background events.
-- DO NOT include earlier or later story details.
-- Use ACTIVE VERBS.
-- Keep answers short (one clause).
-Question type: {q_type}
 Knowledge Base:
 {kb}
@@ -86,27 +73,29 @@ Knowledge Base:
 Question:
 {question}
-Return the answer as bullet points.
 """
     inputs = llm_tokenizer(prompt, return_tensors="pt", truncation=True).to(DEVICE)
     outputs = llm_model.generate(
         **inputs,
-        max_new_tokens=80,
-        temperature=0.0,
-        do_sample=False
     )
     raw = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
-    facts = [
-        line.strip("-• ").strip()
-        for line in raw.split("\n")
-        if len(line.strip()) > 4
-    ]
     return {
-        "question_type": q_type,
         "required_concepts": facts,
         "raw_llm_output": raw
     }
@@ -114,106 +103,81 @@ Return the answer as bullet points.
 # ============================================================
 # CORE EVALUATION
 # ============================================================
 def evaluate_answer(answer, question, kb):
-    logs = {
-        "inputs": {
-            "question": question,
-            "answer": answer,
-            "kb_length": len(kb)
-        }
-    }
     key = hash_key(kb, question)
     if key not in SCHEMA_CACHE:
-        schema = generate_schema_with_llm(kb, question)
-        # HARD FILTER: must contain an ACTION VERB
-        action_schema = []
-        for s in schema["required_concepts"]:
-            if re.search(r'\b(bit|cut|free|help|rescue|save)\b', s.lower()):
-                action_schema.append(s)
-        # Fallback: extract action sentences directly from KB
-        if not action_schema:
-            action_schema = [
-                s for s in split_sentences(kb)
-                if re.search(r'\b(bit|cut|free|help|rescue|save)\b', s.lower())
-            ]
-        schema["required_concepts"] = action_schema[:2]
         SCHEMA_CACHE[key] = schema
     schema = SCHEMA_CACHE[key]
     logs["schema"] = schema
     claims = decompose_answer(answer)
     logs["claims"] = claims
     # ---------------- COVERAGE ----------------
     coverage = []
     covered_all = True
     for concept in schema["required_concepts"]:
-        scores = sim_model.predict([(concept, c) for c in claims])
-        best = float(scores.max())
-        ok = best >= SIM_THRESHOLD
         coverage.append({
             "concept": concept,
             "similarity": round(best, 3),
             "covered": ok
         })
         if not ok:
             covered_all = False
     logs["coverage"] = coverage
     # ---------------- CONTRADICTIONS ----------------
     contradictions = []
     for claim in claims:
-        for ref in schema["required_concepts"]:
-            probs = softmax_logits(nli_model.predict([(ref, claim)]))
             if probs[0] > CONTRADICTION_THRESHOLD:
                 contradictions.append({
                     "claim": claim,
-                    "against": ref,
                     "confidence": round(probs[0] * 100, 1)
                 })
     logs["contradictions"] = contradictions
-    # ---------------- VERDICT ----------------
     if contradictions:
-        verdict = "❌ INCORRECT"
     elif covered_all:
         verdict = "✅ CORRECT"
     else:
         verdict = "⚠️ PARTIALLY CORRECT"
     logs["final_verdict"] = verdict
     return verdict, logs
 # ============================================================
 # GRADIO UI
 # ============================================================
 def run(answer, question, kb):
     return evaluate_answer(answer, question, kb)
 with gr.Blocks(title="Competitive Exam Answer Checker") as demo:
-    gr.Markdown("## 🧠 Competitive Exam Answer Checker")
-    kb = gr.Textbox(label="Knowledge Base", lines=8)
     question = gr.Textbox(label="Question")
     answer = gr.Textbox(label="Student Answer")
     verdict = gr.Textbox(label="Verdict")
     debug = gr.JSON(label="Debug Logs")
     btn = gr.Button("Evaluate")
     btn.click(run, [answer, question, kb], [verdict, debug])

 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import re
 import hashlib
+import json
 # ============================================================
 # DEVICE
 # ============================================================
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ============================================================
 # MODELS
 # ============================================================
 SIM_MODEL_NAME = "cross-encoder/stsb-distilroberta-base"
 NLI_MODEL_NAME = "cross-encoder/nli-deberta-v3-xsmall"
 LLM_NAME = "google/flan-t5-base"
+print("Loading similarity + NLI models...")
 sim_model = CrossEncoder(SIM_MODEL_NAME, device=DEVICE)
 nli_model = CrossEncoder(NLI_MODEL_NAME, device=DEVICE)
+print("Loading LLM for atomic fact extraction...")
 llm_tokenizer = AutoTokenizer.from_pretrained(LLM_NAME)
 llm_model = AutoModelForSeq2SeqLM.from_pretrained(LLM_NAME).to(DEVICE)
+print("✅ All models loaded")
 # ============================================================
+# CONFIGURATION
 # ============================================================
+SIM_THRESHOLD_REQUIRED = 0.55
 CONTRADICTION_THRESHOLD = 0.70
 SCHEMA_CACHE = {}
 # ============================================================
 # UTILITIES
 # ============================================================
 def split_sentences(text):
     return re.split(r'(?<=[.!?])\s+', text.strip())
 def hash_key(kb, question):
     return hashlib.sha256((kb + question).encode()).hexdigest()
 def decompose_answer(answer):
+    """Split answer into atomic claims."""
+    parts = re.split(r'\b(?:and|because|before|after|while|then|so)\b', answer)
+    return [p.strip() for p in parts if p.strip()]
 # ============================================================
+# LLM FACT EXTRACTION
 # ============================================================
+def generate_atomic_facts(kb, question):
+    """
+    Ask LLM to extract 1-5 atomic facts from KB that directly answer the question.
+    Returns JSON: {"facts": [ ... ]}
+    """
     prompt = f"""
+Extract atomic facts that directly answer the question.
 Knowledge Base:
 {kb}
 Question:
 {question}
+RULES:
+- Return 1-5 short factual statements that directly answer the question.
+- Output strictly in JSON format: {{"facts": ["fact1", "fact2", ...]}}
+- Do not include unrelated events or explanations.
+- Each fact should be self-contained.
 """
     inputs = llm_tokenizer(prompt, return_tensors="pt", truncation=True).to(DEVICE)
     outputs = llm_model.generate(
         **inputs,
+        max_new_tokens=128,
+        do_sample=False,
+        temperature=0.0
     )
     raw = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
+    try:
+        data = json.loads(raw)
+        facts = data.get("facts", [])
+    except:
+        # fallback: parse line by line if JSON fails
+        facts = [line.strip("-• ").strip() for line in raw.split("\n") if len(line.strip()) > 3]
     return {
         "required_concepts": facts,
         "raw_llm_output": raw
     }
 # ============================================================
 # CORE EVALUATION
 # ============================================================
 def evaluate_answer(answer, question, kb):
+    logs = {"inputs": {"question": question, "answer": answer, "kb_length": len(kb)}}
     key = hash_key(kb, question)
     if key not in SCHEMA_CACHE:
+        schema = generate_atomic_facts(kb, question)
         SCHEMA_CACHE[key] = schema
     schema = SCHEMA_CACHE[key]
     logs["schema"] = schema
     claims = decompose_answer(answer)
     logs["claims"] = claims
     # ---------------- COVERAGE ----------------
     coverage = []
     covered_all = True
     for concept in schema["required_concepts"]:
+        if claims:
+            scores = sim_model.predict([(concept, c) for c in claims])
+            best = float(scores.max())
+            ok = best >= SIM_THRESHOLD_REQUIRED
+        else:
+            best = 0.0
+            ok = False
         coverage.append({
             "concept": concept,
             "similarity": round(best, 3),
             "covered": ok
         })
         if not ok:
             covered_all = False
     logs["coverage"] = coverage
     # ---------------- CONTRADICTIONS ----------------
     contradictions = []
+    kb_sents = split_sentences(kb)
     for claim in claims:
+        for sent in kb_sents:
+            probs = softmax_logits(nli_model.predict([(sent, claim)]))
             if probs[0] > CONTRADICTION_THRESHOLD:
                 contradictions.append({
                     "claim": claim,
+                    "sentence": sent,
                     "confidence": round(probs[0] * 100, 1)
                 })
     logs["contradictions"] = contradictions
+    # ---------------- FINAL VERDICT ----------------
     if contradictions:
+        verdict = "❌ INCORRECT (Contradiction)"
     elif covered_all:
         verdict = "✅ CORRECT"
     else:
         verdict = "⚠️ PARTIALLY CORRECT"
     logs["final_verdict"] = verdict
     return verdict, logs
 # ============================================================
 # GRADIO UI
 # ============================================================
 def run(answer, question, kb):
     return evaluate_answer(answer, question, kb)
 with gr.Blocks(title="Competitive Exam Answer Checker") as demo:
+    gr.Markdown("## 🧠 Competitive Exam Answer Checker (Robust General Version)")
+    kb = gr.Textbox(label="Knowledge Base", lines=10)
     question = gr.Textbox(label="Question")
     answer = gr.Textbox(label="Student Answer")
     verdict = gr.Textbox(label="Verdict")
     debug = gr.JSON(label="Debug Logs")
     btn = gr.Button("Evaluate")
     btn.click(run, [answer, question, kb], [verdict, debug])