Spaces:

heerjtdev
/

answer_checker

Runtime error

App Files Files Community

heerjtdev commited on Jan 2

Commit

7753020

verified ·

1 Parent(s): d427ea6

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -69

app.py CHANGED Viewed

@@ -15,31 +15,29 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ============================================================
 # MODELS
 # ============================================================
-SIM_MODEL_NAME = "cross-encoder/stsb-distilroberta-base"
 NLI_MODEL_NAME = "cross-encoder/nli-deberta-v3-xsmall"
 LLM_NAME = "google/flan-t5-base"
-print("Loading similarity + NLI models...")
-sim_model = CrossEncoder(SIM_MODEL_NAME, device=DEVICE)
 nli_model = CrossEncoder(NLI_MODEL_NAME, device=DEVICE)
-print("Loading LLM for atomic fact extraction...")
 llm_tokenizer = AutoTokenizer.from_pretrained(LLM_NAME)
 llm_model = AutoModelForSeq2SeqLM.from_pretrained(LLM_NAME).to(DEVICE)
 print("✅ All models loaded")
 # ============================================================
-# CONFIGURATION
 # ============================================================
-SIM_THRESHOLD_REQUIRED = 0.55
-CONTRADICTION_THRESHOLD = 0.70
 SCHEMA_CACHE = {}
 # ============================================================
 # UTILITIES
 # ============================================================
-def split_sentences(text):
     return re.split(r'(?<=[.!?])\s+', text.strip())
 def softmax_logits(logits):
@@ -51,55 +49,71 @@ def softmax_logits(logits):
 def hash_key(kb, question):
     return hashlib.sha256((kb + question).encode()).hexdigest()
-def decompose_answer(answer):
-    """Split answer into atomic claims."""
-    parts = re.split(r'\b(?:and|because|before|after|while|then|so)\b', answer)
-    return [p.strip() for p in parts if p.strip()]
 # ============================================================
-# LLM FACT EXTRACTION
 # ============================================================
-def generate_atomic_facts(kb, question):
     """
-    Ask LLM to extract 1-5 atomic facts from KB that directly answer the question.
-    Returns JSON: {"facts": [ ... ]}
     """
-    prompt = f"""
-From the Knowledge Base, extract the character transformation of Matilda.
-Rules:
-- Identify INITIAL traits, CAUSAL EVENTS, and FINAL traits.
-- Use short factual statements grounded ONLY in the knowledge base.
-- Do NOT paraphrase the question.
-- Return facts that can be checked independently.
-Output strictly as JSON:
 {
   "facts": [
-    "Initially Matilda desired a luxurious life despite her humble background",
-    "She pretended to be wealthy and borrowed a necklace to attend the ball",
-    "She lost the borrowed necklace, causing long-term suffering",
-    "As a result of hardship, she became mature, humble, and grateful"
   ]
 }
 """
     inputs = llm_tokenizer(prompt, return_tensors="pt", truncation=True).to(DEVICE)
     outputs = llm_model.generate(
         **inputs,
-        max_new_tokens=128,
         do_sample=False,
         temperature=0.0
     )
     raw = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
     try:
         data = json.loads(raw)
         facts = data.get("facts", [])
-    except:
-        # fallback: parse line by line if JSON fails
-        facts = [line.strip("-• ").strip() for line in raw.split("\n") if len(line.strip()) > 3]
     return {
         "required_concepts": facts,
         "raw_llm_output": raw
@@ -108,68 +122,68 @@ Output strictly as JSON:
 # ============================================================
 # CORE EVALUATION
 # ============================================================
-def evaluate_answer(answer, question, kb):
-    logs = {"inputs": {"question": question, "answer": answer, "kb_length": len(kb)}}
     key = hash_key(kb, question)
     if key not in SCHEMA_CACHE:
-        schema = generate_atomic_facts(kb, question)
-        SCHEMA_CACHE[key] = schema
     schema = SCHEMA_CACHE[key]
     logs["schema"] = schema
     claims = decompose_answer(answer)
     logs["claims"] = claims
     # ---------------- COVERAGE ----------------
     coverage = []
     covered_all = True
-    for concept in schema["required_concepts"]:
-        if claims:
-            probs = softmax_logits(nli_model.predict([(c, concept)]))
-# index 2 = entailment for NLI DeBERTa
-            entailment = probs[2]
-            ok = entailment > 0.6
-            best = entailment
-            # scores = sim_model.predict([(concept, c) for c in claims])
-            # best = float(scores.max())
-            # ok = best >= SIM_THRESHOLD_REQUIRED
-        else:
-            best = 0.0
-            ok = False
         coverage.append({
             "concept": concept,
-            "similarity": round(best, 3),
             "covered": ok
         })
         if not ok:
             covered_all = False
     logs["coverage"] = coverage
     # ---------------- CONTRADICTIONS ----------------
     contradictions = []
     kb_sents = split_sentences(kb)
     for claim in claims:
         for sent in kb_sents:
             probs = softmax_logits(nli_model.predict([(sent, claim)]))
             contradiction = probs[0]
             entailment = probs[2]
-            if contradiction > 0.8 and entailment < 0.2:
-            # probs = softmax_logits(nli_model.predict([(sent, claim)]))
-            # if probs[0] > CONTRADICTION_THRESHOLD:
                 contradictions.append({
                     "claim": claim,
                     "sentence": sent,
-                    "confidence": round(probs[0] * 100, 1)
                 })
     logs["contradictions"] = contradictions
     # ---------------- FINAL VERDICT ----------------
     if contradictions:
         verdict = "❌ INCORRECT (Contradiction)"
@@ -177,7 +191,7 @@ def evaluate_answer(answer, question, kb):
         verdict = "✅ CORRECT"
     else:
         verdict = "⚠️ PARTIALLY CORRECT"
     logs["final_verdict"] = verdict
     return verdict, logs
@@ -188,16 +202,16 @@ def run(answer, question, kb):
     return evaluate_answer(answer, question, kb)
 with gr.Blocks(title="Competitive Exam Answer Checker") as demo:
-    gr.Markdown("## 🧠 Competitive Exam Answer Checker (Robust General Version)")
     kb = gr.Textbox(label="Knowledge Base", lines=10)
     question = gr.Textbox(label="Question")
     answer = gr.Textbox(label="Student Answer")
     verdict = gr.Textbox(label="Verdict")
     debug = gr.JSON(label="Debug Logs")
     btn = gr.Button("Evaluate")
     btn.click(run, [answer, question, kb], [verdict, debug])
-demo.launch()

 # ============================================================
 # MODELS
 # ============================================================
 NLI_MODEL_NAME = "cross-encoder/nli-deberta-v3-xsmall"
 LLM_NAME = "google/flan-t5-base"
+print("Loading NLI model...")
 nli_model = CrossEncoder(NLI_MODEL_NAME, device=DEVICE)
+print("Loading LLM for schema extraction...")
 llm_tokenizer = AutoTokenizer.from_pretrained(LLM_NAME)
 llm_model = AutoModelForSeq2SeqLM.from_pretrained(LLM_NAME).to(DEVICE)
 print("✅ All models loaded")
 # ============================================================
+# CONFIG
 # ============================================================
+ENTAILMENT_THRESHOLD = 0.6
+CONTRADICTION_THRESHOLD = 0.8
 SCHEMA_CACHE = {}
 # ============================================================
 # UTILITIES
 # ============================================================
+def split_sentences(text: str):
     return re.split(r'(?<=[.!?])\s+', text.strip())
 def softmax_logits(logits):
 def hash_key(kb, question):
     return hashlib.sha256((kb + question).encode()).hexdigest()
+def decompose_answer(answer: str):
+    """
+    Conservative sentence-based decomposition.
+    Avoids fragments that break NLI.
+    """
+    sentences = split_sentences(answer)
+    return [s.strip() for s in sentences if len(s.split()) >= 5]
 # ============================================================
+# LLM SCHEMA EXTRACTION (GENERALISABLE)
 # ============================================================
+def generate_atomic_facts(kb: str, question: str):
     """
+    Extract minimal checkable propositions from the KB.
     """
+    prompt = """
+You are constructing a grading schema.
+Task:
+From the Knowledge Base, extract the MINIMAL set of factual propositions
+that a correct answer to the Question must entail.
+Rules:
+- Use ONLY information present in the knowledge base.
+- Do NOT restate or paraphrase the question.
+- Do NOT add explanations.
+- Each fact must be independently checkable.
+- Prefer concrete states, events, causes, or outcomes.
+- Return between 2 and 6 facts.
+Output STRICTLY in valid JSON:
 {
   "facts": [
+    "fact 1",
+    "fact 2",
+    "fact 3"
   ]
 }
+Knowledge Base:
+<<<KB>>>
+Question:
+<<<QUESTION>>>
 """
+    prompt = prompt.replace("<<<KB>>>", kb).replace("<<<QUESTION>>>", question)
     inputs = llm_tokenizer(prompt, return_tensors="pt", truncation=True).to(DEVICE)
     outputs = llm_model.generate(
         **inputs,
+        max_new_tokens=192,
         do_sample=False,
         temperature=0.0
     )
     raw = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
     try:
         data = json.loads(raw)
         facts = data.get("facts", [])
+    except Exception:
+        facts = []
     return {
         "required_concepts": facts,
         "raw_llm_output": raw
 # ============================================================
 # CORE EVALUATION
 # ============================================================
+def evaluate_answer(answer: str, question: str, kb: str):
+    logs = {
+        "inputs": {
+            "question": question,
+            "answer": answer,
+            "kb_length": len(kb)
+        }
+    }
     key = hash_key(kb, question)
     if key not in SCHEMA_CACHE:
+        SCHEMA_CACHE[key] = generate_atomic_facts(kb, question)
     schema = SCHEMA_CACHE[key]
     logs["schema"] = schema
     claims = decompose_answer(answer)
     logs["claims"] = claims
     # ---------------- COVERAGE ----------------
     coverage = []
     covered_all = True
+    for concept in schema["required_concepts"]:
+        best_entailment = 0.0
+        for claim in claims:
+            probs = softmax_logits(nli_model.predict([(claim, concept)]))
+            best_entailment = max(best_entailment, probs[2])  # entailment
+        ok = best_entailment >= ENTAILMENT_THRESHOLD
         coverage.append({
             "concept": concept,
+            "entailment": round(best_entailment, 3),
             "covered": ok
         })
         if not ok:
             covered_all = False
     logs["coverage"] = coverage
     # ---------------- CONTRADICTIONS ----------------
     contradictions = []
     kb_sents = split_sentences(kb)
     for claim in claims:
         for sent in kb_sents:
             probs = softmax_logits(nli_model.predict([(sent, claim)]))
             contradiction = probs[0]
             entailment = probs[2]
+            # Conservative contradiction rule
+            if contradiction >= CONTRADICTION_THRESHOLD and entailment < 0.2:
                 contradictions.append({
                     "claim": claim,
                     "sentence": sent,
+                    "confidence": round(contradiction * 100, 1)
                 })
     logs["contradictions"] = contradictions
     # ---------------- FINAL VERDICT ----------------
     if contradictions:
         verdict = "❌ INCORRECT (Contradiction)"
         verdict = "✅ CORRECT"
     else:
         verdict = "⚠️ PARTIALLY CORRECT"
     logs["final_verdict"] = verdict
     return verdict, logs
     return evaluate_answer(answer, question, kb)
 with gr.Blocks(title="Competitive Exam Answer Checker") as demo:
+    gr.Markdown("## 🧠 Competitive Exam Answer Checker")
     kb = gr.Textbox(label="Knowledge Base", lines=10)
     question = gr.Textbox(label="Question")
     answer = gr.Textbox(label="Student Answer")
     verdict = gr.Textbox(label="Verdict")
     debug = gr.JSON(label="Debug Logs")
     btn = gr.Button("Evaluate")
     btn.click(run, [answer, question, kb], [verdict, debug])
+demo.launch()