Spaces:

heerjtdev
/

answer_checker

Runtime error

App Files Files Community

heerjtdev commited on Jan 1

Commit

d0c9fda

verified ·

1 Parent(s): d14dbdf

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -34

app.py CHANGED Viewed

@@ -5,7 +5,6 @@ from sentence_transformers import CrossEncoder
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import re
 import hashlib
-import json
 # ============================================================
 # DEVICE
@@ -21,23 +20,24 @@ SIM_MODEL_NAME = "cross-encoder/stsb-distilroberta-base"
 NLI_MODEL_NAME = "cross-encoder/nli-deberta-v3-xsmall"
 LLM_NAME = "google/flan-t5-base"
-print("Loading similarity + NLI models...")
 sim_model = CrossEncoder(SIM_MODEL_NAME, device=DEVICE)
 nli_model = CrossEncoder(NLI_MODEL_NAME, device=DEVICE)
 print("Loading LLM for schema generation...")
 llm_tokenizer = AutoTokenizer.from_pretrained(LLM_NAME)
 llm_model = AutoModelForSeq2SeqLM.from_pretrained(LLM_NAME).to(DEVICE)
-print("✅ All models loaded successfully")
 # ============================================================
-# CONFIGURATION
 # ============================================================
 SIM_THRESHOLD_REQUIRED = 0.55
 CONTRADICTION_THRESHOLD = 0.70
 SCHEMA_CACHE = {}
 # ============================================================
@@ -56,13 +56,34 @@ def softmax_logits(logits):
 def hash_key(kb, question):
     return hashlib.sha256((kb + question).encode()).hexdigest()
 # ============================================================
-# LLM SCHEMA GENERATION (CORE PART YOU ASKED FOR)
 # ============================================================
 def generate_schema_with_llm(kb, question):
     prompt = f"""
-From the knowledge base below, answer the question using short factual points.
 Knowledge Base:
 {kb}
@@ -70,7 +91,7 @@ Knowledge Base:
 Question:
 {question}
-Write 1–3 short factual bullet points. Do NOT explain.
 """
     inputs = llm_tokenizer(prompt, return_tensors="pt", truncation=True).to(DEVICE)
@@ -78,35 +99,46 @@ Write 1–3 short factual bullet points. Do NOT explain.
     outputs = llm_model.generate(
         **inputs,
         max_new_tokens=128,
-        temperature=0.0,
-        do_sample=False
     )
-    text = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Extract bullet-like facts
     facts = [
         line.strip("-• ").strip()
-        for line in text.split("\n")
-        if len(line.strip()) > 3
     ]
     return {
-        "question_type": "FACT",
         "required_concepts": facts,
-        "forbidden_concepts": [],
         "allow_extra_info": True,
-        "raw_llm_output": text
     }
 # ============================================================
 # ANSWER DECOMPOSITION
 # ============================================================
 def decompose_answer(answer):
-    clauses = re.split(r'\b(?:and|because|before|after|while)\b', answer)
-    return [c.strip() for c in clauses if c.strip()]
 # ============================================================
 # CORE EVALUATION
@@ -121,42 +153,53 @@ def evaluate_answer(answer, question, kb):
         }
     }
-    # ---------------- SCHEMA ----------------
     key = hash_key(kb, question)
     if key not in SCHEMA_CACHE:
-        SCHEMA_CACHE[key] = generate_schema_with_llm(kb, question)
     schema = SCHEMA_CACHE[key]
     logs["schema"] = schema
-    # ---------------- ANSWER CLAIMS ----------------
     claims = decompose_answer(answer)
     logs["claims"] = claims
-    # ---------------- REQUIRED COVERAGE ----------------
     coverage = []
     covered_all = True
-    for concept in schema.get("required_concepts", []):
         scores = sim_model.predict([(concept, c) for c in claims])
         best = float(scores.max())
-        covered = best >= SIM_THRESHOLD_REQUIRED
         coverage.append({
             "concept": concept,
             "similarity": round(best, 3),
-            "covered": covered
         })
-        if not covered:
             covered_all = False
     logs["coverage"] = coverage
-    # ---------------- CONTRADICTION CHECK (FIXED) ----------------
     contradictions = []
-    relevant_kb = schema.get("required_concepts", [])
     for claim in claims:
-        for sent in relevant_kb:
             probs = softmax_logits(nli_model.predict([(sent, claim)]))
             if probs[0] > CONTRADICTION_THRESHOLD:
                 contradictions.append({
@@ -167,7 +210,7 @@ def evaluate_answer(answer, question, kb):
     logs["contradictions"] = contradictions
-    # ---------------- FINAL DECISION ----------------
     if contradictions:
         verdict = "❌ INCORRECT (Contradiction)"
     elif covered_all:
@@ -178,7 +221,6 @@ def evaluate_answer(answer, question, kb):
     logs["final_verdict"] = verdict
     return verdict, logs
 # ============================================================
 # GRADIO UI
 # ============================================================
@@ -187,9 +229,9 @@ def run(answer, question, kb):
     return evaluate_answer(answer, question, kb)
 with gr.Blocks(title="Competitive Exam Answer Checker") as demo:
-    gr.Markdown("## 🧠 Competitive Exam Answer Checker (HF-Free LLM Version)")
-    kb = gr.Textbox(label="Knowledge Base", lines=7)
     question = gr.Textbox(label="Question")
     answer = gr.Textbox(label="Student Answer")

 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import re
 import hashlib
 # ============================================================
 # DEVICE
 NLI_MODEL_NAME = "cross-encoder/nli-deberta-v3-xsmall"
 LLM_NAME = "google/flan-t5-base"
+print("Loading similarity model...")
 sim_model = CrossEncoder(SIM_MODEL_NAME, device=DEVICE)
+print("Loading NLI model...")
 nli_model = CrossEncoder(NLI_MODEL_NAME, device=DEVICE)
 print("Loading LLM for schema generation...")
 llm_tokenizer = AutoTokenizer.from_pretrained(LLM_NAME)
 llm_model = AutoModelForSeq2SeqLM.from_pretrained(LLM_NAME).to(DEVICE)
+print("✅ All models loaded")
 # ============================================================
+# CONFIG
 # ============================================================
 SIM_THRESHOLD_REQUIRED = 0.55
 CONTRADICTION_THRESHOLD = 0.70
 SCHEMA_CACHE = {}
 # ============================================================
 def hash_key(kb, question):
     return hashlib.sha256((kb + question).encode()).hexdigest()
+def infer_question_type(question):
+    q = question.lower().strip()
+    if q.startswith("how"):
+        return "METHOD"
+    if q.startswith("why"):
+        return "REASON"
+    if q.startswith("when") or q.startswith("where"):
+        return "FACT"
+    return "FACT"
 # ============================================================
+# LLM SCHEMA GENERATION (HARDENED)
 # ============================================================
 def generate_schema_with_llm(kb, question):
+    q_type = infer_question_type(question)
     prompt = f"""
+You are extracting the correct answer to a competitive exam question.
+RULES:
+- ONLY extract facts that DIRECTLY answer the question.
+- IGNORE unrelated events.
+- If the question asks "how", extract the METHOD.
+- Use short, atomic factual sentences.
+- Do NOT summarize the story.
+Question Type: {q_type}
 Knowledge Base:
 {kb}
 Question:
 {question}
+Return 1–3 bullet points that directly answer the question.
 """
     inputs = llm_tokenizer(prompt, return_tensors="pt", truncation=True).to(DEVICE)
     outputs = llm_model.generate(
         **inputs,
         max_new_tokens=128,
+        do_sample=False,
+        temperature=0.0
     )
+    raw = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
     facts = [
         line.strip("-• ").strip()
+        for line in raw.split("\n")
+        if len(line.strip()) > 4
     ]
     return {
+        "question_type": q_type,
         "required_concepts": facts,
         "allow_extra_info": True,
+        "raw_llm_output": raw
     }
+# ============================================================
+# SCHEMA VALIDATION (CRITICAL)
+# ============================================================
+def validate_schema(schema, question):
+    q_words = set(question.lower().split())
+    valid = []
+    for c in schema["required_concepts"]:
+        if q_words & set(c.lower().split()):
+            valid.append(c)
+    return valid
 # ============================================================
 # ANSWER DECOMPOSITION
 # ============================================================
 def decompose_answer(answer):
+    parts = re.split(r'\b(?:and|because|before|after|while)\b', answer)
+    return [p.strip() for p in parts if p.strip()]
 # ============================================================
 # CORE EVALUATION
         }
     }
     key = hash_key(kb, question)
     if key not in SCHEMA_CACHE:
+        schema = generate_schema_with_llm(kb, question)
+        validated = validate_schema(schema, question)
+        if not validated:
+            # fallback: keyword-based extraction
+            validated = [
+                s for s in split_sentences(kb)
+                if any(w in s.lower() for w in question.lower().split())
+            ][:2]
+        schema["required_concepts"] = validated
+        SCHEMA_CACHE[key] = schema
     schema = SCHEMA_CACHE[key]
     logs["schema"] = schema
     claims = decompose_answer(answer)
     logs["claims"] = claims
+    # ---------------- COVERAGE ----------------
     coverage = []
     covered_all = True
+    for concept in schema["required_concepts"]:
         scores = sim_model.predict([(concept, c) for c in claims])
         best = float(scores.max())
+        ok = best >= SIM_THRESHOLD_REQUIRED
         coverage.append({
             "concept": concept,
             "similarity": round(best, 3),
+            "covered": ok
         })
+        if not ok:
             covered_all = False
     logs["coverage"] = coverage
+    # ---------------- CONTRADICTIONS ----------------
     contradictions = []
     for claim in claims:
+        for sent in schema["required_concepts"]:
             probs = softmax_logits(nli_model.predict([(sent, claim)]))
             if probs[0] > CONTRADICTION_THRESHOLD:
                 contradictions.append({
     logs["contradictions"] = contradictions
+    # ---------------- FINAL VERDICT ----------------
     if contradictions:
         verdict = "❌ INCORRECT (Contradiction)"
     elif covered_all:
     logs["final_verdict"] = verdict
     return verdict, logs
 # ============================================================
 # GRADIO UI
 # ============================================================
     return evaluate_answer(answer, question, kb)
 with gr.Blocks(title="Competitive Exam Answer Checker") as demo:
+    gr.Markdown("## 🧠 Competitive Exam Answer Checker")
+    kb = gr.Textbox(label="Knowledge Base", lines=8)
     question = gr.Textbox(label="Question")
     answer = gr.Textbox(label="Student Answer")