Spaces:

heerjtdev
/

answer_checker

Runtime error

App Files Files Community

heerjtdev commited on Jan 1

Commit

c53835a

verified ·

1 Parent(s): d0c9fda

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -60

app.py CHANGED Viewed

@@ -20,23 +20,17 @@ SIM_MODEL_NAME = "cross-encoder/stsb-distilroberta-base"
 NLI_MODEL_NAME = "cross-encoder/nli-deberta-v3-xsmall"
 LLM_NAME = "google/flan-t5-base"
-print("Loading similarity model...")
 sim_model = CrossEncoder(SIM_MODEL_NAME, device=DEVICE)
-print("Loading NLI model...")
 nli_model = CrossEncoder(NLI_MODEL_NAME, device=DEVICE)
-print("Loading LLM for schema generation...")
 llm_tokenizer = AutoTokenizer.from_pretrained(LLM_NAME)
 llm_model = AutoModelForSeq2SeqLM.from_pretrained(LLM_NAME).to(DEVICE)
-print("✅ All models loaded")
 # ============================================================
 # CONFIG
 # ============================================================
-SIM_THRESHOLD_REQUIRED = 0.55
 CONTRADICTION_THRESHOLD = 0.70
 SCHEMA_CACHE = {}
@@ -57,33 +51,34 @@ def hash_key(kb, question):
     return hashlib.sha256((kb + question).encode()).hexdigest()
 def infer_question_type(question):
-    q = question.lower().strip()
     if q.startswith("how"):
         return "METHOD"
     if q.startswith("why"):
         return "REASON"
-    if q.startswith("when") or q.startswith("where"):
-        return "FACT"
     return "FACT"
 # ============================================================
-# LLM SCHEMA GENERATION (HARDENED)
 # ============================================================
 def generate_schema_with_llm(kb, question):
     q_type = infer_question_type(question)
     prompt = f"""
-You are extracting the correct answer to a competitive exam question.
-RULES:
-- ONLY extract facts that DIRECTLY answer the question.
-- IGNORE unrelated events.
-- If the question asks "how", extract the METHOD.
-- Use short, atomic factual sentences.
-- Do NOT summarize the story.
-Question Type: {q_type}
 Knowledge Base:
 {kb}
@@ -91,16 +86,15 @@ Knowledge Base:
 Question:
 {question}
-Return 1–3 bullet points that directly answer the question.
 """
     inputs = llm_tokenizer(prompt, return_tensors="pt", truncation=True).to(DEVICE)
     outputs = llm_model.generate(
         **inputs,
-        max_new_tokens=128,
-        do_sample=False,
-        temperature=0.0
     )
     raw = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
@@ -114,32 +108,9 @@ Return 1–3 bullet points that directly answer the question.
     return {
         "question_type": q_type,
         "required_concepts": facts,
-        "allow_extra_info": True,
         "raw_llm_output": raw
     }
-# ============================================================
-# SCHEMA VALIDATION (CRITICAL)
-# ============================================================
-def validate_schema(schema, question):
-    q_words = set(question.lower().split())
-    valid = []
-    for c in schema["required_concepts"]:
-        if q_words & set(c.lower().split()):
-            valid.append(c)
-    return valid
-# ============================================================
-# ANSWER DECOMPOSITION
-# ============================================================
-def decompose_answer(answer):
-    parts = re.split(r'\b(?:and|because|before|after|while)\b', answer)
-    return [p.strip() for p in parts if p.strip()]
 # ============================================================
 # CORE EVALUATION
 # ============================================================
@@ -157,16 +128,21 @@ def evaluate_answer(answer, question, kb):
     if key not in SCHEMA_CACHE:
         schema = generate_schema_with_llm(kb, question)
-        validated = validate_schema(schema, question)
-        if not validated:
-            # fallback: keyword-based extraction
-            validated = [
                 s for s in split_sentences(kb)
-                if any(w in s.lower() for w in question.lower().split())
-            ][:2]
-        schema["required_concepts"] = validated
         SCHEMA_CACHE[key] = schema
     schema = SCHEMA_CACHE[key]
@@ -182,7 +158,7 @@ def evaluate_answer(answer, question, kb):
     for concept in schema["required_concepts"]:
         scores = sim_model.predict([(concept, c) for c in claims])
         best = float(scores.max())
-        ok = best >= SIM_THRESHOLD_REQUIRED
         coverage.append({
             "concept": concept,
@@ -199,20 +175,20 @@ def evaluate_answer(answer, question, kb):
     contradictions = []
     for claim in claims:
-        for sent in schema["required_concepts"]:
-            probs = softmax_logits(nli_model.predict([(sent, claim)]))
             if probs[0] > CONTRADICTION_THRESHOLD:
                 contradictions.append({
                     "claim": claim,
-                    "sentence": sent,
                     "confidence": round(probs[0] * 100, 1)
                 })
     logs["contradictions"] = contradictions
-    # ---------------- FINAL VERDICT ----------------
     if contradictions:
-        verdict = "❌ INCORRECT (Contradiction)"
     elif covered_all:
         verdict = "✅ CORRECT"
     else:

 NLI_MODEL_NAME = "cross-encoder/nli-deberta-v3-xsmall"
 LLM_NAME = "google/flan-t5-base"
 sim_model = CrossEncoder(SIM_MODEL_NAME, device=DEVICE)
 nli_model = CrossEncoder(NLI_MODEL_NAME, device=DEVICE)
 llm_tokenizer = AutoTokenizer.from_pretrained(LLM_NAME)
 llm_model = AutoModelForSeq2SeqLM.from_pretrained(LLM_NAME).to(DEVICE)
 # ============================================================
 # CONFIG
 # ============================================================
+SIM_THRESHOLD = 0.60
 CONTRADICTION_THRESHOLD = 0.70
 SCHEMA_CACHE = {}
     return hashlib.sha256((kb + question).encode()).hexdigest()
 def infer_question_type(question):
+    q = question.lower()
     if q.startswith("how"):
         return "METHOD"
     if q.startswith("why"):
         return "REASON"
     return "FACT"
+def decompose_answer(answer):
+    return [s.strip() for s in split_sentences(answer) if len(s.strip()) > 3]
 # ============================================================
+# 🔥 ACTION-FOCUSED SCHEMA GENERATION (FIXED)
 # ============================================================
 def generate_schema_with_llm(kb, question):
     q_type = infer_question_type(question)
     prompt = f"""
+You are extracting the exact answer to a competitive exam question.
+STRICT RULES:
+- Extract ONLY the direct action that answers the question.
+- DO NOT include background events.
+- DO NOT include earlier or later story details.
+- Use ACTIVE VERBS.
+- Keep answers short (one clause).
+Question type: {q_type}
 Knowledge Base:
 {kb}
 Question:
 {question}
+Return the answer as bullet points.
 """
     inputs = llm_tokenizer(prompt, return_tensors="pt", truncation=True).to(DEVICE)
     outputs = llm_model.generate(
         **inputs,
+        max_new_tokens=80,
+        temperature=0.0,
+        do_sample=False
     )
     raw = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
     return {
         "question_type": q_type,
         "required_concepts": facts,
         "raw_llm_output": raw
     }
 # ============================================================
 # CORE EVALUATION
 # ============================================================
     if key not in SCHEMA_CACHE:
         schema = generate_schema_with_llm(kb, question)
+        # HARD FILTER: must contain an ACTION VERB
+        action_schema = []
+        for s in schema["required_concepts"]:
+            if re.search(r'\b(bit|cut|free|help|rescue|save)\b', s.lower()):
+                action_schema.append(s)
+        # Fallback: extract action sentences directly from KB
+        if not action_schema:
+            action_schema = [
                 s for s in split_sentences(kb)
+                if re.search(r'\b(bit|cut|free|help|rescue|save)\b', s.lower())
+            ]
+        schema["required_concepts"] = action_schema[:2]
         SCHEMA_CACHE[key] = schema
     schema = SCHEMA_CACHE[key]
     for concept in schema["required_concepts"]:
         scores = sim_model.predict([(concept, c) for c in claims])
         best = float(scores.max())
+        ok = best >= SIM_THRESHOLD
         coverage.append({
             "concept": concept,
     contradictions = []
     for claim in claims:
+        for ref in schema["required_concepts"]:
+            probs = softmax_logits(nli_model.predict([(ref, claim)]))
             if probs[0] > CONTRADICTION_THRESHOLD:
                 contradictions.append({
                     "claim": claim,
+                    "against": ref,
                     "confidence": round(probs[0] * 100, 1)
                 })
     logs["contradictions"] = contradictions
+    # ---------------- VERDICT ----------------
     if contradictions:
+        verdict = "❌ INCORRECT"
     elif covered_all:
         verdict = "✅ CORRECT"
     else: