Spaces:

heerjtdev
/

answer_checker

Runtime error

App Files Files Community

heerjtdev commited on Jan 1

Commit

e2daaeb

verified ·

1 Parent(s): 4134c06

Update app.py

Browse files

Files changed (1) hide show

app.py +176 -7

app.py CHANGED Viewed

@@ -1,20 +1,189 @@
 import gradio as gr
-from pipeline.evaluator import evaluate_answer
-def run(answer, question, kb):
-    schema = load_schema(kb, question)
-    verdict, logs = evaluate_answer(answer, question, kb, schema, MODELS)
     return verdict, logs
-with gr.Blocks() as demo:
     kb = gr.Textbox(label="Knowledge Base", lines=6)
     question = gr.Textbox(label="Question")
     answer = gr.Textbox(label="Student Answer")
     verdict = gr.Textbox(label="Verdict")
-    logs = gr.JSON(label="Debug Logs")
     btn = gr.Button("Evaluate")
-    btn.click(run, [answer, question, kb], [verdict, logs])
 demo.launch()

 import gradio as gr
+import torch
+import torch.nn.functional as F
+from sentence_transformers import CrossEncoder
+import re
+import hashlib
+import json
+# ============================================================
+# MODEL LOADING (ONCE)
+# ============================================================
+DEVICE = "cpu"
+SIM_MODEL_NAME = "cross-encoder/stsb-distilroberta-base"
+NLI_MODEL_NAME = "cross-encoder/nli-deberta-v3-xsmall"
+print("Loading models...")
+sim_model = CrossEncoder(SIM_MODEL_NAME, device=DEVICE)
+nli_model = CrossEncoder(NLI_MODEL_NAME, device=DEVICE)
+print("✅ Models ready")
+# ============================================================
+# CONFIGURATION
+# ============================================================
+SIM_THRESHOLD_REQUIRED = 0.55
+SIM_THRESHOLD_FORBIDDEN = 0.60
+ENTAILMENT_THRESHOLD = 0.65
+SCHEMA_CACHE = {}  # in-memory cache (HF-safe)
+# ============================================================
+# UTILITIES
+# ============================================================
+def split_sentences(text):
+    return re.split(r'(?<=[.!?])\s+', text.strip())
+def softmax_logits(logits):
+    t = torch.tensor(logits)
+    if t.dim() > 1:
+        t = t.squeeze(0)
+    return F.softmax(t, dim=0).tolist()
+def hash_key(kb, question):
+    return hashlib.sha256((kb + question).encode()).hexdigest()
+# ============================================================
+# QUESTION CLASSIFIER
+# ============================================================
+def classify_question(question):
+    q = question.lower()
+    if q.startswith("what was") or q.startswith("who"):
+        return "FACT"
+    if q.startswith("define"):
+        return "DEFINITION"
+    if "explain" in q or "why" in q:
+        return "EXPLANATION"
+    return "FACT"
+# ============================================================
+# SCHEMA GENERATION (AUTO, NO LLM)
+# ============================================================
+def generate_schema(kb, question):
+    """
+    Auto-generates a grading schema directly from KB.
+    Deterministic and HF-safe.
+    """
+    sentences = split_sentences(kb)
+    q_type = classify_question(question)
+    # Find most relevant sentence
+    scores = sim_model.predict([(s, question) for s in sentences])
+    best_idx = int(scores.argmax())
+    best_sentence = sentences[best_idx]
+    schema = {
+        "question_type": q_type,
+        "required_concepts": [best_sentence],
+        "forbidden_concepts": [],
+        "allow_extra_info": True
+    }
+    return schema
+# ============================================================
+# ANSWER DECOMPOSITION
+# ============================================================
+def decompose_answer(answer):
+    clauses = re.split(r'\b(?:and|before|after|because|while)\b', answer)
+    return [c.strip() for c in clauses if c.strip()]
+# ============================================================
+# CORE EVALUATION
+# ============================================================
+def evaluate_answer(answer, question, kb):
+    logs = {}
+    # --------------------
+    # SCHEMA LOAD / CREATE
+    # --------------------
+    key = hash_key(kb, question)
+    if key not in SCHEMA_CACHE:
+        SCHEMA_CACHE[key] = generate_schema(kb, question)
+    schema = SCHEMA_CACHE[key]
+    logs["schema"] = schema
+    # --------------------
+    # ANSWER PARSING
+    # --------------------
+    claims = decompose_answer(answer)
+    logs["answer_claims"] = claims
+    # --------------------
+    # REQUIRED CONCEPT CHECK
+    # --------------------
+    required = schema["required_concepts"]
+    coverage = []
+    for req in required:
+        scores = sim_model.predict([(req, c) for c in claims])
+        best = float(scores.max())
+        coverage.append({
+            "concept": req,
+            "max_similarity": round(best, 3),
+            "covered": best >= SIM_THRESHOLD_REQUIRED
+        })
+    logs["required_coverage"] = coverage
+    covered_all = all(c["covered"] for c in coverage)
+    # --------------------
+    # CONTRADICTION CHECK (NLI)
+    # --------------------
+    kb_sentences = split_sentences(kb)
+    contradictions = []
+    for claim in claims:
+        for sent in kb_sentences:
+            probs = softmax_logits(nli_model.predict([(sent, claim)]))
+            if probs[0] > 0.70:  # Contradiction
+                contradictions.append({
+                    "claim": claim,
+                    "sentence": sent,
+                    "confidence": round(probs[0] * 100, 1)
+                })
+    logs["contradictions"] = contradictions
+    # --------------------
+    # FINAL DECISION
+    # --------------------
+    if covered_all and not contradictions:
+        verdict = "✅ CORRECT"
+    elif contradictions:
+        verdict = "❌ INCORRECT (Contradiction)"
+    else:
+        verdict = "⚠️ PARTIALLY CORRECT"
+    logs["final_decision"] = verdict
     return verdict, logs
+# ============================================================
+# GRADIO UI
+# ============================================================
+def run(answer, question, kb):
+    return evaluate_answer(answer, question, kb)
+with gr.Blocks(title="Competitive Exam Answer Checker") as demo:
+    gr.Markdown("## 🧠 Competitive Exam Answer Checker (Single-File Engine)")
     kb = gr.Textbox(label="Knowledge Base", lines=6)
     question = gr.Textbox(label="Question")
     answer = gr.Textbox(label="Student Answer")
     verdict = gr.Textbox(label="Verdict")
+    debug = gr.JSON(label="Debug Logs")
     btn = gr.Button("Evaluate")
+    btn.click(run, [answer, question, kb], [verdict, debug])
 demo.launch()