Spaces:

heerjtdev
/

answer_validator

Sleeping

App Files Files Community

heerjtdev commited on Jan 2

Commit

2307277

verified ·

1 Parent(s): 0e4e76a

Update app.py

Browse files

extensive evaluations

Files changed (1) hide show

app.py +173 -51

app.py CHANGED Viewed

@@ -1,3 +1,115 @@
 import os
 import json
 import re
@@ -12,83 +124,93 @@ from sentence_transformers import SentenceTransformer, util
 GEMINI_API_KEY = "AIzaSyBrbLGXkSdXReb0lUucYqcNCNBkvS-RBFw"
 genai.configure(api_key=GEMINI_API_KEY)
-# UPDATED: Use a supported 2026 model
 MODEL = genai.GenerativeModel("gemini-2.5-flash")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 SIM_THRESHOLD = 0.55
-print("Loading embedding model...")
 embedder = SentenceTransformer(EMBED_MODEL, device=DEVICE)
 print("✅ Ready")
 # ============================================================
-# LOGIC
 # ============================================================
-def get_evaluation_data(kb, question):
-    """Gets both intent and rubric in one single API request."""
     prompt = f"""
-    Acting as an examiner, analyze the Knowledge Base (KB) and Question.
-    1. Determine the intent (FACTUAL, EXPLANATORY, PROCESS, or COMPARISON).
-    2. Create a rubric of 3-5 atomic grading criteria based ONLY on the KB.
-    KB: {kb}
     Question: {question}
-    OUTPUT JSON ONLY:
     {{
-      "intent": "LABEL",
-      "criteria": ["criterion 1", "criterion 2"]
     }}
     """
     try:
         response = MODEL.generate_content(prompt)
-        # Handle cases where model might wrap JSON in backticks
         clean_text = re.sub(r'```json|```', '', response.text).strip()
         return json.loads(clean_text)
     except Exception as e:
-        print(f"API Error: {e}")
-        return {"intent": "ERROR", "criteria": []}
-def evaluate(answer, question, kb):
-    # API Call
-    data = get_evaluation_data(kb, question)
-    rubric = data.get("criteria", [])
-    if not rubric:
-        return {"error": "Could not generate rubric. Check API status."}
-    # Semantic Matching (Local)
     sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', answer) if len(s.strip()) > 5]
-    if not sents:
-        return {"error": "Answer is too short to evaluate."}
-    ans_emb = embedder.encode(sents, convert_to_tensor=True)
-    results = []
-    for crit in rubric:
-        crit_emb = embedder.encode(crit, convert_to_tensor=True)
-        sims = util.cos_sim(crit_emb, ans_emb)[0]
-        score = float(torch.max(sims)) if sims.numel() else 0.0
-        results.append({"criterion": crit, "satisfied": score >= SIM_THRESHOLD})
-    # Verdict
-    hits = sum(r["satisfied"] for r in results)
-    verdict = "✅ CORRECT" if hits == len(results) else "⚠️ PARTIAL" if hits > 0 else "❌ INCORRECT"
-    return {
-        "intent": data.get("intent"),
-        "rubric_results": results,
-        "final_verdict": verdict
-    }
-# UI
-with gr.Blocks() as demo:
-    gr.Markdown("## 🧠 Gemini 2.5 Answer Grader")
-    kb_input = gr.Textbox(label="Knowledge Base", lines=5)
-    q_input = gr.Textbox(label="Question")
-    a_input = gr.Textbox(label="Student Answer", lines=4)
-    out = gr.JSON(label="Evaluation Result")
-    gr.Button("Evaluate").click(evaluate, [a_input, q_input, kb_input], out)
 demo.launch()

+# import os
+# import json
+# import re
+# import torch
+# import gradio as gr
+# import google.generativeai as genai
+# from sentence_transformers import SentenceTransformer, util
+# # ============================================================
+# # CONFIG
+# # ============================================================
+# GEMINI_API_KEY = "AIzaSyBrbLGXkSdXReb0lUucYqcNCNBkvS-RBFw"
+# genai.configure(api_key=GEMINI_API_KEY)
+# # UPDATED: Use a supported 2026 model
+# MODEL = genai.GenerativeModel("gemini-2.5-flash")
+# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+# SIM_THRESHOLD = 0.55
+# print("Loading embedding model...")
+# embedder = SentenceTransformer(EMBED_MODEL, device=DEVICE)
+# print("✅ Ready")
+# # ============================================================
+# # LOGIC
+# # ============================================================
+# def get_evaluation_data(kb, question):
+#     """Gets both intent and rubric in one single API request."""
+#     prompt = f"""
+#     Acting as an examiner, analyze the Knowledge Base (KB) and Question.
+#     1. Determine the intent (FACTUAL, EXPLANATORY, PROCESS, or COMPARISON).
+#     2. Create a rubric of 3-5 atomic grading criteria based ONLY on the KB.
+#     KB: {kb}
+#     Question: {question}
+#     OUTPUT JSON ONLY:
+#     {{
+#       "intent": "LABEL",
+#       "criteria": ["criterion 1", "criterion 2"]
+#     }}
+#     """
+#     try:
+#         response = MODEL.generate_content(prompt)
+#         # Handle cases where model might wrap JSON in backticks
+#         clean_text = re.sub(r'```json|```', '', response.text).strip()
+#         return json.loads(clean_text)
+#     except Exception as e:
+#         print(f"API Error: {e}")
+#         return {"intent": "ERROR", "criteria": []}
+# def evaluate(answer, question, kb):
+#     # API Call
+#     data = get_evaluation_data(kb, question)
+#     rubric = data.get("criteria", [])
+#     if not rubric:
+#         return {"error": "Could not generate rubric. Check API status."}
+#     # Semantic Matching (Local)
+#     sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', answer) if len(s.strip()) > 5]
+#     if not sents:
+#         return {"error": "Answer is too short to evaluate."}
+#     ans_emb = embedder.encode(sents, convert_to_tensor=True)
+#     results = []
+#     for crit in rubric:
+#         crit_emb = embedder.encode(crit, convert_to_tensor=True)
+#         sims = util.cos_sim(crit_emb, ans_emb)[0]
+#         score = float(torch.max(sims)) if sims.numel() else 0.0
+#         results.append({"criterion": crit, "satisfied": score >= SIM_THRESHOLD})
+#     # Verdict
+#     hits = sum(r["satisfied"] for r in results)
+#     verdict = "✅ CORRECT" if hits == len(results) else "⚠️ PARTIAL" if hits > 0 else "❌ INCORRECT"
+#     return {
+#         "intent": data.get("intent"),
+#         "rubric_results": results,
+#         "final_verdict": verdict
+#     }
+# # UI
+# with gr.Blocks() as demo:
+#     gr.Markdown("## 🧠 Gemini 2.5 Answer Grader")
+#     kb_input = gr.Textbox(label="Knowledge Base", lines=5)
+#     q_input = gr.Textbox(label="Question")
+#     a_input = gr.Textbox(label="Student Answer", lines=4)
+#     out = gr.JSON(label="Evaluation Result")
+#     gr.Button("Evaluate").click(evaluate, [a_input, q_input, kb_input], out)
+# demo.launch()
 import os
 import json
 import re
 GEMINI_API_KEY = "AIzaSyBrbLGXkSdXReb0lUucYqcNCNBkvS-RBFw"
 genai.configure(api_key=GEMINI_API_KEY)
 MODEL = genai.GenerativeModel("gemini-2.5-flash")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 SIM_THRESHOLD = 0.55
+print("Loading local embedding auditor...")
 embedder = SentenceTransformer(EMBED_MODEL, device=DEVICE)
 print("✅ Ready")
 # ============================================================
+# SOPHISTICATED EVALUATION LOGIC
 # ============================================================
+def get_advanced_evaluation(kb, question, answer):
     prompt = f"""
+    You are a Senior Academic Evaluator. Compare the Answer against the Knowledge Base (KB) for the specific Question.
+    TASK:
+    1. Identify 'intent' (e.g., FACTUAL, PROCEDURAL).
+    2. Create a 'rubric' of 3-5 criteria from the KB.
+    3. For each criterion:
+       - Determine if 'satisfied' (true/false).
+       - Provide a 'confidence' score (0-100) based on how clearly the answer matches the KB.
+    4. Extract 'irrelevant_snippets': Parts of the answer that don't help answer the question.
+    5. Extract 'contradictions': Parts of the answer that factually conflict with the KB.
+    6. Suggest a 'total_score' (0-100) and 'feedback'.
+    Knowledge Base: {kb}
     Question: {question}
+    Student Answer: {answer}
+    STRICT JSON OUTPUT ONLY:
     {{
+      "intent": "...",
+      "rubric": [
+        {{"criterion": "...", "satisfied": true, "confidence": 95}}
+      ],
+      "irrelevant_snippets": ["...", "..."],
+      "contradictions": ["...", "..."],
+      "suggested_mark": 85,
+      "feedback": "..."
     }}
     """
     try:
         response = MODEL.generate_content(prompt)
         clean_text = re.sub(r'```json|```', '', response.text).strip()
         return json.loads(clean_text)
     except Exception as e:
+        return { "error": str(e) }
+def evaluate(kb, question, answer):
+    # Perform the single heavy-duty API call
+    eval_data = get_advanced_evaluation(kb, question, answer)
+    if "error" in eval_data:
+        return eval_data
+    # --- Local Semantic Cross-Check (Local, no API cost) ---
+    # This helps catch if Gemini was "too nice" or missed a nuance
     sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', answer) if len(s.strip()) > 5]
+    if sents:
+        ans_emb = embedder.encode(sents, convert_to_tensor=True)
+        for item in eval_data.get("rubric", []):
+            crit_emb = embedder.encode(item["criterion"], convert_to_tensor=True)
+            sims = util.cos_sim(crit_emb, ans_emb)[0]
+            max_sim = float(torch.max(sims)) if sims.numel() else 0.0
+            # We add this 'local_check' to the JSON so the user can compare
+            item["local_semantic_similarity"] = round(max_sim * 100, 1)
+    return eval_data
+# ============================================================
+# IMPROVED UI
+# ============================================================
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🎓 Advanced AI Grading System (Gemini 2.5)")
+    with gr.Row():
+        with gr.Column():
+            kb_input = gr.Textbox(label="1. Reference Material (KB)", lines=8, placeholder="Paste the factual source here...")
+            q_input = gr.Textbox(label="2. Question", placeholder="What are you asking?")
+            a_input = gr.Textbox(label="3. Student Answer", lines=8, placeholder="Paste the answer to grade...")
+            btn = gr.Button("🔍 Run Deep Analysis", variant="primary")
+        with gr.Column():
+            out = gr.JSON(label="Grading Report & Forensic Analysis")
+    btn.click(evaluate, [kb_input, q_input, a_input], out)
 demo.launch()