Spaces:

heerjtdev
/

try_answer

Running

App Files Files Community

heerjtdev commited on 1 day ago

Commit

d2523b4

verified ·

1 Parent(s): 9aae92e

Update app.py

Browse files

Files changed (1) hide show

app.py +732 -102

app.py CHANGED Viewed

@@ -288,6 +288,12 @@
@@ -634,6 +640,367 @@
@@ -648,6 +1015,8 @@ import fitz  # PyMuPDF
 import torch
 import os
 import numpy as np
 # --- IMPORT SESSION OPTIONS ---
 from onnxruntime import SessionOptions, GraphOptimizationLevel
@@ -668,7 +1037,7 @@ PROVIDERS = ["CPUExecutionProvider"]
 print(f"⚡ Running on: {PROVIDERS}")
 # ---------------------------------------------------------
-# 1. OPTIMIZED EMBEDDINGS (BGE-SMALL)
 # ---------------------------------------------------------
 class OnnxBgeEmbeddings(Embeddings):
     def __init__(self):
@@ -697,11 +1066,54 @@ class OnnxBgeEmbeddings(Embeddings):
 # ---------------------------------------------------------
-# 2. OPTIMIZED LLM (Qwen 2.5 - 0.5B) - STRICT GRADING
 # ---------------------------------------------------------
 class LLMEvaluator:
     def __init__(self):
-        # Qwen 2.5 0.5B is fast but needs "Few-Shot" examples to be strict.
         self.repo_id = "onnx-community/Qwen2.5-0.5B-Instruct"
         self.local_dir = "onnx_qwen_local"
@@ -731,53 +1143,72 @@ class LLMEvaluator:
             session_options=sess_options
         )
-    def evaluate(self, context, question, student_answer, max_marks):
-        # --- IMPROVED PROMPT STRATEGY ---
-        system_prompt = f"""You are a strict Logic Validator. You are NOT a helpful assistant.
-        Your job is to check if the Student Answer is FACTUALLY present in the Context.
-        GRADING ALGORITHM:
-        1. IF the Student Answer mentions things NOT in the Context -> PENALTY (-50% of the marks).
-        2. IF the Student Answer interprets the text opposite to its meaning -> PENALTY (-100% of the marks).
-        3. IF the Student Answer is generic fluff -> SCORE: 0.
-        --- EXAMPLE 1 (HALLUCINATION) ---
-        Context: The sky is blue due to Rayleigh scattering.
-        Question: Why is the sky blue?
-        Student Answer: Because the ocean reflects the water into the sky.
-        Analysis: The Context mentions 'Rayleigh scattering'. The student mentions 'ocean reflection'. These are different. The student is hallucinating outside facts.
-        Score: 0/{max_marks}
-        --- EXAMPLE 2 (CONTRADICTION) ---
-        Context: One must efface one's own personality. Good prose is like a windowpane.
-        Question: What does the author mean?
-        Student Answer: It means we should see the author's personality clearly.
-        Analysis: The text says 'efface' (remove) personality. The student says 'see' personality. This is a direct contradiction.
-        Score: 0/{max_marks}
-        --- EXAMPLE 3 (CORRECT) ---
-        Context: Mitochondria is the powerhouse of the cell.
-        Question: What is mitochondria?
-        Student Answer: It is the cell's powerhouse.
-        Analysis: Matches the text meaning exactly.
-        Score: {max_marks}/{max_marks}
         """
         user_prompt = f"""
-        --- YOUR TASK ---
-        Context:
-        {context}
-        Question:
-        {question}
-        Student Answer:
-        {student_answer}
-        OUTPUT FORMAT:
-        Analysis: [Compare Student Answer vs Context. List any hallucinations or contradictions.]
-        Score: [X]/{max_marks}
-        """
         messages = [
             {"role": "system", "content": system_prompt},
@@ -787,29 +1218,129 @@ class LLMEvaluator:
         input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = self.tokenizer(input_text, return_tensors="pt")
-        # Lower temperature for strictness
         with torch.no_grad():
             outputs = self.model.generate(
                 **inputs,
-                max_new_tokens=150,
-                temperature=0.1,    # Strict logic, no creativity
-                top_p=0.2,          # Cut off unlikely tokens
                 do_sample=True,
-                repetition_penalty=1.2 # Penalize repetition
             )
         input_length = inputs['input_ids'].shape[1]
         response = self.tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
-        return response
 # ---------------------------------------------------------
-# 3. NEW: ONNX RERANKER (Cross-Encoder)
-# Uses existing 'optimum' & 'transformers' libs (No new deps)
 # ---------------------------------------------------------
 class OnnxReranker:
     def __init__(self):
-        # TinyBERT is ~17MB and very fast on CPU
         self.model_name = "Xenova/ms-marco-TinyBERT-L-2-v2"
         print(f"🔄 Loading Reranker: {self.model_name}...")
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
@@ -823,7 +1354,6 @@ class OnnxReranker:
         if not docs:
             return []
-        # Prepare pairs: [query, doc_text]
         pairs = [[query, doc.page_content] for doc in docs]
         inputs = self.tokenizer(
@@ -837,34 +1367,33 @@ class OnnxReranker:
         with torch.no_grad():
             outputs = self.model(**inputs)
-        # Get logits (Relevance scores)
-        # MS-Marco models typically output a single logit or [irrelevant, relevant]
         logits = outputs.logits
         if logits.shape[1] == 2:
-            scores = logits[:, 1] # Take the "relevant" class score
         else:
             scores = logits.flatten()
-        # Sort docs by score (descending)
         scores = scores.numpy().tolist()
         doc_score_pairs = list(zip(docs, scores))
         doc_score_pairs.sort(key=lambda x: x[1], reverse=True)
-        # Return top K docs
         return [doc for doc, score in doc_score_pairs[:top_k]]
 # ---------------------------------------------------------
-# 4. Main Application Logic
 # ---------------------------------------------------------
-class VectorSystem:
     def __init__(self):
         self.vector_store = None
         self.embeddings = OnnxBgeEmbeddings()
         self.llm = LLMEvaluator()
-        self.reranker = OnnxReranker() # Initialize Reranker
-        self.all_chunks = []
         self.total_chunks = 0
     def process_content(self, file_obj, raw_text):
         has_file = file_obj is not None
@@ -881,87 +1410,188 @@ class VectorSystem:
             if has_file:
                 if file_obj.name.endswith('.pdf'):
                     doc = fitz.open(file_obj.name)
-                    for page in doc: text += page.get_text()
                 elif file_obj.name.endswith('.txt'):
-                    with open(file_obj.name, 'r', encoding='utf-8') as f: text = f.read()
                 else:
                     return "❌ Error: Only .pdf and .txt supported."
             else:
                 text = raw_text
-            # Smaller chunks for Reranking precision (500 chars)
             text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
             texts = text_splitter.split_text(text)
-            self.all_chunks = texts # Keep plain text list for reference
-            # Create Document objects with metadata
             docs = [Document(page_content=t, metadata={"id": i}) for i, t in enumerate(texts)]
             self.total_chunks = len(docs)
-            if not docs: return "Content empty."
             self.vector_store = FAISS.from_documents(docs, self.embeddings)
-            return f"✅ Indexed {self.total_chunks} chunks."
         except Exception as e:
             return f"Error: {str(e)}"
-    def process_query(self, question, student_answer, max_marks):
-        if not self.vector_store: return "⚠️ Please upload a file or paste text first.", ""
-        if not question: return "⚠️ Enter a question.", ""
-        # Step A: Wide Net Retrieval (Get top 15 candidates)
-        # We fetch more than we need to ensure the answer is in the candidate pool
-        initial_docs = self.vector_store.similarity_search(question, k=15)
-        # Step B: Rerank (Get top 3 best matches)
-        # The Cross-Encoder strictly judges relevance
         top_docs = self.reranker.rank(question, initial_docs, top_k=3)
-        # Step C: Construct Context
-        # We merge the top 3 specific chunks
         expanded_context = "\n\n---\n\n".join([d.page_content for d in top_docs])
-        evidence_display = f"### 📚 Optimized Context (Top {len(top_docs)} chunks after Reranking):\n"
-        evidence_display += f"> {expanded_context} ..."
-        llm_feedback = "Please enter a student answer to grade."
-        if student_answer:
-            llm_feedback = self.llm.evaluate(expanded_context, question, student_answer, max_marks)
         return evidence_display, llm_feedback
-system = VectorSystem()
-with gr.Blocks(title="EduGenius AI Grader") as demo:
-    gr.Markdown("# ⚡ EduGenius: CPU Optimized RAG")
-    gr.Markdown("Powered by **Qwen-2.5-0.5B**, **BGE-Small** & **TinyBERT Reranker**")
     with gr.Row():
         with gr.Column(scale=1):
-            gr.Markdown("### Source Input (Choose One)")
-            pdf_input = gr.File(label="Option A: Upload Chapter (PDF/TXT)")
             gr.Markdown("**OR**")
-            text_input = gr.Textbox(label="Option B: Paste Context", placeholder="Paste text here if you don't have a file...", lines=5)
-            upload_btn = gr.Button("Index Content", variant="primary")
             status_msg = gr.Textbox(label="Status", interactive=False)
         with gr.Column(scale=2):
             with gr.Row():
                 q_input = gr.Textbox(label="Question", scale=2)
                 max_marks = gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max Marks")
-            a_input = gr.TextArea(label="Student Answer")
-            run_btn = gr.Button("Retrieve & Grade", variant="secondary")
             with gr.Row():
-                evidence_box = gr.Markdown(label="Context Used")
-                grade_box = gr.Markdown(label="Grading Result")
-    # Pass both inputs to the process_content function
-    upload_btn.click(system.process_content, inputs=[pdf_input, text_input], outputs=[status_msg])
-    run_btn.click(system.process_query, inputs=[q_input, a_input, max_marks], outputs=[evidence_box, grade_box])
 if __name__ == "__main__":
     demo.launch()

+# import gradio as gr
+# import fitz  # PyMuPDF
+# import torch
+# import os
+# import numpy as np
+# # --- IMPORT SESSION OPTIONS ---
+# from onnxruntime import SessionOptions, GraphOptimizationLevel
+# # --- LANGCHAIN & RAG IMPORTS ---
+# from langchain_text_splitters import RecursiveCharacterTextSplitter
+# from langchain_community.vectorstores import FAISS
+# from langchain_core.embeddings import Embeddings
+# from langchain_core.documents import Document
+# # --- ONNX & MODEL IMPORTS ---
+# from transformers import AutoTokenizer
+# from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTModelForCausalLM, ORTModelForSequenceClassification
+# from huggingface_hub import snapshot_download
+# # Force CPU Provider
+# PROVIDERS = ["CPUExecutionProvider"]
+# print(f"⚡ Running on: {PROVIDERS}")
+# # ---------------------------------------------------------
+# # 1. OPTIMIZED EMBEDDINGS (BGE-SMALL)
+# # ---------------------------------------------------------
+# class OnnxBgeEmbeddings(Embeddings):
+#     def __init__(self):
+#         model_name = "Xenova/bge-small-en-v1.5"
+#         print(f"🔄 Loading Embeddings: {model_name}...")
+#         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+#         self.model = ORTModelForFeatureExtraction.from_pretrained(
+#             model_name,
+#             export=False,
+#             provider=PROVIDERS[0]
+#         )
+#     def _process_batch(self, texts):
+#         inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
+#         with torch.no_grad():
+#             outputs = self.model(**inputs)
+#         embeddings = outputs.last_hidden_state[:, 0]
+#         embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+#         return embeddings.numpy().tolist()
+#     def embed_documents(self, texts):
+#         return self._process_batch(texts)
+#     def embed_query(self, text):
+#         return self._process_batch(["Represent this sentence for searching relevant passages: " + text])[0]
+# # ---------------------------------------------------------
+# # 2. OPTIMIZED LLM (Qwen 2.5 - 0.5B) - STRICT GRADING
+# # ---------------------------------------------------------
+# class LLMEvaluator:
+#     def __init__(self):
+#         # Qwen 2.5 0.5B is fast but needs "Few-Shot" examples to be strict.
+#         self.repo_id = "onnx-community/Qwen2.5-0.5B-Instruct"
+#         self.local_dir = "onnx_qwen_local"
+#         print(f"🔄 Preparing CPU LLM: {self.repo_id}...")
+#         if not os.path.exists(self.local_dir):
+#             print(f"📥 Downloading FP16 model to {self.local_dir}...")
+#             snapshot_download(
+#                 repo_id=self.repo_id,
+#                 local_dir=self.local_dir,
+#                 allow_patterns=["config.json", "generation_config.json", "tokenizer*", "special_tokens_map.json", "*.jinja", "onnx/model_fp16.onnx*"]
+#             )
+#             print("✅ Download complete.")
+#         self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir)
+#         sess_options = SessionOptions()
+#         sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL
+#         self.model = ORTModelForCausalLM.from_pretrained(
+#             self.local_dir,
+#             subfolder="onnx",
+#             file_name="model_fp16.onnx",
+#             use_cache=True,
+#             use_io_binding=False,
+#             provider=PROVIDERS[0],
+#             session_options=sess_options
+#         )
+#     def evaluate(self, context, question, student_answer, max_marks):
+#         # --- IMPROVED PROMPT STRATEGY ---
+#         system_prompt = f"""You are a strict Logic Validator. You are NOT a helpful assistant.
+#         Your job is to check if the Student Answer is FACTUALLY present in the Context.
+#         GRADING ALGORITHM:
+#         1. IF the Student Answer mentions things NOT in the Context -> PENALTY (-50% of the marks).
+#         2. IF the Student Answer interprets the text opposite to its meaning -> PENALTY (-100% of the marks).
+#         3. IF the Student Answer is generic fluff -> SCORE: 0.
+#         --- EXAMPLE 1 (HALLUCINATION) ---
+#         Context: The sky is blue due to Rayleigh scattering.
+#         Question: Why is the sky blue?
+#         Student Answer: Because the ocean reflects the water into the sky.
+#         Analysis: The Context mentions 'Rayleigh scattering'. The student mentions 'ocean reflection'. These are different. The student is hallucinating outside facts.
+#         Score: 0/{max_marks}
+#         --- EXAMPLE 2 (CONTRADICTION) ---
+#         Context: One must efface one's own personality. Good prose is like a windowpane.
+#         Question: What does the author mean?
+#         Student Answer: It means we should see the author's personality clearly.
+#         Analysis: The text says 'efface' (remove) personality. The student says 'see' personality. This is a direct contradiction.
+#         Score: 0/{max_marks}
+#         --- EXAMPLE 3 (CORRECT) ---
+#         Context: Mitochondria is the powerhouse of the cell.
+#         Question: What is mitochondria?
+#         Student Answer: It is the cell's powerhouse.
+#         Analysis: Matches the text meaning exactly.
+#         Score: {max_marks}/{max_marks}
+#         """
+#         user_prompt = f"""
+#         --- YOUR TASK ---
+#         Context:
+#         {context}
+#         Question:
+#         {question}
+#         Student Answer:
+#         {student_answer}
+#         OUTPUT FORMAT:
+#         Analysis: [Compare Student Answer vs Context. List any hallucinations or contradictions.]
+#         Score: [X]/{max_marks}
+#         """
+#         messages = [
+#             {"role": "system", "content": system_prompt},
+#             {"role": "user", "content": user_prompt}
+#         ]
+#         input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+#         inputs = self.tokenizer(input_text, return_tensors="pt")
+#         # Lower temperature for strictness
+#         with torch.no_grad():
+#             outputs = self.model.generate(
+#                 **inputs,
+#                 max_new_tokens=150,
+#                 temperature=0.1,    # Strict logic, no creativity
+#                 top_p=0.2,          # Cut off unlikely tokens
+#                 do_sample=True,
+#                 repetition_penalty=1.2 # Penalize repetition
+#             )
+#         input_length = inputs['input_ids'].shape[1]
+#         response = self.tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
+#         return response
+# # ---------------------------------------------------------
+# # 3. NEW: ONNX RERANKER (Cross-Encoder)
+# # Uses existing 'optimum' & 'transformers' libs (No new deps)
+# # ---------------------------------------------------------
+# class OnnxReranker:
+#     def __init__(self):
+#         # TinyBERT is ~17MB and very fast on CPU
+#         self.model_name = "Xenova/ms-marco-TinyBERT-L-2-v2"
+#         print(f"🔄 Loading Reranker: {self.model_name}...")
+#         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+#         self.model = ORTModelForSequenceClassification.from_pretrained(
+#             self.model_name,
+#             export=False,
+#             provider=PROVIDERS[0]
+#         )
+#     def rank(self, query, docs, top_k=3):
+#         if not docs:
+#             return []
+#         # Prepare pairs: [query, doc_text]
+#         pairs = [[query, doc.page_content] for doc in docs]
+#         inputs = self.tokenizer(
+#             pairs,
+#             padding=True,
+#             truncation=True,
+#             max_length=512,
+#             return_tensors="pt"
+#         )
+#         with torch.no_grad():
+#             outputs = self.model(**inputs)
+#         # Get logits (Relevance scores)
+#         # MS-Marco models typically output a single logit or [irrelevant, relevant]
+#         logits = outputs.logits
+#         if logits.shape[1] == 2:
+#             scores = logits[:, 1] # Take the "relevant" class score
+#         else:
+#             scores = logits.flatten()
+#         # Sort docs by score (descending)
+#         scores = scores.numpy().tolist()
+#         doc_score_pairs = list(zip(docs, scores))
+#         doc_score_pairs.sort(key=lambda x: x[1], reverse=True)
+#         # Return top K docs
+#         return [doc for doc, score in doc_score_pairs[:top_k]]
+# # ---------------------------------------------------------
+# # 4. Main Application Logic
+# # ---------------------------------------------------------
+# class VectorSystem:
+#     def __init__(self):
+#         self.vector_store = None
+#         self.embeddings = OnnxBgeEmbeddings()
+#         self.llm = LLMEvaluator()
+#         self.reranker = OnnxReranker() # Initialize Reranker
+#         self.all_chunks = []
+#         self.total_chunks = 0
+#     def process_content(self, file_obj, raw_text):
+#         has_file = file_obj is not None
+#         has_text = raw_text is not None and len(raw_text.strip()) > 0
+#         if has_file and has_text:
+#             return "❌ Error: Please provide EITHER a file OR paste text, not both at the same time."
+#         if not has_file and not has_text:
+#             return "⚠️ No content provided. Please upload a file or paste text."
+#         try:
+#             text = ""
+#             if has_file:
+#                 if file_obj.name.endswith('.pdf'):
+#                     doc = fitz.open(file_obj.name)
+#                     for page in doc: text += page.get_text()
+#                 elif file_obj.name.endswith('.txt'):
+#                     with open(file_obj.name, 'r', encoding='utf-8') as f: text = f.read()
+#                 else:
+#                     return "❌ Error: Only .pdf and .txt supported."
+#             else:
+#                 text = raw_text
+#             # Smaller chunks for Reranking precision (500 chars)
+#             text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
+#             texts = text_splitter.split_text(text)
+#             self.all_chunks = texts # Keep plain text list for reference
+#             # Create Document objects with metadata
+#             docs = [Document(page_content=t, metadata={"id": i}) for i, t in enumerate(texts)]
+#             self.total_chunks = len(docs)
+#             if not docs: return "Content empty."
+#             self.vector_store = FAISS.from_documents(docs, self.embeddings)
+#             return f"✅ Indexed {self.total_chunks} chunks."
+#         except Exception as e:
+#             return f"Error: {str(e)}"
+#     def process_query(self, question, student_answer, max_marks):
+#         if not self.vector_store: return "⚠️ Please upload a file or paste text first.", ""
+#         if not question: return "⚠️ Enter a question.", ""
+#         # Step A: Wide Net Retrieval (Get top 15 candidates)
+#         # We fetch more than we need to ensure the answer is in the candidate pool
+#         initial_docs = self.vector_store.similarity_search(question, k=15)
+#         # Step B: Rerank (Get top 3 best matches)
+#         # The Cross-Encoder strictly judges relevance
+#         top_docs = self.reranker.rank(question, initial_docs, top_k=3)
+#         # Step C: Construct Context
+#         # We merge the top 3 specific chunks
+#         expanded_context = "\n\n---\n\n".join([d.page_content for d in top_docs])
+#         evidence_display = f"### 📚 Optimized Context (Top {len(top_docs)} chunks after Reranking):\n"
+#         evidence_display += f"> {expanded_context} ..."
+#         llm_feedback = "Please enter a student answer to grade."
+#         if student_answer:
+#             llm_feedback = self.llm.evaluate(expanded_context, question, student_answer, max_marks)
+#         return evidence_display, llm_feedback
+# system = VectorSystem()
+# with gr.Blocks(title="EduGenius AI Grader") as demo:
+#     gr.Markdown("# ⚡ EduGenius: CPU Optimized RAG")
+#     gr.Markdown("Powered by **Qwen-2.5-0.5B**, **BGE-Small** & **TinyBERT Reranker**")
+#     with gr.Row():
+#         with gr.Column(scale=1):
+#             gr.Markdown("### Source Input (Choose One)")
+#             pdf_input = gr.File(label="Option A: Upload Chapter (PDF/TXT)")
+#             gr.Markdown("**OR**")
+#             text_input = gr.Textbox(label="Option B: Paste Context", placeholder="Paste text here if you don't have a file...", lines=5)
+#             upload_btn = gr.Button("Index Content", variant="primary")
+#             status_msg = gr.Textbox(label="Status", interactive=False)
+#         with gr.Column(scale=2):
+#             with gr.Row():
+#                 q_input = gr.Textbox(label="Question", scale=2)
+#                 max_marks = gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max Marks")
+#             a_input = gr.TextArea(label="Student Answer")
+#             run_btn = gr.Button("Retrieve & Grade", variant="secondary")
+#             with gr.Row():
+#                 evidence_box = gr.Markdown(label="Context Used")
+#                 grade_box = gr.Markdown(label="Grading Result")
+#     # Pass both inputs to the process_content function
+#     upload_btn.click(system.process_content, inputs=[pdf_input, text_input], outputs=[status_msg])
+#     run_btn.click(system.process_query, inputs=[q_input, a_input, max_marks], outputs=[evidence_box, grade_box])
+# if __name__ == "__main__":
+#     demo.launch()
 import torch
 import os
 import numpy as np
+import re
+from typing import List, Dict, Tuple, Optional
 # --- IMPORT SESSION OPTIONS ---
 from onnxruntime import SessionOptions, GraphOptimizationLevel
 print(f"⚡ Running on: {PROVIDERS}")
 # ---------------------------------------------------------
+# 1. OPTIMIZED EMBEDDINGS (BGE-SMALL) - UNCHANGED
 # ---------------------------------------------------------
 class OnnxBgeEmbeddings(Embeddings):
     def __init__(self):
 # ---------------------------------------------------------
+# 2. NEW: ANSWER PRESENCE CHECKER
+# Paper insight: Prevent grading blank/missing answers
+# ---------------------------------------------------------
+class AnswerPresenceChecker:
+    """Checks if a student answer actually exists and contains substance."""
+    def __init__(self):
+        self.min_length = 10  # Minimum characters for valid answer
+        self.min_words = 3    # Minimum words for valid answer
+    def check_presence(self, student_answer: str) -> Tuple[bool, str]:
+        """
+        Returns: (is_present, reason)
+        """
+        if not student_answer or len(student_answer.strip()) == 0:
+            return False, "Answer is empty"
+        answer = student_answer.strip()
+        # Check minimum length
+        if len(answer) < self.min_length:
+            return False, f"Answer too short ({len(answer)} chars, need {self.min_length})"
+        # Check minimum word count
+        words = answer.split()
+        if len(words) < self.min_words:
+            return False, f"Answer too brief ({len(words)} words, need {self.min_words})"
+        # Check for placeholder text
+        placeholder_patterns = [
+            r'^[.\s]*$',  # Only dots/spaces
+            r'^[?]+$',    # Only question marks
+            r'^(n/?a|na|idk|dunno)\s*$',  # Common non-answers
+        ]
+        for pattern in placeholder_patterns:
+            if re.match(pattern, answer.lower()):
+                return False, "Answer appears to be placeholder text"
+        return True, "Answer present and valid"
+# ---------------------------------------------------------
+# 3. ENHANCED LLM EVALUATOR WITH ENSEMBLE SUPPORT
+# Paper insights: Structured prompting, reference grounding, ensemble grading
 # ---------------------------------------------------------
 class LLMEvaluator:
     def __init__(self):
         self.repo_id = "onnx-community/Qwen2.5-0.5B-Instruct"
         self.local_dir = "onnx_qwen_local"
             session_options=sess_options
         )
+    def evaluate_single(self, context: str, question: str, student_answer: str,
+                       max_marks: int, grader_id: int = 1,
+                       reference_summary: Optional[str] = None) -> Dict:
         """
+        Single grader evaluation with structured output.
+        Paper insight: Use rigid templates with deterministic validation.
+        Returns structured dict with:
+        - analysis: str
+        - score: int
+        - raw_response: str
+        """
+        # Enhanced system prompt with reference grounding
+        system_prompt = f"""You are Grader #{grader_id}, a strict Logic Validator for educational assessment.
+YOUR GRADING ALGORITHM:
+1. Compare Student Answer ONLY against the provided Context
+2. IF Student Answer mentions facts NOT in Context → PENALTY (-50% of marks)
+3. IF Student Answer contradicts the Context → PENALTY (-100% of marks)
+4. IF Student Answer is vague/generic without specific facts → SCORE: 0-20%
+5. IF Student Answer accurately reflects Context → SCORE: 80-100%
+CRITICAL RULES:
+[R1] Grade ONLY based on Context provided, not general knowledge
+[R2] Penalize hallucinations (facts not in Context) heavily
+[R3] Penalize contradictions (opposite meaning) completely
+[R4] Reward specific, accurate paraphrasing from Context
+[R5] Partial credit for partially correct answers
+OUTPUT FORMAT (MANDATORY):
+You MUST output in this exact format:
+## Analysis
+[Your detailed comparison of Student Answer vs Context]
+## Score
+[X]/{max_marks}
+Do NOT deviate from this format."""
+        # Add reference summary if provided (paper's key insight)
+        reference_section = ""
+        if reference_summary:
+            reference_section = f"""
+### REFERENCE SOLUTION (Perfect Answer Example):
+{reference_summary}
+Use this as calibration for what a 100% answer looks like."""
         user_prompt = f"""
+### Context (Retrieved from Source):
+{context}
+{reference_section}
+### Question:
+{question}
+### Student Answer:
+{student_answer}
+### Maximum Marks: {max_marks}
+Provide your grading following the mandatory output format.
+"""
         messages = [
             {"role": "system", "content": system_prompt},
         input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = self.tokenizer(input_text, return_tensors="pt")
+        # Strict sampling for consistency
         with torch.no_grad():
             outputs = self.model.generate(
                 **inputs,
+                max_new_tokens=200,  # Increased for structured output
+                temperature=0.1,     # Very strict
+                top_p=0.2,
                 do_sample=True,
+                repetition_penalty=1.2
             )
         input_length = inputs['input_ids'].shape[1]
         response = self.tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
+        # Parse structured output
+        analysis, score = self._parse_response(response, max_marks)
+        return {
+            "grader_id": grader_id,
+            "analysis": analysis,
+            "score": score,
+            "raw_response": response
+        }
+    def _parse_response(self, response: str, max_marks: int) -> Tuple[str, int]:
+        """
+        Parse structured response to extract analysis and score.
+        Paper insight: Deterministic parsing of rigid templates.
+        """
+        # Extract score using regex
+        score_pattern = r'##\s*Score\s*\n\s*\[?(\d+)\]?/\d+'
+        score_match = re.search(score_pattern, response, re.IGNORECASE)
+        if score_match:
+            score = int(score_match.group(1))
+            score = min(score, max_marks)  # Cap at max
+        else:
+            # Fallback: look for any number/max pattern
+            fallback_pattern = r'(\d+)\s*/\s*\d+'
+            fallback_match = re.search(fallback_pattern, response)
+            if fallback_match:
+                score = min(int(fallback_match.group(1)), max_marks)
+            else:
+                score = 0  # Default if parsing fails
+        # Extract analysis
+        analysis_pattern = r'##\s*Analysis\s*\n(.*?)(?=##\s*Score|$)'
+        analysis_match = re.search(analysis_pattern, response, re.DOTALL | re.IGNORECASE)
+        if analysis_match:
+            analysis = analysis_match.group(1).strip()
+        else:
+            # Fallback: use everything before score section
+            analysis = response.split('##')[0].strip() if '##' in response else response
+        return analysis, score
 # ---------------------------------------------------------
+# 4. NEW: SUPERVISOR AGGREGATOR
+# Paper insight: Merge ensemble outputs into final decision
+# ---------------------------------------------------------
+class SupervisorAggregator:
+    """
+    Aggregates multiple grader outputs into a final consensus grade.
+    Paper uses another LLM call; we use statistical aggregation for CPU efficiency.
+    """
+    def aggregate(self, grader_results: List[Dict], max_marks: int) -> Dict:
+        """
+        Aggregate K=3 grader results into final score.
+        Returns:
+        - final_score: int (median of ensemble)
+        - disagreement: int (max - min score)
+        - needs_review: bool (high disagreement flag)
+        - consensus_analysis: str
+        """
+        scores = [r['score'] for r in grader_results]
+        # Use median for robustness (paper uses supervisor LLM call)
+        final_score = int(np.median(scores))
+        # Calculate disagreement
+        disagreement = max(scores) - min(scores)
+        # Flag for manual review if disagreement too high
+        # Paper uses Dmax thresholds; we use 40% of max marks
+        needs_review = disagreement >= (0.4 * max_marks)
+        # Merge analyses
+        consensus_analysis = self._merge_analyses(grader_results, final_score, disagreement)
+        return {
+            "final_score": final_score,
+            "individual_scores": scores,
+            "disagreement": disagreement,
+            "needs_review": needs_review,
+            "consensus_analysis": consensus_analysis,
+            "grader_details": grader_results
+        }
+    def _merge_analyses(self, results: List[Dict], final_score: int, disagreement: int) -> str:
+        """Create consensus analysis from multiple graders."""
+        output = f"**Ensemble Grading Results** (Final: {final_score}, Disagreement: ±{disagreement})\n\n"
+        for i, result in enumerate(results, 1):
+            output += f"**Grader {i} ({result['score']} points):**\n{result['analysis']}\n\n"
+        if disagreement > 0:
+            output += f"\n⚠️ **Note:** Graders disagreed by {disagreement} points. "
+            if disagreement >= 5:
+                output += "Consider manual review."
+        return output
+# ---------------------------------------------------------
+# 5. ONNX RERANKER - UNCHANGED
 # ---------------------------------------------------------
 class OnnxReranker:
     def __init__(self):
         self.model_name = "Xenova/ms-marco-TinyBERT-L-2-v2"
         print(f"🔄 Loading Reranker: {self.model_name}...")
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
         if not docs:
             return []
         pairs = [[query, doc.page_content] for doc in docs]
         inputs = self.tokenizer(
         with torch.no_grad():
             outputs = self.model(**inputs)
         logits = outputs.logits
         if logits.shape[1] == 2:
+            scores = logits[:, 1]
         else:
             scores = logits.flatten()
         scores = scores.numpy().tolist()
         doc_score_pairs = list(zip(docs, scores))
         doc_score_pairs.sort(key=lambda x: x[1], reverse=True)
         return [doc for doc, score in doc_score_pairs[:top_k]]
 # ---------------------------------------------------------
+# 6. ENHANCED MAIN SYSTEM WITH MULTI-STAGE PIPELINE
 # ---------------------------------------------------------
+class EnhancedVectorSystem:
     def __init__(self):
         self.vector_store = None
         self.embeddings = OnnxBgeEmbeddings()
         self.llm = LLMEvaluator()
+        self.reranker = OnnxReranker()
+        self.presence_checker = AnswerPresenceChecker()
+        self.supervisor = SupervisorAggregator()
+        self.all_chunks = []
         self.total_chunks = 0
+        self.reference_summary = None  # Store reference answer summary
     def process_content(self, file_obj, raw_text):
         has_file = file_obj is not None
             if has_file:
                 if file_obj.name.endswith('.pdf'):
                     doc = fitz.open(file_obj.name)
+                    for page in doc:
+                        text += page.get_text()
                 elif file_obj.name.endswith('.txt'):
+                    with open(file_obj.name, 'r', encoding='utf-8') as f:
+                        text = f.read()
                 else:
                     return "❌ Error: Only .pdf and .txt supported."
             else:
                 text = raw_text
+            # Smaller chunks for precision
             text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
             texts = text_splitter.split_text(text)
+            self.all_chunks = texts
             docs = [Document(page_content=t, metadata={"id": i}) for i, t in enumerate(texts)]
             self.total_chunks = len(docs)
+            if not docs:
+                return "Content empty."
             self.vector_store = FAISS.from_documents(docs, self.embeddings)
+            return f"✅ Indexed {self.total_chunks} chunks. Ready for grading."
         except Exception as e:
             return f"Error: {str(e)}"
+    def set_reference_answer(self, reference_text: str) -> str:
+        """
+        Set reference answer for grading calibration.
+        Paper insight: Reference grounding prevents over-grading.
+        """
+        if not reference_text or len(reference_text.strip()) == 0:
+            self.reference_summary = None
+            return "ℹ️ Reference answer cleared."
+        self.reference_summary = reference_text.strip()
+        return f"✅ Reference answer set ({len(self.reference_summary)} chars). Will be used to calibrate grading."
+    def process_query(self, question, student_answer, max_marks, enable_ensemble=True):
+        """
+        Enhanced grading pipeline with multi-stage processing.
+        """
+        if not self.vector_store:
+            return "⚠️ Please upload a file or paste text first.", ""
+        if not question:
+            return "⚠️ Enter a question.", ""
+        # Stage 1: Presence Check (Paper insight)
+        is_present, presence_reason = self.presence_checker.check_presence(student_answer)
+        if not is_present:
+            return f"⚠️ **No valid answer detected:** {presence_reason}", f"**Score: 0/{max_marks}**\n\nNo answer to grade."
+        # Stage 2: Retrieval + Reranking
+        initial_docs = self.vector_store.similarity_search(question, k=15)
         top_docs = self.reranker.rank(question, initial_docs, top_k=3)
         expanded_context = "\n\n---\n\n".join([d.page_content for d in top_docs])
+        evidence_display = f"### 📚 Retrieved Context (Top {len(top_docs)} chunks):\n"
+        evidence_display += f"> {expanded_context[:500]}..."
+        # Stage 3: Ensemble Grading (Paper's key innovation)
+        if not student_answer:
+            return evidence_display, "Please enter a student answer to grade."
+        if enable_ensemble:
+            # Run K=3 independent graders
+            grader_results = []
+            for grader_id in range(1, 4):  # K=3 ensemble
+                result = self.llm.evaluate_single(
+                    context=expanded_context,
+                    question=question,
+                    student_answer=student_answer,
+                    max_marks=max_marks,
+                    grader_id=grader_id,
+                    reference_summary=self.reference_summary
+                )
+                grader_results.append(result)
+            # Stage 4: Supervisor Aggregation
+            final_result = self.supervisor.aggregate(grader_results, max_marks)
+            # Format output
+            llm_feedback = f"# 🎓 Final Grade: {final_result['final_score']}/{max_marks}\n\n"
+            if final_result['needs_review']:
+                llm_feedback += "⚠️ **Manual Review Recommended** (High grader disagreement)\n\n"
+            llm_feedback += final_result['consensus_analysis']
+            # Add statistics
+            llm_feedback += f"\n\n---\n**Grading Statistics:**\n"
+            llm_feedback += f"- Individual Scores: {final_result['individual_scores']}\n"
+            llm_feedback += f"- Score Range: {min(final_result['individual_scores'])}-{max(final_result['individual_scores'])}\n"
+            llm_feedback += f"- Disagreement: ±{final_result['disagreement']} points\n"
+        else:
+            # Single grader mode (for comparison)
+            result = self.llm.evaluate_single(
+                context=expanded_context,
+                question=question,
+                student_answer=student_answer,
+                max_marks=max_marks,
+                grader_id=1,
+                reference_summary=self.reference_summary
+            )
+            llm_feedback = f"# 🎓 Grade: {result['score']}/{max_marks}\n\n{result['analysis']}"
         return evidence_display, llm_feedback
+# ---------------------------------------------------------
+# 7. GRADIO INTERFACE
+# ---------------------------------------------------------
+system = EnhancedVectorSystem()
+with gr.Blocks(title="EduGenius AI Grader - Enhanced", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# ⚡ EduGenius: Enhanced RAG-Based Grader")
+    gr.Markdown("Powered by **Ensemble Grading**, **Reference Grounding** & **Presence Checking**")
+    gr.Markdown("*Implements multi-stage pipeline from research: arXiv:2601.00730*")
     with gr.Row():
         with gr.Column(scale=1):
+            gr.Markdown("### 📄 Source Content")
+            pdf_input = gr.File(label="Option A: Upload Document (PDF/TXT)")
             gr.Markdown("**OR**")
+            text_input = gr.Textbox(label="Option B: Paste Text", placeholder="Paste context here...", lines=5)
+            upload_btn = gr.Button("📥 Index Content", variant="primary")
             status_msg = gr.Textbox(label="Status", interactive=False)
+            gr.Markdown("---")
+            gr.Markdown("### 🎯 Reference Answer (Optional)")
+            gr.Markdown("*Providing a reference answer improves grading accuracy*")
+            reference_input = gr.Textbox(
+                label="Perfect Answer Example",
+                placeholder="What would a 100% answer look like?",
+                lines=3
+            )
+            ref_btn = gr.Button("Set Reference", variant="secondary")
+            ref_status = gr.Textbox(label="Reference Status", interactive=False)
         with gr.Column(scale=2):
+            gr.Markdown("### ❓ Grading Interface")
             with gr.Row():
                 q_input = gr.Textbox(label="Question", scale=2)
                 max_marks = gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max Marks")
+            a_input = gr.TextArea(label="Student Answer", lines=4)
             with gr.Row():
+                ensemble_check = gr.Checkbox(label="Enable Ensemble Grading (K=3)", value=True)
+                run_btn = gr.Button("🚀 Grade Answer", variant="primary", scale=2)
+            gr.Markdown("---")
+            with gr.Row():
+                with gr.Column():
+                    evidence_box = gr.Markdown(label="📚 Retrieved Context")
+                with gr.Column():
+                    grade_box = gr.Markdown(label="🎓 Grading Result")
+    # Event handlers
+    upload_btn.click(
+        system.process_content,
+        inputs=[pdf_input, text_input],
+        outputs=[status_msg]
+    )
+    ref_btn.click(
+        system.set_reference_answer,
+        inputs=[reference_input],
+        outputs=[ref_status]
+    )
+    run_btn.click(
+        system.process_query,
+        inputs=[q_input, a_input, max_marks, ensemble_check],
+        outputs=[evidence_box, grade_box]
+    )
 if __name__ == "__main__":
     demo.launch()