Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -288,6 +288,12 @@
|
|
| 288 |
|
| 289 |
|
| 290 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
|
| 292 |
|
| 293 |
|
|
@@ -634,6 +640,367 @@
|
|
| 634 |
|
| 635 |
|
| 636 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 637 |
|
| 638 |
|
| 639 |
|
|
@@ -648,6 +1015,8 @@ import fitz # PyMuPDF
|
|
| 648 |
import torch
|
| 649 |
import os
|
| 650 |
import numpy as np
|
|
|
|
|
|
|
| 651 |
|
| 652 |
# --- IMPORT SESSION OPTIONS ---
|
| 653 |
from onnxruntime import SessionOptions, GraphOptimizationLevel
|
|
@@ -668,7 +1037,7 @@ PROVIDERS = ["CPUExecutionProvider"]
|
|
| 668 |
print(f"β‘ Running on: {PROVIDERS}")
|
| 669 |
|
| 670 |
# ---------------------------------------------------------
|
| 671 |
-
# 1. OPTIMIZED EMBEDDINGS (BGE-SMALL)
|
| 672 |
# ---------------------------------------------------------
|
| 673 |
class OnnxBgeEmbeddings(Embeddings):
|
| 674 |
def __init__(self):
|
|
@@ -697,11 +1066,54 @@ class OnnxBgeEmbeddings(Embeddings):
|
|
| 697 |
|
| 698 |
|
| 699 |
# ---------------------------------------------------------
|
| 700 |
-
# 2.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 701 |
# ---------------------------------------------------------
|
| 702 |
class LLMEvaluator:
|
| 703 |
def __init__(self):
|
| 704 |
-
# Qwen 2.5 0.5B is fast but needs "Few-Shot" examples to be strict.
|
| 705 |
self.repo_id = "onnx-community/Qwen2.5-0.5B-Instruct"
|
| 706 |
self.local_dir = "onnx_qwen_local"
|
| 707 |
|
|
@@ -731,53 +1143,72 @@ class LLMEvaluator:
|
|
| 731 |
session_options=sess_options
|
| 732 |
)
|
| 733 |
|
| 734 |
-
def
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
Your job is to check if the Student Answer is FACTUALLY present in the Context.
|
| 738 |
-
|
| 739 |
-
GRADING ALGORITHM:
|
| 740 |
-
1. IF the Student Answer mentions things NOT in the Context -> PENALTY (-50% of the marks).
|
| 741 |
-
2. IF the Student Answer interprets the text opposite to its meaning -> PENALTY (-100% of the marks).
|
| 742 |
-
3. IF the Student Answer is generic fluff -> SCORE: 0.
|
| 743 |
-
|
| 744 |
-
--- EXAMPLE 1 (HALLUCINATION) ---
|
| 745 |
-
Context: The sky is blue due to Rayleigh scattering.
|
| 746 |
-
Question: Why is the sky blue?
|
| 747 |
-
Student Answer: Because the ocean reflects the water into the sky.
|
| 748 |
-
Analysis: The Context mentions 'Rayleigh scattering'. The student mentions 'ocean reflection'. These are different. The student is hallucinating outside facts.
|
| 749 |
-
Score: 0/{max_marks}
|
| 750 |
-
|
| 751 |
-
--- EXAMPLE 2 (CONTRADICTION) ---
|
| 752 |
-
Context: One must efface one's own personality. Good prose is like a windowpane.
|
| 753 |
-
Question: What does the author mean?
|
| 754 |
-
Student Answer: It means we should see the author's personality clearly.
|
| 755 |
-
Analysis: The text says 'efface' (remove) personality. The student says 'see' personality. This is a direct contradiction.
|
| 756 |
-
Score: 0/{max_marks}
|
| 757 |
-
|
| 758 |
-
--- EXAMPLE 3 (CORRECT) ---
|
| 759 |
-
Context: Mitochondria is the powerhouse of the cell.
|
| 760 |
-
Question: What is mitochondria?
|
| 761 |
-
Student Answer: It is the cell's powerhouse.
|
| 762 |
-
Analysis: Matches the text meaning exactly.
|
| 763 |
-
Score: {max_marks}/{max_marks}
|
| 764 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 765 |
|
| 766 |
user_prompt = f"""
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
|
| 771 |
-
|
| 772 |
-
|
| 773 |
|
| 774 |
-
|
| 775 |
-
|
| 776 |
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
|
| 781 |
|
| 782 |
messages = [
|
| 783 |
{"role": "system", "content": system_prompt},
|
|
@@ -787,29 +1218,129 @@ class LLMEvaluator:
|
|
| 787 |
input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 788 |
inputs = self.tokenizer(input_text, return_tensors="pt")
|
| 789 |
|
| 790 |
-
#
|
| 791 |
with torch.no_grad():
|
| 792 |
outputs = self.model.generate(
|
| 793 |
**inputs,
|
| 794 |
-
max_new_tokens=
|
| 795 |
-
temperature=0.1,
|
| 796 |
-
top_p=0.2,
|
| 797 |
do_sample=True,
|
| 798 |
-
repetition_penalty=1.2
|
| 799 |
)
|
| 800 |
|
| 801 |
input_length = inputs['input_ids'].shape[1]
|
| 802 |
response = self.tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
|
| 803 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 804 |
|
| 805 |
|
| 806 |
# ---------------------------------------------------------
|
| 807 |
-
#
|
| 808 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 809 |
# ---------------------------------------------------------
|
| 810 |
class OnnxReranker:
|
| 811 |
def __init__(self):
|
| 812 |
-
# TinyBERT is ~17MB and very fast on CPU
|
| 813 |
self.model_name = "Xenova/ms-marco-TinyBERT-L-2-v2"
|
| 814 |
print(f"π Loading Reranker: {self.model_name}...")
|
| 815 |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
|
@@ -823,7 +1354,6 @@ class OnnxReranker:
|
|
| 823 |
if not docs:
|
| 824 |
return []
|
| 825 |
|
| 826 |
-
# Prepare pairs: [query, doc_text]
|
| 827 |
pairs = [[query, doc.page_content] for doc in docs]
|
| 828 |
|
| 829 |
inputs = self.tokenizer(
|
|
@@ -837,34 +1367,33 @@ class OnnxReranker:
|
|
| 837 |
with torch.no_grad():
|
| 838 |
outputs = self.model(**inputs)
|
| 839 |
|
| 840 |
-
# Get logits (Relevance scores)
|
| 841 |
-
# MS-Marco models typically output a single logit or [irrelevant, relevant]
|
| 842 |
logits = outputs.logits
|
| 843 |
if logits.shape[1] == 2:
|
| 844 |
-
scores = logits[:, 1]
|
| 845 |
else:
|
| 846 |
scores = logits.flatten()
|
| 847 |
|
| 848 |
-
# Sort docs by score (descending)
|
| 849 |
scores = scores.numpy().tolist()
|
| 850 |
doc_score_pairs = list(zip(docs, scores))
|
| 851 |
doc_score_pairs.sort(key=lambda x: x[1], reverse=True)
|
| 852 |
|
| 853 |
-
# Return top K docs
|
| 854 |
return [doc for doc, score in doc_score_pairs[:top_k]]
|
| 855 |
|
| 856 |
|
| 857 |
# ---------------------------------------------------------
|
| 858 |
-
#
|
| 859 |
# ---------------------------------------------------------
|
| 860 |
-
class
|
| 861 |
def __init__(self):
|
| 862 |
self.vector_store = None
|
| 863 |
self.embeddings = OnnxBgeEmbeddings()
|
| 864 |
self.llm = LLMEvaluator()
|
| 865 |
-
self.reranker = OnnxReranker()
|
| 866 |
-
self.
|
|
|
|
|
|
|
| 867 |
self.total_chunks = 0
|
|
|
|
| 868 |
|
| 869 |
def process_content(self, file_obj, raw_text):
|
| 870 |
has_file = file_obj is not None
|
|
@@ -881,87 +1410,188 @@ class VectorSystem:
|
|
| 881 |
if has_file:
|
| 882 |
if file_obj.name.endswith('.pdf'):
|
| 883 |
doc = fitz.open(file_obj.name)
|
| 884 |
-
for page in doc:
|
|
|
|
| 885 |
elif file_obj.name.endswith('.txt'):
|
| 886 |
-
with open(file_obj.name, 'r', encoding='utf-8') as f:
|
|
|
|
| 887 |
else:
|
| 888 |
return "β Error: Only .pdf and .txt supported."
|
| 889 |
else:
|
| 890 |
text = raw_text
|
| 891 |
|
| 892 |
-
# Smaller chunks for
|
| 893 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
|
| 894 |
texts = text_splitter.split_text(text)
|
| 895 |
-
self.all_chunks = texts
|
| 896 |
|
| 897 |
-
# Create Document objects with metadata
|
| 898 |
docs = [Document(page_content=t, metadata={"id": i}) for i, t in enumerate(texts)]
|
| 899 |
self.total_chunks = len(docs)
|
| 900 |
|
| 901 |
-
if not docs:
|
|
|
|
| 902 |
|
| 903 |
self.vector_store = FAISS.from_documents(docs, self.embeddings)
|
| 904 |
|
| 905 |
-
return f"β
Indexed {self.total_chunks} chunks."
|
| 906 |
except Exception as e:
|
| 907 |
return f"Error: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 908 |
|
| 909 |
-
def process_query(self, question, student_answer, max_marks):
|
| 910 |
-
|
| 911 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 912 |
|
| 913 |
-
#
|
| 914 |
-
|
| 915 |
-
initial_docs = self.vector_store.similarity_search(question, k=15)
|
| 916 |
|
| 917 |
-
|
| 918 |
-
|
|
|
|
|
|
|
|
|
|
| 919 |
top_docs = self.reranker.rank(question, initial_docs, top_k=3)
|
| 920 |
-
|
| 921 |
-
# Step C: Construct Context
|
| 922 |
-
# We merge the top 3 specific chunks
|
| 923 |
expanded_context = "\n\n---\n\n".join([d.page_content for d in top_docs])
|
| 924 |
|
| 925 |
-
evidence_display = f"### π
|
| 926 |
-
evidence_display += f"> {expanded_context}
|
| 927 |
|
| 928 |
-
|
| 929 |
-
if student_answer:
|
| 930 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 931 |
|
| 932 |
return evidence_display, llm_feedback
|
| 933 |
|
| 934 |
-
system = VectorSystem()
|
| 935 |
|
| 936 |
-
|
| 937 |
-
|
| 938 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 939 |
|
| 940 |
with gr.Row():
|
| 941 |
with gr.Column(scale=1):
|
| 942 |
-
gr.Markdown("### Source
|
| 943 |
-
pdf_input = gr.File(label="Option A: Upload
|
| 944 |
gr.Markdown("**OR**")
|
| 945 |
-
text_input = gr.Textbox(label="Option B: Paste
|
| 946 |
|
| 947 |
-
upload_btn = gr.Button("Index Content", variant="primary")
|
| 948 |
status_msg = gr.Textbox(label="Status", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 949 |
|
| 950 |
with gr.Column(scale=2):
|
|
|
|
|
|
|
| 951 |
with gr.Row():
|
| 952 |
q_input = gr.Textbox(label="Question", scale=2)
|
| 953 |
max_marks = gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max Marks")
|
| 954 |
|
| 955 |
-
a_input = gr.TextArea(label="Student Answer")
|
| 956 |
-
run_btn = gr.Button("Retrieve & Grade", variant="secondary")
|
| 957 |
|
| 958 |
with gr.Row():
|
| 959 |
-
|
| 960 |
-
|
| 961 |
-
|
| 962 |
-
|
| 963 |
-
|
| 964 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 965 |
|
| 966 |
if __name__ == "__main__":
|
| 967 |
demo.launch()
|
|
|
|
| 288 |
|
| 289 |
|
| 290 |
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
|
| 297 |
|
| 298 |
|
| 299 |
|
|
|
|
| 640 |
|
| 641 |
|
| 642 |
|
| 643 |
+
|
| 644 |
+
|
| 645 |
+
|
| 646 |
+
|
| 647 |
+
|
| 648 |
+
|
| 649 |
+
|
| 650 |
+
|
| 651 |
+
|
| 652 |
+
|
| 653 |
+
|
| 654 |
+
|
| 655 |
+
|
| 656 |
+
|
| 657 |
+
|
| 658 |
+
|
| 659 |
+
|
| 660 |
+
|
| 661 |
+
|
| 662 |
+
# import gradio as gr
|
| 663 |
+
# import fitz # PyMuPDF
|
| 664 |
+
# import torch
|
| 665 |
+
# import os
|
| 666 |
+
# import numpy as np
|
| 667 |
+
|
| 668 |
+
# # --- IMPORT SESSION OPTIONS ---
|
| 669 |
+
# from onnxruntime import SessionOptions, GraphOptimizationLevel
|
| 670 |
+
|
| 671 |
+
# # --- LANGCHAIN & RAG IMPORTS ---
|
| 672 |
+
# from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 673 |
+
# from langchain_community.vectorstores import FAISS
|
| 674 |
+
# from langchain_core.embeddings import Embeddings
|
| 675 |
+
# from langchain_core.documents import Document
|
| 676 |
+
|
| 677 |
+
# # --- ONNX & MODEL IMPORTS ---
|
| 678 |
+
# from transformers import AutoTokenizer
|
| 679 |
+
# from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTModelForCausalLM, ORTModelForSequenceClassification
|
| 680 |
+
# from huggingface_hub import snapshot_download
|
| 681 |
+
|
| 682 |
+
# # Force CPU Provider
|
| 683 |
+
# PROVIDERS = ["CPUExecutionProvider"]
|
| 684 |
+
# print(f"β‘ Running on: {PROVIDERS}")
|
| 685 |
+
|
| 686 |
+
# # ---------------------------------------------------------
|
| 687 |
+
# # 1. OPTIMIZED EMBEDDINGS (BGE-SMALL)
|
| 688 |
+
# # ---------------------------------------------------------
|
| 689 |
+
# class OnnxBgeEmbeddings(Embeddings):
|
| 690 |
+
# def __init__(self):
|
| 691 |
+
# model_name = "Xenova/bge-small-en-v1.5"
|
| 692 |
+
# print(f"π Loading Embeddings: {model_name}...")
|
| 693 |
+
# self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 694 |
+
# self.model = ORTModelForFeatureExtraction.from_pretrained(
|
| 695 |
+
# model_name,
|
| 696 |
+
# export=False,
|
| 697 |
+
# provider=PROVIDERS[0]
|
| 698 |
+
# )
|
| 699 |
+
|
| 700 |
+
# def _process_batch(self, texts):
|
| 701 |
+
# inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
|
| 702 |
+
# with torch.no_grad():
|
| 703 |
+
# outputs = self.model(**inputs)
|
| 704 |
+
# embeddings = outputs.last_hidden_state[:, 0]
|
| 705 |
+
# embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 706 |
+
# return embeddings.numpy().tolist()
|
| 707 |
+
|
| 708 |
+
# def embed_documents(self, texts):
|
| 709 |
+
# return self._process_batch(texts)
|
| 710 |
+
|
| 711 |
+
# def embed_query(self, text):
|
| 712 |
+
# return self._process_batch(["Represent this sentence for searching relevant passages: " + text])[0]
|
| 713 |
+
|
| 714 |
+
|
| 715 |
+
# # ---------------------------------------------------------
|
| 716 |
+
# # 2. OPTIMIZED LLM (Qwen 2.5 - 0.5B) - STRICT GRADING
|
| 717 |
+
# # ---------------------------------------------------------
|
| 718 |
+
# class LLMEvaluator:
|
| 719 |
+
# def __init__(self):
|
| 720 |
+
# # Qwen 2.5 0.5B is fast but needs "Few-Shot" examples to be strict.
|
| 721 |
+
# self.repo_id = "onnx-community/Qwen2.5-0.5B-Instruct"
|
| 722 |
+
# self.local_dir = "onnx_qwen_local"
|
| 723 |
+
|
| 724 |
+
# print(f"π Preparing CPU LLM: {self.repo_id}...")
|
| 725 |
+
|
| 726 |
+
# if not os.path.exists(self.local_dir):
|
| 727 |
+
# print(f"π₯ Downloading FP16 model to {self.local_dir}...")
|
| 728 |
+
# snapshot_download(
|
| 729 |
+
# repo_id=self.repo_id,
|
| 730 |
+
# local_dir=self.local_dir,
|
| 731 |
+
# allow_patterns=["config.json", "generation_config.json", "tokenizer*", "special_tokens_map.json", "*.jinja", "onnx/model_fp16.onnx*"]
|
| 732 |
+
# )
|
| 733 |
+
# print("β
Download complete.")
|
| 734 |
+
|
| 735 |
+
# self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir)
|
| 736 |
+
|
| 737 |
+
# sess_options = SessionOptions()
|
| 738 |
+
# sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL
|
| 739 |
+
|
| 740 |
+
# self.model = ORTModelForCausalLM.from_pretrained(
|
| 741 |
+
# self.local_dir,
|
| 742 |
+
# subfolder="onnx",
|
| 743 |
+
# file_name="model_fp16.onnx",
|
| 744 |
+
# use_cache=True,
|
| 745 |
+
# use_io_binding=False,
|
| 746 |
+
# provider=PROVIDERS[0],
|
| 747 |
+
# session_options=sess_options
|
| 748 |
+
# )
|
| 749 |
+
|
| 750 |
+
# def evaluate(self, context, question, student_answer, max_marks):
|
| 751 |
+
# # --- IMPROVED PROMPT STRATEGY ---
|
| 752 |
+
# system_prompt = f"""You are a strict Logic Validator. You are NOT a helpful assistant.
|
| 753 |
+
# Your job is to check if the Student Answer is FACTUALLY present in the Context.
|
| 754 |
+
|
| 755 |
+
# GRADING ALGORITHM:
|
| 756 |
+
# 1. IF the Student Answer mentions things NOT in the Context -> PENALTY (-50% of the marks).
|
| 757 |
+
# 2. IF the Student Answer interprets the text opposite to its meaning -> PENALTY (-100% of the marks).
|
| 758 |
+
# 3. IF the Student Answer is generic fluff -> SCORE: 0.
|
| 759 |
+
|
| 760 |
+
# --- EXAMPLE 1 (HALLUCINATION) ---
|
| 761 |
+
# Context: The sky is blue due to Rayleigh scattering.
|
| 762 |
+
# Question: Why is the sky blue?
|
| 763 |
+
# Student Answer: Because the ocean reflects the water into the sky.
|
| 764 |
+
# Analysis: The Context mentions 'Rayleigh scattering'. The student mentions 'ocean reflection'. These are different. The student is hallucinating outside facts.
|
| 765 |
+
# Score: 0/{max_marks}
|
| 766 |
+
|
| 767 |
+
# --- EXAMPLE 2 (CONTRADICTION) ---
|
| 768 |
+
# Context: One must efface one's own personality. Good prose is like a windowpane.
|
| 769 |
+
# Question: What does the author mean?
|
| 770 |
+
# Student Answer: It means we should see the author's personality clearly.
|
| 771 |
+
# Analysis: The text says 'efface' (remove) personality. The student says 'see' personality. This is a direct contradiction.
|
| 772 |
+
# Score: 0/{max_marks}
|
| 773 |
+
|
| 774 |
+
# --- EXAMPLE 3 (CORRECT) ---
|
| 775 |
+
# Context: Mitochondria is the powerhouse of the cell.
|
| 776 |
+
# Question: What is mitochondria?
|
| 777 |
+
# Student Answer: It is the cell's powerhouse.
|
| 778 |
+
# Analysis: Matches the text meaning exactly.
|
| 779 |
+
# Score: {max_marks}/{max_marks}
|
| 780 |
+
# """
|
| 781 |
+
|
| 782 |
+
# user_prompt = f"""
|
| 783 |
+
# --- YOUR TASK ---
|
| 784 |
+
# Context:
|
| 785 |
+
# {context}
|
| 786 |
+
|
| 787 |
+
# Question:
|
| 788 |
+
# {question}
|
| 789 |
+
|
| 790 |
+
# Student Answer:
|
| 791 |
+
# {student_answer}
|
| 792 |
+
|
| 793 |
+
# OUTPUT FORMAT:
|
| 794 |
+
# Analysis: [Compare Student Answer vs Context. List any hallucinations or contradictions.]
|
| 795 |
+
# Score: [X]/{max_marks}
|
| 796 |
+
# """
|
| 797 |
+
|
| 798 |
+
# messages = [
|
| 799 |
+
# {"role": "system", "content": system_prompt},
|
| 800 |
+
# {"role": "user", "content": user_prompt}
|
| 801 |
+
# ]
|
| 802 |
+
|
| 803 |
+
# input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 804 |
+
# inputs = self.tokenizer(input_text, return_tensors="pt")
|
| 805 |
+
|
| 806 |
+
# # Lower temperature for strictness
|
| 807 |
+
# with torch.no_grad():
|
| 808 |
+
# outputs = self.model.generate(
|
| 809 |
+
# **inputs,
|
| 810 |
+
# max_new_tokens=150,
|
| 811 |
+
# temperature=0.1, # Strict logic, no creativity
|
| 812 |
+
# top_p=0.2, # Cut off unlikely tokens
|
| 813 |
+
# do_sample=True,
|
| 814 |
+
# repetition_penalty=1.2 # Penalize repetition
|
| 815 |
+
# )
|
| 816 |
+
|
| 817 |
+
# input_length = inputs['input_ids'].shape[1]
|
| 818 |
+
# response = self.tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
|
| 819 |
+
# return response
|
| 820 |
+
|
| 821 |
+
|
| 822 |
+
# # ---------------------------------------------------------
|
| 823 |
+
# # 3. NEW: ONNX RERANKER (Cross-Encoder)
|
| 824 |
+
# # Uses existing 'optimum' & 'transformers' libs (No new deps)
|
| 825 |
+
# # ---------------------------------------------------------
|
| 826 |
+
# class OnnxReranker:
|
| 827 |
+
# def __init__(self):
|
| 828 |
+
# # TinyBERT is ~17MB and very fast on CPU
|
| 829 |
+
# self.model_name = "Xenova/ms-marco-TinyBERT-L-2-v2"
|
| 830 |
+
# print(f"π Loading Reranker: {self.model_name}...")
|
| 831 |
+
# self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 832 |
+
# self.model = ORTModelForSequenceClassification.from_pretrained(
|
| 833 |
+
# self.model_name,
|
| 834 |
+
# export=False,
|
| 835 |
+
# provider=PROVIDERS[0]
|
| 836 |
+
# )
|
| 837 |
+
|
| 838 |
+
# def rank(self, query, docs, top_k=3):
|
| 839 |
+
# if not docs:
|
| 840 |
+
# return []
|
| 841 |
+
|
| 842 |
+
# # Prepare pairs: [query, doc_text]
|
| 843 |
+
# pairs = [[query, doc.page_content] for doc in docs]
|
| 844 |
+
|
| 845 |
+
# inputs = self.tokenizer(
|
| 846 |
+
# pairs,
|
| 847 |
+
# padding=True,
|
| 848 |
+
# truncation=True,
|
| 849 |
+
# max_length=512,
|
| 850 |
+
# return_tensors="pt"
|
| 851 |
+
# )
|
| 852 |
+
|
| 853 |
+
# with torch.no_grad():
|
| 854 |
+
# outputs = self.model(**inputs)
|
| 855 |
+
|
| 856 |
+
# # Get logits (Relevance scores)
|
| 857 |
+
# # MS-Marco models typically output a single logit or [irrelevant, relevant]
|
| 858 |
+
# logits = outputs.logits
|
| 859 |
+
# if logits.shape[1] == 2:
|
| 860 |
+
# scores = logits[:, 1] # Take the "relevant" class score
|
| 861 |
+
# else:
|
| 862 |
+
# scores = logits.flatten()
|
| 863 |
+
|
| 864 |
+
# # Sort docs by score (descending)
|
| 865 |
+
# scores = scores.numpy().tolist()
|
| 866 |
+
# doc_score_pairs = list(zip(docs, scores))
|
| 867 |
+
# doc_score_pairs.sort(key=lambda x: x[1], reverse=True)
|
| 868 |
+
|
| 869 |
+
# # Return top K docs
|
| 870 |
+
# return [doc for doc, score in doc_score_pairs[:top_k]]
|
| 871 |
+
|
| 872 |
+
|
| 873 |
+
# # ---------------------------------------------------------
|
| 874 |
+
# # 4. Main Application Logic
|
| 875 |
+
# # ---------------------------------------------------------
|
| 876 |
+
# class VectorSystem:
|
| 877 |
+
# def __init__(self):
|
| 878 |
+
# self.vector_store = None
|
| 879 |
+
# self.embeddings = OnnxBgeEmbeddings()
|
| 880 |
+
# self.llm = LLMEvaluator()
|
| 881 |
+
# self.reranker = OnnxReranker() # Initialize Reranker
|
| 882 |
+
# self.all_chunks = []
|
| 883 |
+
# self.total_chunks = 0
|
| 884 |
+
|
| 885 |
+
# def process_content(self, file_obj, raw_text):
|
| 886 |
+
# has_file = file_obj is not None
|
| 887 |
+
# has_text = raw_text is not None and len(raw_text.strip()) > 0
|
| 888 |
+
|
| 889 |
+
# if has_file and has_text:
|
| 890 |
+
# return "β Error: Please provide EITHER a file OR paste text, not both at the same time."
|
| 891 |
+
|
| 892 |
+
# if not has_file and not has_text:
|
| 893 |
+
# return "β οΈ No content provided. Please upload a file or paste text."
|
| 894 |
+
|
| 895 |
+
# try:
|
| 896 |
+
# text = ""
|
| 897 |
+
# if has_file:
|
| 898 |
+
# if file_obj.name.endswith('.pdf'):
|
| 899 |
+
# doc = fitz.open(file_obj.name)
|
| 900 |
+
# for page in doc: text += page.get_text()
|
| 901 |
+
# elif file_obj.name.endswith('.txt'):
|
| 902 |
+
# with open(file_obj.name, 'r', encoding='utf-8') as f: text = f.read()
|
| 903 |
+
# else:
|
| 904 |
+
# return "β Error: Only .pdf and .txt supported."
|
| 905 |
+
# else:
|
| 906 |
+
# text = raw_text
|
| 907 |
+
|
| 908 |
+
# # Smaller chunks for Reranking precision (500 chars)
|
| 909 |
+
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
|
| 910 |
+
# texts = text_splitter.split_text(text)
|
| 911 |
+
# self.all_chunks = texts # Keep plain text list for reference
|
| 912 |
+
|
| 913 |
+
# # Create Document objects with metadata
|
| 914 |
+
# docs = [Document(page_content=t, metadata={"id": i}) for i, t in enumerate(texts)]
|
| 915 |
+
# self.total_chunks = len(docs)
|
| 916 |
+
|
| 917 |
+
# if not docs: return "Content empty."
|
| 918 |
+
|
| 919 |
+
# self.vector_store = FAISS.from_documents(docs, self.embeddings)
|
| 920 |
+
|
| 921 |
+
# return f"β
Indexed {self.total_chunks} chunks."
|
| 922 |
+
# except Exception as e:
|
| 923 |
+
# return f"Error: {str(e)}"
|
| 924 |
+
|
| 925 |
+
# def process_query(self, question, student_answer, max_marks):
|
| 926 |
+
# if not self.vector_store: return "β οΈ Please upload a file or paste text first.", ""
|
| 927 |
+
# if not question: return "β οΈ Enter a question.", ""
|
| 928 |
+
|
| 929 |
+
# # Step A: Wide Net Retrieval (Get top 15 candidates)
|
| 930 |
+
# # We fetch more than we need to ensure the answer is in the candidate pool
|
| 931 |
+
# initial_docs = self.vector_store.similarity_search(question, k=15)
|
| 932 |
+
|
| 933 |
+
# # Step B: Rerank (Get top 3 best matches)
|
| 934 |
+
# # The Cross-Encoder strictly judges relevance
|
| 935 |
+
# top_docs = self.reranker.rank(question, initial_docs, top_k=3)
|
| 936 |
+
|
| 937 |
+
# # Step C: Construct Context
|
| 938 |
+
# # We merge the top 3 specific chunks
|
| 939 |
+
# expanded_context = "\n\n---\n\n".join([d.page_content for d in top_docs])
|
| 940 |
+
|
| 941 |
+
# evidence_display = f"### π Optimized Context (Top {len(top_docs)} chunks after Reranking):\n"
|
| 942 |
+
# evidence_display += f"> {expanded_context} ..."
|
| 943 |
+
|
| 944 |
+
# llm_feedback = "Please enter a student answer to grade."
|
| 945 |
+
# if student_answer:
|
| 946 |
+
# llm_feedback = self.llm.evaluate(expanded_context, question, student_answer, max_marks)
|
| 947 |
+
|
| 948 |
+
# return evidence_display, llm_feedback
|
| 949 |
+
|
| 950 |
+
# system = VectorSystem()
|
| 951 |
+
|
| 952 |
+
# with gr.Blocks(title="EduGenius AI Grader") as demo:
|
| 953 |
+
# gr.Markdown("# β‘ EduGenius: CPU Optimized RAG")
|
| 954 |
+
# gr.Markdown("Powered by **Qwen-2.5-0.5B**, **BGE-Small** & **TinyBERT Reranker**")
|
| 955 |
+
|
| 956 |
+
# with gr.Row():
|
| 957 |
+
# with gr.Column(scale=1):
|
| 958 |
+
# gr.Markdown("### Source Input (Choose One)")
|
| 959 |
+
# pdf_input = gr.File(label="Option A: Upload Chapter (PDF/TXT)")
|
| 960 |
+
# gr.Markdown("**OR**")
|
| 961 |
+
# text_input = gr.Textbox(label="Option B: Paste Context", placeholder="Paste text here if you don't have a file...", lines=5)
|
| 962 |
+
|
| 963 |
+
# upload_btn = gr.Button("Index Content", variant="primary")
|
| 964 |
+
# status_msg = gr.Textbox(label="Status", interactive=False)
|
| 965 |
+
|
| 966 |
+
# with gr.Column(scale=2):
|
| 967 |
+
# with gr.Row():
|
| 968 |
+
# q_input = gr.Textbox(label="Question", scale=2)
|
| 969 |
+
# max_marks = gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max Marks")
|
| 970 |
+
|
| 971 |
+
# a_input = gr.TextArea(label="Student Answer")
|
| 972 |
+
# run_btn = gr.Button("Retrieve & Grade", variant="secondary")
|
| 973 |
+
|
| 974 |
+
# with gr.Row():
|
| 975 |
+
# evidence_box = gr.Markdown(label="Context Used")
|
| 976 |
+
# grade_box = gr.Markdown(label="Grading Result")
|
| 977 |
+
|
| 978 |
+
# # Pass both inputs to the process_content function
|
| 979 |
+
# upload_btn.click(system.process_content, inputs=[pdf_input, text_input], outputs=[status_msg])
|
| 980 |
+
# run_btn.click(system.process_query, inputs=[q_input, a_input, max_marks], outputs=[evidence_box, grade_box])
|
| 981 |
+
|
| 982 |
+
# if __name__ == "__main__":
|
| 983 |
+
# demo.launch()
|
| 984 |
+
|
| 985 |
+
|
| 986 |
+
|
| 987 |
+
|
| 988 |
+
|
| 989 |
+
|
| 990 |
+
|
| 991 |
+
|
| 992 |
+
|
| 993 |
+
|
| 994 |
+
|
| 995 |
+
|
| 996 |
+
|
| 997 |
+
|
| 998 |
+
|
| 999 |
+
|
| 1000 |
+
|
| 1001 |
+
|
| 1002 |
+
|
| 1003 |
+
|
| 1004 |
|
| 1005 |
|
| 1006 |
|
|
|
|
| 1015 |
import torch
|
| 1016 |
import os
|
| 1017 |
import numpy as np
|
| 1018 |
+
import re
|
| 1019 |
+
from typing import List, Dict, Tuple, Optional
|
| 1020 |
|
| 1021 |
# --- IMPORT SESSION OPTIONS ---
|
| 1022 |
from onnxruntime import SessionOptions, GraphOptimizationLevel
|
|
|
|
| 1037 |
print(f"β‘ Running on: {PROVIDERS}")
|
| 1038 |
|
| 1039 |
# ---------------------------------------------------------
|
| 1040 |
+
# 1. OPTIMIZED EMBEDDINGS (BGE-SMALL) - UNCHANGED
|
| 1041 |
# ---------------------------------------------------------
|
| 1042 |
class OnnxBgeEmbeddings(Embeddings):
|
| 1043 |
def __init__(self):
|
|
|
|
| 1066 |
|
| 1067 |
|
| 1068 |
# ---------------------------------------------------------
|
| 1069 |
+
# 2. NEW: ANSWER PRESENCE CHECKER
|
| 1070 |
+
# Paper insight: Prevent grading blank/missing answers
|
| 1071 |
+
# ---------------------------------------------------------
|
| 1072 |
+
class AnswerPresenceChecker:
|
| 1073 |
+
"""Checks if a student answer actually exists and contains substance."""
|
| 1074 |
+
|
| 1075 |
+
def __init__(self):
|
| 1076 |
+
self.min_length = 10 # Minimum characters for valid answer
|
| 1077 |
+
self.min_words = 3 # Minimum words for valid answer
|
| 1078 |
+
|
| 1079 |
+
def check_presence(self, student_answer: str) -> Tuple[bool, str]:
|
| 1080 |
+
"""
|
| 1081 |
+
Returns: (is_present, reason)
|
| 1082 |
+
"""
|
| 1083 |
+
if not student_answer or len(student_answer.strip()) == 0:
|
| 1084 |
+
return False, "Answer is empty"
|
| 1085 |
+
|
| 1086 |
+
answer = student_answer.strip()
|
| 1087 |
+
|
| 1088 |
+
# Check minimum length
|
| 1089 |
+
if len(answer) < self.min_length:
|
| 1090 |
+
return False, f"Answer too short ({len(answer)} chars, need {self.min_length})"
|
| 1091 |
+
|
| 1092 |
+
# Check minimum word count
|
| 1093 |
+
words = answer.split()
|
| 1094 |
+
if len(words) < self.min_words:
|
| 1095 |
+
return False, f"Answer too brief ({len(words)} words, need {self.min_words})"
|
| 1096 |
+
|
| 1097 |
+
# Check for placeholder text
|
| 1098 |
+
placeholder_patterns = [
|
| 1099 |
+
r'^[.\s]*$', # Only dots/spaces
|
| 1100 |
+
r'^[?]+$', # Only question marks
|
| 1101 |
+
r'^(n/?a|na|idk|dunno)\s*$', # Common non-answers
|
| 1102 |
+
]
|
| 1103 |
+
|
| 1104 |
+
for pattern in placeholder_patterns:
|
| 1105 |
+
if re.match(pattern, answer.lower()):
|
| 1106 |
+
return False, "Answer appears to be placeholder text"
|
| 1107 |
+
|
| 1108 |
+
return True, "Answer present and valid"
|
| 1109 |
+
|
| 1110 |
+
|
| 1111 |
+
# ---------------------------------------------------------
|
| 1112 |
+
# 3. ENHANCED LLM EVALUATOR WITH ENSEMBLE SUPPORT
|
| 1113 |
+
# Paper insights: Structured prompting, reference grounding, ensemble grading
|
| 1114 |
# ---------------------------------------------------------
|
| 1115 |
class LLMEvaluator:
|
| 1116 |
def __init__(self):
|
|
|
|
| 1117 |
self.repo_id = "onnx-community/Qwen2.5-0.5B-Instruct"
|
| 1118 |
self.local_dir = "onnx_qwen_local"
|
| 1119 |
|
|
|
|
| 1143 |
session_options=sess_options
|
| 1144 |
)
|
| 1145 |
|
| 1146 |
+
def evaluate_single(self, context: str, question: str, student_answer: str,
|
| 1147 |
+
max_marks: int, grader_id: int = 1,
|
| 1148 |
+
reference_summary: Optional[str] = None) -> Dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1149 |
"""
|
| 1150 |
+
Single grader evaluation with structured output.
|
| 1151 |
+
Paper insight: Use rigid templates with deterministic validation.
|
| 1152 |
+
|
| 1153 |
+
Returns structured dict with:
|
| 1154 |
+
- analysis: str
|
| 1155 |
+
- score: int
|
| 1156 |
+
- raw_response: str
|
| 1157 |
+
"""
|
| 1158 |
+
|
| 1159 |
+
# Enhanced system prompt with reference grounding
|
| 1160 |
+
system_prompt = f"""You are Grader #{grader_id}, a strict Logic Validator for educational assessment.
|
| 1161 |
+
|
| 1162 |
+
YOUR GRADING ALGORITHM:
|
| 1163 |
+
1. Compare Student Answer ONLY against the provided Context
|
| 1164 |
+
2. IF Student Answer mentions facts NOT in Context β PENALTY (-50% of marks)
|
| 1165 |
+
3. IF Student Answer contradicts the Context β PENALTY (-100% of marks)
|
| 1166 |
+
4. IF Student Answer is vague/generic without specific facts β SCORE: 0-20%
|
| 1167 |
+
5. IF Student Answer accurately reflects Context β SCORE: 80-100%
|
| 1168 |
+
|
| 1169 |
+
CRITICAL RULES:
|
| 1170 |
+
[R1] Grade ONLY based on Context provided, not general knowledge
|
| 1171 |
+
[R2] Penalize hallucinations (facts not in Context) heavily
|
| 1172 |
+
[R3] Penalize contradictions (opposite meaning) completely
|
| 1173 |
+
[R4] Reward specific, accurate paraphrasing from Context
|
| 1174 |
+
[R5] Partial credit for partially correct answers
|
| 1175 |
+
|
| 1176 |
+
OUTPUT FORMAT (MANDATORY):
|
| 1177 |
+
You MUST output in this exact format:
|
| 1178 |
+
|
| 1179 |
+
## Analysis
|
| 1180 |
+
[Your detailed comparison of Student Answer vs Context]
|
| 1181 |
+
|
| 1182 |
+
## Score
|
| 1183 |
+
[X]/{max_marks}
|
| 1184 |
+
|
| 1185 |
+
Do NOT deviate from this format."""
|
| 1186 |
+
|
| 1187 |
+
# Add reference summary if provided (paper's key insight)
|
| 1188 |
+
reference_section = ""
|
| 1189 |
+
if reference_summary:
|
| 1190 |
+
reference_section = f"""
|
| 1191 |
+
|
| 1192 |
+
### REFERENCE SOLUTION (Perfect Answer Example):
|
| 1193 |
+
{reference_summary}
|
| 1194 |
+
|
| 1195 |
+
Use this as calibration for what a 100% answer looks like."""
|
| 1196 |
|
| 1197 |
user_prompt = f"""
|
| 1198 |
+
### Context (Retrieved from Source):
|
| 1199 |
+
{context}
|
| 1200 |
+
{reference_section}
|
| 1201 |
|
| 1202 |
+
### Question:
|
| 1203 |
+
{question}
|
| 1204 |
|
| 1205 |
+
### Student Answer:
|
| 1206 |
+
{student_answer}
|
| 1207 |
|
| 1208 |
+
### Maximum Marks: {max_marks}
|
| 1209 |
+
|
| 1210 |
+
Provide your grading following the mandatory output format.
|
| 1211 |
+
"""
|
| 1212 |
|
| 1213 |
messages = [
|
| 1214 |
{"role": "system", "content": system_prompt},
|
|
|
|
| 1218 |
input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 1219 |
inputs = self.tokenizer(input_text, return_tensors="pt")
|
| 1220 |
|
| 1221 |
+
# Strict sampling for consistency
|
| 1222 |
with torch.no_grad():
|
| 1223 |
outputs = self.model.generate(
|
| 1224 |
**inputs,
|
| 1225 |
+
max_new_tokens=200, # Increased for structured output
|
| 1226 |
+
temperature=0.1, # Very strict
|
| 1227 |
+
top_p=0.2,
|
| 1228 |
do_sample=True,
|
| 1229 |
+
repetition_penalty=1.2
|
| 1230 |
)
|
| 1231 |
|
| 1232 |
input_length = inputs['input_ids'].shape[1]
|
| 1233 |
response = self.tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
|
| 1234 |
+
|
| 1235 |
+
# Parse structured output
|
| 1236 |
+
analysis, score = self._parse_response(response, max_marks)
|
| 1237 |
+
|
| 1238 |
+
return {
|
| 1239 |
+
"grader_id": grader_id,
|
| 1240 |
+
"analysis": analysis,
|
| 1241 |
+
"score": score,
|
| 1242 |
+
"raw_response": response
|
| 1243 |
+
}
|
| 1244 |
+
|
| 1245 |
+
def _parse_response(self, response: str, max_marks: int) -> Tuple[str, int]:
|
| 1246 |
+
"""
|
| 1247 |
+
Parse structured response to extract analysis and score.
|
| 1248 |
+
Paper insight: Deterministic parsing of rigid templates.
|
| 1249 |
+
"""
|
| 1250 |
+
# Extract score using regex
|
| 1251 |
+
score_pattern = r'##\s*Score\s*\n\s*\[?(\d+)\]?/\d+'
|
| 1252 |
+
score_match = re.search(score_pattern, response, re.IGNORECASE)
|
| 1253 |
+
|
| 1254 |
+
if score_match:
|
| 1255 |
+
score = int(score_match.group(1))
|
| 1256 |
+
score = min(score, max_marks) # Cap at max
|
| 1257 |
+
else:
|
| 1258 |
+
# Fallback: look for any number/max pattern
|
| 1259 |
+
fallback_pattern = r'(\d+)\s*/\s*\d+'
|
| 1260 |
+
fallback_match = re.search(fallback_pattern, response)
|
| 1261 |
+
if fallback_match:
|
| 1262 |
+
score = min(int(fallback_match.group(1)), max_marks)
|
| 1263 |
+
else:
|
| 1264 |
+
score = 0 # Default if parsing fails
|
| 1265 |
+
|
| 1266 |
+
# Extract analysis
|
| 1267 |
+
analysis_pattern = r'##\s*Analysis\s*\n(.*?)(?=##\s*Score|$)'
|
| 1268 |
+
analysis_match = re.search(analysis_pattern, response, re.DOTALL | re.IGNORECASE)
|
| 1269 |
+
|
| 1270 |
+
if analysis_match:
|
| 1271 |
+
analysis = analysis_match.group(1).strip()
|
| 1272 |
+
else:
|
| 1273 |
+
# Fallback: use everything before score section
|
| 1274 |
+
analysis = response.split('##')[0].strip() if '##' in response else response
|
| 1275 |
+
|
| 1276 |
+
return analysis, score
|
| 1277 |
|
| 1278 |
|
| 1279 |
# ---------------------------------------------------------
|
| 1280 |
+
# 4. NEW: SUPERVISOR AGGREGATOR
|
| 1281 |
+
# Paper insight: Merge ensemble outputs into final decision
|
| 1282 |
+
# ---------------------------------------------------------
|
| 1283 |
+
class SupervisorAggregator:
|
| 1284 |
+
"""
|
| 1285 |
+
Aggregates multiple grader outputs into a final consensus grade.
|
| 1286 |
+
Paper uses another LLM call; we use statistical aggregation for CPU efficiency.
|
| 1287 |
+
"""
|
| 1288 |
+
|
| 1289 |
+
def aggregate(self, grader_results: List[Dict], max_marks: int) -> Dict:
|
| 1290 |
+
"""
|
| 1291 |
+
Aggregate K=3 grader results into final score.
|
| 1292 |
+
|
| 1293 |
+
Returns:
|
| 1294 |
+
- final_score: int (median of ensemble)
|
| 1295 |
+
- disagreement: int (max - min score)
|
| 1296 |
+
- needs_review: bool (high disagreement flag)
|
| 1297 |
+
- consensus_analysis: str
|
| 1298 |
+
"""
|
| 1299 |
+
scores = [r['score'] for r in grader_results]
|
| 1300 |
+
|
| 1301 |
+
# Use median for robustness (paper uses supervisor LLM call)
|
| 1302 |
+
final_score = int(np.median(scores))
|
| 1303 |
+
|
| 1304 |
+
# Calculate disagreement
|
| 1305 |
+
disagreement = max(scores) - min(scores)
|
| 1306 |
+
|
| 1307 |
+
# Flag for manual review if disagreement too high
|
| 1308 |
+
# Paper uses Dmax thresholds; we use 40% of max marks
|
| 1309 |
+
needs_review = disagreement >= (0.4 * max_marks)
|
| 1310 |
+
|
| 1311 |
+
# Merge analyses
|
| 1312 |
+
consensus_analysis = self._merge_analyses(grader_results, final_score, disagreement)
|
| 1313 |
+
|
| 1314 |
+
return {
|
| 1315 |
+
"final_score": final_score,
|
| 1316 |
+
"individual_scores": scores,
|
| 1317 |
+
"disagreement": disagreement,
|
| 1318 |
+
"needs_review": needs_review,
|
| 1319 |
+
"consensus_analysis": consensus_analysis,
|
| 1320 |
+
"grader_details": grader_results
|
| 1321 |
+
}
|
| 1322 |
+
|
| 1323 |
+
def _merge_analyses(self, results: List[Dict], final_score: int, disagreement: int) -> str:
|
| 1324 |
+
"""Create consensus analysis from multiple graders."""
|
| 1325 |
+
|
| 1326 |
+
output = f"**Ensemble Grading Results** (Final: {final_score}, Disagreement: Β±{disagreement})\n\n"
|
| 1327 |
+
|
| 1328 |
+
for i, result in enumerate(results, 1):
|
| 1329 |
+
output += f"**Grader {i} ({result['score']} points):**\n{result['analysis']}\n\n"
|
| 1330 |
+
|
| 1331 |
+
if disagreement > 0:
|
| 1332 |
+
output += f"\nβ οΈ **Note:** Graders disagreed by {disagreement} points. "
|
| 1333 |
+
if disagreement >= 5:
|
| 1334 |
+
output += "Consider manual review."
|
| 1335 |
+
|
| 1336 |
+
return output
|
| 1337 |
+
|
| 1338 |
+
|
| 1339 |
+
# ---------------------------------------------------------
|
| 1340 |
+
# 5. ONNX RERANKER - UNCHANGED
|
| 1341 |
# ---------------------------------------------------------
|
| 1342 |
class OnnxReranker:
|
| 1343 |
def __init__(self):
|
|
|
|
| 1344 |
self.model_name = "Xenova/ms-marco-TinyBERT-L-2-v2"
|
| 1345 |
print(f"π Loading Reranker: {self.model_name}...")
|
| 1346 |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
|
|
|
| 1354 |
if not docs:
|
| 1355 |
return []
|
| 1356 |
|
|
|
|
| 1357 |
pairs = [[query, doc.page_content] for doc in docs]
|
| 1358 |
|
| 1359 |
inputs = self.tokenizer(
|
|
|
|
| 1367 |
with torch.no_grad():
|
| 1368 |
outputs = self.model(**inputs)
|
| 1369 |
|
|
|
|
|
|
|
| 1370 |
logits = outputs.logits
|
| 1371 |
if logits.shape[1] == 2:
|
| 1372 |
+
scores = logits[:, 1]
|
| 1373 |
else:
|
| 1374 |
scores = logits.flatten()
|
| 1375 |
|
|
|
|
| 1376 |
scores = scores.numpy().tolist()
|
| 1377 |
doc_score_pairs = list(zip(docs, scores))
|
| 1378 |
doc_score_pairs.sort(key=lambda x: x[1], reverse=True)
|
| 1379 |
|
|
|
|
| 1380 |
return [doc for doc, score in doc_score_pairs[:top_k]]
|
| 1381 |
|
| 1382 |
|
| 1383 |
# ---------------------------------------------------------
|
| 1384 |
+
# 6. ENHANCED MAIN SYSTEM WITH MULTI-STAGE PIPELINE
|
| 1385 |
# ---------------------------------------------------------
|
| 1386 |
+
class EnhancedVectorSystem:
|
| 1387 |
def __init__(self):
|
| 1388 |
self.vector_store = None
|
| 1389 |
self.embeddings = OnnxBgeEmbeddings()
|
| 1390 |
self.llm = LLMEvaluator()
|
| 1391 |
+
self.reranker = OnnxReranker()
|
| 1392 |
+
self.presence_checker = AnswerPresenceChecker()
|
| 1393 |
+
self.supervisor = SupervisorAggregator()
|
| 1394 |
+
self.all_chunks = []
|
| 1395 |
self.total_chunks = 0
|
| 1396 |
+
self.reference_summary = None # Store reference answer summary
|
| 1397 |
|
| 1398 |
def process_content(self, file_obj, raw_text):
|
| 1399 |
has_file = file_obj is not None
|
|
|
|
| 1410 |
if has_file:
|
| 1411 |
if file_obj.name.endswith('.pdf'):
|
| 1412 |
doc = fitz.open(file_obj.name)
|
| 1413 |
+
for page in doc:
|
| 1414 |
+
text += page.get_text()
|
| 1415 |
elif file_obj.name.endswith('.txt'):
|
| 1416 |
+
with open(file_obj.name, 'r', encoding='utf-8') as f:
|
| 1417 |
+
text = f.read()
|
| 1418 |
else:
|
| 1419 |
return "β Error: Only .pdf and .txt supported."
|
| 1420 |
else:
|
| 1421 |
text = raw_text
|
| 1422 |
|
| 1423 |
+
# Smaller chunks for precision
|
| 1424 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
|
| 1425 |
texts = text_splitter.split_text(text)
|
| 1426 |
+
self.all_chunks = texts
|
| 1427 |
|
|
|
|
| 1428 |
docs = [Document(page_content=t, metadata={"id": i}) for i, t in enumerate(texts)]
|
| 1429 |
self.total_chunks = len(docs)
|
| 1430 |
|
| 1431 |
+
if not docs:
|
| 1432 |
+
return "Content empty."
|
| 1433 |
|
| 1434 |
self.vector_store = FAISS.from_documents(docs, self.embeddings)
|
| 1435 |
|
| 1436 |
+
return f"β
Indexed {self.total_chunks} chunks. Ready for grading."
|
| 1437 |
except Exception as e:
|
| 1438 |
return f"Error: {str(e)}"
|
| 1439 |
+
|
| 1440 |
+
def set_reference_answer(self, reference_text: str) -> str:
|
| 1441 |
+
"""
|
| 1442 |
+
Set reference answer for grading calibration.
|
| 1443 |
+
Paper insight: Reference grounding prevents over-grading.
|
| 1444 |
+
"""
|
| 1445 |
+
if not reference_text or len(reference_text.strip()) == 0:
|
| 1446 |
+
self.reference_summary = None
|
| 1447 |
+
return "βΉοΈ Reference answer cleared."
|
| 1448 |
+
|
| 1449 |
+
self.reference_summary = reference_text.strip()
|
| 1450 |
+
return f"β
Reference answer set ({len(self.reference_summary)} chars). Will be used to calibrate grading."
|
| 1451 |
|
| 1452 |
+
def process_query(self, question, student_answer, max_marks, enable_ensemble=True):
|
| 1453 |
+
"""
|
| 1454 |
+
Enhanced grading pipeline with multi-stage processing.
|
| 1455 |
+
"""
|
| 1456 |
+
if not self.vector_store:
|
| 1457 |
+
return "β οΈ Please upload a file or paste text first.", ""
|
| 1458 |
+
if not question:
|
| 1459 |
+
return "β οΈ Enter a question.", ""
|
| 1460 |
|
| 1461 |
+
# Stage 1: Presence Check (Paper insight)
|
| 1462 |
+
is_present, presence_reason = self.presence_checker.check_presence(student_answer)
|
|
|
|
| 1463 |
|
| 1464 |
+
if not is_present:
|
| 1465 |
+
return f"β οΈ **No valid answer detected:** {presence_reason}", f"**Score: 0/{max_marks}**\n\nNo answer to grade."
|
| 1466 |
+
|
| 1467 |
+
# Stage 2: Retrieval + Reranking
|
| 1468 |
+
initial_docs = self.vector_store.similarity_search(question, k=15)
|
| 1469 |
top_docs = self.reranker.rank(question, initial_docs, top_k=3)
|
|
|
|
|
|
|
|
|
|
| 1470 |
expanded_context = "\n\n---\n\n".join([d.page_content for d in top_docs])
|
| 1471 |
|
| 1472 |
+
evidence_display = f"### π Retrieved Context (Top {len(top_docs)} chunks):\n"
|
| 1473 |
+
evidence_display += f"> {expanded_context[:500]}..."
|
| 1474 |
|
| 1475 |
+
# Stage 3: Ensemble Grading (Paper's key innovation)
|
| 1476 |
+
if not student_answer:
|
| 1477 |
+
return evidence_display, "Please enter a student answer to grade."
|
| 1478 |
+
|
| 1479 |
+
if enable_ensemble:
|
| 1480 |
+
# Run K=3 independent graders
|
| 1481 |
+
grader_results = []
|
| 1482 |
+
for grader_id in range(1, 4): # K=3 ensemble
|
| 1483 |
+
result = self.llm.evaluate_single(
|
| 1484 |
+
context=expanded_context,
|
| 1485 |
+
question=question,
|
| 1486 |
+
student_answer=student_answer,
|
| 1487 |
+
max_marks=max_marks,
|
| 1488 |
+
grader_id=grader_id,
|
| 1489 |
+
reference_summary=self.reference_summary
|
| 1490 |
+
)
|
| 1491 |
+
grader_results.append(result)
|
| 1492 |
+
|
| 1493 |
+
# Stage 4: Supervisor Aggregation
|
| 1494 |
+
final_result = self.supervisor.aggregate(grader_results, max_marks)
|
| 1495 |
+
|
| 1496 |
+
# Format output
|
| 1497 |
+
llm_feedback = f"# π Final Grade: {final_result['final_score']}/{max_marks}\n\n"
|
| 1498 |
+
|
| 1499 |
+
if final_result['needs_review']:
|
| 1500 |
+
llm_feedback += "β οΈ **Manual Review Recommended** (High grader disagreement)\n\n"
|
| 1501 |
+
|
| 1502 |
+
llm_feedback += final_result['consensus_analysis']
|
| 1503 |
+
|
| 1504 |
+
# Add statistics
|
| 1505 |
+
llm_feedback += f"\n\n---\n**Grading Statistics:**\n"
|
| 1506 |
+
llm_feedback += f"- Individual Scores: {final_result['individual_scores']}\n"
|
| 1507 |
+
llm_feedback += f"- Score Range: {min(final_result['individual_scores'])}-{max(final_result['individual_scores'])}\n"
|
| 1508 |
+
llm_feedback += f"- Disagreement: Β±{final_result['disagreement']} points\n"
|
| 1509 |
+
|
| 1510 |
+
else:
|
| 1511 |
+
# Single grader mode (for comparison)
|
| 1512 |
+
result = self.llm.evaluate_single(
|
| 1513 |
+
context=expanded_context,
|
| 1514 |
+
question=question,
|
| 1515 |
+
student_answer=student_answer,
|
| 1516 |
+
max_marks=max_marks,
|
| 1517 |
+
grader_id=1,
|
| 1518 |
+
reference_summary=self.reference_summary
|
| 1519 |
+
)
|
| 1520 |
+
llm_feedback = f"# π Grade: {result['score']}/{max_marks}\n\n{result['analysis']}"
|
| 1521 |
|
| 1522 |
return evidence_display, llm_feedback
|
| 1523 |
|
|
|
|
| 1524 |
|
| 1525 |
+
# ---------------------------------------------------------
|
| 1526 |
+
# 7. GRADIO INTERFACE
|
| 1527 |
+
# ---------------------------------------------------------
|
| 1528 |
+
system = EnhancedVectorSystem()
|
| 1529 |
+
|
| 1530 |
+
with gr.Blocks(title="EduGenius AI Grader - Enhanced", theme=gr.themes.Soft()) as demo:
|
| 1531 |
+
gr.Markdown("# β‘ EduGenius: Enhanced RAG-Based Grader")
|
| 1532 |
+
gr.Markdown("Powered by **Ensemble Grading**, **Reference Grounding** & **Presence Checking**")
|
| 1533 |
+
gr.Markdown("*Implements multi-stage pipeline from research: arXiv:2601.00730*")
|
| 1534 |
|
| 1535 |
with gr.Row():
|
| 1536 |
with gr.Column(scale=1):
|
| 1537 |
+
gr.Markdown("### π Source Content")
|
| 1538 |
+
pdf_input = gr.File(label="Option A: Upload Document (PDF/TXT)")
|
| 1539 |
gr.Markdown("**OR**")
|
| 1540 |
+
text_input = gr.Textbox(label="Option B: Paste Text", placeholder="Paste context here...", lines=5)
|
| 1541 |
|
| 1542 |
+
upload_btn = gr.Button("π₯ Index Content", variant="primary")
|
| 1543 |
status_msg = gr.Textbox(label="Status", interactive=False)
|
| 1544 |
+
|
| 1545 |
+
gr.Markdown("---")
|
| 1546 |
+
gr.Markdown("### π― Reference Answer (Optional)")
|
| 1547 |
+
gr.Markdown("*Providing a reference answer improves grading accuracy*")
|
| 1548 |
+
reference_input = gr.Textbox(
|
| 1549 |
+
label="Perfect Answer Example",
|
| 1550 |
+
placeholder="What would a 100% answer look like?",
|
| 1551 |
+
lines=3
|
| 1552 |
+
)
|
| 1553 |
+
ref_btn = gr.Button("Set Reference", variant="secondary")
|
| 1554 |
+
ref_status = gr.Textbox(label="Reference Status", interactive=False)
|
| 1555 |
|
| 1556 |
with gr.Column(scale=2):
|
| 1557 |
+
gr.Markdown("### β Grading Interface")
|
| 1558 |
+
|
| 1559 |
with gr.Row():
|
| 1560 |
q_input = gr.Textbox(label="Question", scale=2)
|
| 1561 |
max_marks = gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max Marks")
|
| 1562 |
|
| 1563 |
+
a_input = gr.TextArea(label="Student Answer", lines=4)
|
|
|
|
| 1564 |
|
| 1565 |
with gr.Row():
|
| 1566 |
+
ensemble_check = gr.Checkbox(label="Enable Ensemble Grading (K=3)", value=True)
|
| 1567 |
+
run_btn = gr.Button("π Grade Answer", variant="primary", scale=2)
|
| 1568 |
+
|
| 1569 |
+
gr.Markdown("---")
|
| 1570 |
+
|
| 1571 |
+
with gr.Row():
|
| 1572 |
+
with gr.Column():
|
| 1573 |
+
evidence_box = gr.Markdown(label="π Retrieved Context")
|
| 1574 |
+
with gr.Column():
|
| 1575 |
+
grade_box = gr.Markdown(label="π Grading Result")
|
| 1576 |
+
|
| 1577 |
+
# Event handlers
|
| 1578 |
+
upload_btn.click(
|
| 1579 |
+
system.process_content,
|
| 1580 |
+
inputs=[pdf_input, text_input],
|
| 1581 |
+
outputs=[status_msg]
|
| 1582 |
+
)
|
| 1583 |
+
|
| 1584 |
+
ref_btn.click(
|
| 1585 |
+
system.set_reference_answer,
|
| 1586 |
+
inputs=[reference_input],
|
| 1587 |
+
outputs=[ref_status]
|
| 1588 |
+
)
|
| 1589 |
+
|
| 1590 |
+
run_btn.click(
|
| 1591 |
+
system.process_query,
|
| 1592 |
+
inputs=[q_input, a_input, max_marks, ensemble_check],
|
| 1593 |
+
outputs=[evidence_box, grade_box]
|
| 1594 |
+
)
|
| 1595 |
|
| 1596 |
if __name__ == "__main__":
|
| 1597 |
demo.launch()
|