File size: 8,592 Bytes
317a0d7
2c19d14
 
317a0d7
 
2c19d14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317a0d7
 
2c19d14
 
 
317a0d7
 
2c19d14
 
 
 
 
 
 
 
 
 
 
 
 
317a0d7
 
2c19d14
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import gradio as gr
import fitz  # PyMuPDF
import torch
import os

# --- LANGCHAIN & RAG IMPORTS ---
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.embeddings import Embeddings

# --- ONNX & MODEL IMPORTS ---
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTModelForCausalLM
from huggingface_hub import snapshot_download
import onnxruntime as ort

# Check available hardware accelerators
PROVIDERS = ort.get_available_providers()
print(f"⚑ Hardware Acceleration Providers: {PROVIDERS}")

# ---------------------------------------------------------
# 1. OPTIMIZED EMBEDDINGS (BGE-SMALL)
# ---------------------------------------------------------
class OnnxBgeEmbeddings(Embeddings):
    # CHANGE 1: Switched to 'bge-small' (3x faster than large, similar accuracy)
    def __init__(self, model_name="BAAI/bge-small-en-v1.5"):
        print(f"πŸ”„ Loading Faster Embeddings: {model_name}...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        self.model = ORTModelForFeatureExtraction.from_pretrained(
            model_name, 
            export=False,
            provider=PROVIDERS[0] # Auto-select best hardware (CUDA/CoreML)
        )

    def _process_batch(self, texts):
        inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
        
        # Move inputs to same device as model if needed (mostly handled by Optimum)
        device = self.model.device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model(**inputs)
        
        embeddings = outputs.last_hidden_state[:, 0]
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        # Detach from graph before converting to numpy
        return embeddings.cpu().numpy().tolist()

    def embed_documents(self, texts):
        return self._process_batch(texts)

    def embed_query(self, text):
        return self._process_batch(["Represent this sentence for searching relevant passages: " + text])[0]

# ---------------------------------------------------------
# 2. OPTIMIZED LLM (Qwen 2.5 - 0.5B)
# ---------------------------------------------------------
class LLMEvaluator:
    def __init__(self):
        # CHANGE 2: Switched to Qwen 2.5 0.5B (Half the size of Llama 1B, very smart)
        self.repo_id = "Xenova/Qwen2.5-0.5B-Instruct" 
        self.local_dir = "onnx_qwen_local"
        
        print(f"πŸ”„ Preparing Ultra-Fast LLM: {self.repo_id}...")
        
        if not os.path.exists(self.local_dir):
            print(f"πŸ“₯ Downloading Model to {self.local_dir}...")
            # Note: Xenova repos usually have the ONNX ready, no complex wildcard needed
            snapshot_download(repo_id=self.repo_id, local_dir=self.local_dir)
            print("βœ… Download complete.")

        self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir)
        
        # CHANGE 3: Enabled IO Binding + Explicit Provider
        self.model = ORTModelForCausalLM.from_pretrained(
            self.local_dir,
            use_cache=True,
            use_io_binding=True, # CHANGE: Major speedup on GPU
            provider=PROVIDERS[0]
        )

    def evaluate(self, context, question, student_answer, max_marks):
        # Qwen uses ChatML format implicitly via tokenizer
        messages = [
            {"role": "system", "content": "You are a strict academic grader. Verify the student answer against the context. Be harsh. Do not halluncinate."},
            {"role": "user", "content": f"""
            CONTEXT: {context}
            QUESTION: {question}
            ANSWER: {student_answer}
            
            TASK: Grade out of {max_marks}.
            RULES:
            1. If wrong, 0 marks.
            2. Be strict.
            3. Format: 'Score: X/{max_marks} \n Feedback: ...'
            """}
        ]
        
        input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = self.tokenizer(input_text, return_tensors="pt")
        
        # Move inputs for IO Binding
        device = self.model.device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=75,  # CHANGE 4: Reduced tokens (we only need a short score/feedback)
                temperature=0.1,
                do_sample=False
            )
        
        response = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
        return response

# ---------------------------------------------------------
# 3. Main Application Logic (Unchanged but uses new classes)
# ---------------------------------------------------------
class VectorSystem:
    def __init__(self):
        self.vector_store = None
        self.embeddings = OnnxBgeEmbeddings() # Uses new BGE-Small
        self.llm = LLMEvaluator() # Uses new Qwen 0.5B
        self.all_chunks = [] 
        self.total_chunks = 0

    def process_file(self, file_obj):
        if file_obj is None: return "No file uploaded."
        try:
            text = ""
            if file_obj.name.endswith('.pdf'):
                doc = fitz.open(file_obj.name)
                for page in doc: text += page.get_text()
            elif file_obj.name.endswith('.txt'):
                with open(file_obj.name, 'r', encoding='utf-8') as f: text = f.read()
            else:
                return "❌ Error: Only .pdf and .txt supported."

            text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
            self.all_chunks = text_splitter.split_text(text)
            self.total_chunks = len(self.all_chunks)
            
            if not self.all_chunks: return "File empty."

            metadatas = [{"id": i} for i in range(self.total_chunks)]
            self.vector_store = FAISS.from_texts(self.all_chunks, self.embeddings, metadatas=metadatas)
            
            return f"βœ… Indexed {self.total_chunks} chunks."
        except Exception as e:
            return f"Error: {str(e)}"

    def process_query(self, question, student_answer, max_marks):
        if not self.vector_store: return "⚠️ Please upload a file first.", ""
        if not question: return "⚠️ Enter a question.", ""

        results = self.vector_store.similarity_search_with_score(question, k=1)
        top_doc, score = results[0]
        
        center_id = top_doc.metadata['id']
        start_id = max(0, center_id - 1)
        end_id = min(self.total_chunks - 1, center_id + 1)
        
        expanded_context = ""
        for i in range(start_id, end_id + 1):
            expanded_context += self.all_chunks[i] + "\n"

        evidence_display = f"### πŸ“š Expanded Context (Chunks {start_id} to {end_id}):\n"
        evidence_display += f"> ... {expanded_context} ..."
        
        llm_feedback = "Please enter a student answer to grade."
        if student_answer:
            llm_feedback = self.llm.evaluate(expanded_context, question, student_answer, max_marks)

        return evidence_display, llm_feedback

system = VectorSystem()

with gr.Blocks(title="EduGenius AI Grader") as demo:
    gr.Markdown("# ⚑ EduGenius: Ultra-Fast RAG")
    gr.Markdown("Powered by **Qwen-2.5-0.5B** and **BGE-Small** (ONNX Optimized)")
    
    with gr.Row():
        with gr.Column(scale=1):
            pdf_input = gr.File(label="1. Upload Chapter")
            upload_btn = gr.Button("Index Content", variant="primary")
            status_msg = gr.Textbox(label="Status", interactive=False)

        with gr.Column(scale=2):
            with gr.Row():
                q_input = gr.Textbox(label="Question", scale=2)
                max_marks = gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max Marks")
            
            a_input = gr.TextArea(label="Student Answer")
            run_btn = gr.Button("Retrieve & Grade", variant="secondary")
            
            with gr.Row():
                evidence_box = gr.Markdown(label="Context Used")
                grade_box = gr.Markdown(label="Grading Result")

    upload_btn.click(system.process_file, inputs=[pdf_input], outputs=[status_msg])
    run_btn.click(system.process_query, inputs=[q_input, a_input, max_marks], outputs=[evidence_box, grade_box])

if __name__ == "__main__":
    demo.launch()