import gradio as gr import fitz # PyMuPDF import torch import os # --- LANGCHAIN & RAG IMPORTS --- from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS from langchain_core.embeddings import Embeddings # --- ONNX & MODEL IMPORTS --- from transformers import AutoTokenizer from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTModelForCausalLM from huggingface_hub import snapshot_download import onnxruntime as ort # Check available hardware accelerators PROVIDERS = ort.get_available_providers() print(f"⚡ Hardware Acceleration Providers: {PROVIDERS}") # --------------------------------------------------------- # 1. OPTIMIZED EMBEDDINGS (BGE-SMALL) # --------------------------------------------------------- class OnnxBgeEmbeddings(Embeddings): # CHANGE 1: Switched to 'bge-small' (3x faster than large, similar accuracy) def __init__(self, model_name="BAAI/bge-small-en-v1.5"): print(f"🔄 Loading Faster Embeddings: {model_name}...") self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = ORTModelForFeatureExtraction.from_pretrained( model_name, export=False, provider=PROVIDERS[0] # Auto-select best hardware (CUDA/CoreML) ) def _process_batch(self, texts): inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt") # Move inputs to same device as model if needed (mostly handled by Optimum) device = self.model.device inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): outputs = self.model(**inputs) embeddings = outputs.last_hidden_state[:, 0] embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) # Detach from graph before converting to numpy return embeddings.cpu().numpy().tolist() def embed_documents(self, texts): return self._process_batch(texts) def embed_query(self, text): return self._process_batch(["Represent this sentence for searching relevant passages: " + text])[0] # --------------------------------------------------------- # 2. OPTIMIZED LLM (Qwen 2.5 - 0.5B) # --------------------------------------------------------- class LLMEvaluator: def __init__(self): # CHANGE 2: Switched to Qwen 2.5 0.5B (Half the size of Llama 1B, very smart) self.repo_id = "Xenova/Qwen2.5-0.5B-Instruct" self.local_dir = "onnx_qwen_local" print(f"🔄 Preparing Ultra-Fast LLM: {self.repo_id}...") if not os.path.exists(self.local_dir): print(f"📥 Downloading Model to {self.local_dir}...") # Note: Xenova repos usually have the ONNX ready, no complex wildcard needed snapshot_download(repo_id=self.repo_id, local_dir=self.local_dir) print("✅ Download complete.") self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir) # CHANGE 3: Enabled IO Binding + Explicit Provider self.model = ORTModelForCausalLM.from_pretrained( self.local_dir, use_cache=True, use_io_binding=True, # CHANGE: Major speedup on GPU provider=PROVIDERS[0] ) def evaluate(self, context, question, student_answer, max_marks): # Qwen uses ChatML format implicitly via tokenizer messages = [ {"role": "system", "content": "You are a strict academic grader. Verify the student answer against the context. Be harsh. Do not halluncinate."}, {"role": "user", "content": f""" CONTEXT: {context} QUESTION: {question} ANSWER: {student_answer} TASK: Grade out of {max_marks}. RULES: 1. If wrong, 0 marks. 2. Be strict. 3. Format: 'Score: X/{max_marks} \n Feedback: ...' """} ] input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = self.tokenizer(input_text, return_tensors="pt") # Move inputs for IO Binding device = self.model.device inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): outputs = self.model.generate( **inputs, max_new_tokens=75, # CHANGE 4: Reduced tokens (we only need a short score/feedback) temperature=0.1, do_sample=False ) response = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) return response # --------------------------------------------------------- # 3. Main Application Logic (Unchanged but uses new classes) # --------------------------------------------------------- class VectorSystem: def __init__(self): self.vector_store = None self.embeddings = OnnxBgeEmbeddings() # Uses new BGE-Small self.llm = LLMEvaluator() # Uses new Qwen 0.5B self.all_chunks = [] self.total_chunks = 0 def process_file(self, file_obj): if file_obj is None: return "No file uploaded." try: text = "" if file_obj.name.endswith('.pdf'): doc = fitz.open(file_obj.name) for page in doc: text += page.get_text() elif file_obj.name.endswith('.txt'): with open(file_obj.name, 'r', encoding='utf-8') as f: text = f.read() else: return "❌ Error: Only .pdf and .txt supported." text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100) self.all_chunks = text_splitter.split_text(text) self.total_chunks = len(self.all_chunks) if not self.all_chunks: return "File empty." metadatas = [{"id": i} for i in range(self.total_chunks)] self.vector_store = FAISS.from_texts(self.all_chunks, self.embeddings, metadatas=metadatas) return f"✅ Indexed {self.total_chunks} chunks." except Exception as e: return f"Error: {str(e)}" def process_query(self, question, student_answer, max_marks): if not self.vector_store: return "⚠️ Please upload a file first.", "" if not question: return "⚠️ Enter a question.", "" results = self.vector_store.similarity_search_with_score(question, k=1) top_doc, score = results[0] center_id = top_doc.metadata['id'] start_id = max(0, center_id - 1) end_id = min(self.total_chunks - 1, center_id + 1) expanded_context = "" for i in range(start_id, end_id + 1): expanded_context += self.all_chunks[i] + "\n" evidence_display = f"### 📚 Expanded Context (Chunks {start_id} to {end_id}):\n" evidence_display += f"> ... {expanded_context} ..." llm_feedback = "Please enter a student answer to grade." if student_answer: llm_feedback = self.llm.evaluate(expanded_context, question, student_answer, max_marks) return evidence_display, llm_feedback system = VectorSystem() with gr.Blocks(title="EduGenius AI Grader") as demo: gr.Markdown("# ⚡ EduGenius: Ultra-Fast RAG") gr.Markdown("Powered by **Qwen-2.5-0.5B** and **BGE-Small** (ONNX Optimized)") with gr.Row(): with gr.Column(scale=1): pdf_input = gr.File(label="1. Upload Chapter") upload_btn = gr.Button("Index Content", variant="primary") status_msg = gr.Textbox(label="Status", interactive=False) with gr.Column(scale=2): with gr.Row(): q_input = gr.Textbox(label="Question", scale=2) max_marks = gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max Marks") a_input = gr.TextArea(label="Student Answer") run_btn = gr.Button("Retrieve & Grade", variant="secondary") with gr.Row(): evidence_box = gr.Markdown(label="Context Used") grade_box = gr.Markdown(label="Grading Result") upload_btn.click(system.process_file, inputs=[pdf_input], outputs=[status_msg]) run_btn.click(system.process_query, inputs=[q_input, a_input, max_marks], outputs=[evidence_box, grade_box]) if __name__ == "__main__": demo.launch()