Spaces:
Running
Running
File size: 8,592 Bytes
317a0d7 2c19d14 317a0d7 2c19d14 317a0d7 2c19d14 317a0d7 2c19d14 317a0d7 2c19d14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | import gradio as gr
import fitz # PyMuPDF
import torch
import os
# --- LANGCHAIN & RAG IMPORTS ---
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.embeddings import Embeddings
# --- ONNX & MODEL IMPORTS ---
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTModelForCausalLM
from huggingface_hub import snapshot_download
import onnxruntime as ort
# Check available hardware accelerators
PROVIDERS = ort.get_available_providers()
print(f"β‘ Hardware Acceleration Providers: {PROVIDERS}")
# ---------------------------------------------------------
# 1. OPTIMIZED EMBEDDINGS (BGE-SMALL)
# ---------------------------------------------------------
class OnnxBgeEmbeddings(Embeddings):
# CHANGE 1: Switched to 'bge-small' (3x faster than large, similar accuracy)
def __init__(self, model_name="BAAI/bge-small-en-v1.5"):
print(f"π Loading Faster Embeddings: {model_name}...")
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = ORTModelForFeatureExtraction.from_pretrained(
model_name,
export=False,
provider=PROVIDERS[0] # Auto-select best hardware (CUDA/CoreML)
)
def _process_batch(self, texts):
inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
# Move inputs to same device as model if needed (mostly handled by Optimum)
device = self.model.device
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = self.model(**inputs)
embeddings = outputs.last_hidden_state[:, 0]
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
# Detach from graph before converting to numpy
return embeddings.cpu().numpy().tolist()
def embed_documents(self, texts):
return self._process_batch(texts)
def embed_query(self, text):
return self._process_batch(["Represent this sentence for searching relevant passages: " + text])[0]
# ---------------------------------------------------------
# 2. OPTIMIZED LLM (Qwen 2.5 - 0.5B)
# ---------------------------------------------------------
class LLMEvaluator:
def __init__(self):
# CHANGE 2: Switched to Qwen 2.5 0.5B (Half the size of Llama 1B, very smart)
self.repo_id = "Xenova/Qwen2.5-0.5B-Instruct"
self.local_dir = "onnx_qwen_local"
print(f"π Preparing Ultra-Fast LLM: {self.repo_id}...")
if not os.path.exists(self.local_dir):
print(f"π₯ Downloading Model to {self.local_dir}...")
# Note: Xenova repos usually have the ONNX ready, no complex wildcard needed
snapshot_download(repo_id=self.repo_id, local_dir=self.local_dir)
print("β
Download complete.")
self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir)
# CHANGE 3: Enabled IO Binding + Explicit Provider
self.model = ORTModelForCausalLM.from_pretrained(
self.local_dir,
use_cache=True,
use_io_binding=True, # CHANGE: Major speedup on GPU
provider=PROVIDERS[0]
)
def evaluate(self, context, question, student_answer, max_marks):
# Qwen uses ChatML format implicitly via tokenizer
messages = [
{"role": "system", "content": "You are a strict academic grader. Verify the student answer against the context. Be harsh. Do not halluncinate."},
{"role": "user", "content": f"""
CONTEXT: {context}
QUESTION: {question}
ANSWER: {student_answer}
TASK: Grade out of {max_marks}.
RULES:
1. If wrong, 0 marks.
2. Be strict.
3. Format: 'Score: X/{max_marks} \n Feedback: ...'
"""}
]
input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = self.tokenizer(input_text, return_tensors="pt")
# Move inputs for IO Binding
device = self.model.device
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=75, # CHANGE 4: Reduced tokens (we only need a short score/feedback)
temperature=0.1,
do_sample=False
)
response = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
return response
# ---------------------------------------------------------
# 3. Main Application Logic (Unchanged but uses new classes)
# ---------------------------------------------------------
class VectorSystem:
def __init__(self):
self.vector_store = None
self.embeddings = OnnxBgeEmbeddings() # Uses new BGE-Small
self.llm = LLMEvaluator() # Uses new Qwen 0.5B
self.all_chunks = []
self.total_chunks = 0
def process_file(self, file_obj):
if file_obj is None: return "No file uploaded."
try:
text = ""
if file_obj.name.endswith('.pdf'):
doc = fitz.open(file_obj.name)
for page in doc: text += page.get_text()
elif file_obj.name.endswith('.txt'):
with open(file_obj.name, 'r', encoding='utf-8') as f: text = f.read()
else:
return "β Error: Only .pdf and .txt supported."
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
self.all_chunks = text_splitter.split_text(text)
self.total_chunks = len(self.all_chunks)
if not self.all_chunks: return "File empty."
metadatas = [{"id": i} for i in range(self.total_chunks)]
self.vector_store = FAISS.from_texts(self.all_chunks, self.embeddings, metadatas=metadatas)
return f"β
Indexed {self.total_chunks} chunks."
except Exception as e:
return f"Error: {str(e)}"
def process_query(self, question, student_answer, max_marks):
if not self.vector_store: return "β οΈ Please upload a file first.", ""
if not question: return "β οΈ Enter a question.", ""
results = self.vector_store.similarity_search_with_score(question, k=1)
top_doc, score = results[0]
center_id = top_doc.metadata['id']
start_id = max(0, center_id - 1)
end_id = min(self.total_chunks - 1, center_id + 1)
expanded_context = ""
for i in range(start_id, end_id + 1):
expanded_context += self.all_chunks[i] + "\n"
evidence_display = f"### π Expanded Context (Chunks {start_id} to {end_id}):\n"
evidence_display += f"> ... {expanded_context} ..."
llm_feedback = "Please enter a student answer to grade."
if student_answer:
llm_feedback = self.llm.evaluate(expanded_context, question, student_answer, max_marks)
return evidence_display, llm_feedback
system = VectorSystem()
with gr.Blocks(title="EduGenius AI Grader") as demo:
gr.Markdown("# β‘ EduGenius: Ultra-Fast RAG")
gr.Markdown("Powered by **Qwen-2.5-0.5B** and **BGE-Small** (ONNX Optimized)")
with gr.Row():
with gr.Column(scale=1):
pdf_input = gr.File(label="1. Upload Chapter")
upload_btn = gr.Button("Index Content", variant="primary")
status_msg = gr.Textbox(label="Status", interactive=False)
with gr.Column(scale=2):
with gr.Row():
q_input = gr.Textbox(label="Question", scale=2)
max_marks = gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max Marks")
a_input = gr.TextArea(label="Student Answer")
run_btn = gr.Button("Retrieve & Grade", variant="secondary")
with gr.Row():
evidence_box = gr.Markdown(label="Context Used")
grade_box = gr.Markdown(label="Grading Result")
upload_btn.click(system.process_file, inputs=[pdf_input], outputs=[status_msg])
run_btn.click(system.process_query, inputs=[q_input, a_input, max_marks], outputs=[evidence_box, grade_box])
if __name__ == "__main__":
demo.launch() |