try_answer / app.py
heerjtdev's picture
Update app.py
2c19d14 verified
raw
history blame
8.59 kB
import gradio as gr
import fitz # PyMuPDF
import torch
import os
# --- LANGCHAIN & RAG IMPORTS ---
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.embeddings import Embeddings
# --- ONNX & MODEL IMPORTS ---
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTModelForCausalLM
from huggingface_hub import snapshot_download
import onnxruntime as ort
# Check available hardware accelerators
PROVIDERS = ort.get_available_providers()
print(f"⚑ Hardware Acceleration Providers: {PROVIDERS}")
# ---------------------------------------------------------
# 1. OPTIMIZED EMBEDDINGS (BGE-SMALL)
# ---------------------------------------------------------
class OnnxBgeEmbeddings(Embeddings):
# CHANGE 1: Switched to 'bge-small' (3x faster than large, similar accuracy)
def __init__(self, model_name="BAAI/bge-small-en-v1.5"):
print(f"πŸ”„ Loading Faster Embeddings: {model_name}...")
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = ORTModelForFeatureExtraction.from_pretrained(
model_name,
export=False,
provider=PROVIDERS[0] # Auto-select best hardware (CUDA/CoreML)
)
def _process_batch(self, texts):
inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
# Move inputs to same device as model if needed (mostly handled by Optimum)
device = self.model.device
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = self.model(**inputs)
embeddings = outputs.last_hidden_state[:, 0]
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
# Detach from graph before converting to numpy
return embeddings.cpu().numpy().tolist()
def embed_documents(self, texts):
return self._process_batch(texts)
def embed_query(self, text):
return self._process_batch(["Represent this sentence for searching relevant passages: " + text])[0]
# ---------------------------------------------------------
# 2. OPTIMIZED LLM (Qwen 2.5 - 0.5B)
# ---------------------------------------------------------
class LLMEvaluator:
def __init__(self):
# CHANGE 2: Switched to Qwen 2.5 0.5B (Half the size of Llama 1B, very smart)
self.repo_id = "Xenova/Qwen2.5-0.5B-Instruct"
self.local_dir = "onnx_qwen_local"
print(f"πŸ”„ Preparing Ultra-Fast LLM: {self.repo_id}...")
if not os.path.exists(self.local_dir):
print(f"πŸ“₯ Downloading Model to {self.local_dir}...")
# Note: Xenova repos usually have the ONNX ready, no complex wildcard needed
snapshot_download(repo_id=self.repo_id, local_dir=self.local_dir)
print("βœ… Download complete.")
self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir)
# CHANGE 3: Enabled IO Binding + Explicit Provider
self.model = ORTModelForCausalLM.from_pretrained(
self.local_dir,
use_cache=True,
use_io_binding=True, # CHANGE: Major speedup on GPU
provider=PROVIDERS[0]
)
def evaluate(self, context, question, student_answer, max_marks):
# Qwen uses ChatML format implicitly via tokenizer
messages = [
{"role": "system", "content": "You are a strict academic grader. Verify the student answer against the context. Be harsh. Do not halluncinate."},
{"role": "user", "content": f"""
CONTEXT: {context}
QUESTION: {question}
ANSWER: {student_answer}
TASK: Grade out of {max_marks}.
RULES:
1. If wrong, 0 marks.
2. Be strict.
3. Format: 'Score: X/{max_marks} \n Feedback: ...'
"""}
]
input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = self.tokenizer(input_text, return_tensors="pt")
# Move inputs for IO Binding
device = self.model.device
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=75, # CHANGE 4: Reduced tokens (we only need a short score/feedback)
temperature=0.1,
do_sample=False
)
response = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
return response
# ---------------------------------------------------------
# 3. Main Application Logic (Unchanged but uses new classes)
# ---------------------------------------------------------
class VectorSystem:
def __init__(self):
self.vector_store = None
self.embeddings = OnnxBgeEmbeddings() # Uses new BGE-Small
self.llm = LLMEvaluator() # Uses new Qwen 0.5B
self.all_chunks = []
self.total_chunks = 0
def process_file(self, file_obj):
if file_obj is None: return "No file uploaded."
try:
text = ""
if file_obj.name.endswith('.pdf'):
doc = fitz.open(file_obj.name)
for page in doc: text += page.get_text()
elif file_obj.name.endswith('.txt'):
with open(file_obj.name, 'r', encoding='utf-8') as f: text = f.read()
else:
return "❌ Error: Only .pdf and .txt supported."
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
self.all_chunks = text_splitter.split_text(text)
self.total_chunks = len(self.all_chunks)
if not self.all_chunks: return "File empty."
metadatas = [{"id": i} for i in range(self.total_chunks)]
self.vector_store = FAISS.from_texts(self.all_chunks, self.embeddings, metadatas=metadatas)
return f"βœ… Indexed {self.total_chunks} chunks."
except Exception as e:
return f"Error: {str(e)}"
def process_query(self, question, student_answer, max_marks):
if not self.vector_store: return "⚠️ Please upload a file first.", ""
if not question: return "⚠️ Enter a question.", ""
results = self.vector_store.similarity_search_with_score(question, k=1)
top_doc, score = results[0]
center_id = top_doc.metadata['id']
start_id = max(0, center_id - 1)
end_id = min(self.total_chunks - 1, center_id + 1)
expanded_context = ""
for i in range(start_id, end_id + 1):
expanded_context += self.all_chunks[i] + "\n"
evidence_display = f"### πŸ“š Expanded Context (Chunks {start_id} to {end_id}):\n"
evidence_display += f"> ... {expanded_context} ..."
llm_feedback = "Please enter a student answer to grade."
if student_answer:
llm_feedback = self.llm.evaluate(expanded_context, question, student_answer, max_marks)
return evidence_display, llm_feedback
system = VectorSystem()
with gr.Blocks(title="EduGenius AI Grader") as demo:
gr.Markdown("# ⚑ EduGenius: Ultra-Fast RAG")
gr.Markdown("Powered by **Qwen-2.5-0.5B** and **BGE-Small** (ONNX Optimized)")
with gr.Row():
with gr.Column(scale=1):
pdf_input = gr.File(label="1. Upload Chapter")
upload_btn = gr.Button("Index Content", variant="primary")
status_msg = gr.Textbox(label="Status", interactive=False)
with gr.Column(scale=2):
with gr.Row():
q_input = gr.Textbox(label="Question", scale=2)
max_marks = gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max Marks")
a_input = gr.TextArea(label="Student Answer")
run_btn = gr.Button("Retrieve & Grade", variant="secondary")
with gr.Row():
evidence_box = gr.Markdown(label="Context Used")
grade_box = gr.Markdown(label="Grading Result")
upload_btn.click(system.process_file, inputs=[pdf_input], outputs=[status_msg])
run_btn.click(system.process_query, inputs=[q_input, a_input, max_marks], outputs=[evidence_box, grade_box])
if __name__ == "__main__":
demo.launch()