Spaces:

heerjtdev
/

try_answer

Running

App Files Files Community

heerjtdev commited on Feb 3

Commit

196a2a0

verified ·

1 Parent(s): eab649a

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -20

app.py CHANGED Viewed

@@ -17,12 +17,12 @@ from transformers import AutoTokenizer
 from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTModelForCausalLM
 from huggingface_hub import snapshot_download
-# Check available hardware accelerators
-PROVIDERS = ort.get_available_providers()
-print(f"⚡ Hardware Acceleration Providers: {PROVIDERS}")
 # ---------------------------------------------------------
-# 1. OPTIMIZED EMBEDDINGS (BGE-SMALL) - [KEEP THIS SAME]
 # ---------------------------------------------------------
 class OnnxBgeEmbeddings(Embeddings):
     def __init__(self):
@@ -37,13 +37,12 @@ class OnnxBgeEmbeddings(Embeddings):
     def _process_batch(self, texts):
         inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
-        device = self.model.device
-        inputs = {k: v.to(device) for k, v in inputs.items()}
         with torch.no_grad():
             outputs = self.model(**inputs)
         embeddings = outputs.last_hidden_state[:, 0]
         embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
-        return embeddings.cpu().numpy().tolist()
     def embed_documents(self, texts):
         return self._process_batch(texts)
@@ -52,18 +51,17 @@ class OnnxBgeEmbeddings(Embeddings):
         return self._process_batch(["Represent this sentence for searching relevant passages: " + text])[0]
 # ---------------------------------------------------------
-# 2. OPTIMIZED LLM (Qwen 2.5 - 0.5B) - [FIXED]
 # ---------------------------------------------------------
 class LLMEvaluator:
     def __init__(self):
         self.repo_id = "onnx-community/Qwen2.5-0.5B-Instruct"
         self.local_dir = "onnx_qwen_local"
-        print(f"🔄 Preparing Ultra-Fast LLM: {self.repo_id}...")
-        # Download (same as before)
         if not os.path.exists(self.local_dir):
-            print(f"📥 Downloading FP16 model + data to {self.local_dir}...")
             snapshot_download(
                 repo_id=self.repo_id,
                 local_dir=self.local_dir,
@@ -73,8 +71,7 @@ class LLMEvaluator:
         self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir)
-        # --- CRITICAL FIX: DISABLE GRAPH OPTIMIZATIONS ---
-        # The model is already optimized. Re-optimizing it at runtime causes the crash.
         sess_options = SessionOptions()
         sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL
@@ -83,9 +80,9 @@ class LLMEvaluator:
             subfolder="onnx",
             file_name="model_fp16.onnx",
             use_cache=True,
-            use_io_binding=True,
             provider=PROVIDERS[0],
-            session_options=sess_options  # <--- PASS THIS HERE
         )
     def evaluate(self, context, question, student_answer, max_marks):
@@ -107,9 +104,6 @@ class LLMEvaluator:
         input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = self.tokenizer(input_text, return_tensors="pt")
-        device = self.model.device
-        inputs = {k: v.to(device) for k, v in inputs.items()}
         with torch.no_grad():
             outputs = self.model.generate(
                 **inputs,
@@ -118,7 +112,9 @@ class LLMEvaluator:
                 do_sample=False
             )
-        response = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
         return response
 # ---------------------------------------------------------
@@ -184,7 +180,7 @@ class VectorSystem:
 system = VectorSystem()
 with gr.Blocks(title="EduGenius AI Grader") as demo:
-    gr.Markdown("# ⚡ EduGenius: Ultra-Fast RAG")
     gr.Markdown("Powered by **Qwen-2.5-0.5B** and **BGE-Small** (ONNX Optimized)")
     with gr.Row():

 from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTModelForCausalLM
 from huggingface_hub import snapshot_download
+# Force CPU Provider
+PROVIDERS = ["CPUExecutionProvider"]
+print(f"⚡ Running on: {PROVIDERS}")
 # ---------------------------------------------------------
+# 1. OPTIMIZED EMBEDDINGS (BGE-SMALL)
 # ---------------------------------------------------------
 class OnnxBgeEmbeddings(Embeddings):
     def __init__(self):
     def _process_batch(self, texts):
         inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
+        # On CPU, we don't need to manually move to device, but it's good practice
         with torch.no_grad():
             outputs = self.model(**inputs)
         embeddings = outputs.last_hidden_state[:, 0]
         embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+        return embeddings.numpy().tolist()
     def embed_documents(self, texts):
         return self._process_batch(texts)
         return self._process_batch(["Represent this sentence for searching relevant passages: " + text])[0]
 # ---------------------------------------------------------
+# 2. OPTIMIZED LLM (Qwen 2.5 - 0.5B) - CPU VERSION
 # ---------------------------------------------------------
 class LLMEvaluator:
     def __init__(self):
         self.repo_id = "onnx-community/Qwen2.5-0.5B-Instruct"
         self.local_dir = "onnx_qwen_local"
+        print(f"🔄 Preparing CPU LLM: {self.repo_id}...")
         if not os.path.exists(self.local_dir):
+            print(f"📥 Downloading FP16 model to {self.local_dir}...")
             snapshot_download(
                 repo_id=self.repo_id,
                 local_dir=self.local_dir,
         self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir)
+        # CRITICAL: Disable Graph Optimizations to prevent crash
         sess_options = SessionOptions()
         sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL
             subfolder="onnx",
             file_name="model_fp16.onnx",
             use_cache=True,
+            use_io_binding=False, # DISABLED FOR CPU
             provider=PROVIDERS[0],
+            session_options=sess_options
         )
     def evaluate(self, context, question, student_answer, max_marks):
         input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = self.tokenizer(input_text, return_tensors="pt")
         with torch.no_grad():
             outputs = self.model.generate(
                 **inputs,
                 do_sample=False
             )
+        # FIX: Access input_ids correctly
+        input_length = inputs['input_ids'].shape[1]
+        response = self.tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
         return response
 # ---------------------------------------------------------
 system = VectorSystem()
 with gr.Blocks(title="EduGenius AI Grader") as demo:
+    gr.Markdown("# ⚡ EduGenius: CPU Optimized RAG")
     gr.Markdown("Powered by **Qwen-2.5-0.5B** and **BGE-Small** (ONNX Optimized)")
     with gr.Row():