heerjtdev commited on
Commit
196a2a0
·
verified ·
1 Parent(s): eab649a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -20
app.py CHANGED
@@ -17,12 +17,12 @@ from transformers import AutoTokenizer
17
  from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTModelForCausalLM
18
  from huggingface_hub import snapshot_download
19
 
20
- # Check available hardware accelerators
21
- PROVIDERS = ort.get_available_providers()
22
- print(f"⚡ Hardware Acceleration Providers: {PROVIDERS}")
23
 
24
  # ---------------------------------------------------------
25
- # 1. OPTIMIZED EMBEDDINGS (BGE-SMALL) - [KEEP THIS SAME]
26
  # ---------------------------------------------------------
27
  class OnnxBgeEmbeddings(Embeddings):
28
  def __init__(self):
@@ -37,13 +37,12 @@ class OnnxBgeEmbeddings(Embeddings):
37
 
38
  def _process_batch(self, texts):
39
  inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
40
- device = self.model.device
41
- inputs = {k: v.to(device) for k, v in inputs.items()}
42
  with torch.no_grad():
43
  outputs = self.model(**inputs)
44
  embeddings = outputs.last_hidden_state[:, 0]
45
  embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
46
- return embeddings.cpu().numpy().tolist()
47
 
48
  def embed_documents(self, texts):
49
  return self._process_batch(texts)
@@ -52,18 +51,17 @@ class OnnxBgeEmbeddings(Embeddings):
52
  return self._process_batch(["Represent this sentence for searching relevant passages: " + text])[0]
53
 
54
  # ---------------------------------------------------------
55
- # 2. OPTIMIZED LLM (Qwen 2.5 - 0.5B) - [FIXED]
56
  # ---------------------------------------------------------
57
  class LLMEvaluator:
58
  def __init__(self):
59
  self.repo_id = "onnx-community/Qwen2.5-0.5B-Instruct"
60
  self.local_dir = "onnx_qwen_local"
61
 
62
- print(f"🔄 Preparing Ultra-Fast LLM: {self.repo_id}...")
63
 
64
- # Download (same as before)
65
  if not os.path.exists(self.local_dir):
66
- print(f"📥 Downloading FP16 model + data to {self.local_dir}...")
67
  snapshot_download(
68
  repo_id=self.repo_id,
69
  local_dir=self.local_dir,
@@ -73,8 +71,7 @@ class LLMEvaluator:
73
 
74
  self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir)
75
 
76
- # --- CRITICAL FIX: DISABLE GRAPH OPTIMIZATIONS ---
77
- # The model is already optimized. Re-optimizing it at runtime causes the crash.
78
  sess_options = SessionOptions()
79
  sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL
80
 
@@ -83,9 +80,9 @@ class LLMEvaluator:
83
  subfolder="onnx",
84
  file_name="model_fp16.onnx",
85
  use_cache=True,
86
- use_io_binding=True,
87
  provider=PROVIDERS[0],
88
- session_options=sess_options # <--- PASS THIS HERE
89
  )
90
 
91
  def evaluate(self, context, question, student_answer, max_marks):
@@ -107,9 +104,6 @@ class LLMEvaluator:
107
  input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
108
  inputs = self.tokenizer(input_text, return_tensors="pt")
109
 
110
- device = self.model.device
111
- inputs = {k: v.to(device) for k, v in inputs.items()}
112
-
113
  with torch.no_grad():
114
  outputs = self.model.generate(
115
  **inputs,
@@ -118,7 +112,9 @@ class LLMEvaluator:
118
  do_sample=False
119
  )
120
 
121
- response = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
 
 
122
  return response
123
 
124
  # ---------------------------------------------------------
@@ -184,7 +180,7 @@ class VectorSystem:
184
  system = VectorSystem()
185
 
186
  with gr.Blocks(title="EduGenius AI Grader") as demo:
187
- gr.Markdown("# ⚡ EduGenius: Ultra-Fast RAG")
188
  gr.Markdown("Powered by **Qwen-2.5-0.5B** and **BGE-Small** (ONNX Optimized)")
189
 
190
  with gr.Row():
 
17
  from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTModelForCausalLM
18
  from huggingface_hub import snapshot_download
19
 
20
+ # Force CPU Provider
21
+ PROVIDERS = ["CPUExecutionProvider"]
22
+ print(f"⚡ Running on: {PROVIDERS}")
23
 
24
  # ---------------------------------------------------------
25
+ # 1. OPTIMIZED EMBEDDINGS (BGE-SMALL)
26
  # ---------------------------------------------------------
27
  class OnnxBgeEmbeddings(Embeddings):
28
  def __init__(self):
 
37
 
38
  def _process_batch(self, texts):
39
  inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
40
+ # On CPU, we don't need to manually move to device, but it's good practice
 
41
  with torch.no_grad():
42
  outputs = self.model(**inputs)
43
  embeddings = outputs.last_hidden_state[:, 0]
44
  embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
45
+ return embeddings.numpy().tolist()
46
 
47
  def embed_documents(self, texts):
48
  return self._process_batch(texts)
 
51
  return self._process_batch(["Represent this sentence for searching relevant passages: " + text])[0]
52
 
53
  # ---------------------------------------------------------
54
+ # 2. OPTIMIZED LLM (Qwen 2.5 - 0.5B) - CPU VERSION
55
  # ---------------------------------------------------------
56
  class LLMEvaluator:
57
  def __init__(self):
58
  self.repo_id = "onnx-community/Qwen2.5-0.5B-Instruct"
59
  self.local_dir = "onnx_qwen_local"
60
 
61
+ print(f"🔄 Preparing CPU LLM: {self.repo_id}...")
62
 
 
63
  if not os.path.exists(self.local_dir):
64
+ print(f"📥 Downloading FP16 model to {self.local_dir}...")
65
  snapshot_download(
66
  repo_id=self.repo_id,
67
  local_dir=self.local_dir,
 
71
 
72
  self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir)
73
 
74
+ # CRITICAL: Disable Graph Optimizations to prevent crash
 
75
  sess_options = SessionOptions()
76
  sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL
77
 
 
80
  subfolder="onnx",
81
  file_name="model_fp16.onnx",
82
  use_cache=True,
83
+ use_io_binding=False, # DISABLED FOR CPU
84
  provider=PROVIDERS[0],
85
+ session_options=sess_options
86
  )
87
 
88
  def evaluate(self, context, question, student_answer, max_marks):
 
104
  input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
105
  inputs = self.tokenizer(input_text, return_tensors="pt")
106
 
 
 
 
107
  with torch.no_grad():
108
  outputs = self.model.generate(
109
  **inputs,
 
112
  do_sample=False
113
  )
114
 
115
+ # FIX: Access input_ids correctly
116
+ input_length = inputs['input_ids'].shape[1]
117
+ response = self.tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
118
  return response
119
 
120
  # ---------------------------------------------------------
 
180
  system = VectorSystem()
181
 
182
  with gr.Blocks(title="EduGenius AI Grader") as demo:
183
+ gr.Markdown("# ⚡ EduGenius: CPU Optimized RAG")
184
  gr.Markdown("Powered by **Qwen-2.5-0.5B** and **BGE-Small** (ONNX Optimized)")
185
 
186
  with gr.Row():