heerjtdev commited on
Commit
6662485
·
verified ·
1 Parent(s): bdf342a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +139 -30
app.py CHANGED
@@ -138,13 +138,12 @@ class OnnxBgeEmbeddings(Embeddings):
138
 
139
 
140
 
141
-
142
  # ---------------------------------------------------------
143
  # 2. OPTIMIZED LLM (Qwen 2.5 - 0.5B) - STRICT GRADING
144
  # ---------------------------------------------------------
145
  class LLMEvaluator:
146
  def __init__(self):
147
- # Qwen 0.5B is great for speed, but needs VERY specific prompts to be strict.
148
  self.repo_id = "onnx-community/Qwen2.5-0.5B-Instruct"
149
  self.local_dir = "onnx_qwen_local"
150
 
@@ -175,33 +174,42 @@ class LLMEvaluator:
175
  )
176
 
177
  def evaluate(self, context, question, student_answer, max_marks):
178
- # --- STRATEGY: FEW-SHOT PROMPTING & CHAIN OF THOUGHT ---
179
- # Small models (0.5B) need examples to understand "Strictness".
180
-
181
- system_prompt = """You are a strict automated grader. You grade ONLY based on the provided Context.
182
 
183
- RULES:
184
- 1. If the Student Answer contains facts NOT found in the Context, Score is 0.
185
- 2. If the Student Answer contradicts the Context, Score is 0.
186
- 3. Do not use outside knowledge. If it's not in the text, it's wrong.
187
-
188
- --- EXAMPLE 1 (WRONG ANSWER) ---
189
- Context: The sky is blue because of Rayleigh scattering.
 
 
 
190
  Question: Why is the sky blue?
191
- Student Answer: Because the ocean reflects into it.
192
- Analysis: The context mentions Rayleigh scattering. The student mentioned ocean reflection. These do not match.
193
  Score: 0/{max_marks}
194
-
195
- --- EXAMPLE 2 (CORRECT ANSWER) ---
 
 
 
 
 
 
 
196
  Context: Mitochondria is the powerhouse of the cell.
197
- Question: What is the mitochondria?
198
- Student Answer: It is the powerhouse of the cell.
199
- Analysis: The student answer matches the context text exactly.
200
  Score: {max_marks}/{max_marks}
201
  """
202
 
203
  user_prompt = f"""
204
- --- NOW GRADE THIS ---
205
  Context:
206
  {context}
207
 
@@ -211,12 +219,8 @@ class LLMEvaluator:
211
  Student Answer:
212
  {student_answer}
213
 
214
- Task:
215
- 1. Analyze if the specific keywords in Student Answer exist in Context.
216
- 2. Assign a Score.
217
-
218
- Output format:
219
- Analysis: [Analysis here]
220
  Score: [X]/{max_marks}
221
  """
222
 
@@ -228,14 +232,15 @@ class LLMEvaluator:
228
  input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
229
  inputs = self.tokenizer(input_text, return_tensors="pt")
230
 
 
231
  with torch.no_grad():
232
  outputs = self.model.generate(
233
  **inputs,
234
  max_new_tokens=150,
235
- temperature=0.1, # Low temperature for facts
236
- top_p=0.1, # Reduce creativity
237
  do_sample=True,
238
- repetition_penalty=1.1
239
  )
240
 
241
  input_length = inputs['input_ids'].shape[1]
@@ -243,6 +248,110 @@ class LLMEvaluator:
243
  return response
244
 
245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  # ---------------------------------------------------------
247
  # 3. Main Application Logic
248
  # ---------------------------------------------------------
 
138
 
139
 
140
 
 
141
  # ---------------------------------------------------------
142
  # 2. OPTIMIZED LLM (Qwen 2.5 - 0.5B) - STRICT GRADING
143
  # ---------------------------------------------------------
144
  class LLMEvaluator:
145
  def __init__(self):
146
+ # Qwen 2.5 0.5B is fast but needs "Few-Shot" examples to be strict.
147
  self.repo_id = "onnx-community/Qwen2.5-0.5B-Instruct"
148
  self.local_dir = "onnx_qwen_local"
149
 
 
174
  )
175
 
176
  def evaluate(self, context, question, student_answer, max_marks):
177
+ # --- IMPROVED PROMPT STRATEGY ---
178
+ # 1. Role: We set the persona to a "Strict Logical Validator" not a "Teacher".
179
+ # 2. Few-Shot: We give examples of HALLUCINATIONS getting 0 marks.
 
180
 
181
+ system_prompt = f"""You are a strict Logic Validator. You are NOT a helpful assistant.
182
+ Your job is to check if the Student Answer is FACTUALLY present in the Context.
183
+
184
+ GRADING ALGORITHM:
185
+ 1. IF the Student Answer mentions things NOT in the Context -> PENALTY (-100%).
186
+ 2. IF the Student Answer interprets the text opposite to its meaning -> PENALTY (-100%).
187
+ 3. IF the Student Answer is generic fluff -> SCORE: 0.
188
+
189
+ --- EXAMPLE 1 (HALLUCINATION) ---
190
+ Context: The sky is blue due to Rayleigh scattering.
191
  Question: Why is the sky blue?
192
+ Student Answer: Because the ocean reflects the water into the sky.
193
+ Analysis: The Context mentions 'Rayleigh scattering'. The student mentions 'ocean reflection'. These are different. The student is hallucinating outside facts.
194
  Score: 0/{max_marks}
195
+
196
+ --- EXAMPLE 2 (CONTRADICTION) ---
197
+ Context: One must efface one's own personality. Good prose is like a windowpane.
198
+ Question: What does the author mean?
199
+ Student Answer: It means we should see the author's personality clearly.
200
+ Analysis: The text says 'efface' (remove) personality. The student says 'see' personality. This is a direct contradiction.
201
+ Score: 0/{max_marks}
202
+
203
+ --- EXAMPLE 3 (CORRECT) ---
204
  Context: Mitochondria is the powerhouse of the cell.
205
+ Question: What is mitochondria?
206
+ Student Answer: It is the cell's powerhouse.
207
+ Analysis: Matches the text meaning exactly.
208
  Score: {max_marks}/{max_marks}
209
  """
210
 
211
  user_prompt = f"""
212
+ --- YOUR TASK ---
213
  Context:
214
  {context}
215
 
 
219
  Student Answer:
220
  {student_answer}
221
 
222
+ OUTPUT FORMAT:
223
+ Analysis: [Compare Student Answer vs Context. List any hallucinations or contradictions.]
 
 
 
 
224
  Score: [X]/{max_marks}
225
  """
226
 
 
232
  input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
233
  inputs = self.tokenizer(input_text, return_tensors="pt")
234
 
235
+ # Lower temperature for strictness
236
  with torch.no_grad():
237
  outputs = self.model.generate(
238
  **inputs,
239
  max_new_tokens=150,
240
+ temperature=0.1, # Strict logic, no creativity
241
+ top_p=0.2, # Cut off unlikely tokens
242
  do_sample=True,
243
+ repetition_penalty=1.2 # Penalize repetition
244
  )
245
 
246
  input_length = inputs['input_ids'].shape[1]
 
248
  return response
249
 
250
 
251
+ # # ---------------------------------------------------------
252
+ # # 2. OPTIMIZED LLM (Qwen 2.5 - 0.5B) - STRICT GRADING
253
+ # # ---------------------------------------------------------
254
+ # class LLMEvaluator:
255
+ # def __init__(self):
256
+ # # Qwen 0.5B is great for speed, but needs VERY specific prompts to be strict.
257
+ # self.repo_id = "onnx-community/Qwen2.5-0.5B-Instruct"
258
+ # self.local_dir = "onnx_qwen_local"
259
+
260
+ # print(f"🔄 Preparing CPU LLM: {self.repo_id}...")
261
+
262
+ # if not os.path.exists(self.local_dir):
263
+ # print(f"📥 Downloading FP16 model to {self.local_dir}...")
264
+ # snapshot_download(
265
+ # repo_id=self.repo_id,
266
+ # local_dir=self.local_dir,
267
+ # allow_patterns=["config.json", "generation_config.json", "tokenizer*", "special_tokens_map.json", "*.jinja", "onnx/model_fp16.onnx*"]
268
+ # )
269
+ # print("✅ Download complete.")
270
+
271
+ # self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir)
272
+
273
+ # sess_options = SessionOptions()
274
+ # sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL
275
+
276
+ # self.model = ORTModelForCausalLM.from_pretrained(
277
+ # self.local_dir,
278
+ # subfolder="onnx",
279
+ # file_name="model_fp16.onnx",
280
+ # use_cache=True,
281
+ # use_io_binding=False,
282
+ # provider=PROVIDERS[0],
283
+ # session_options=sess_options
284
+ # )
285
+
286
+ # def evaluate(self, context, question, student_answer, max_marks):
287
+ # # --- STRATEGY: FEW-SHOT PROMPTING & CHAIN OF THOUGHT ---
288
+ # # Small models (0.5B) need examples to understand "Strictness".
289
+
290
+ # system_prompt = """You are a strict automated grader. You grade ONLY based on the provided Context.
291
+
292
+ # RULES:
293
+ # 1. If the Student Answer contains facts NOT found in the Context, Score is 0.
294
+ # 2. If the Student Answer contradicts the Context, Score is 0.
295
+ # 3. Do not use outside knowledge. If it's not in the text, it's wrong.
296
+
297
+ # --- EXAMPLE 1 (WRONG ANSWER) ---
298
+ # Context: The sky is blue because of Rayleigh scattering.
299
+ # Question: Why is the sky blue?
300
+ # Student Answer: Because the ocean reflects into it.
301
+ # Analysis: The context mentions Rayleigh scattering. The student mentioned ocean reflection. These do not match.
302
+ # Score: 0/{max_marks}
303
+
304
+ # --- EXAMPLE 2 (CORRECT ANSWER) ---
305
+ # Context: Mitochondria is the powerhouse of the cell.
306
+ # Question: What is the mitochondria?
307
+ # Student Answer: It is the powerhouse of the cell.
308
+ # Analysis: The student answer matches the context text exactly.
309
+ # Score: {max_marks}/{max_marks}
310
+ # """
311
+
312
+ # user_prompt = f"""
313
+ # --- NOW GRADE THIS ---
314
+ # Context:
315
+ # {context}
316
+
317
+ # Question:
318
+ # {question}
319
+
320
+ # Student Answer:
321
+ # {student_answer}
322
+
323
+ # Task:
324
+ # 1. Analyze if the specific keywords in Student Answer exist in Context.
325
+ # 2. Assign a Score.
326
+
327
+ # Output format:
328
+ # Analysis: [Analysis here]
329
+ # Score: [X]/{max_marks}
330
+ # """
331
+
332
+ # messages = [
333
+ # {"role": "system", "content": system_prompt},
334
+ # {"role": "user", "content": user_prompt}
335
+ # ]
336
+
337
+ # input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
338
+ # inputs = self.tokenizer(input_text, return_tensors="pt")
339
+
340
+ # with torch.no_grad():
341
+ # outputs = self.model.generate(
342
+ # **inputs,
343
+ # max_new_tokens=150,
344
+ # temperature=0.1, # Low temperature for facts
345
+ # top_p=0.1, # Reduce creativity
346
+ # do_sample=True,
347
+ # repetition_penalty=1.1
348
+ # )
349
+
350
+ # input_length = inputs['input_ids'].shape[1]
351
+ # response = self.tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
352
+ # return response
353
+
354
+
355
  # ---------------------------------------------------------
356
  # 3. Main Application Logic
357
  # ---------------------------------------------------------