nathanael-fijalkow commited on
Commit
429074d
Β·
1 Parent(s): 116756e

added debug output

Browse files
Files changed (1) hide show
  1. app.py +34 -10
app.py CHANGED
@@ -168,6 +168,12 @@ def evaluate_submission(file_obj, debug=False):
168
  # 2. ISOLATED LOADING
169
  # We use a unique name for each import to avoid namespace collisions
170
  file_path = file_obj if isinstance(file_obj, str) else file_obj.name
 
 
 
 
 
 
171
  spec = importlib.util.spec_from_file_location("student_module", file_path)
172
  student_module = importlib.util.module_from_spec(spec)
173
  spec.loader.exec_module(student_module)
@@ -179,9 +185,13 @@ def evaluate_submission(file_obj, debug=False):
179
  ex1_timeout = False
180
  ex1_outputs = []
181
  try:
 
182
  ex1_instance = student_module.LaDisparition(model, tokenizer)
183
  for i, prompt in enumerate(TEST_CASES["exercise_1"]):
184
  try:
 
 
 
185
  # We limit tokens to keep evaluation fast
186
  output = run_with_timeout(
187
  ex1_instance,
@@ -191,21 +201,26 @@ def evaluate_submission(file_obj, debug=False):
191
  )
192
  # Remove prompt from output to only validate generated text
193
  cleaned_output = strip_prompt_from_output(output, prompt)
194
- passed = 'e' not in cleaned_output.lower() and len(cleaned_output.strip()) > 10
 
 
 
 
195
  if passed:
196
  ex1_passed += 1
197
- ex1_outputs.append({"prompt": prompt, "output": cleaned_output, "passed": passed})
198
  if debug:
199
  print(f"Ex1 Test {i+1}: {'βœ“' if passed else 'βœ—'}")
200
  print(f" Prompt: {prompt}")
201
- print(f" Output: {cleaned_output}")
202
  print()
203
  except TimeoutException:
204
  ex1_timeout = True
205
  ex1_outputs.append({"prompt": prompt, "output": "TIMEOUT", "passed": False})
206
- if debug:
207
- print(f"Ex1 Test {i+1}: βœ— TIMEOUT")
208
  break
 
 
209
  if ex1_timeout:
210
  report.append(f" **Ex 1 (No 'e'):** TIMEOUT - evaluation exceeded {TIMEOUT_SECONDS}s limit")
211
  else:
@@ -223,9 +238,13 @@ def evaluate_submission(file_obj, debug=False):
223
  ex2_timeout = False
224
  ex2_outputs = []
225
  try:
 
226
  ex2_instance = student_module.ToulouseSequence(model, tokenizer)
227
  for i, prompt in enumerate(TEST_CASES["exercise_2"]):
228
  try:
 
 
 
229
  output = run_with_timeout(
230
  ex2_instance,
231
  args=(prompt,),
@@ -234,21 +253,26 @@ def evaluate_submission(file_obj, debug=False):
234
  )
235
  # Remove prompt from output to only validate generated text
236
  cleaned_output = strip_prompt_from_output(output, prompt)
237
- passed = "toulouse" not in cleaned_output.lower() and len(cleaned_output.strip()) > 10
 
 
 
 
238
  if passed:
239
  ex2_passed += 1
240
- ex2_outputs.append({"prompt": prompt, "output": cleaned_output, "passed": passed})
241
  if debug:
242
  print(f"Ex2 Test {i+1}: {'βœ“' if passed else 'βœ—'}")
243
  print(f" Prompt: {prompt}")
244
- print(f" Output: {cleaned_output}")
245
  print()
246
  except TimeoutException:
247
  ex2_timeout = True
248
  ex2_outputs.append({"prompt": prompt, "output": "TIMEOUT", "passed": False})
249
- if debug:
250
- print(f"Ex2 Test {i+1}: βœ— TIMEOUT")
251
  break
 
 
252
  if ex2_timeout:
253
  report.append(f" **Ex 2 (No Toulouse):** TIMEOUT - evaluation exceeded {TIMEOUT_SECONDS}s limit")
254
  else:
 
168
  # 2. ISOLATED LOADING
169
  # We use a unique name for each import to avoid namespace collisions
170
  file_path = file_obj if isinstance(file_obj, str) else file_obj.name
171
+
172
+ # Always print who is being evaluated
173
+ print(f"\n{'='*60}")
174
+ print(f"EVALUATING: {file_path}")
175
+ print(f"{'='*60}\n")
176
+
177
  spec = importlib.util.spec_from_file_location("student_module", file_path)
178
  student_module = importlib.util.module_from_spec(spec)
179
  spec.loader.exec_module(student_module)
 
185
  ex1_timeout = False
186
  ex1_outputs = []
187
  try:
188
+ print("### EXERCISE 1 - La Disparition (No 'e')")
189
  ex1_instance = student_module.LaDisparition(model, tokenizer)
190
  for i, prompt in enumerate(TEST_CASES["exercise_1"]):
191
  try:
192
+ print(f"\nTest {i+1}/{len(TEST_CASES['exercise_1'])}")
193
+ print(f"Prompt: {prompt}")
194
+
195
  # We limit tokens to keep evaluation fast
196
  output = run_with_timeout(
197
  ex1_instance,
 
201
  )
202
  # Remove prompt from output to only validate generated text
203
  cleaned_output = strip_prompt_from_output(output, prompt)
204
+ assistant_response = extract_assistant_response(cleaned_output)
205
+
206
+ print(f"Response: {assistant_response}")
207
+
208
+ passed = 'e' not in assistant_response.lower() and len(assistant_response.strip()) > 10
209
  if passed:
210
  ex1_passed += 1
211
+ ex1_outputs.append({"prompt": prompt, "output": assistant_response, "passed": passed})
212
  if debug:
213
  print(f"Ex1 Test {i+1}: {'βœ“' if passed else 'βœ—'}")
214
  print(f" Prompt: {prompt}")
215
+ print(f" Output: {assistant_response}")
216
  print()
217
  except TimeoutException:
218
  ex1_timeout = True
219
  ex1_outputs.append({"prompt": prompt, "output": "TIMEOUT", "passed": False})
220
+ print(f"Result: βœ— TIMEOUT")
 
221
  break
222
+
223
+ print(f"\nExercise 1 Score: {ex1_passed}/5")
224
  if ex1_timeout:
225
  report.append(f" **Ex 1 (No 'e'):** TIMEOUT - evaluation exceeded {TIMEOUT_SECONDS}s limit")
226
  else:
 
238
  ex2_timeout = False
239
  ex2_outputs = []
240
  try:
241
+ print("\n### EXERCISE 2 - Toulouse Sequence (No 'Toulouse')")
242
  ex2_instance = student_module.ToulouseSequence(model, tokenizer)
243
  for i, prompt in enumerate(TEST_CASES["exercise_2"]):
244
  try:
245
+ print(f"\nTest {i+1}/{len(TEST_CASES['exercise_2'])}")
246
+ print(f"Prompt: {prompt}")
247
+
248
  output = run_with_timeout(
249
  ex2_instance,
250
  args=(prompt,),
 
253
  )
254
  # Remove prompt from output to only validate generated text
255
  cleaned_output = strip_prompt_from_output(output, prompt)
256
+ assistant_response = extract_assistant_response(cleaned_output)
257
+
258
+ print(f"Response: {assistant_response}")
259
+
260
+ passed = "toulouse" not in assistant_response.lower() and len(assistant_response.strip()) > 10
261
  if passed:
262
  ex2_passed += 1
263
+ ex2_outputs.append({"prompt": prompt, "output": assistant_response, "passed": passed})
264
  if debug:
265
  print(f"Ex2 Test {i+1}: {'βœ“' if passed else 'βœ—'}")
266
  print(f" Prompt: {prompt}")
267
+ print(f" Output: {assistant_response}")
268
  print()
269
  except TimeoutException:
270
  ex2_timeout = True
271
  ex2_outputs.append({"prompt": prompt, "output": "TIMEOUT", "passed": False})
272
+ print(f"Result: βœ— TIMEOUT")
 
273
  break
274
+
275
+ print(f"\nExercise 2 Score: {ex2_passed}/5")
276
  if ex2_timeout:
277
  report.append(f" **Ex 2 (No Toulouse):** TIMEOUT - evaluation exceeded {TIMEOUT_SECONDS}s limit")
278
  else: