nathanael-fijalkow commited on
Commit
4d8bbd9
·
1 Parent(s): 1d7752e

Improved logprob-based scoring

Browse files
Files changed (7) hide show
  1. app.py +157 -12
  2. calibrate_logprobs.py +236 -0
  3. forbidden_solution.py +14 -6
  4. greedy.py +127 -0
  5. reference_scores.csv +11 -0
  6. solution.py +7 -6
  7. test_cases.json +4 -4
app.py CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
2
  import importlib.util
3
  import json
4
  import torch
 
5
  import gc
6
  from transformers import AutoModelForCausalLM, AutoTokenizer
7
  import threading
@@ -26,6 +27,120 @@ model = AutoModelForCausalLM.from_pretrained(
26
  with open("test_cases.json", "r") as f:
27
  TEST_CASES = json.load(f)
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  class TimeoutException(Exception):
30
  pass
31
 
@@ -201,6 +316,7 @@ def evaluate_submission(file_obj, debug=False):
201
  ex1_passed = 0
202
  ex1_timeout = False
203
  ex1_outputs = []
 
204
  try:
205
  print("### EXERCISE 1 - La Disparition (No 'e')")
206
  ex1_instance = student_module.LaDisparition(model, tokenizer)
@@ -218,35 +334,49 @@ def evaluate_submission(file_obj, debug=False):
218
  )
219
  # Remove prompt from output to only validate generated text
220
  cleaned_output = strip_prompt_from_output(output, prompt)
221
- # assistant_response = extract_assistant_response(cleaned_output)
222
 
223
  print(f"Response: {cleaned_output}")
224
 
225
  passed = 'e' not in cleaned_output.lower() and len(cleaned_output.strip()) > 3
 
 
 
 
 
 
 
 
226
  if passed:
227
  ex1_passed += 1
228
- ex1_outputs.append({"prompt": prompt, "output": cleaned_output, "passed": passed})
 
 
 
229
  if debug:
230
  print(f"Ex1 Test {i+1}: {'✓' if passed else '✗'}")
231
  print(f" Prompt: {prompt}")
232
  print(f" Output: {output}")
 
233
  print()
234
  except TimeoutException:
235
  ex1_timeout = True
236
- ex1_outputs.append({"prompt": prompt, "output": "TIMEOUT", "passed": False})
 
237
  print(f"Result: ✗ TIMEOUT")
238
  break
239
 
240
- print(f"\nExercise 1 Score: {ex1_passed}/5")
 
241
  if ex1_timeout:
242
  report.append(f" **Ex 1 (No 'e'):** TIMEOUT - evaluation exceeded {TIMEOUT_SECONDS}s limit")
243
  else:
244
- report.append(f" **Ex 1 (No 'e'):** {ex1_passed}/5 correct")
245
 
246
  if debug:
247
  report.append("\n### Ex 1 Outputs:")
248
  for i, out in enumerate(ex1_outputs):
249
- report.append(f"{i+1}. {'' if out['passed'] else ''} `{out['output']}`")
 
250
  except Exception as e:
251
  report.append(f" **Ex 1 Error:** {str(e)}")
252
 
@@ -254,6 +384,7 @@ def evaluate_submission(file_obj, debug=False):
254
  ex2_passed = 0
255
  ex2_timeout = False
256
  ex2_outputs = []
 
257
  try:
258
  print("\n### EXERCISE 2 - Toulouse Sequence (No 'Toulouse')")
259
  ex2_instance = student_module.ToulouseSequence(model, tokenizer)
@@ -270,35 +401,49 @@ def evaluate_submission(file_obj, debug=False):
270
  )
271
  # Remove prompt from output to only validate generated text
272
  cleaned_output = strip_prompt_from_output(output, prompt)
273
- # assistant_response = extract_assistant_response(cleaned_output)
274
 
275
  print(f"Response: {cleaned_output}")
276
 
277
  passed = "toulouse" not in cleaned_output.lower() and len(cleaned_output.strip()) > 3
 
 
 
 
 
 
 
 
278
  if passed:
279
  ex2_passed += 1
280
- ex2_outputs.append({"prompt": prompt, "output": output, "passed": passed})
 
 
 
281
  if debug:
282
  print(f"Ex2 Test {i+1}: {'✓' if passed else '✗'}")
283
  print(f" Prompt: {prompt}")
284
  print(f" Output: {output}")
 
285
  print()
286
  except TimeoutException:
287
  ex2_timeout = True
288
- ex2_outputs.append({"prompt": prompt, "output": "TIMEOUT", "passed": False})
 
289
  print(f"Result: ✗ TIMEOUT")
290
  break
291
 
292
- print(f"\nExercise 2 Score: {ex2_passed}/5")
 
293
  if ex2_timeout:
294
  report.append(f" **Ex 2 (No Toulouse):** TIMEOUT - evaluation exceeded {TIMEOUT_SECONDS}s limit")
295
  else:
296
- report.append(f" **Ex 2 (No Toulouse):** {ex2_passed}/5 correct")
297
 
298
  if debug:
299
  report.append("\n### Ex 2 Outputs:")
300
  for i, out in enumerate(ex2_outputs):
301
- report.append(f"{i+1}. {'' if out['passed'] else ''} `{out['output']}`")
 
302
  except Exception as e:
303
  report.append(f" **Ex 2 Error:** {str(e)}")
304
 
 
2
  import importlib.util
3
  import json
4
  import torch
5
+ import torch.nn.functional as F
6
  import gc
7
  from transformers import AutoModelForCausalLM, AutoTokenizer
8
  import threading
 
27
  with open("test_cases.json", "r") as f:
28
  TEST_CASES = json.load(f)
29
 
30
+ # --- PER-PROMPT REFERENCE SCORING ---
31
+ # Load reference scores from CSV (generated by calibrate_logprobs.py from solution.py).
32
+ # Each prompt has: unconstrained_logprob (baseline) and reference_delta (solution.py delta).
33
+ # Quality = 1 if student is as good or better than solution.py, decreasing for worse.
34
+ import csv
35
+
36
+ REFERENCE_SCORES = {} # key: (exercise, prompt_index) → dict
37
+
38
+ with open("reference_scores.csv", "r") as csvfile:
39
+ reader = csv.DictReader(csvfile)
40
+ for row in reader:
41
+ key = (row["exercise"], int(row["prompt_index"]))
42
+ REFERENCE_SCORES[key] = {
43
+ "prompt": row["prompt"],
44
+ "unconstrained_logprob": float(row["unconstrained_logprob"]),
45
+ "reference_logprob": float(row["reference_logprob"]),
46
+ "reference_delta": float(row["reference_delta"]),
47
+ }
48
+
49
+
50
+ def compute_mean_logprob(prompt_text, generated_text):
51
+ """
52
+ Compute the mean log-probability per token of `generated_text`
53
+ conditioned on `prompt_text`, under the unconstrained model.
54
+
55
+ Uses chat template since the evaluation model is an instruct model.
56
+ This measures how "natural" the generated text is: a well-constrained
57
+ generator still produces coherent text (high logprob), while a bad one
58
+ produces gibberish (low logprob).
59
+
60
+ Returns: (mean_logprob, n_tokens)
61
+ """
62
+ if not generated_text or not generated_text.strip():
63
+ return -float('inf'), 0
64
+
65
+ # Always use chat template: the model is an instruct model, so
66
+ # logprobs are meaningful only in the chat context.
67
+ message = [{"role": "user", "content": prompt_text}]
68
+ prompt_ids = tokenizer.apply_chat_template(
69
+ message, add_generation_prompt=True, return_tensors="pt"
70
+ ).to(model.device)
71
+ gen_ids = tokenizer.encode(
72
+ generated_text, add_special_tokens=False, return_tensors="pt"
73
+ ).to(model.device)
74
+ full_ids = torch.cat([prompt_ids, gen_ids], dim=1)
75
+ prompt_len = prompt_ids.shape[1]
76
+
77
+ if full_ids.shape[1] <= prompt_len:
78
+ return -float('inf'), 0
79
+
80
+ with torch.no_grad():
81
+ outputs = model(full_ids)
82
+ logits = outputs.logits
83
+
84
+ log_probs = F.log_softmax(logits, dim=-1)
85
+
86
+ total_logprob = 0.0
87
+ n_tokens = 0
88
+ for i in range(prompt_len, full_ids.shape[1]):
89
+ token_id = full_ids[0, i].item()
90
+ token_logprob = log_probs[0, i - 1, token_id].item()
91
+ total_logprob += token_logprob
92
+ n_tokens += 1
93
+
94
+ mean_logprob = total_logprob / n_tokens if n_tokens > 0 else -float('inf')
95
+ return mean_logprob, n_tokens
96
+
97
+
98
+ def compute_quality_score(mean_logprob, exercise_key, prompt_index):
99
+ """
100
+ Per-prompt quality score in [0, 1] using reference deltas from solution.py.
101
+
102
+ Logic:
103
+ - Compute student_delta = student_logprob - unconstrained_logprob
104
+ - Both student_delta and reference_delta are negative (constrained is worse).
105
+ - Quality = 1.0 if student_delta >= reference_delta (student as good or better).
106
+ - Quality = student_delta / reference_delta if student is worse, clamped to [0, 1].
107
+ (ratio > 1 when student is worse since both are negative, so we use
108
+ reference/student to get a value in [0, 1] that decreases as student gets worse).
109
+ - A generous margin (3x reference delta) maps to quality = 0.
110
+ """
111
+ key = (exercise_key, prompt_index)
112
+ if key not in REFERENCE_SCORES:
113
+ # Fallback: if no reference data, return 1 for any non-terrible logprob
114
+ return 1.0 if mean_logprob > -5.0 else 0.0
115
+
116
+ ref = REFERENCE_SCORES[key]
117
+ unconstrained_lp = ref["unconstrained_logprob"]
118
+ ref_delta = ref["reference_delta"] # negative value
119
+
120
+ student_delta = mean_logprob - unconstrained_lp # negative value
121
+
122
+ if student_delta >= ref_delta:
123
+ # Student is as good or better than reference → quality = 1
124
+ return 1.0
125
+
126
+ if ref_delta == 0:
127
+ return 0.0
128
+
129
+ # Student is worse than reference.
130
+ # Linear decay: quality = ref_delta / student_delta
131
+ # When student_delta == ref_delta → 1.0
132
+ # When student_delta is much worse → approaches 0
133
+ # Cap at 3x reference delta for quality = 0
134
+ worst_delta = 3.0 * ref_delta # e.g., ref=-0.9 → worst=-2.7
135
+
136
+ if student_delta <= worst_delta:
137
+ return 0.0
138
+
139
+ # Linear interpolation between ref_delta (quality=1) and worst_delta (quality=0)
140
+ quality = (student_delta - worst_delta) / (ref_delta - worst_delta)
141
+ return max(0.0, min(1.0, quality))
142
+
143
+
144
  class TimeoutException(Exception):
145
  pass
146
 
 
316
  ex1_passed = 0
317
  ex1_timeout = False
318
  ex1_outputs = []
319
+ ex1_quality_scores = []
320
  try:
321
  print("### EXERCISE 1 - La Disparition (No 'e')")
322
  ex1_instance = student_module.LaDisparition(model, tokenizer)
 
334
  )
335
  # Remove prompt from output to only validate generated text
336
  cleaned_output = strip_prompt_from_output(output, prompt)
 
337
 
338
  print(f"Response: {cleaned_output}")
339
 
340
  passed = 'e' not in cleaned_output.lower() and len(cleaned_output.strip()) > 3
341
+
342
+ # Compute logprob quality score
343
+ mean_lp, n_tok = compute_mean_logprob(prompt, cleaned_output)
344
+ quality = compute_quality_score(mean_lp, "exercise_1", i) if passed else 0.0
345
+ ex1_quality_scores.append(quality)
346
+
347
+ print(f" Constraint passed: {passed} | mean_logprob: {mean_lp:.3f} | quality: {quality:.2f}")
348
+
349
  if passed:
350
  ex1_passed += 1
351
+ ex1_outputs.append({
352
+ "prompt": prompt, "output": cleaned_output, "passed": passed,
353
+ "mean_logprob": mean_lp, "quality": quality
354
+ })
355
  if debug:
356
  print(f"Ex1 Test {i+1}: {'✓' if passed else '✗'}")
357
  print(f" Prompt: {prompt}")
358
  print(f" Output: {output}")
359
+ print(f" mean_logprob={mean_lp:.4f}, quality={quality:.2f}")
360
  print()
361
  except TimeoutException:
362
  ex1_timeout = True
363
+ ex1_outputs.append({"prompt": prompt, "output": "TIMEOUT", "passed": False, "mean_logprob": float('-inf'), "quality": 0.0})
364
+ ex1_quality_scores.append(0.0)
365
  print(f"Result: ✗ TIMEOUT")
366
  break
367
 
368
+ ex1_avg_quality = sum(ex1_quality_scores) / len(ex1_quality_scores) if ex1_quality_scores else 0.0
369
+ print(f"\nExercise 1 Score: {ex1_passed}/5 | Avg quality: {ex1_avg_quality:.2f}")
370
  if ex1_timeout:
371
  report.append(f" **Ex 1 (No 'e'):** TIMEOUT - evaluation exceeded {TIMEOUT_SECONDS}s limit")
372
  else:
373
+ report.append(f" **Ex 1 (No 'e'):** {ex1_passed}/5 correct | Quality: {ex1_avg_quality:.0%}")
374
 
375
  if debug:
376
  report.append("\n### Ex 1 Outputs:")
377
  for i, out in enumerate(ex1_outputs):
378
+ lp_str = f"logprob={out['mean_logprob']:.2f}" if out['mean_logprob'] != float('-inf') else "logprob=N/A"
379
+ report.append(f"{i+1}. {'✓' if out['passed'] else '✗'} [{lp_str}, q={out['quality']:.2f}] `{out['output']}`")
380
  except Exception as e:
381
  report.append(f" **Ex 1 Error:** {str(e)}")
382
 
 
384
  ex2_passed = 0
385
  ex2_timeout = False
386
  ex2_outputs = []
387
+ ex2_quality_scores = []
388
  try:
389
  print("\n### EXERCISE 2 - Toulouse Sequence (No 'Toulouse')")
390
  ex2_instance = student_module.ToulouseSequence(model, tokenizer)
 
401
  )
402
  # Remove prompt from output to only validate generated text
403
  cleaned_output = strip_prompt_from_output(output, prompt)
 
404
 
405
  print(f"Response: {cleaned_output}")
406
 
407
  passed = "toulouse" not in cleaned_output.lower() and len(cleaned_output.strip()) > 3
408
+
409
+ # Compute logprob quality score
410
+ mean_lp, n_tok = compute_mean_logprob(prompt, cleaned_output)
411
+ quality = compute_quality_score(mean_lp, "exercise_2", i) if passed else 0.0
412
+ ex2_quality_scores.append(quality)
413
+
414
+ print(f" Constraint passed: {passed} | mean_logprob: {mean_lp:.3f} | quality: {quality:.2f}")
415
+
416
  if passed:
417
  ex2_passed += 1
418
+ ex2_outputs.append({
419
+ "prompt": prompt, "output": cleaned_output, "passed": passed,
420
+ "mean_logprob": mean_lp, "quality": quality
421
+ })
422
  if debug:
423
  print(f"Ex2 Test {i+1}: {'✓' if passed else '✗'}")
424
  print(f" Prompt: {prompt}")
425
  print(f" Output: {output}")
426
+ print(f" mean_logprob={mean_lp:.4f}, quality={quality:.2f}")
427
  print()
428
  except TimeoutException:
429
  ex2_timeout = True
430
+ ex2_outputs.append({"prompt": prompt, "output": "TIMEOUT", "passed": False, "mean_logprob": float('-inf'), "quality": 0.0})
431
+ ex2_quality_scores.append(0.0)
432
  print(f"Result: ✗ TIMEOUT")
433
  break
434
 
435
+ ex2_avg_quality = sum(ex2_quality_scores) / len(ex2_quality_scores) if ex2_quality_scores else 0.0
436
+ print(f"\nExercise 2 Score: {ex2_passed}/5 | Avg quality: {ex2_avg_quality:.2f}")
437
  if ex2_timeout:
438
  report.append(f" **Ex 2 (No Toulouse):** TIMEOUT - evaluation exceeded {TIMEOUT_SECONDS}s limit")
439
  else:
440
+ report.append(f" **Ex 2 (No Toulouse):** {ex2_passed}/5 correct | Quality: {ex2_avg_quality:.0%}")
441
 
442
  if debug:
443
  report.append("\n### Ex 2 Outputs:")
444
  for i, out in enumerate(ex2_outputs):
445
+ lp_str = f"logprob={out['mean_logprob']:.2f}" if out['mean_logprob'] != float('-inf') else "logprob=N/A"
446
+ report.append(f"{i+1}. {'✓' if out['passed'] else '✗'} [{lp_str}, q={out['quality']:.2f}] `{out['output']}`")
447
  except Exception as e:
448
  report.append(f" **Ex 2 Error:** {str(e)}")
449
 
calibrate_logprobs.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Calibration script: compute logprobs for reference solution outputs
3
+ vs unconstrained model outputs to design a scoring function.
4
+ """
5
+ import torch
6
+ import torch.nn.functional as F
7
+ import json
8
+ from transformers import AutoModelForCausalLM, AutoTokenizer
9
+
10
+ EVAL_MODEL = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
11
+
12
+ print("Loading model...")
13
+ tokenizer = AutoTokenizer.from_pretrained(EVAL_MODEL)
14
+ if tokenizer.pad_token is None:
15
+ tokenizer.pad_token = tokenizer.eos_token
16
+ model = AutoModelForCausalLM.from_pretrained(EVAL_MODEL, dtype=torch.float16, device_map="auto")
17
+
18
+ with open("test_cases.json", "r") as f:
19
+ TEST_CASES = json.load(f)
20
+
21
+
22
+ def compute_chat_logprobs(model, tokenizer, prompt, generated_text):
23
+ """
24
+ Compute logprobs using chat template (works for both exercises).
25
+ The prompt is formatted as a chat message, generated_text is the response.
26
+
27
+ Returns:
28
+ mean_logprob: mean log-prob per generated token
29
+ total_logprob: sum of log-probs
30
+ n_tokens: number of generated tokens
31
+ per_token: list of (token_str, logprob) pairs
32
+ """
33
+ if not generated_text or not generated_text.strip():
34
+ return -float('inf'), 0.0, 0, []
35
+
36
+ message = [{"role": "user", "content": prompt}]
37
+ prompt_ids = tokenizer.apply_chat_template(
38
+ message, add_generation_prompt=True, return_tensors="pt"
39
+ ).to(model.device)
40
+ prompt_len = prompt_ids.shape[1]
41
+
42
+ gen_ids = tokenizer.encode(
43
+ generated_text, add_special_tokens=False, return_tensors="pt"
44
+ ).to(model.device)
45
+
46
+ full_ids = torch.cat([prompt_ids, gen_ids], dim=1)
47
+
48
+ if full_ids.shape[1] <= prompt_len:
49
+ return -float('inf'), 0.0, 0, []
50
+
51
+ with torch.no_grad():
52
+ outputs = model(full_ids)
53
+ logits = outputs.logits
54
+
55
+ log_probs = F.log_softmax(logits, dim=-1)
56
+
57
+ per_token = []
58
+ total_logprob = 0.0
59
+ n_tokens = 0
60
+
61
+ for i in range(prompt_len, full_ids.shape[1]):
62
+ token_id = full_ids[0, i].item()
63
+ token_logprob = log_probs[0, i - 1, token_id].item()
64
+ token_str = tokenizer.decode([token_id])
65
+ per_token.append((token_str, token_logprob))
66
+ total_logprob += token_logprob
67
+ n_tokens += 1
68
+
69
+ mean_logprob = total_logprob / n_tokens if n_tokens > 0 else -float('inf')
70
+ return mean_logprob, total_logprob, n_tokens, per_token
71
+
72
+
73
+ def generate_unconstrained_chat(model, tokenizer, prompt, max_tokens=20):
74
+ """Generate unconstrained text using chat template (for both exercises)."""
75
+ message = [{"role": "user", "content": prompt}]
76
+ inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors="pt").to(model.device)
77
+ attention_mask = torch.ones_like(inputs)
78
+ prompt_length = inputs.shape[1]
79
+
80
+ with torch.no_grad():
81
+ output = model.generate(
82
+ inputs,
83
+ attention_mask=attention_mask,
84
+ max_new_tokens=max_tokens,
85
+ do_sample=False,
86
+ pad_token_id=tokenizer.pad_token_id
87
+ )
88
+ generated_tokens = output[0][prompt_length:]
89
+ return tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
90
+
91
+
92
+ # ---- Load and run the reference solution ----
93
+ import importlib.util
94
+ import sys
95
+ import time
96
+
97
+ module_name = f"solution_module_{int(time.time())}"
98
+ spec = importlib.util.spec_from_file_location(module_name, "solution.py")
99
+ solution = importlib.util.module_from_spec(spec)
100
+ sys.modules[module_name] = solution
101
+ spec.loader.exec_module(solution)
102
+
103
+ print("\n" + "="*80)
104
+ print("EXERCISE 1: La Disparition (no 'e')")
105
+ print("="*80)
106
+
107
+ ex1_instance = solution.LaDisparition(model, tokenizer)
108
+
109
+ ex1_results = []
110
+ for i, prompt in enumerate(TEST_CASES["exercise_1"]):
111
+ # Generate constrained output
112
+ constrained_output = ex1_instance(prompt, max_tokens=20)
113
+ # Strip prompt from output
114
+ if constrained_output.startswith(prompt):
115
+ constrained_gen = constrained_output[len(prompt):].strip()
116
+ else:
117
+ constrained_gen = constrained_output.strip()
118
+
119
+ # Generate unconstrained output (chat template for instruct model)
120
+ unconstrained_gen = generate_unconstrained_chat(model, tokenizer, prompt, max_tokens=20)
121
+
122
+ # Compute logprobs using chat template (matches how the model should be used)
123
+ c_mean, c_total, c_ntok, c_per = compute_chat_logprobs(model, tokenizer, prompt, constrained_gen)
124
+
125
+ # Compute logprobs for unconstrained output
126
+ u_mean, u_total, u_ntok, u_per = compute_chat_logprobs(model, tokenizer, prompt, unconstrained_gen)
127
+
128
+ delta = c_mean - u_mean # will be negative (constrained is worse)
129
+
130
+ print(f"\nTest {i+1}: {prompt}")
131
+ print(f" Unconstrained: {unconstrained_gen}")
132
+ print(f" mean_logprob={u_mean:.4f}, n_tokens={u_ntok}")
133
+ print(f" Constrained: {constrained_gen}")
134
+ print(f" mean_logprob={c_mean:.4f}, n_tokens={c_ntok}")
135
+ print(f" Delta (constrained - unconstrained): {delta:.4f}")
136
+
137
+ ex1_results.append({
138
+ "prompt": prompt,
139
+ "constrained_gen": constrained_gen,
140
+ "unconstrained_gen": unconstrained_gen,
141
+ "c_mean_logprob": c_mean,
142
+ "u_mean_logprob": u_mean,
143
+ "delta_mean_logprob": delta,
144
+ })
145
+
146
+ print(f"\n--- Exercise 1 Summary ---")
147
+ deltas_1 = [r["delta_mean_logprob"] for r in ex1_results]
148
+ c_means_1 = [r["c_mean_logprob"] for r in ex1_results]
149
+ u_means_1 = [r["u_mean_logprob"] for r in ex1_results]
150
+ print(f" Unconstrained mean logprobs: {[f'{x:.3f}' for x in u_means_1]}")
151
+ print(f" Constrained mean logprobs: {[f'{x:.3f}' for x in c_means_1]}")
152
+ print(f" Deltas: {[f'{x:.3f}' for x in deltas_1]}")
153
+ print(f" Mean delta: {sum(deltas_1)/len(deltas_1):.4f}")
154
+ print(f" Worst delta: {min(deltas_1):.4f}")
155
+
156
+
157
+ print("\n" + "="*80)
158
+ print("EXERCISE 2: Toulouse Sequence (no 'Toulouse')")
159
+ print("="*80)
160
+
161
+ ex2_instance = solution.ToulouseSequence(model, tokenizer)
162
+
163
+ ex2_results = []
164
+ for i, prompt in enumerate(TEST_CASES["exercise_2"]):
165
+ # Generate constrained output
166
+ constrained_gen = ex2_instance(prompt, max_tokens=20)
167
+
168
+ # Generate unconstrained output (chat format)
169
+ unconstrained_gen = generate_unconstrained_chat(model, tokenizer, prompt, max_tokens=20)
170
+
171
+ # Compute logprobs (chat format)
172
+ c_mean, c_total, c_ntok, c_per = compute_chat_logprobs(model, tokenizer, prompt, constrained_gen)
173
+ u_mean, u_total, u_ntok, u_per = compute_chat_logprobs(model, tokenizer, prompt, unconstrained_gen)
174
+
175
+ delta = c_mean - u_mean
176
+
177
+ print(f"\nTest {i+1}: {prompt}")
178
+ print(f" Unconstrained: {unconstrained_gen}")
179
+ print(f" mean_logprob={u_mean:.4f}, n_tokens={u_ntok}")
180
+ print(f" Constrained: {constrained_gen}")
181
+ print(f" mean_logprob={c_mean:.4f}, n_tokens={c_ntok}")
182
+ print(f" Delta (constrained - unconstrained): {delta:.4f}")
183
+
184
+ ex2_results.append({
185
+ "prompt": prompt,
186
+ "constrained_gen": constrained_gen,
187
+ "unconstrained_gen": unconstrained_gen,
188
+ "c_mean_logprob": c_mean,
189
+ "u_mean_logprob": u_mean,
190
+ "delta_mean_logprob": delta,
191
+ })
192
+
193
+ print(f"\n--- Exercise 2 Summary ---")
194
+ deltas_2 = [r["delta_mean_logprob"] for r in ex2_results]
195
+ c_means_2 = [r["c_mean_logprob"] for r in ex2_results]
196
+ u_means_2 = [r["u_mean_logprob"] for r in ex2_results]
197
+ print(f" Unconstrained mean logprobs: {[f'{x:.3f}' for x in u_means_2]}")
198
+ print(f" Constrained mean logprobs: {[f'{x:.3f}' for x in c_means_2]}")
199
+ print(f" Deltas: {[f'{x:.3f}' for x in deltas_2]}")
200
+ print(f" Mean delta: {sum(deltas_2)/len(deltas_2):.4f}")
201
+ print(f" Worst delta: {min(deltas_2):.4f}")
202
+
203
+ print("\n" + "="*80)
204
+ print("OVERALL RECOMMENDATION")
205
+ print("="*80)
206
+ all_deltas = deltas_1 + deltas_2
207
+ print(f"All deltas: {[f'{x:.3f}' for x in all_deltas]}")
208
+ print(f"Global mean delta: {sum(all_deltas)/len(all_deltas):.4f}")
209
+ print(f"Global worst delta: {min(all_deltas):.4f}")
210
+
211
+ # ---- Save reference scores to CSV ----
212
+ import csv
213
+
214
+ csv_path = "reference_scores.csv"
215
+ with open(csv_path, "w", newline="") as csvfile:
216
+ writer = csv.writer(csvfile)
217
+ writer.writerow([
218
+ "exercise", "prompt_index", "prompt",
219
+ "unconstrained_logprob", "reference_logprob", "reference_delta"
220
+ ])
221
+ for i, r in enumerate(ex1_results):
222
+ writer.writerow([
223
+ "exercise_1", i, r["prompt"],
224
+ f"{r['u_mean_logprob']:.6f}",
225
+ f"{r['c_mean_logprob']:.6f}",
226
+ f"{r['delta_mean_logprob']:.6f}",
227
+ ])
228
+ for i, r in enumerate(ex2_results):
229
+ writer.writerow([
230
+ "exercise_2", i, r["prompt"],
231
+ f"{r['u_mean_logprob']:.6f}",
232
+ f"{r['c_mean_logprob']:.6f}",
233
+ f"{r['delta_mean_logprob']:.6f}",
234
+ ])
235
+
236
+ print(f"\nReference scores saved to {csv_path}")
forbidden_solution.py CHANGED
@@ -30,18 +30,26 @@ class LaDisparition:
30
  self.processor = ForbidTokensLogitsProcessor(self.forbidden_token_ids)
31
 
32
  def __call__(self, prompt, max_tokens=30, beam_width=5):
33
- # Option 1: we use self.tokenizer to tokenize the prompt
34
- inputs = self.tokenizer(prompt, return_tensors="pt", return_attention_mask=True).to(self.model.device)
35
-
 
 
 
 
 
36
  outputs = self.model.generate(
37
- **inputs,
 
38
  max_new_tokens=max_tokens,
39
  num_beams=beam_width,
40
  logits_processor=[self.processor],
41
  do_sample=False
42
  )
43
 
44
- return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
45
 
46
 
47
  # --- EXERCISE 2: The Toulouse Sequence ---
@@ -133,7 +141,7 @@ class ToulouseSequence:
133
  if __name__ == "__main__":
134
  # NOTE: This block is for testing only. The evaluation server provides model and tokenizer.
135
  # SETUP
136
- MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
137
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
138
  model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float32, device_map="auto")
139
  la_disparition_generator = LaDisparition(model, tokenizer)
 
30
  self.processor = ForbidTokensLogitsProcessor(self.forbidden_token_ids)
31
 
32
  def __call__(self, prompt, max_tokens=30, beam_width=5):
33
+ # Option 2: we use self.tokenizer.apply_chat_template to tokenize the prompt
34
+ message = [{"role": "user", "content": prompt}]
35
+ inputs = self.tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors="pt").to(self.model.device)
36
+
37
+ # Create an attention mask for the inputs
38
+ attention_mask = torch.ones_like(inputs)
39
+ prompt_length = inputs.shape[1]
40
+
41
  outputs = self.model.generate(
42
+ inputs,
43
+ attention_mask=attention_mask,
44
  max_new_tokens=max_tokens,
45
  num_beams=beam_width,
46
  logits_processor=[self.processor],
47
  do_sample=False
48
  )
49
 
50
+ # Return only the generated part
51
+ generated_tokens = outputs[0][prompt_length:]
52
+ return self.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
53
 
54
 
55
  # --- EXERCISE 2: The Toulouse Sequence ---
 
141
  if __name__ == "__main__":
142
  # NOTE: This block is for testing only. The evaluation server provides model and tokenizer.
143
  # SETUP
144
+ MODEL_NAME = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
145
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
146
  model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float32, device_map="auto")
147
  la_disparition_generator = LaDisparition(model, tokenizer)
greedy.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Greedy/naive solution for comparison.
3
+ - Exercise 1: greedy decoding with token-level 'e' masking (same idea, simpler than beam search)
4
+ - Exercise 2: naive approach — forbid the first token of "Toulouse" and " Toulouse"
5
+ (tokens 'T' and ' T'), which is very aggressive and blocks ALL T-starting words.
6
+ """
7
+ from typing import List
8
+ import torch
9
+ import torch.nn.functional as F
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer
11
+
12
+
13
+ # --- EXERCISE 1: La disparition (No 'e' or 'E') ---
14
+ class LaDisparition:
15
+ """Greedy constrained generation: forbid tokens containing 'e', pick argmax."""
16
+ def __init__(self, model: AutoModelForCausalLM, tokenizer: AutoTokenizer):
17
+ self.model = model
18
+ self.tokenizer = tokenizer
19
+ self.forbidden_token_ids = set()
20
+ vocab = self.tokenizer.get_vocab()
21
+ for token_id in range(len(vocab)):
22
+ decoded = self.tokenizer.decode([token_id])
23
+ if 'e' in decoded.lower() or not all(ord(c) < 128 for c in decoded):
24
+ self.forbidden_token_ids.add(token_id)
25
+
26
+ def __call__(self, prompt, max_tokens=20):
27
+ message = [{"role": "user", "content": prompt}]
28
+ input_ids = self.tokenizer.apply_chat_template(
29
+ message, add_generation_prompt=True, return_tensors="pt"
30
+ ).to(self.model.device)
31
+ prompt_len = input_ids.shape[1]
32
+
33
+ seq = input_ids[0].tolist()
34
+ forbidden_list = list(self.forbidden_token_ids)
35
+
36
+ for step in range(max_tokens):
37
+ input_tensor = torch.tensor([seq], device=self.model.device)
38
+ with torch.no_grad():
39
+ outputs = self.model(input_tensor)
40
+ logits = outputs.logits[0, -1, :].clone()
41
+
42
+ # Mask forbidden tokens
43
+ logits[forbidden_list] = -float('inf')
44
+
45
+ next_token = torch.argmax(logits).item()
46
+ if next_token == self.tokenizer.eos_token_id:
47
+ break
48
+ seq.append(next_token)
49
+
50
+ generated_tokens = seq[prompt_len:]
51
+ return self.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
52
+
53
+
54
+ # --- EXERCISE 2: The Toulouse Sequence (naive approach) ---
55
+ class ToulouseSequence:
56
+ """
57
+ Naive approach: forbid the first token of "Toulouse" and " Toulouse".
58
+
59
+ "Toulouse" tokenizes as [T(68)][oul(9226)][ouse(1368)]
60
+ " Toulouse" tokenizes as [ T(312)][oul(9226)][ouse(1368)]
61
+
62
+ By forbidding tokens 68 ('T') and 312 (' T'), we block the model from
63
+ ever starting the word "Toulouse". This is very aggressive: it also blocks
64
+ ALL words starting with 'T' (e.g., "The", "This", "That", "They", ...).
65
+ """
66
+ def __init__(self, model: AutoModelForCausalLM, tokenizer: AutoTokenizer):
67
+ self.model = model
68
+ self.tokenizer = tokenizer
69
+ # Find the first token of "Toulouse" and " Toulouse"
70
+ toulouse_ids = self.tokenizer.encode("Toulouse", add_special_tokens=False)
71
+ space_toulouse_ids = self.tokenizer.encode(" Toulouse", add_special_tokens=False)
72
+ self.forbidden_token_ids = {toulouse_ids[0], space_toulouse_ids[0]}
73
+ print(f"[ToulouseSequence naive] Forbidden first tokens: {self.forbidden_token_ids}")
74
+
75
+ def __call__(self, prompt, max_tokens=20):
76
+ message = [{"role": "user", "content": prompt}]
77
+ inputs = self.tokenizer.apply_chat_template(
78
+ message, add_generation_prompt=True, return_tensors="pt"
79
+ ).to(self.model.device)
80
+ prompt_length = inputs.shape[1]
81
+
82
+ seq = inputs[0].tolist()
83
+ forbidden_list = list(self.forbidden_token_ids)
84
+
85
+ for step in range(max_tokens):
86
+ input_tensor = torch.tensor([seq], device=self.model.device)
87
+ with torch.no_grad():
88
+ outputs = self.model(input_tensor)
89
+ logits = outputs.logits[0, -1, :].clone()
90
+
91
+ # Mask forbidden tokens
92
+ logits[forbidden_list] = -float('inf')
93
+
94
+ next_token = torch.argmax(logits).item()
95
+ if next_token == self.tokenizer.eos_token_id:
96
+ break
97
+ seq.append(next_token)
98
+
99
+ generated_tokens = seq[prompt_length:]
100
+ return self.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
101
+
102
+
103
+ if __name__ == "__main__":
104
+ MODEL_NAME = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
105
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
106
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float16, device_map="auto")
107
+
108
+ print("=== Exercise 1: La Disparition (no 'e') ===")
109
+ ex1 = LaDisparition(model, tokenizer)
110
+ for prompt in ["Who is the king of the jungle?", "Name a fruit that is red."]:
111
+ result = ex1(prompt)
112
+ has_e = 'e' in result.lower()
113
+ print(f" Q: {prompt}")
114
+ print(f" A: {result}")
115
+ print(f" {'✗ FAIL' if has_e else '✓ PASS'}\n")
116
+
117
+ print("=== Exercise 2: No Toulouse (naive) ===")
118
+ ex2 = ToulouseSequence(model, tokenizer)
119
+ for prompt in [
120
+ "Where is the headquarters of Airbus located?",
121
+ "In which French city can you find the Place du Capitole?",
122
+ ]:
123
+ result = ex2(prompt)
124
+ has_toulouse = 'toulouse' in result.lower()
125
+ print(f" Q: {prompt}")
126
+ print(f" A: {result}")
127
+ print(f" {'✗ FAIL' if has_toulouse else '✓ PASS'}\n")
reference_scores.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ exercise,prompt_index,prompt,unconstrained_logprob,reference_logprob,reference_delta
2
+ exercise_1,0,Who is the king of the jungle?,-0.438915,-1.160570,-0.721655
3
+ exercise_1,1,Complete this: Once upon a...,-0.333118,-1.340559,-1.007441
4
+ exercise_1,2,What is the opposite of 'always'?,-0.204553,-1.405256,-1.200703
5
+ exercise_1,3,Name a fruit that is red.,-0.428213,-1.045869,-0.617657
6
+ exercise_1,4,What do you use to see things?,-0.485363,-1.388507,-0.903145
7
+ exercise_2,0,Where is the headquarters of Airbus located?,-0.372244,-0.689966,-0.317723
8
+ exercise_2,1,Complete this sentence: The Airbus A380 is assembled in the city of,-0.145879,-0.765407,-0.619529
9
+ exercise_2,2,Which city in southern France hosts the Cité de l'Espace space museum?,-0.169176,-0.571546,-0.402370
10
+ exercise_2,3,In which French city can you find the Place du Capitole?,-0.145984,-0.815522,-0.669538
11
+ exercise_2,4,Which French city is home to both Airbus and the Space Centre?,-0.213124,-1.147179,-0.934054
solution.py CHANGED
@@ -27,9 +27,9 @@ class LaDisparition:
27
  self.forbidden_token_ids.add(token_id)
28
 
29
  def __call__(self, prompt, max_tokens=20, beam_width=5):
30
- # Option 1: we use self.tokenizer to tokenize the prompt
31
- inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
32
- input_ids = inputs["input_ids"]
33
  prompt_len = input_ids.shape[1]
34
 
35
  # Beam search: maintain multiple hypotheses
@@ -86,9 +86,10 @@ class LaDisparition:
86
  decoded = self.tokenizer.decode(seq, skip_special_tokens=True)
87
  print(f" Beam {i}: log_prob={log_prob:.4f} | {decoded}")
88
 
89
- # Return the best hypothesis
90
  best_seq = beams[0][0]
91
- return self.tokenizer.decode(best_seq, skip_special_tokens=True)
 
92
 
93
 
94
  # --- EXERCISE 2: The Toulouse Sequence ---
@@ -191,7 +192,7 @@ class ToulouseSequence:
191
  if __name__ == "__main__":
192
  # NOTE: This block is for testing only. The evaluation server provides model and tokenizer.
193
  # SETUP
194
- MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
195
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
196
  model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")
197
  la_disparition_generator = LaDisparition(model, tokenizer)
 
27
  self.forbidden_token_ids.add(token_id)
28
 
29
  def __call__(self, prompt, max_tokens=20, beam_width=5):
30
+ # Option 2: we use self.tokenizer.apply_chat_template to tokenize the prompt
31
+ message = [{"role": "user", "content": prompt}]
32
+ input_ids = self.tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors="pt").to(self.model.device)
33
  prompt_len = input_ids.shape[1]
34
 
35
  # Beam search: maintain multiple hypotheses
 
86
  decoded = self.tokenizer.decode(seq, skip_special_tokens=True)
87
  print(f" Beam {i}: log_prob={log_prob:.4f} | {decoded}")
88
 
89
+ # Return the best hypothesis (only the generated part)
90
  best_seq = beams[0][0]
91
+ generated_tokens = best_seq[prompt_len:]
92
+ return self.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
93
 
94
 
95
  # --- EXERCISE 2: The Toulouse Sequence ---
 
192
  if __name__ == "__main__":
193
  # NOTE: This block is for testing only. The evaluation server provides model and tokenizer.
194
  # SETUP
195
+ MODEL_NAME = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
196
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
197
  model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")
198
  la_disparition_generator = LaDisparition(model, tokenizer)
test_cases.json CHANGED
@@ -7,10 +7,10 @@
7
  "What do you use to see things?"
8
  ],
9
  "exercise_2": [
10
- "Which French city is known as the 'Ville Rose'?",
11
  "Where is the headquarters of Airbus located?",
12
- "Name a major city in the Occitanie region crossed by the Garonne River.",
13
- "What French city is famous for its aerospace industry and has a historic basilica called Saint-Sernin?",
14
- "If you are at the Cité de l'Espace, which city are you in?"
 
15
  ]
16
  }
 
7
  "What do you use to see things?"
8
  ],
9
  "exercise_2": [
 
10
  "Where is the headquarters of Airbus located?",
11
+ "Complete this sentence: The Airbus A380 is assembled in the city of",
12
+ "Which city in southern France hosts the Cité de l'Espace space museum?",
13
+ "In which French city can you find the Place du Capitole?",
14
+ "Which French city is home to both Airbus and the Space Centre?"
15
  ]
16
  }