Spaces:
Running
Running
Commit ·
4d8bbd9
1
Parent(s): 1d7752e
Improved logprob-based scoring
Browse files- app.py +157 -12
- calibrate_logprobs.py +236 -0
- forbidden_solution.py +14 -6
- greedy.py +127 -0
- reference_scores.csv +11 -0
- solution.py +7 -6
- test_cases.json +4 -4
app.py
CHANGED
|
@@ -2,6 +2,7 @@ import gradio as gr
|
|
| 2 |
import importlib.util
|
| 3 |
import json
|
| 4 |
import torch
|
|
|
|
| 5 |
import gc
|
| 6 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 7 |
import threading
|
|
@@ -26,6 +27,120 @@ model = AutoModelForCausalLM.from_pretrained(
|
|
| 26 |
with open("test_cases.json", "r") as f:
|
| 27 |
TEST_CASES = json.load(f)
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
class TimeoutException(Exception):
|
| 30 |
pass
|
| 31 |
|
|
@@ -201,6 +316,7 @@ def evaluate_submission(file_obj, debug=False):
|
|
| 201 |
ex1_passed = 0
|
| 202 |
ex1_timeout = False
|
| 203 |
ex1_outputs = []
|
|
|
|
| 204 |
try:
|
| 205 |
print("### EXERCISE 1 - La Disparition (No 'e')")
|
| 206 |
ex1_instance = student_module.LaDisparition(model, tokenizer)
|
|
@@ -218,35 +334,49 @@ def evaluate_submission(file_obj, debug=False):
|
|
| 218 |
)
|
| 219 |
# Remove prompt from output to only validate generated text
|
| 220 |
cleaned_output = strip_prompt_from_output(output, prompt)
|
| 221 |
-
# assistant_response = extract_assistant_response(cleaned_output)
|
| 222 |
|
| 223 |
print(f"Response: {cleaned_output}")
|
| 224 |
|
| 225 |
passed = 'e' not in cleaned_output.lower() and len(cleaned_output.strip()) > 3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
if passed:
|
| 227 |
ex1_passed += 1
|
| 228 |
-
ex1_outputs.append({
|
|
|
|
|
|
|
|
|
|
| 229 |
if debug:
|
| 230 |
print(f"Ex1 Test {i+1}: {'✓' if passed else '✗'}")
|
| 231 |
print(f" Prompt: {prompt}")
|
| 232 |
print(f" Output: {output}")
|
|
|
|
| 233 |
print()
|
| 234 |
except TimeoutException:
|
| 235 |
ex1_timeout = True
|
| 236 |
-
ex1_outputs.append({"prompt": prompt, "output": "TIMEOUT", "passed": False})
|
|
|
|
| 237 |
print(f"Result: ✗ TIMEOUT")
|
| 238 |
break
|
| 239 |
|
| 240 |
-
|
|
|
|
| 241 |
if ex1_timeout:
|
| 242 |
report.append(f" **Ex 1 (No 'e'):** TIMEOUT - evaluation exceeded {TIMEOUT_SECONDS}s limit")
|
| 243 |
else:
|
| 244 |
-
report.append(f" **Ex 1 (No 'e'):** {ex1_passed}/5 correct")
|
| 245 |
|
| 246 |
if debug:
|
| 247 |
report.append("\n### Ex 1 Outputs:")
|
| 248 |
for i, out in enumerate(ex1_outputs):
|
| 249 |
-
|
|
|
|
| 250 |
except Exception as e:
|
| 251 |
report.append(f" **Ex 1 Error:** {str(e)}")
|
| 252 |
|
|
@@ -254,6 +384,7 @@ def evaluate_submission(file_obj, debug=False):
|
|
| 254 |
ex2_passed = 0
|
| 255 |
ex2_timeout = False
|
| 256 |
ex2_outputs = []
|
|
|
|
| 257 |
try:
|
| 258 |
print("\n### EXERCISE 2 - Toulouse Sequence (No 'Toulouse')")
|
| 259 |
ex2_instance = student_module.ToulouseSequence(model, tokenizer)
|
|
@@ -270,35 +401,49 @@ def evaluate_submission(file_obj, debug=False):
|
|
| 270 |
)
|
| 271 |
# Remove prompt from output to only validate generated text
|
| 272 |
cleaned_output = strip_prompt_from_output(output, prompt)
|
| 273 |
-
# assistant_response = extract_assistant_response(cleaned_output)
|
| 274 |
|
| 275 |
print(f"Response: {cleaned_output}")
|
| 276 |
|
| 277 |
passed = "toulouse" not in cleaned_output.lower() and len(cleaned_output.strip()) > 3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
if passed:
|
| 279 |
ex2_passed += 1
|
| 280 |
-
ex2_outputs.append({
|
|
|
|
|
|
|
|
|
|
| 281 |
if debug:
|
| 282 |
print(f"Ex2 Test {i+1}: {'✓' if passed else '✗'}")
|
| 283 |
print(f" Prompt: {prompt}")
|
| 284 |
print(f" Output: {output}")
|
|
|
|
| 285 |
print()
|
| 286 |
except TimeoutException:
|
| 287 |
ex2_timeout = True
|
| 288 |
-
ex2_outputs.append({"prompt": prompt, "output": "TIMEOUT", "passed": False})
|
|
|
|
| 289 |
print(f"Result: ✗ TIMEOUT")
|
| 290 |
break
|
| 291 |
|
| 292 |
-
|
|
|
|
| 293 |
if ex2_timeout:
|
| 294 |
report.append(f" **Ex 2 (No Toulouse):** TIMEOUT - evaluation exceeded {TIMEOUT_SECONDS}s limit")
|
| 295 |
else:
|
| 296 |
-
report.append(f" **Ex 2 (No Toulouse):** {ex2_passed}/5 correct")
|
| 297 |
|
| 298 |
if debug:
|
| 299 |
report.append("\n### Ex 2 Outputs:")
|
| 300 |
for i, out in enumerate(ex2_outputs):
|
| 301 |
-
|
|
|
|
| 302 |
except Exception as e:
|
| 303 |
report.append(f" **Ex 2 Error:** {str(e)}")
|
| 304 |
|
|
|
|
| 2 |
import importlib.util
|
| 3 |
import json
|
| 4 |
import torch
|
| 5 |
+
import torch.nn.functional as F
|
| 6 |
import gc
|
| 7 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 8 |
import threading
|
|
|
|
| 27 |
with open("test_cases.json", "r") as f:
|
| 28 |
TEST_CASES = json.load(f)
|
| 29 |
|
| 30 |
+
# --- PER-PROMPT REFERENCE SCORING ---
|
| 31 |
+
# Load reference scores from CSV (generated by calibrate_logprobs.py from solution.py).
|
| 32 |
+
# Each prompt has: unconstrained_logprob (baseline) and reference_delta (solution.py delta).
|
| 33 |
+
# Quality = 1 if student is as good or better than solution.py, decreasing for worse.
|
| 34 |
+
import csv
|
| 35 |
+
|
| 36 |
+
REFERENCE_SCORES = {} # key: (exercise, prompt_index) → dict
|
| 37 |
+
|
| 38 |
+
with open("reference_scores.csv", "r") as csvfile:
|
| 39 |
+
reader = csv.DictReader(csvfile)
|
| 40 |
+
for row in reader:
|
| 41 |
+
key = (row["exercise"], int(row["prompt_index"]))
|
| 42 |
+
REFERENCE_SCORES[key] = {
|
| 43 |
+
"prompt": row["prompt"],
|
| 44 |
+
"unconstrained_logprob": float(row["unconstrained_logprob"]),
|
| 45 |
+
"reference_logprob": float(row["reference_logprob"]),
|
| 46 |
+
"reference_delta": float(row["reference_delta"]),
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def compute_mean_logprob(prompt_text, generated_text):
|
| 51 |
+
"""
|
| 52 |
+
Compute the mean log-probability per token of `generated_text`
|
| 53 |
+
conditioned on `prompt_text`, under the unconstrained model.
|
| 54 |
+
|
| 55 |
+
Uses chat template since the evaluation model is an instruct model.
|
| 56 |
+
This measures how "natural" the generated text is: a well-constrained
|
| 57 |
+
generator still produces coherent text (high logprob), while a bad one
|
| 58 |
+
produces gibberish (low logprob).
|
| 59 |
+
|
| 60 |
+
Returns: (mean_logprob, n_tokens)
|
| 61 |
+
"""
|
| 62 |
+
if not generated_text or not generated_text.strip():
|
| 63 |
+
return -float('inf'), 0
|
| 64 |
+
|
| 65 |
+
# Always use chat template: the model is an instruct model, so
|
| 66 |
+
# logprobs are meaningful only in the chat context.
|
| 67 |
+
message = [{"role": "user", "content": prompt_text}]
|
| 68 |
+
prompt_ids = tokenizer.apply_chat_template(
|
| 69 |
+
message, add_generation_prompt=True, return_tensors="pt"
|
| 70 |
+
).to(model.device)
|
| 71 |
+
gen_ids = tokenizer.encode(
|
| 72 |
+
generated_text, add_special_tokens=False, return_tensors="pt"
|
| 73 |
+
).to(model.device)
|
| 74 |
+
full_ids = torch.cat([prompt_ids, gen_ids], dim=1)
|
| 75 |
+
prompt_len = prompt_ids.shape[1]
|
| 76 |
+
|
| 77 |
+
if full_ids.shape[1] <= prompt_len:
|
| 78 |
+
return -float('inf'), 0
|
| 79 |
+
|
| 80 |
+
with torch.no_grad():
|
| 81 |
+
outputs = model(full_ids)
|
| 82 |
+
logits = outputs.logits
|
| 83 |
+
|
| 84 |
+
log_probs = F.log_softmax(logits, dim=-1)
|
| 85 |
+
|
| 86 |
+
total_logprob = 0.0
|
| 87 |
+
n_tokens = 0
|
| 88 |
+
for i in range(prompt_len, full_ids.shape[1]):
|
| 89 |
+
token_id = full_ids[0, i].item()
|
| 90 |
+
token_logprob = log_probs[0, i - 1, token_id].item()
|
| 91 |
+
total_logprob += token_logprob
|
| 92 |
+
n_tokens += 1
|
| 93 |
+
|
| 94 |
+
mean_logprob = total_logprob / n_tokens if n_tokens > 0 else -float('inf')
|
| 95 |
+
return mean_logprob, n_tokens
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def compute_quality_score(mean_logprob, exercise_key, prompt_index):
|
| 99 |
+
"""
|
| 100 |
+
Per-prompt quality score in [0, 1] using reference deltas from solution.py.
|
| 101 |
+
|
| 102 |
+
Logic:
|
| 103 |
+
- Compute student_delta = student_logprob - unconstrained_logprob
|
| 104 |
+
- Both student_delta and reference_delta are negative (constrained is worse).
|
| 105 |
+
- Quality = 1.0 if student_delta >= reference_delta (student as good or better).
|
| 106 |
+
- Quality = student_delta / reference_delta if student is worse, clamped to [0, 1].
|
| 107 |
+
(ratio > 1 when student is worse since both are negative, so we use
|
| 108 |
+
reference/student to get a value in [0, 1] that decreases as student gets worse).
|
| 109 |
+
- A generous margin (3x reference delta) maps to quality = 0.
|
| 110 |
+
"""
|
| 111 |
+
key = (exercise_key, prompt_index)
|
| 112 |
+
if key not in REFERENCE_SCORES:
|
| 113 |
+
# Fallback: if no reference data, return 1 for any non-terrible logprob
|
| 114 |
+
return 1.0 if mean_logprob > -5.0 else 0.0
|
| 115 |
+
|
| 116 |
+
ref = REFERENCE_SCORES[key]
|
| 117 |
+
unconstrained_lp = ref["unconstrained_logprob"]
|
| 118 |
+
ref_delta = ref["reference_delta"] # negative value
|
| 119 |
+
|
| 120 |
+
student_delta = mean_logprob - unconstrained_lp # negative value
|
| 121 |
+
|
| 122 |
+
if student_delta >= ref_delta:
|
| 123 |
+
# Student is as good or better than reference → quality = 1
|
| 124 |
+
return 1.0
|
| 125 |
+
|
| 126 |
+
if ref_delta == 0:
|
| 127 |
+
return 0.0
|
| 128 |
+
|
| 129 |
+
# Student is worse than reference.
|
| 130 |
+
# Linear decay: quality = ref_delta / student_delta
|
| 131 |
+
# When student_delta == ref_delta → 1.0
|
| 132 |
+
# When student_delta is much worse → approaches 0
|
| 133 |
+
# Cap at 3x reference delta for quality = 0
|
| 134 |
+
worst_delta = 3.0 * ref_delta # e.g., ref=-0.9 → worst=-2.7
|
| 135 |
+
|
| 136 |
+
if student_delta <= worst_delta:
|
| 137 |
+
return 0.0
|
| 138 |
+
|
| 139 |
+
# Linear interpolation between ref_delta (quality=1) and worst_delta (quality=0)
|
| 140 |
+
quality = (student_delta - worst_delta) / (ref_delta - worst_delta)
|
| 141 |
+
return max(0.0, min(1.0, quality))
|
| 142 |
+
|
| 143 |
+
|
| 144 |
class TimeoutException(Exception):
|
| 145 |
pass
|
| 146 |
|
|
|
|
| 316 |
ex1_passed = 0
|
| 317 |
ex1_timeout = False
|
| 318 |
ex1_outputs = []
|
| 319 |
+
ex1_quality_scores = []
|
| 320 |
try:
|
| 321 |
print("### EXERCISE 1 - La Disparition (No 'e')")
|
| 322 |
ex1_instance = student_module.LaDisparition(model, tokenizer)
|
|
|
|
| 334 |
)
|
| 335 |
# Remove prompt from output to only validate generated text
|
| 336 |
cleaned_output = strip_prompt_from_output(output, prompt)
|
|
|
|
| 337 |
|
| 338 |
print(f"Response: {cleaned_output}")
|
| 339 |
|
| 340 |
passed = 'e' not in cleaned_output.lower() and len(cleaned_output.strip()) > 3
|
| 341 |
+
|
| 342 |
+
# Compute logprob quality score
|
| 343 |
+
mean_lp, n_tok = compute_mean_logprob(prompt, cleaned_output)
|
| 344 |
+
quality = compute_quality_score(mean_lp, "exercise_1", i) if passed else 0.0
|
| 345 |
+
ex1_quality_scores.append(quality)
|
| 346 |
+
|
| 347 |
+
print(f" Constraint passed: {passed} | mean_logprob: {mean_lp:.3f} | quality: {quality:.2f}")
|
| 348 |
+
|
| 349 |
if passed:
|
| 350 |
ex1_passed += 1
|
| 351 |
+
ex1_outputs.append({
|
| 352 |
+
"prompt": prompt, "output": cleaned_output, "passed": passed,
|
| 353 |
+
"mean_logprob": mean_lp, "quality": quality
|
| 354 |
+
})
|
| 355 |
if debug:
|
| 356 |
print(f"Ex1 Test {i+1}: {'✓' if passed else '✗'}")
|
| 357 |
print(f" Prompt: {prompt}")
|
| 358 |
print(f" Output: {output}")
|
| 359 |
+
print(f" mean_logprob={mean_lp:.4f}, quality={quality:.2f}")
|
| 360 |
print()
|
| 361 |
except TimeoutException:
|
| 362 |
ex1_timeout = True
|
| 363 |
+
ex1_outputs.append({"prompt": prompt, "output": "TIMEOUT", "passed": False, "mean_logprob": float('-inf'), "quality": 0.0})
|
| 364 |
+
ex1_quality_scores.append(0.0)
|
| 365 |
print(f"Result: ✗ TIMEOUT")
|
| 366 |
break
|
| 367 |
|
| 368 |
+
ex1_avg_quality = sum(ex1_quality_scores) / len(ex1_quality_scores) if ex1_quality_scores else 0.0
|
| 369 |
+
print(f"\nExercise 1 Score: {ex1_passed}/5 | Avg quality: {ex1_avg_quality:.2f}")
|
| 370 |
if ex1_timeout:
|
| 371 |
report.append(f" **Ex 1 (No 'e'):** TIMEOUT - evaluation exceeded {TIMEOUT_SECONDS}s limit")
|
| 372 |
else:
|
| 373 |
+
report.append(f" **Ex 1 (No 'e'):** {ex1_passed}/5 correct | Quality: {ex1_avg_quality:.0%}")
|
| 374 |
|
| 375 |
if debug:
|
| 376 |
report.append("\n### Ex 1 Outputs:")
|
| 377 |
for i, out in enumerate(ex1_outputs):
|
| 378 |
+
lp_str = f"logprob={out['mean_logprob']:.2f}" if out['mean_logprob'] != float('-inf') else "logprob=N/A"
|
| 379 |
+
report.append(f"{i+1}. {'✓' if out['passed'] else '✗'} [{lp_str}, q={out['quality']:.2f}] `{out['output']}`")
|
| 380 |
except Exception as e:
|
| 381 |
report.append(f" **Ex 1 Error:** {str(e)}")
|
| 382 |
|
|
|
|
| 384 |
ex2_passed = 0
|
| 385 |
ex2_timeout = False
|
| 386 |
ex2_outputs = []
|
| 387 |
+
ex2_quality_scores = []
|
| 388 |
try:
|
| 389 |
print("\n### EXERCISE 2 - Toulouse Sequence (No 'Toulouse')")
|
| 390 |
ex2_instance = student_module.ToulouseSequence(model, tokenizer)
|
|
|
|
| 401 |
)
|
| 402 |
# Remove prompt from output to only validate generated text
|
| 403 |
cleaned_output = strip_prompt_from_output(output, prompt)
|
|
|
|
| 404 |
|
| 405 |
print(f"Response: {cleaned_output}")
|
| 406 |
|
| 407 |
passed = "toulouse" not in cleaned_output.lower() and len(cleaned_output.strip()) > 3
|
| 408 |
+
|
| 409 |
+
# Compute logprob quality score
|
| 410 |
+
mean_lp, n_tok = compute_mean_logprob(prompt, cleaned_output)
|
| 411 |
+
quality = compute_quality_score(mean_lp, "exercise_2", i) if passed else 0.0
|
| 412 |
+
ex2_quality_scores.append(quality)
|
| 413 |
+
|
| 414 |
+
print(f" Constraint passed: {passed} | mean_logprob: {mean_lp:.3f} | quality: {quality:.2f}")
|
| 415 |
+
|
| 416 |
if passed:
|
| 417 |
ex2_passed += 1
|
| 418 |
+
ex2_outputs.append({
|
| 419 |
+
"prompt": prompt, "output": cleaned_output, "passed": passed,
|
| 420 |
+
"mean_logprob": mean_lp, "quality": quality
|
| 421 |
+
})
|
| 422 |
if debug:
|
| 423 |
print(f"Ex2 Test {i+1}: {'✓' if passed else '✗'}")
|
| 424 |
print(f" Prompt: {prompt}")
|
| 425 |
print(f" Output: {output}")
|
| 426 |
+
print(f" mean_logprob={mean_lp:.4f}, quality={quality:.2f}")
|
| 427 |
print()
|
| 428 |
except TimeoutException:
|
| 429 |
ex2_timeout = True
|
| 430 |
+
ex2_outputs.append({"prompt": prompt, "output": "TIMEOUT", "passed": False, "mean_logprob": float('-inf'), "quality": 0.0})
|
| 431 |
+
ex2_quality_scores.append(0.0)
|
| 432 |
print(f"Result: ✗ TIMEOUT")
|
| 433 |
break
|
| 434 |
|
| 435 |
+
ex2_avg_quality = sum(ex2_quality_scores) / len(ex2_quality_scores) if ex2_quality_scores else 0.0
|
| 436 |
+
print(f"\nExercise 2 Score: {ex2_passed}/5 | Avg quality: {ex2_avg_quality:.2f}")
|
| 437 |
if ex2_timeout:
|
| 438 |
report.append(f" **Ex 2 (No Toulouse):** TIMEOUT - evaluation exceeded {TIMEOUT_SECONDS}s limit")
|
| 439 |
else:
|
| 440 |
+
report.append(f" **Ex 2 (No Toulouse):** {ex2_passed}/5 correct | Quality: {ex2_avg_quality:.0%}")
|
| 441 |
|
| 442 |
if debug:
|
| 443 |
report.append("\n### Ex 2 Outputs:")
|
| 444 |
for i, out in enumerate(ex2_outputs):
|
| 445 |
+
lp_str = f"logprob={out['mean_logprob']:.2f}" if out['mean_logprob'] != float('-inf') else "logprob=N/A"
|
| 446 |
+
report.append(f"{i+1}. {'✓' if out['passed'] else '✗'} [{lp_str}, q={out['quality']:.2f}] `{out['output']}`")
|
| 447 |
except Exception as e:
|
| 448 |
report.append(f" **Ex 2 Error:** {str(e)}")
|
| 449 |
|
calibrate_logprobs.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Calibration script: compute logprobs for reference solution outputs
|
| 3 |
+
vs unconstrained model outputs to design a scoring function.
|
| 4 |
+
"""
|
| 5 |
+
import torch
|
| 6 |
+
import torch.nn.functional as F
|
| 7 |
+
import json
|
| 8 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 9 |
+
|
| 10 |
+
EVAL_MODEL = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
|
| 11 |
+
|
| 12 |
+
print("Loading model...")
|
| 13 |
+
tokenizer = AutoTokenizer.from_pretrained(EVAL_MODEL)
|
| 14 |
+
if tokenizer.pad_token is None:
|
| 15 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 16 |
+
model = AutoModelForCausalLM.from_pretrained(EVAL_MODEL, dtype=torch.float16, device_map="auto")
|
| 17 |
+
|
| 18 |
+
with open("test_cases.json", "r") as f:
|
| 19 |
+
TEST_CASES = json.load(f)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def compute_chat_logprobs(model, tokenizer, prompt, generated_text):
|
| 23 |
+
"""
|
| 24 |
+
Compute logprobs using chat template (works for both exercises).
|
| 25 |
+
The prompt is formatted as a chat message, generated_text is the response.
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
mean_logprob: mean log-prob per generated token
|
| 29 |
+
total_logprob: sum of log-probs
|
| 30 |
+
n_tokens: number of generated tokens
|
| 31 |
+
per_token: list of (token_str, logprob) pairs
|
| 32 |
+
"""
|
| 33 |
+
if not generated_text or not generated_text.strip():
|
| 34 |
+
return -float('inf'), 0.0, 0, []
|
| 35 |
+
|
| 36 |
+
message = [{"role": "user", "content": prompt}]
|
| 37 |
+
prompt_ids = tokenizer.apply_chat_template(
|
| 38 |
+
message, add_generation_prompt=True, return_tensors="pt"
|
| 39 |
+
).to(model.device)
|
| 40 |
+
prompt_len = prompt_ids.shape[1]
|
| 41 |
+
|
| 42 |
+
gen_ids = tokenizer.encode(
|
| 43 |
+
generated_text, add_special_tokens=False, return_tensors="pt"
|
| 44 |
+
).to(model.device)
|
| 45 |
+
|
| 46 |
+
full_ids = torch.cat([prompt_ids, gen_ids], dim=1)
|
| 47 |
+
|
| 48 |
+
if full_ids.shape[1] <= prompt_len:
|
| 49 |
+
return -float('inf'), 0.0, 0, []
|
| 50 |
+
|
| 51 |
+
with torch.no_grad():
|
| 52 |
+
outputs = model(full_ids)
|
| 53 |
+
logits = outputs.logits
|
| 54 |
+
|
| 55 |
+
log_probs = F.log_softmax(logits, dim=-1)
|
| 56 |
+
|
| 57 |
+
per_token = []
|
| 58 |
+
total_logprob = 0.0
|
| 59 |
+
n_tokens = 0
|
| 60 |
+
|
| 61 |
+
for i in range(prompt_len, full_ids.shape[1]):
|
| 62 |
+
token_id = full_ids[0, i].item()
|
| 63 |
+
token_logprob = log_probs[0, i - 1, token_id].item()
|
| 64 |
+
token_str = tokenizer.decode([token_id])
|
| 65 |
+
per_token.append((token_str, token_logprob))
|
| 66 |
+
total_logprob += token_logprob
|
| 67 |
+
n_tokens += 1
|
| 68 |
+
|
| 69 |
+
mean_logprob = total_logprob / n_tokens if n_tokens > 0 else -float('inf')
|
| 70 |
+
return mean_logprob, total_logprob, n_tokens, per_token
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def generate_unconstrained_chat(model, tokenizer, prompt, max_tokens=20):
|
| 74 |
+
"""Generate unconstrained text using chat template (for both exercises)."""
|
| 75 |
+
message = [{"role": "user", "content": prompt}]
|
| 76 |
+
inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors="pt").to(model.device)
|
| 77 |
+
attention_mask = torch.ones_like(inputs)
|
| 78 |
+
prompt_length = inputs.shape[1]
|
| 79 |
+
|
| 80 |
+
with torch.no_grad():
|
| 81 |
+
output = model.generate(
|
| 82 |
+
inputs,
|
| 83 |
+
attention_mask=attention_mask,
|
| 84 |
+
max_new_tokens=max_tokens,
|
| 85 |
+
do_sample=False,
|
| 86 |
+
pad_token_id=tokenizer.pad_token_id
|
| 87 |
+
)
|
| 88 |
+
generated_tokens = output[0][prompt_length:]
|
| 89 |
+
return tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
# ---- Load and run the reference solution ----
|
| 93 |
+
import importlib.util
|
| 94 |
+
import sys
|
| 95 |
+
import time
|
| 96 |
+
|
| 97 |
+
module_name = f"solution_module_{int(time.time())}"
|
| 98 |
+
spec = importlib.util.spec_from_file_location(module_name, "solution.py")
|
| 99 |
+
solution = importlib.util.module_from_spec(spec)
|
| 100 |
+
sys.modules[module_name] = solution
|
| 101 |
+
spec.loader.exec_module(solution)
|
| 102 |
+
|
| 103 |
+
print("\n" + "="*80)
|
| 104 |
+
print("EXERCISE 1: La Disparition (no 'e')")
|
| 105 |
+
print("="*80)
|
| 106 |
+
|
| 107 |
+
ex1_instance = solution.LaDisparition(model, tokenizer)
|
| 108 |
+
|
| 109 |
+
ex1_results = []
|
| 110 |
+
for i, prompt in enumerate(TEST_CASES["exercise_1"]):
|
| 111 |
+
# Generate constrained output
|
| 112 |
+
constrained_output = ex1_instance(prompt, max_tokens=20)
|
| 113 |
+
# Strip prompt from output
|
| 114 |
+
if constrained_output.startswith(prompt):
|
| 115 |
+
constrained_gen = constrained_output[len(prompt):].strip()
|
| 116 |
+
else:
|
| 117 |
+
constrained_gen = constrained_output.strip()
|
| 118 |
+
|
| 119 |
+
# Generate unconstrained output (chat template for instruct model)
|
| 120 |
+
unconstrained_gen = generate_unconstrained_chat(model, tokenizer, prompt, max_tokens=20)
|
| 121 |
+
|
| 122 |
+
# Compute logprobs using chat template (matches how the model should be used)
|
| 123 |
+
c_mean, c_total, c_ntok, c_per = compute_chat_logprobs(model, tokenizer, prompt, constrained_gen)
|
| 124 |
+
|
| 125 |
+
# Compute logprobs for unconstrained output
|
| 126 |
+
u_mean, u_total, u_ntok, u_per = compute_chat_logprobs(model, tokenizer, prompt, unconstrained_gen)
|
| 127 |
+
|
| 128 |
+
delta = c_mean - u_mean # will be negative (constrained is worse)
|
| 129 |
+
|
| 130 |
+
print(f"\nTest {i+1}: {prompt}")
|
| 131 |
+
print(f" Unconstrained: {unconstrained_gen}")
|
| 132 |
+
print(f" mean_logprob={u_mean:.4f}, n_tokens={u_ntok}")
|
| 133 |
+
print(f" Constrained: {constrained_gen}")
|
| 134 |
+
print(f" mean_logprob={c_mean:.4f}, n_tokens={c_ntok}")
|
| 135 |
+
print(f" Delta (constrained - unconstrained): {delta:.4f}")
|
| 136 |
+
|
| 137 |
+
ex1_results.append({
|
| 138 |
+
"prompt": prompt,
|
| 139 |
+
"constrained_gen": constrained_gen,
|
| 140 |
+
"unconstrained_gen": unconstrained_gen,
|
| 141 |
+
"c_mean_logprob": c_mean,
|
| 142 |
+
"u_mean_logprob": u_mean,
|
| 143 |
+
"delta_mean_logprob": delta,
|
| 144 |
+
})
|
| 145 |
+
|
| 146 |
+
print(f"\n--- Exercise 1 Summary ---")
|
| 147 |
+
deltas_1 = [r["delta_mean_logprob"] for r in ex1_results]
|
| 148 |
+
c_means_1 = [r["c_mean_logprob"] for r in ex1_results]
|
| 149 |
+
u_means_1 = [r["u_mean_logprob"] for r in ex1_results]
|
| 150 |
+
print(f" Unconstrained mean logprobs: {[f'{x:.3f}' for x in u_means_1]}")
|
| 151 |
+
print(f" Constrained mean logprobs: {[f'{x:.3f}' for x in c_means_1]}")
|
| 152 |
+
print(f" Deltas: {[f'{x:.3f}' for x in deltas_1]}")
|
| 153 |
+
print(f" Mean delta: {sum(deltas_1)/len(deltas_1):.4f}")
|
| 154 |
+
print(f" Worst delta: {min(deltas_1):.4f}")
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
print("\n" + "="*80)
|
| 158 |
+
print("EXERCISE 2: Toulouse Sequence (no 'Toulouse')")
|
| 159 |
+
print("="*80)
|
| 160 |
+
|
| 161 |
+
ex2_instance = solution.ToulouseSequence(model, tokenizer)
|
| 162 |
+
|
| 163 |
+
ex2_results = []
|
| 164 |
+
for i, prompt in enumerate(TEST_CASES["exercise_2"]):
|
| 165 |
+
# Generate constrained output
|
| 166 |
+
constrained_gen = ex2_instance(prompt, max_tokens=20)
|
| 167 |
+
|
| 168 |
+
# Generate unconstrained output (chat format)
|
| 169 |
+
unconstrained_gen = generate_unconstrained_chat(model, tokenizer, prompt, max_tokens=20)
|
| 170 |
+
|
| 171 |
+
# Compute logprobs (chat format)
|
| 172 |
+
c_mean, c_total, c_ntok, c_per = compute_chat_logprobs(model, tokenizer, prompt, constrained_gen)
|
| 173 |
+
u_mean, u_total, u_ntok, u_per = compute_chat_logprobs(model, tokenizer, prompt, unconstrained_gen)
|
| 174 |
+
|
| 175 |
+
delta = c_mean - u_mean
|
| 176 |
+
|
| 177 |
+
print(f"\nTest {i+1}: {prompt}")
|
| 178 |
+
print(f" Unconstrained: {unconstrained_gen}")
|
| 179 |
+
print(f" mean_logprob={u_mean:.4f}, n_tokens={u_ntok}")
|
| 180 |
+
print(f" Constrained: {constrained_gen}")
|
| 181 |
+
print(f" mean_logprob={c_mean:.4f}, n_tokens={c_ntok}")
|
| 182 |
+
print(f" Delta (constrained - unconstrained): {delta:.4f}")
|
| 183 |
+
|
| 184 |
+
ex2_results.append({
|
| 185 |
+
"prompt": prompt,
|
| 186 |
+
"constrained_gen": constrained_gen,
|
| 187 |
+
"unconstrained_gen": unconstrained_gen,
|
| 188 |
+
"c_mean_logprob": c_mean,
|
| 189 |
+
"u_mean_logprob": u_mean,
|
| 190 |
+
"delta_mean_logprob": delta,
|
| 191 |
+
})
|
| 192 |
+
|
| 193 |
+
print(f"\n--- Exercise 2 Summary ---")
|
| 194 |
+
deltas_2 = [r["delta_mean_logprob"] for r in ex2_results]
|
| 195 |
+
c_means_2 = [r["c_mean_logprob"] for r in ex2_results]
|
| 196 |
+
u_means_2 = [r["u_mean_logprob"] for r in ex2_results]
|
| 197 |
+
print(f" Unconstrained mean logprobs: {[f'{x:.3f}' for x in u_means_2]}")
|
| 198 |
+
print(f" Constrained mean logprobs: {[f'{x:.3f}' for x in c_means_2]}")
|
| 199 |
+
print(f" Deltas: {[f'{x:.3f}' for x in deltas_2]}")
|
| 200 |
+
print(f" Mean delta: {sum(deltas_2)/len(deltas_2):.4f}")
|
| 201 |
+
print(f" Worst delta: {min(deltas_2):.4f}")
|
| 202 |
+
|
| 203 |
+
print("\n" + "="*80)
|
| 204 |
+
print("OVERALL RECOMMENDATION")
|
| 205 |
+
print("="*80)
|
| 206 |
+
all_deltas = deltas_1 + deltas_2
|
| 207 |
+
print(f"All deltas: {[f'{x:.3f}' for x in all_deltas]}")
|
| 208 |
+
print(f"Global mean delta: {sum(all_deltas)/len(all_deltas):.4f}")
|
| 209 |
+
print(f"Global worst delta: {min(all_deltas):.4f}")
|
| 210 |
+
|
| 211 |
+
# ---- Save reference scores to CSV ----
|
| 212 |
+
import csv
|
| 213 |
+
|
| 214 |
+
csv_path = "reference_scores.csv"
|
| 215 |
+
with open(csv_path, "w", newline="") as csvfile:
|
| 216 |
+
writer = csv.writer(csvfile)
|
| 217 |
+
writer.writerow([
|
| 218 |
+
"exercise", "prompt_index", "prompt",
|
| 219 |
+
"unconstrained_logprob", "reference_logprob", "reference_delta"
|
| 220 |
+
])
|
| 221 |
+
for i, r in enumerate(ex1_results):
|
| 222 |
+
writer.writerow([
|
| 223 |
+
"exercise_1", i, r["prompt"],
|
| 224 |
+
f"{r['u_mean_logprob']:.6f}",
|
| 225 |
+
f"{r['c_mean_logprob']:.6f}",
|
| 226 |
+
f"{r['delta_mean_logprob']:.6f}",
|
| 227 |
+
])
|
| 228 |
+
for i, r in enumerate(ex2_results):
|
| 229 |
+
writer.writerow([
|
| 230 |
+
"exercise_2", i, r["prompt"],
|
| 231 |
+
f"{r['u_mean_logprob']:.6f}",
|
| 232 |
+
f"{r['c_mean_logprob']:.6f}",
|
| 233 |
+
f"{r['delta_mean_logprob']:.6f}",
|
| 234 |
+
])
|
| 235 |
+
|
| 236 |
+
print(f"\nReference scores saved to {csv_path}")
|
forbidden_solution.py
CHANGED
|
@@ -30,18 +30,26 @@ class LaDisparition:
|
|
| 30 |
self.processor = ForbidTokensLogitsProcessor(self.forbidden_token_ids)
|
| 31 |
|
| 32 |
def __call__(self, prompt, max_tokens=30, beam_width=5):
|
| 33 |
-
# Option
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
outputs = self.model.generate(
|
| 37 |
-
|
|
|
|
| 38 |
max_new_tokens=max_tokens,
|
| 39 |
num_beams=beam_width,
|
| 40 |
logits_processor=[self.processor],
|
| 41 |
do_sample=False
|
| 42 |
)
|
| 43 |
|
| 44 |
-
|
|
|
|
|
|
|
| 45 |
|
| 46 |
|
| 47 |
# --- EXERCISE 2: The Toulouse Sequence ---
|
|
@@ -133,7 +141,7 @@ class ToulouseSequence:
|
|
| 133 |
if __name__ == "__main__":
|
| 134 |
# NOTE: This block is for testing only. The evaluation server provides model and tokenizer.
|
| 135 |
# SETUP
|
| 136 |
-
MODEL_NAME = "
|
| 137 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 138 |
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float32, device_map="auto")
|
| 139 |
la_disparition_generator = LaDisparition(model, tokenizer)
|
|
|
|
| 30 |
self.processor = ForbidTokensLogitsProcessor(self.forbidden_token_ids)
|
| 31 |
|
| 32 |
def __call__(self, prompt, max_tokens=30, beam_width=5):
|
| 33 |
+
# Option 2: we use self.tokenizer.apply_chat_template to tokenize the prompt
|
| 34 |
+
message = [{"role": "user", "content": prompt}]
|
| 35 |
+
inputs = self.tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors="pt").to(self.model.device)
|
| 36 |
+
|
| 37 |
+
# Create an attention mask for the inputs
|
| 38 |
+
attention_mask = torch.ones_like(inputs)
|
| 39 |
+
prompt_length = inputs.shape[1]
|
| 40 |
+
|
| 41 |
outputs = self.model.generate(
|
| 42 |
+
inputs,
|
| 43 |
+
attention_mask=attention_mask,
|
| 44 |
max_new_tokens=max_tokens,
|
| 45 |
num_beams=beam_width,
|
| 46 |
logits_processor=[self.processor],
|
| 47 |
do_sample=False
|
| 48 |
)
|
| 49 |
|
| 50 |
+
# Return only the generated part
|
| 51 |
+
generated_tokens = outputs[0][prompt_length:]
|
| 52 |
+
return self.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
|
| 53 |
|
| 54 |
|
| 55 |
# --- EXERCISE 2: The Toulouse Sequence ---
|
|
|
|
| 141 |
if __name__ == "__main__":
|
| 142 |
# NOTE: This block is for testing only. The evaluation server provides model and tokenizer.
|
| 143 |
# SETUP
|
| 144 |
+
MODEL_NAME = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
|
| 145 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 146 |
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float32, device_map="auto")
|
| 147 |
la_disparition_generator = LaDisparition(model, tokenizer)
|
greedy.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Greedy/naive solution for comparison.
|
| 3 |
+
- Exercise 1: greedy decoding with token-level 'e' masking (same idea, simpler than beam search)
|
| 4 |
+
- Exercise 2: naive approach — forbid the first token of "Toulouse" and " Toulouse"
|
| 5 |
+
(tokens 'T' and ' T'), which is very aggressive and blocks ALL T-starting words.
|
| 6 |
+
"""
|
| 7 |
+
from typing import List
|
| 8 |
+
import torch
|
| 9 |
+
import torch.nn.functional as F
|
| 10 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# --- EXERCISE 1: La disparition (No 'e' or 'E') ---
|
| 14 |
+
class LaDisparition:
|
| 15 |
+
"""Greedy constrained generation: forbid tokens containing 'e', pick argmax."""
|
| 16 |
+
def __init__(self, model: AutoModelForCausalLM, tokenizer: AutoTokenizer):
|
| 17 |
+
self.model = model
|
| 18 |
+
self.tokenizer = tokenizer
|
| 19 |
+
self.forbidden_token_ids = set()
|
| 20 |
+
vocab = self.tokenizer.get_vocab()
|
| 21 |
+
for token_id in range(len(vocab)):
|
| 22 |
+
decoded = self.tokenizer.decode([token_id])
|
| 23 |
+
if 'e' in decoded.lower() or not all(ord(c) < 128 for c in decoded):
|
| 24 |
+
self.forbidden_token_ids.add(token_id)
|
| 25 |
+
|
| 26 |
+
def __call__(self, prompt, max_tokens=20):
|
| 27 |
+
message = [{"role": "user", "content": prompt}]
|
| 28 |
+
input_ids = self.tokenizer.apply_chat_template(
|
| 29 |
+
message, add_generation_prompt=True, return_tensors="pt"
|
| 30 |
+
).to(self.model.device)
|
| 31 |
+
prompt_len = input_ids.shape[1]
|
| 32 |
+
|
| 33 |
+
seq = input_ids[0].tolist()
|
| 34 |
+
forbidden_list = list(self.forbidden_token_ids)
|
| 35 |
+
|
| 36 |
+
for step in range(max_tokens):
|
| 37 |
+
input_tensor = torch.tensor([seq], device=self.model.device)
|
| 38 |
+
with torch.no_grad():
|
| 39 |
+
outputs = self.model(input_tensor)
|
| 40 |
+
logits = outputs.logits[0, -1, :].clone()
|
| 41 |
+
|
| 42 |
+
# Mask forbidden tokens
|
| 43 |
+
logits[forbidden_list] = -float('inf')
|
| 44 |
+
|
| 45 |
+
next_token = torch.argmax(logits).item()
|
| 46 |
+
if next_token == self.tokenizer.eos_token_id:
|
| 47 |
+
break
|
| 48 |
+
seq.append(next_token)
|
| 49 |
+
|
| 50 |
+
generated_tokens = seq[prompt_len:]
|
| 51 |
+
return self.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# --- EXERCISE 2: The Toulouse Sequence (naive approach) ---
|
| 55 |
+
class ToulouseSequence:
|
| 56 |
+
"""
|
| 57 |
+
Naive approach: forbid the first token of "Toulouse" and " Toulouse".
|
| 58 |
+
|
| 59 |
+
"Toulouse" tokenizes as [T(68)][oul(9226)][ouse(1368)]
|
| 60 |
+
" Toulouse" tokenizes as [ T(312)][oul(9226)][ouse(1368)]
|
| 61 |
+
|
| 62 |
+
By forbidding tokens 68 ('T') and 312 (' T'), we block the model from
|
| 63 |
+
ever starting the word "Toulouse". This is very aggressive: it also blocks
|
| 64 |
+
ALL words starting with 'T' (e.g., "The", "This", "That", "They", ...).
|
| 65 |
+
"""
|
| 66 |
+
def __init__(self, model: AutoModelForCausalLM, tokenizer: AutoTokenizer):
|
| 67 |
+
self.model = model
|
| 68 |
+
self.tokenizer = tokenizer
|
| 69 |
+
# Find the first token of "Toulouse" and " Toulouse"
|
| 70 |
+
toulouse_ids = self.tokenizer.encode("Toulouse", add_special_tokens=False)
|
| 71 |
+
space_toulouse_ids = self.tokenizer.encode(" Toulouse", add_special_tokens=False)
|
| 72 |
+
self.forbidden_token_ids = {toulouse_ids[0], space_toulouse_ids[0]}
|
| 73 |
+
print(f"[ToulouseSequence naive] Forbidden first tokens: {self.forbidden_token_ids}")
|
| 74 |
+
|
| 75 |
+
def __call__(self, prompt, max_tokens=20):
|
| 76 |
+
message = [{"role": "user", "content": prompt}]
|
| 77 |
+
inputs = self.tokenizer.apply_chat_template(
|
| 78 |
+
message, add_generation_prompt=True, return_tensors="pt"
|
| 79 |
+
).to(self.model.device)
|
| 80 |
+
prompt_length = inputs.shape[1]
|
| 81 |
+
|
| 82 |
+
seq = inputs[0].tolist()
|
| 83 |
+
forbidden_list = list(self.forbidden_token_ids)
|
| 84 |
+
|
| 85 |
+
for step in range(max_tokens):
|
| 86 |
+
input_tensor = torch.tensor([seq], device=self.model.device)
|
| 87 |
+
with torch.no_grad():
|
| 88 |
+
outputs = self.model(input_tensor)
|
| 89 |
+
logits = outputs.logits[0, -1, :].clone()
|
| 90 |
+
|
| 91 |
+
# Mask forbidden tokens
|
| 92 |
+
logits[forbidden_list] = -float('inf')
|
| 93 |
+
|
| 94 |
+
next_token = torch.argmax(logits).item()
|
| 95 |
+
if next_token == self.tokenizer.eos_token_id:
|
| 96 |
+
break
|
| 97 |
+
seq.append(next_token)
|
| 98 |
+
|
| 99 |
+
generated_tokens = seq[prompt_length:]
|
| 100 |
+
return self.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
if __name__ == "__main__":
|
| 104 |
+
MODEL_NAME = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
|
| 105 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 106 |
+
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float16, device_map="auto")
|
| 107 |
+
|
| 108 |
+
print("=== Exercise 1: La Disparition (no 'e') ===")
|
| 109 |
+
ex1 = LaDisparition(model, tokenizer)
|
| 110 |
+
for prompt in ["Who is the king of the jungle?", "Name a fruit that is red."]:
|
| 111 |
+
result = ex1(prompt)
|
| 112 |
+
has_e = 'e' in result.lower()
|
| 113 |
+
print(f" Q: {prompt}")
|
| 114 |
+
print(f" A: {result}")
|
| 115 |
+
print(f" {'✗ FAIL' if has_e else '✓ PASS'}\n")
|
| 116 |
+
|
| 117 |
+
print("=== Exercise 2: No Toulouse (naive) ===")
|
| 118 |
+
ex2 = ToulouseSequence(model, tokenizer)
|
| 119 |
+
for prompt in [
|
| 120 |
+
"Where is the headquarters of Airbus located?",
|
| 121 |
+
"In which French city can you find the Place du Capitole?",
|
| 122 |
+
]:
|
| 123 |
+
result = ex2(prompt)
|
| 124 |
+
has_toulouse = 'toulouse' in result.lower()
|
| 125 |
+
print(f" Q: {prompt}")
|
| 126 |
+
print(f" A: {result}")
|
| 127 |
+
print(f" {'✗ FAIL' if has_toulouse else '✓ PASS'}\n")
|
reference_scores.csv
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
exercise,prompt_index,prompt,unconstrained_logprob,reference_logprob,reference_delta
|
| 2 |
+
exercise_1,0,Who is the king of the jungle?,-0.438915,-1.160570,-0.721655
|
| 3 |
+
exercise_1,1,Complete this: Once upon a...,-0.333118,-1.340559,-1.007441
|
| 4 |
+
exercise_1,2,What is the opposite of 'always'?,-0.204553,-1.405256,-1.200703
|
| 5 |
+
exercise_1,3,Name a fruit that is red.,-0.428213,-1.045869,-0.617657
|
| 6 |
+
exercise_1,4,What do you use to see things?,-0.485363,-1.388507,-0.903145
|
| 7 |
+
exercise_2,0,Where is the headquarters of Airbus located?,-0.372244,-0.689966,-0.317723
|
| 8 |
+
exercise_2,1,Complete this sentence: The Airbus A380 is assembled in the city of,-0.145879,-0.765407,-0.619529
|
| 9 |
+
exercise_2,2,Which city in southern France hosts the Cité de l'Espace space museum?,-0.169176,-0.571546,-0.402370
|
| 10 |
+
exercise_2,3,In which French city can you find the Place du Capitole?,-0.145984,-0.815522,-0.669538
|
| 11 |
+
exercise_2,4,Which French city is home to both Airbus and the Space Centre?,-0.213124,-1.147179,-0.934054
|
solution.py
CHANGED
|
@@ -27,9 +27,9 @@ class LaDisparition:
|
|
| 27 |
self.forbidden_token_ids.add(token_id)
|
| 28 |
|
| 29 |
def __call__(self, prompt, max_tokens=20, beam_width=5):
|
| 30 |
-
# Option
|
| 31 |
-
|
| 32 |
-
input_ids =
|
| 33 |
prompt_len = input_ids.shape[1]
|
| 34 |
|
| 35 |
# Beam search: maintain multiple hypotheses
|
|
@@ -86,9 +86,10 @@ class LaDisparition:
|
|
| 86 |
decoded = self.tokenizer.decode(seq, skip_special_tokens=True)
|
| 87 |
print(f" Beam {i}: log_prob={log_prob:.4f} | {decoded}")
|
| 88 |
|
| 89 |
-
# Return the best hypothesis
|
| 90 |
best_seq = beams[0][0]
|
| 91 |
-
|
|
|
|
| 92 |
|
| 93 |
|
| 94 |
# --- EXERCISE 2: The Toulouse Sequence ---
|
|
@@ -191,7 +192,7 @@ class ToulouseSequence:
|
|
| 191 |
if __name__ == "__main__":
|
| 192 |
# NOTE: This block is for testing only. The evaluation server provides model and tokenizer.
|
| 193 |
# SETUP
|
| 194 |
-
MODEL_NAME = "
|
| 195 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 196 |
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")
|
| 197 |
la_disparition_generator = LaDisparition(model, tokenizer)
|
|
|
|
| 27 |
self.forbidden_token_ids.add(token_id)
|
| 28 |
|
| 29 |
def __call__(self, prompt, max_tokens=20, beam_width=5):
|
| 30 |
+
# Option 2: we use self.tokenizer.apply_chat_template to tokenize the prompt
|
| 31 |
+
message = [{"role": "user", "content": prompt}]
|
| 32 |
+
input_ids = self.tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors="pt").to(self.model.device)
|
| 33 |
prompt_len = input_ids.shape[1]
|
| 34 |
|
| 35 |
# Beam search: maintain multiple hypotheses
|
|
|
|
| 86 |
decoded = self.tokenizer.decode(seq, skip_special_tokens=True)
|
| 87 |
print(f" Beam {i}: log_prob={log_prob:.4f} | {decoded}")
|
| 88 |
|
| 89 |
+
# Return the best hypothesis (only the generated part)
|
| 90 |
best_seq = beams[0][0]
|
| 91 |
+
generated_tokens = best_seq[prompt_len:]
|
| 92 |
+
return self.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
|
| 93 |
|
| 94 |
|
| 95 |
# --- EXERCISE 2: The Toulouse Sequence ---
|
|
|
|
| 192 |
if __name__ == "__main__":
|
| 193 |
# NOTE: This block is for testing only. The evaluation server provides model and tokenizer.
|
| 194 |
# SETUP
|
| 195 |
+
MODEL_NAME = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
|
| 196 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 197 |
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")
|
| 198 |
la_disparition_generator = LaDisparition(model, tokenizer)
|
test_cases.json
CHANGED
|
@@ -7,10 +7,10 @@
|
|
| 7 |
"What do you use to see things?"
|
| 8 |
],
|
| 9 |
"exercise_2": [
|
| 10 |
-
"Which French city is known as the 'Ville Rose'?",
|
| 11 |
"Where is the headquarters of Airbus located?",
|
| 12 |
-
"
|
| 13 |
-
"
|
| 14 |
-
"
|
|
|
|
| 15 |
]
|
| 16 |
}
|
|
|
|
| 7 |
"What do you use to see things?"
|
| 8 |
],
|
| 9 |
"exercise_2": [
|
|
|
|
| 10 |
"Where is the headquarters of Airbus located?",
|
| 11 |
+
"Complete this sentence: The Airbus A380 is assembled in the city of",
|
| 12 |
+
"Which city in southern France hosts the Cité de l'Espace space museum?",
|
| 13 |
+
"In which French city can you find the Place du Capitole?",
|
| 14 |
+
"Which French city is home to both Airbus and the Space Centre?"
|
| 15 |
]
|
| 16 |
}
|