Spaces:

LLM-course
/

lipogram_private

Running

App Files Files Community

nathanael-fijalkow commited on 17 days ago

Commit

4d8bbd9

1 Parent(s): 1d7752e

Improved logprob-based scoring

Browse files

Files changed (7) hide show

app.py +157 -12
calibrate_logprobs.py +236 -0
forbidden_solution.py +14 -6
greedy.py +127 -0
reference_scores.csv +11 -0
solution.py +7 -6
test_cases.json +4 -4

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gradio as gr
 import importlib.util
 import json
 import torch
 import gc
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import threading
@@ -26,6 +27,120 @@ model = AutoModelForCausalLM.from_pretrained(
 with open("test_cases.json", "r") as f:
     TEST_CASES = json.load(f)
 class TimeoutException(Exception):
     pass
@@ -201,6 +316,7 @@ def evaluate_submission(file_obj, debug=False):
         ex1_passed = 0
         ex1_timeout = False
         ex1_outputs = []
         try:
             print("### EXERCISE 1 - La Disparition (No 'e')")
             ex1_instance = student_module.LaDisparition(model, tokenizer)
@@ -218,35 +334,49 @@ def evaluate_submission(file_obj, debug=False):
                     )
                     # Remove prompt from output to only validate generated text
                     cleaned_output = strip_prompt_from_output(output, prompt)
-                    # assistant_response = extract_assistant_response(cleaned_output)
                     print(f"Response: {cleaned_output}")
                     passed = 'e' not in cleaned_output.lower() and len(cleaned_output.strip()) > 3
                     if passed:
                         ex1_passed += 1
-                    ex1_outputs.append({"prompt": prompt, "output": cleaned_output, "passed": passed})
                     if debug:
                         print(f"Ex1 Test {i+1}: {'✓' if passed else '✗'}")
                         print(f"  Prompt: {prompt}")
                         print(f"  Output: {output}")
                         print()
                 except TimeoutException:
                     ex1_timeout = True
-                    ex1_outputs.append({"prompt": prompt, "output": "TIMEOUT", "passed": False})
                     print(f"Result: ✗ TIMEOUT")
                     break
-            print(f"\nExercise 1 Score: {ex1_passed}/5")
             if ex1_timeout:
                 report.append(f" **Ex 1 (No 'e'):** TIMEOUT - evaluation exceeded {TIMEOUT_SECONDS}s limit")
             else:
-                report.append(f" **Ex 1 (No 'e'):** {ex1_passed}/5 correct")
             if debug:
                 report.append("\n### Ex 1 Outputs:")
                 for i, out in enumerate(ex1_outputs):
-                    report.append(f"{i+1}. {'✓' if out['passed'] else '✗'} `{out['output']}`")
         except Exception as e:
             report.append(f" **Ex 1 Error:** {str(e)}")
@@ -254,6 +384,7 @@ def evaluate_submission(file_obj, debug=False):
         ex2_passed = 0
         ex2_timeout = False
         ex2_outputs = []
         try:
             print("\n### EXERCISE 2 - Toulouse Sequence (No 'Toulouse')")
             ex2_instance = student_module.ToulouseSequence(model, tokenizer)
@@ -270,35 +401,49 @@ def evaluate_submission(file_obj, debug=False):
                     )
                     # Remove prompt from output to only validate generated text
                     cleaned_output = strip_prompt_from_output(output, prompt)
-                    # assistant_response = extract_assistant_response(cleaned_output)
                     print(f"Response: {cleaned_output}")
                     passed = "toulouse" not in cleaned_output.lower() and len(cleaned_output.strip()) > 3
                     if passed:
                         ex2_passed += 1
-                    ex2_outputs.append({"prompt": prompt, "output": output, "passed": passed})
                     if debug:
                         print(f"Ex2 Test {i+1}: {'✓' if passed else '✗'}")
                         print(f"  Prompt: {prompt}")
                         print(f"  Output: {output}")
                         print()
                 except TimeoutException:
                     ex2_timeout = True
-                    ex2_outputs.append({"prompt": prompt, "output": "TIMEOUT", "passed": False})
                     print(f"Result: ✗ TIMEOUT")
                     break
-            print(f"\nExercise 2 Score: {ex2_passed}/5")
             if ex2_timeout:
                 report.append(f" **Ex 2 (No Toulouse):** TIMEOUT - evaluation exceeded {TIMEOUT_SECONDS}s limit")
             else:
-                report.append(f" **Ex 2 (No Toulouse):** {ex2_passed}/5 correct")
             if debug:
                 report.append("\n### Ex 2 Outputs:")
                 for i, out in enumerate(ex2_outputs):
-                    report.append(f"{i+1}. {'✓' if out['passed'] else '✗'} `{out['output']}`")
         except Exception as e:
             report.append(f" **Ex 2 Error:** {str(e)}")

 import importlib.util
 import json
 import torch
+import torch.nn.functional as F
 import gc
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import threading
 with open("test_cases.json", "r") as f:
     TEST_CASES = json.load(f)
+# --- PER-PROMPT REFERENCE SCORING ---
+# Load reference scores from CSV (generated by calibrate_logprobs.py from solution.py).
+# Each prompt has: unconstrained_logprob (baseline) and reference_delta (solution.py delta).
+# Quality = 1 if student is as good or better than solution.py, decreasing for worse.
+import csv
+REFERENCE_SCORES = {}  # key: (exercise, prompt_index) → dict
+with open("reference_scores.csv", "r") as csvfile:
+    reader = csv.DictReader(csvfile)
+    for row in reader:
+        key = (row["exercise"], int(row["prompt_index"]))
+        REFERENCE_SCORES[key] = {
+            "prompt": row["prompt"],
+            "unconstrained_logprob": float(row["unconstrained_logprob"]),
+            "reference_logprob": float(row["reference_logprob"]),
+            "reference_delta": float(row["reference_delta"]),
+        }
+def compute_mean_logprob(prompt_text, generated_text):
+    """
+    Compute the mean log-probability per token of `generated_text`
+    conditioned on `prompt_text`, under the unconstrained model.
+    Uses chat template since the evaluation model is an instruct model.
+    This measures how "natural" the generated text is: a well-constrained
+    generator still produces coherent text (high logprob), while a bad one
+    produces gibberish (low logprob).
+    Returns: (mean_logprob, n_tokens)
+    """
+    if not generated_text or not generated_text.strip():
+        return -float('inf'), 0
+    # Always use chat template: the model is an instruct model, so
+    # logprobs are meaningful only in the chat context.
+    message = [{"role": "user", "content": prompt_text}]
+    prompt_ids = tokenizer.apply_chat_template(
+        message, add_generation_prompt=True, return_tensors="pt"
+    ).to(model.device)
+    gen_ids = tokenizer.encode(
+        generated_text, add_special_tokens=False, return_tensors="pt"
+    ).to(model.device)
+    full_ids = torch.cat([prompt_ids, gen_ids], dim=1)
+    prompt_len = prompt_ids.shape[1]
+    if full_ids.shape[1] <= prompt_len:
+        return -float('inf'), 0
+    with torch.no_grad():
+        outputs = model(full_ids)
+        logits = outputs.logits
+    log_probs = F.log_softmax(logits, dim=-1)
+    total_logprob = 0.0
+    n_tokens = 0
+    for i in range(prompt_len, full_ids.shape[1]):
+        token_id = full_ids[0, i].item()
+        token_logprob = log_probs[0, i - 1, token_id].item()
+        total_logprob += token_logprob
+        n_tokens += 1
+    mean_logprob = total_logprob / n_tokens if n_tokens > 0 else -float('inf')
+    return mean_logprob, n_tokens
+def compute_quality_score(mean_logprob, exercise_key, prompt_index):
+    """
+    Per-prompt quality score in [0, 1] using reference deltas from solution.py.
+    Logic:
+    - Compute student_delta = student_logprob - unconstrained_logprob
+    - Both student_delta and reference_delta are negative (constrained is worse).
+    - Quality = 1.0 if student_delta >= reference_delta (student as good or better).
+    - Quality = student_delta / reference_delta if student is worse, clamped to [0, 1].
+      (ratio > 1 when student is worse since both are negative, so we use
+       reference/student to get a value in [0, 1] that decreases as student gets worse).
+    - A generous margin (3x reference delta) maps to quality = 0.
+    """
+    key = (exercise_key, prompt_index)
+    if key not in REFERENCE_SCORES:
+        # Fallback: if no reference data, return 1 for any non-terrible logprob
+        return 1.0 if mean_logprob > -5.0 else 0.0
+    ref = REFERENCE_SCORES[key]
+    unconstrained_lp = ref["unconstrained_logprob"]
+    ref_delta = ref["reference_delta"]  # negative value
+    student_delta = mean_logprob - unconstrained_lp  # negative value
+    if student_delta >= ref_delta:
+        # Student is as good or better than reference → quality = 1
+        return 1.0
+    if ref_delta == 0:
+        return 0.0
+    # Student is worse than reference.
+    # Linear decay: quality = ref_delta / student_delta
+    # When student_delta == ref_delta → 1.0
+    # When student_delta is much worse → approaches 0
+    # Cap at 3x reference delta for quality = 0
+    worst_delta = 3.0 * ref_delta  # e.g., ref=-0.9 → worst=-2.7
+    if student_delta <= worst_delta:
+        return 0.0
+    # Linear interpolation between ref_delta (quality=1) and worst_delta (quality=0)
+    quality = (student_delta - worst_delta) / (ref_delta - worst_delta)
+    return max(0.0, min(1.0, quality))
 class TimeoutException(Exception):
     pass
         ex1_passed = 0
         ex1_timeout = False
         ex1_outputs = []
+        ex1_quality_scores = []
         try:
             print("### EXERCISE 1 - La Disparition (No 'e')")
             ex1_instance = student_module.LaDisparition(model, tokenizer)
                     )
                     # Remove prompt from output to only validate generated text
                     cleaned_output = strip_prompt_from_output(output, prompt)
                     print(f"Response: {cleaned_output}")
                     passed = 'e' not in cleaned_output.lower() and len(cleaned_output.strip()) > 3
+                    # Compute logprob quality score
+                    mean_lp, n_tok = compute_mean_logprob(prompt, cleaned_output)
+                    quality = compute_quality_score(mean_lp, "exercise_1", i) if passed else 0.0
+                    ex1_quality_scores.append(quality)
+                    print(f"  Constraint passed: {passed} | mean_logprob: {mean_lp:.3f} | quality: {quality:.2f}")
                     if passed:
                         ex1_passed += 1
+                    ex1_outputs.append({
+                        "prompt": prompt, "output": cleaned_output, "passed": passed,
+                        "mean_logprob": mean_lp, "quality": quality
+                    })
                     if debug:
                         print(f"Ex1 Test {i+1}: {'✓' if passed else '✗'}")
                         print(f"  Prompt: {prompt}")
                         print(f"  Output: {output}")
+                        print(f"  mean_logprob={mean_lp:.4f}, quality={quality:.2f}")
                         print()
                 except TimeoutException:
                     ex1_timeout = True
+                    ex1_outputs.append({"prompt": prompt, "output": "TIMEOUT", "passed": False, "mean_logprob": float('-inf'), "quality": 0.0})
+                    ex1_quality_scores.append(0.0)
                     print(f"Result: ✗ TIMEOUT")
                     break
+            ex1_avg_quality = sum(ex1_quality_scores) / len(ex1_quality_scores) if ex1_quality_scores else 0.0
+            print(f"\nExercise 1 Score: {ex1_passed}/5 | Avg quality: {ex1_avg_quality:.2f}")
             if ex1_timeout:
                 report.append(f" **Ex 1 (No 'e'):** TIMEOUT - evaluation exceeded {TIMEOUT_SECONDS}s limit")
             else:
+                report.append(f" **Ex 1 (No 'e'):** {ex1_passed}/5 correct | Quality: {ex1_avg_quality:.0%}")
             if debug:
                 report.append("\n### Ex 1 Outputs:")
                 for i, out in enumerate(ex1_outputs):
+                    lp_str = f"logprob={out['mean_logprob']:.2f}" if out['mean_logprob'] != float('-inf') else "logprob=N/A"
+                    report.append(f"{i+1}. {'✓' if out['passed'] else '✗'} [{lp_str}, q={out['quality']:.2f}] `{out['output']}`")
         except Exception as e:
             report.append(f" **Ex 1 Error:** {str(e)}")
         ex2_passed = 0
         ex2_timeout = False
         ex2_outputs = []
+        ex2_quality_scores = []
         try:
             print("\n### EXERCISE 2 - Toulouse Sequence (No 'Toulouse')")
             ex2_instance = student_module.ToulouseSequence(model, tokenizer)
                     )
                     # Remove prompt from output to only validate generated text
                     cleaned_output = strip_prompt_from_output(output, prompt)
                     print(f"Response: {cleaned_output}")
                     passed = "toulouse" not in cleaned_output.lower() and len(cleaned_output.strip()) > 3
+                    # Compute logprob quality score
+                    mean_lp, n_tok = compute_mean_logprob(prompt, cleaned_output)
+                    quality = compute_quality_score(mean_lp, "exercise_2", i) if passed else 0.0
+                    ex2_quality_scores.append(quality)
+                    print(f"  Constraint passed: {passed} | mean_logprob: {mean_lp:.3f} | quality: {quality:.2f}")
                     if passed:
                         ex2_passed += 1
+                    ex2_outputs.append({
+                        "prompt": prompt, "output": cleaned_output, "passed": passed,
+                        "mean_logprob": mean_lp, "quality": quality
+                    })
                     if debug:
                         print(f"Ex2 Test {i+1}: {'✓' if passed else '✗'}")
                         print(f"  Prompt: {prompt}")
                         print(f"  Output: {output}")
+                        print(f"  mean_logprob={mean_lp:.4f}, quality={quality:.2f}")
                         print()
                 except TimeoutException:
                     ex2_timeout = True
+                    ex2_outputs.append({"prompt": prompt, "output": "TIMEOUT", "passed": False, "mean_logprob": float('-inf'), "quality": 0.0})
+                    ex2_quality_scores.append(0.0)
                     print(f"Result: ✗ TIMEOUT")
                     break
+            ex2_avg_quality = sum(ex2_quality_scores) / len(ex2_quality_scores) if ex2_quality_scores else 0.0
+            print(f"\nExercise 2 Score: {ex2_passed}/5 | Avg quality: {ex2_avg_quality:.2f}")
             if ex2_timeout:
                 report.append(f" **Ex 2 (No Toulouse):** TIMEOUT - evaluation exceeded {TIMEOUT_SECONDS}s limit")
             else:
+                report.append(f" **Ex 2 (No Toulouse):** {ex2_passed}/5 correct | Quality: {ex2_avg_quality:.0%}")
             if debug:
                 report.append("\n### Ex 2 Outputs:")
                 for i, out in enumerate(ex2_outputs):
+                    lp_str = f"logprob={out['mean_logprob']:.2f}" if out['mean_logprob'] != float('-inf') else "logprob=N/A"
+                    report.append(f"{i+1}. {'✓' if out['passed'] else '✗'} [{lp_str}, q={out['quality']:.2f}] `{out['output']}`")
         except Exception as e:
             report.append(f" **Ex 2 Error:** {str(e)}")

calibrate_logprobs.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""
+Calibration script: compute logprobs for reference solution outputs
+vs unconstrained model outputs to design a scoring function.
+"""
+import torch
+import torch.nn.functional as F
+import json
+from transformers import AutoModelForCausalLM, AutoTokenizer
+EVAL_MODEL = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
+print("Loading model...")
+tokenizer = AutoTokenizer.from_pretrained(EVAL_MODEL)
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+model = AutoModelForCausalLM.from_pretrained(EVAL_MODEL, dtype=torch.float16, device_map="auto")
+with open("test_cases.json", "r") as f:
+    TEST_CASES = json.load(f)
+def compute_chat_logprobs(model, tokenizer, prompt, generated_text):
+    """
+    Compute logprobs using chat template (works for both exercises).
+    The prompt is formatted as a chat message, generated_text is the response.
+    Returns:
+        mean_logprob: mean log-prob per generated token
+        total_logprob: sum of log-probs
+        n_tokens: number of generated tokens
+        per_token: list of (token_str, logprob) pairs
+    """
+    if not generated_text or not generated_text.strip():
+        return -float('inf'), 0.0, 0, []
+    message = [{"role": "user", "content": prompt}]
+    prompt_ids = tokenizer.apply_chat_template(
+        message, add_generation_prompt=True, return_tensors="pt"
+    ).to(model.device)
+    prompt_len = prompt_ids.shape[1]
+    gen_ids = tokenizer.encode(
+        generated_text, add_special_tokens=False, return_tensors="pt"
+    ).to(model.device)
+    full_ids = torch.cat([prompt_ids, gen_ids], dim=1)
+    if full_ids.shape[1] <= prompt_len:
+        return -float('inf'), 0.0, 0, []
+    with torch.no_grad():
+        outputs = model(full_ids)
+        logits = outputs.logits
+    log_probs = F.log_softmax(logits, dim=-1)
+    per_token = []
+    total_logprob = 0.0
+    n_tokens = 0
+    for i in range(prompt_len, full_ids.shape[1]):
+        token_id = full_ids[0, i].item()
+        token_logprob = log_probs[0, i - 1, token_id].item()
+        token_str = tokenizer.decode([token_id])
+        per_token.append((token_str, token_logprob))
+        total_logprob += token_logprob
+        n_tokens += 1
+    mean_logprob = total_logprob / n_tokens if n_tokens > 0 else -float('inf')
+    return mean_logprob, total_logprob, n_tokens, per_token
+def generate_unconstrained_chat(model, tokenizer, prompt, max_tokens=20):
+    """Generate unconstrained text using chat template (for both exercises)."""
+    message = [{"role": "user", "content": prompt}]
+    inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors="pt").to(model.device)
+    attention_mask = torch.ones_like(inputs)
+    prompt_length = inputs.shape[1]
+    with torch.no_grad():
+        output = model.generate(
+            inputs,
+            attention_mask=attention_mask,
+            max_new_tokens=max_tokens,
+            do_sample=False,
+            pad_token_id=tokenizer.pad_token_id
+        )
+    generated_tokens = output[0][prompt_length:]
+    return tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
+# ---- Load and run the reference solution ----
+import importlib.util
+import sys
+import time
+module_name = f"solution_module_{int(time.time())}"
+spec = importlib.util.spec_from_file_location(module_name, "solution.py")
+solution = importlib.util.module_from_spec(spec)
+sys.modules[module_name] = solution
+spec.loader.exec_module(solution)
+print("\n" + "="*80)
+print("EXERCISE 1: La Disparition (no 'e')")
+print("="*80)
+ex1_instance = solution.LaDisparition(model, tokenizer)
+ex1_results = []
+for i, prompt in enumerate(TEST_CASES["exercise_1"]):
+    # Generate constrained output
+    constrained_output = ex1_instance(prompt, max_tokens=20)
+    # Strip prompt from output
+    if constrained_output.startswith(prompt):
+        constrained_gen = constrained_output[len(prompt):].strip()
+    else:
+        constrained_gen = constrained_output.strip()
+    # Generate unconstrained output (chat template for instruct model)
+    unconstrained_gen = generate_unconstrained_chat(model, tokenizer, prompt, max_tokens=20)
+    # Compute logprobs using chat template (matches how the model should be used)
+    c_mean, c_total, c_ntok, c_per = compute_chat_logprobs(model, tokenizer, prompt, constrained_gen)
+    # Compute logprobs for unconstrained output
+    u_mean, u_total, u_ntok, u_per = compute_chat_logprobs(model, tokenizer, prompt, unconstrained_gen)
+    delta = c_mean - u_mean  # will be negative (constrained is worse)
+    print(f"\nTest {i+1}: {prompt}")
+    print(f"  Unconstrained: {unconstrained_gen}")
+    print(f"    mean_logprob={u_mean:.4f}, n_tokens={u_ntok}")
+    print(f"  Constrained:   {constrained_gen}")
+    print(f"    mean_logprob={c_mean:.4f}, n_tokens={c_ntok}")
+    print(f"  Delta (constrained - unconstrained): {delta:.4f}")
+    ex1_results.append({
+        "prompt": prompt,
+        "constrained_gen": constrained_gen,
+        "unconstrained_gen": unconstrained_gen,
+        "c_mean_logprob": c_mean,
+        "u_mean_logprob": u_mean,
+        "delta_mean_logprob": delta,
+    })
+print(f"\n--- Exercise 1 Summary ---")
+deltas_1 = [r["delta_mean_logprob"] for r in ex1_results]
+c_means_1 = [r["c_mean_logprob"] for r in ex1_results]
+u_means_1 = [r["u_mean_logprob"] for r in ex1_results]
+print(f"  Unconstrained mean logprobs: {[f'{x:.3f}' for x in u_means_1]}")
+print(f"  Constrained mean logprobs:   {[f'{x:.3f}' for x in c_means_1]}")
+print(f"  Deltas: {[f'{x:.3f}' for x in deltas_1]}")
+print(f"  Mean delta: {sum(deltas_1)/len(deltas_1):.4f}")
+print(f"  Worst delta: {min(deltas_1):.4f}")
+print("\n" + "="*80)
+print("EXERCISE 2: Toulouse Sequence (no 'Toulouse')")
+print("="*80)
+ex2_instance = solution.ToulouseSequence(model, tokenizer)
+ex2_results = []
+for i, prompt in enumerate(TEST_CASES["exercise_2"]):
+    # Generate constrained output
+    constrained_gen = ex2_instance(prompt, max_tokens=20)
+    # Generate unconstrained output (chat format)
+    unconstrained_gen = generate_unconstrained_chat(model, tokenizer, prompt, max_tokens=20)
+    # Compute logprobs (chat format)
+    c_mean, c_total, c_ntok, c_per = compute_chat_logprobs(model, tokenizer, prompt, constrained_gen)
+    u_mean, u_total, u_ntok, u_per = compute_chat_logprobs(model, tokenizer, prompt, unconstrained_gen)
+    delta = c_mean - u_mean
+    print(f"\nTest {i+1}: {prompt}")
+    print(f"  Unconstrained: {unconstrained_gen}")
+    print(f"    mean_logprob={u_mean:.4f}, n_tokens={u_ntok}")
+    print(f"  Constrained:   {constrained_gen}")
+    print(f"    mean_logprob={c_mean:.4f}, n_tokens={c_ntok}")
+    print(f"  Delta (constrained - unconstrained): {delta:.4f}")
+    ex2_results.append({
+        "prompt": prompt,
+        "constrained_gen": constrained_gen,
+        "unconstrained_gen": unconstrained_gen,
+        "c_mean_logprob": c_mean,
+        "u_mean_logprob": u_mean,
+        "delta_mean_logprob": delta,
+    })
+print(f"\n--- Exercise 2 Summary ---")
+deltas_2 = [r["delta_mean_logprob"] for r in ex2_results]
+c_means_2 = [r["c_mean_logprob"] for r in ex2_results]
+u_means_2 = [r["u_mean_logprob"] for r in ex2_results]
+print(f"  Unconstrained mean logprobs: {[f'{x:.3f}' for x in u_means_2]}")
+print(f"  Constrained mean logprobs:   {[f'{x:.3f}' for x in c_means_2]}")
+print(f"  Deltas: {[f'{x:.3f}' for x in deltas_2]}")
+print(f"  Mean delta: {sum(deltas_2)/len(deltas_2):.4f}")
+print(f"  Worst delta: {min(deltas_2):.4f}")
+print("\n" + "="*80)
+print("OVERALL RECOMMENDATION")
+print("="*80)
+all_deltas = deltas_1 + deltas_2
+print(f"All deltas: {[f'{x:.3f}' for x in all_deltas]}")
+print(f"Global mean delta: {sum(all_deltas)/len(all_deltas):.4f}")
+print(f"Global worst delta: {min(all_deltas):.4f}")
+# ---- Save reference scores to CSV ----
+import csv
+csv_path = "reference_scores.csv"
+with open(csv_path, "w", newline="") as csvfile:
+    writer = csv.writer(csvfile)
+    writer.writerow([
+        "exercise", "prompt_index", "prompt",
+        "unconstrained_logprob", "reference_logprob", "reference_delta"
+    ])
+    for i, r in enumerate(ex1_results):
+        writer.writerow([
+            "exercise_1", i, r["prompt"],
+            f"{r['u_mean_logprob']:.6f}",
+            f"{r['c_mean_logprob']:.6f}",
+            f"{r['delta_mean_logprob']:.6f}",
+        ])
+    for i, r in enumerate(ex2_results):
+        writer.writerow([
+            "exercise_2", i, r["prompt"],
+            f"{r['u_mean_logprob']:.6f}",
+            f"{r['c_mean_logprob']:.6f}",
+            f"{r['delta_mean_logprob']:.6f}",
+        ])
+print(f"\nReference scores saved to {csv_path}")

forbidden_solution.py CHANGED Viewed

@@ -30,18 +30,26 @@ class LaDisparition:
         self.processor = ForbidTokensLogitsProcessor(self.forbidden_token_ids)
     def __call__(self, prompt, max_tokens=30, beam_width=5):
-        # Option 1: we use self.tokenizer to tokenize the prompt
-        inputs = self.tokenizer(prompt, return_tensors="pt", return_attention_mask=True).to(self.model.device)
         outputs = self.model.generate(
-            **inputs,
             max_new_tokens=max_tokens,
             num_beams=beam_width,
             logits_processor=[self.processor],
             do_sample=False
         )
-        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
 # --- EXERCISE 2: The Toulouse Sequence ---
@@ -133,7 +141,7 @@ class ToulouseSequence:
 if __name__ == "__main__":
     # NOTE: This block is for testing only. The evaluation server provides model and tokenizer.
     # SETUP
-    MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float32, device_map="auto")
     la_disparition_generator = LaDisparition(model, tokenizer)

         self.processor = ForbidTokensLogitsProcessor(self.forbidden_token_ids)
     def __call__(self, prompt, max_tokens=30, beam_width=5):
+        # Option 2: we use self.tokenizer.apply_chat_template to tokenize the prompt
+        message = [{"role": "user", "content": prompt}]
+        inputs = self.tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors="pt").to(self.model.device)
+        # Create an attention mask for the inputs
+        attention_mask = torch.ones_like(inputs)
+        prompt_length = inputs.shape[1]
         outputs = self.model.generate(
+            inputs,
+            attention_mask=attention_mask,
             max_new_tokens=max_tokens,
             num_beams=beam_width,
             logits_processor=[self.processor],
             do_sample=False
         )
+        # Return only the generated part
+        generated_tokens = outputs[0][prompt_length:]
+        return self.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
 # --- EXERCISE 2: The Toulouse Sequence ---
 if __name__ == "__main__":
     # NOTE: This block is for testing only. The evaluation server provides model and tokenizer.
     # SETUP
+    MODEL_NAME = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float32, device_map="auto")
     la_disparition_generator = LaDisparition(model, tokenizer)

greedy.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+Greedy/naive solution for comparison.
+- Exercise 1: greedy decoding with token-level 'e' masking (same idea, simpler than beam search)
+- Exercise 2: naive approach — forbid the first token of "Toulouse" and " Toulouse"
+  (tokens 'T' and ' T'), which is very aggressive and blocks ALL T-starting words.
+"""
+from typing import List
+import torch
+import torch.nn.functional as F
+from transformers import AutoModelForCausalLM, AutoTokenizer
+# --- EXERCISE 1: La disparition (No 'e' or 'E') ---
+class LaDisparition:
+    """Greedy constrained generation: forbid tokens containing 'e', pick argmax."""
+    def __init__(self, model: AutoModelForCausalLM, tokenizer: AutoTokenizer):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.forbidden_token_ids = set()
+        vocab = self.tokenizer.get_vocab()
+        for token_id in range(len(vocab)):
+            decoded = self.tokenizer.decode([token_id])
+            if 'e' in decoded.lower() or not all(ord(c) < 128 for c in decoded):
+                self.forbidden_token_ids.add(token_id)
+    def __call__(self, prompt, max_tokens=20):
+        message = [{"role": "user", "content": prompt}]
+        input_ids = self.tokenizer.apply_chat_template(
+            message, add_generation_prompt=True, return_tensors="pt"
+        ).to(self.model.device)
+        prompt_len = input_ids.shape[1]
+        seq = input_ids[0].tolist()
+        forbidden_list = list(self.forbidden_token_ids)
+        for step in range(max_tokens):
+            input_tensor = torch.tensor([seq], device=self.model.device)
+            with torch.no_grad():
+                outputs = self.model(input_tensor)
+                logits = outputs.logits[0, -1, :].clone()
+            # Mask forbidden tokens
+            logits[forbidden_list] = -float('inf')
+            next_token = torch.argmax(logits).item()
+            if next_token == self.tokenizer.eos_token_id:
+                break
+            seq.append(next_token)
+        generated_tokens = seq[prompt_len:]
+        return self.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
+# --- EXERCISE 2: The Toulouse Sequence (naive approach) ---
+class ToulouseSequence:
+    """
+    Naive approach: forbid the first token of "Toulouse" and " Toulouse".
+    "Toulouse" tokenizes as [T(68)][oul(9226)][ouse(1368)]
+    " Toulouse" tokenizes as [ T(312)][oul(9226)][ouse(1368)]
+    By forbidding tokens 68 ('T') and 312 (' T'), we block the model from
+    ever starting the word "Toulouse". This is very aggressive: it also blocks
+    ALL words starting with 'T' (e.g., "The", "This", "That", "They", ...).
+    """
+    def __init__(self, model: AutoModelForCausalLM, tokenizer: AutoTokenizer):
+        self.model = model
+        self.tokenizer = tokenizer
+        # Find the first token of "Toulouse" and " Toulouse"
+        toulouse_ids = self.tokenizer.encode("Toulouse", add_special_tokens=False)
+        space_toulouse_ids = self.tokenizer.encode(" Toulouse", add_special_tokens=False)
+        self.forbidden_token_ids = {toulouse_ids[0], space_toulouse_ids[0]}
+        print(f"[ToulouseSequence naive] Forbidden first tokens: {self.forbidden_token_ids}")
+    def __call__(self, prompt, max_tokens=20):
+        message = [{"role": "user", "content": prompt}]
+        inputs = self.tokenizer.apply_chat_template(
+            message, add_generation_prompt=True, return_tensors="pt"
+        ).to(self.model.device)
+        prompt_length = inputs.shape[1]
+        seq = inputs[0].tolist()
+        forbidden_list = list(self.forbidden_token_ids)
+        for step in range(max_tokens):
+            input_tensor = torch.tensor([seq], device=self.model.device)
+            with torch.no_grad():
+                outputs = self.model(input_tensor)
+                logits = outputs.logits[0, -1, :].clone()
+            # Mask forbidden tokens
+            logits[forbidden_list] = -float('inf')
+            next_token = torch.argmax(logits).item()
+            if next_token == self.tokenizer.eos_token_id:
+                break
+            seq.append(next_token)
+        generated_tokens = seq[prompt_length:]
+        return self.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
+if __name__ == "__main__":
+    MODEL_NAME = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float16, device_map="auto")
+    print("=== Exercise 1: La Disparition (no 'e') ===")
+    ex1 = LaDisparition(model, tokenizer)
+    for prompt in ["Who is the king of the jungle?", "Name a fruit that is red."]:
+        result = ex1(prompt)
+        has_e = 'e' in result.lower()
+        print(f"  Q: {prompt}")
+        print(f"  A: {result}")
+        print(f"  {'✗ FAIL' if has_e else '✓ PASS'}\n")
+    print("=== Exercise 2: No Toulouse (naive) ===")
+    ex2 = ToulouseSequence(model, tokenizer)
+    for prompt in [
+        "Where is the headquarters of Airbus located?",
+        "In which French city can you find the Place du Capitole?",
+    ]:
+        result = ex2(prompt)
+        has_toulouse = 'toulouse' in result.lower()
+        print(f"  Q: {prompt}")
+        print(f"  A: {result}")
+        print(f"  {'✗ FAIL' if has_toulouse else '✓ PASS'}\n")

reference_scores.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+exercise,prompt_index,prompt,unconstrained_logprob,reference_logprob,reference_delta
+exercise_1,0,Who is the king of the jungle?,-0.438915,-1.160570,-0.721655
+exercise_1,1,Complete this: Once upon a...,-0.333118,-1.340559,-1.007441
+exercise_1,2,What is the opposite of 'always'?,-0.204553,-1.405256,-1.200703
+exercise_1,3,Name a fruit that is red.,-0.428213,-1.045869,-0.617657
+exercise_1,4,What do you use to see things?,-0.485363,-1.388507,-0.903145
+exercise_2,0,Where is the headquarters of Airbus located?,-0.372244,-0.689966,-0.317723
+exercise_2,1,Complete this sentence: The Airbus A380 is assembled in the city of,-0.145879,-0.765407,-0.619529
+exercise_2,2,Which city in southern France hosts the Cité de l'Espace space museum?,-0.169176,-0.571546,-0.402370
+exercise_2,3,In which French city can you find the Place du Capitole?,-0.145984,-0.815522,-0.669538
+exercise_2,4,Which French city is home to both Airbus and the Space Centre?,-0.213124,-1.147179,-0.934054

solution.py CHANGED Viewed

@@ -27,9 +27,9 @@ class LaDisparition:
                 self.forbidden_token_ids.add(token_id)
     def __call__(self, prompt, max_tokens=20, beam_width=5):
-        # Option 1: we use self.tokenizer to tokenize the prompt
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
-        input_ids = inputs["input_ids"]
         prompt_len = input_ids.shape[1]
         # Beam search: maintain multiple hypotheses
@@ -86,9 +86,10 @@ class LaDisparition:
                 decoded = self.tokenizer.decode(seq, skip_special_tokens=True)
                 print(f"  Beam {i}: log_prob={log_prob:.4f} | {decoded}")
-        # Return the best hypothesis
         best_seq = beams[0][0]
-        return self.tokenizer.decode(best_seq, skip_special_tokens=True)
 # --- EXERCISE 2: The Toulouse Sequence ---
@@ -191,7 +192,7 @@ class ToulouseSequence:
 if __name__ == "__main__":
     # NOTE: This block is for testing only. The evaluation server provides model and tokenizer.
     # SETUP
-    MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")
     la_disparition_generator = LaDisparition(model, tokenizer)

                 self.forbidden_token_ids.add(token_id)
     def __call__(self, prompt, max_tokens=20, beam_width=5):
+        # Option 2: we use self.tokenizer.apply_chat_template to tokenize the prompt
+        message = [{"role": "user", "content": prompt}]
+        input_ids = self.tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors="pt").to(self.model.device)
         prompt_len = input_ids.shape[1]
         # Beam search: maintain multiple hypotheses
                 decoded = self.tokenizer.decode(seq, skip_special_tokens=True)
                 print(f"  Beam {i}: log_prob={log_prob:.4f} | {decoded}")
+        # Return the best hypothesis (only the generated part)
         best_seq = beams[0][0]
+        generated_tokens = best_seq[prompt_len:]
+        return self.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
 # --- EXERCISE 2: The Toulouse Sequence ---
 if __name__ == "__main__":
     # NOTE: This block is for testing only. The evaluation server provides model and tokenizer.
     # SETUP
+    MODEL_NAME = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")
     la_disparition_generator = LaDisparition(model, tokenizer)

test_cases.json CHANGED Viewed

@@ -7,10 +7,10 @@
     "What do you use to see things?"
   ],
   "exercise_2": [
-    "Which French city is known as the 'Ville Rose'?",
     "Where is the headquarters of Airbus located?",
-    "Name a major city in the Occitanie region crossed by the Garonne River.",
-    "What French city is famous for its aerospace industry and has a historic basilica called Saint-Sernin?",
-    "If you are at the Cité de l'Espace, which city are you in?"
   ]
 }

     "What do you use to see things?"
   ],
   "exercise_2": [
     "Where is the headquarters of Airbus located?",
+    "Complete this sentence: The Airbus A380 is assembled in the city of",
+    "Which city in southern France hosts the Cité de l'Espace space museum?",
+    "In which French city can you find the Place du Capitole?",
+    "Which French city is home to both Airbus and the Space Centre?"
   ]
 }