Spaces:
Running
Running
| """ | |
| Calibration script: compute logprobs for reference solution outputs | |
| vs unconstrained model outputs to design a scoring function. | |
| """ | |
| import torch | |
| import torch.nn.functional as F | |
| import json | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| EVAL_MODEL = "HuggingFaceTB/SmolLM2-1.7B-Instruct" | |
| print("Loading model...") | |
| tokenizer = AutoTokenizer.from_pretrained(EVAL_MODEL) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| model = AutoModelForCausalLM.from_pretrained(EVAL_MODEL, dtype=torch.float16, device_map="auto") | |
| with open("test_cases.json", "r") as f: | |
| TEST_CASES = json.load(f) | |
| def compute_chat_logprobs(model, tokenizer, prompt, generated_text): | |
| """ | |
| Compute logprobs using chat template (works for both exercises). | |
| The prompt is formatted as a chat message, generated_text is the response. | |
| Returns: | |
| mean_logprob: mean log-prob per generated token | |
| total_logprob: sum of log-probs | |
| n_tokens: number of generated tokens | |
| per_token: list of (token_str, logprob) pairs | |
| """ | |
| if not generated_text or not generated_text.strip(): | |
| return -float('inf'), 0.0, 0, [] | |
| message = [{"role": "user", "content": prompt}] | |
| prompt_ids = tokenizer.apply_chat_template( | |
| message, add_generation_prompt=True, return_tensors="pt" | |
| ).to(model.device) | |
| prompt_len = prompt_ids.shape[1] | |
| gen_ids = tokenizer.encode( | |
| generated_text, add_special_tokens=False, return_tensors="pt" | |
| ).to(model.device) | |
| full_ids = torch.cat([prompt_ids, gen_ids], dim=1) | |
| if full_ids.shape[1] <= prompt_len: | |
| return -float('inf'), 0.0, 0, [] | |
| with torch.no_grad(): | |
| outputs = model(full_ids) | |
| logits = outputs.logits | |
| log_probs = F.log_softmax(logits, dim=-1) | |
| per_token = [] | |
| total_logprob = 0.0 | |
| n_tokens = 0 | |
| for i in range(prompt_len, full_ids.shape[1]): | |
| token_id = full_ids[0, i].item() | |
| token_logprob = log_probs[0, i - 1, token_id].item() | |
| token_str = tokenizer.decode([token_id]) | |
| per_token.append((token_str, token_logprob)) | |
| total_logprob += token_logprob | |
| n_tokens += 1 | |
| mean_logprob = total_logprob / n_tokens if n_tokens > 0 else -float('inf') | |
| return mean_logprob, total_logprob, n_tokens, per_token | |
| def generate_unconstrained_chat(model, tokenizer, prompt, max_tokens=20): | |
| """Generate unconstrained text using chat template (for both exercises).""" | |
| message = [{"role": "user", "content": prompt}] | |
| inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors="pt").to(model.device) | |
| attention_mask = torch.ones_like(inputs) | |
| prompt_length = inputs.shape[1] | |
| with torch.no_grad(): | |
| output = model.generate( | |
| inputs, | |
| attention_mask=attention_mask, | |
| max_new_tokens=max_tokens, | |
| do_sample=False, | |
| pad_token_id=tokenizer.pad_token_id | |
| ) | |
| generated_tokens = output[0][prompt_length:] | |
| return tokenizer.decode(generated_tokens, skip_special_tokens=True).strip() | |
| # ---- Load and run the reference solution ---- | |
| import importlib.util | |
| import sys | |
| import time | |
| module_name = f"solution_module_{int(time.time())}" | |
| spec = importlib.util.spec_from_file_location(module_name, "solution.py") | |
| solution = importlib.util.module_from_spec(spec) | |
| sys.modules[module_name] = solution | |
| spec.loader.exec_module(solution) | |
| print("\n" + "="*80) | |
| print("EXERCISE 1: La Disparition (no 'e')") | |
| print("="*80) | |
| ex1_instance = solution.LaDisparition(model, tokenizer) | |
| ex1_results = [] | |
| for i, prompt in enumerate(TEST_CASES["exercise_1"]): | |
| # Generate constrained output | |
| constrained_output = ex1_instance(prompt, max_tokens=20) | |
| # Strip prompt from output | |
| if constrained_output.startswith(prompt): | |
| constrained_gen = constrained_output[len(prompt):].strip() | |
| else: | |
| constrained_gen = constrained_output.strip() | |
| # Generate unconstrained output (chat template for instruct model) | |
| unconstrained_gen = generate_unconstrained_chat(model, tokenizer, prompt, max_tokens=20) | |
| # Compute logprobs using chat template (matches how the model should be used) | |
| c_mean, c_total, c_ntok, c_per = compute_chat_logprobs(model, tokenizer, prompt, constrained_gen) | |
| # Compute logprobs for unconstrained output | |
| u_mean, u_total, u_ntok, u_per = compute_chat_logprobs(model, tokenizer, prompt, unconstrained_gen) | |
| delta = c_mean - u_mean # will be negative (constrained is worse) | |
| print(f"\nTest {i+1}: {prompt}") | |
| print(f" Unconstrained: {unconstrained_gen}") | |
| print(f" mean_logprob={u_mean:.4f}, n_tokens={u_ntok}") | |
| print(f" Constrained: {constrained_gen}") | |
| print(f" mean_logprob={c_mean:.4f}, n_tokens={c_ntok}") | |
| print(f" Delta (constrained - unconstrained): {delta:.4f}") | |
| ex1_results.append({ | |
| "prompt": prompt, | |
| "constrained_gen": constrained_gen, | |
| "unconstrained_gen": unconstrained_gen, | |
| "c_mean_logprob": c_mean, | |
| "u_mean_logprob": u_mean, | |
| "delta_mean_logprob": delta, | |
| }) | |
| print(f"\n--- Exercise 1 Summary ---") | |
| deltas_1 = [r["delta_mean_logprob"] for r in ex1_results] | |
| c_means_1 = [r["c_mean_logprob"] for r in ex1_results] | |
| u_means_1 = [r["u_mean_logprob"] for r in ex1_results] | |
| print(f" Unconstrained mean logprobs: {[f'{x:.3f}' for x in u_means_1]}") | |
| print(f" Constrained mean logprobs: {[f'{x:.3f}' for x in c_means_1]}") | |
| print(f" Deltas: {[f'{x:.3f}' for x in deltas_1]}") | |
| print(f" Mean delta: {sum(deltas_1)/len(deltas_1):.4f}") | |
| print(f" Worst delta: {min(deltas_1):.4f}") | |
| print("\n" + "="*80) | |
| print("EXERCISE 2: Toulouse Sequence (no 'Toulouse')") | |
| print("="*80) | |
| ex2_instance = solution.ToulouseSequence(model, tokenizer) | |
| ex2_results = [] | |
| for i, prompt in enumerate(TEST_CASES["exercise_2"]): | |
| # Generate constrained output | |
| constrained_gen = ex2_instance(prompt, max_tokens=20) | |
| # Generate unconstrained output (chat format) | |
| unconstrained_gen = generate_unconstrained_chat(model, tokenizer, prompt, max_tokens=20) | |
| # Compute logprobs (chat format) | |
| c_mean, c_total, c_ntok, c_per = compute_chat_logprobs(model, tokenizer, prompt, constrained_gen) | |
| u_mean, u_total, u_ntok, u_per = compute_chat_logprobs(model, tokenizer, prompt, unconstrained_gen) | |
| delta = c_mean - u_mean | |
| print(f"\nTest {i+1}: {prompt}") | |
| print(f" Unconstrained: {unconstrained_gen}") | |
| print(f" mean_logprob={u_mean:.4f}, n_tokens={u_ntok}") | |
| print(f" Constrained: {constrained_gen}") | |
| print(f" mean_logprob={c_mean:.4f}, n_tokens={c_ntok}") | |
| print(f" Delta (constrained - unconstrained): {delta:.4f}") | |
| ex2_results.append({ | |
| "prompt": prompt, | |
| "constrained_gen": constrained_gen, | |
| "unconstrained_gen": unconstrained_gen, | |
| "c_mean_logprob": c_mean, | |
| "u_mean_logprob": u_mean, | |
| "delta_mean_logprob": delta, | |
| }) | |
| print(f"\n--- Exercise 2 Summary ---") | |
| deltas_2 = [r["delta_mean_logprob"] for r in ex2_results] | |
| c_means_2 = [r["c_mean_logprob"] for r in ex2_results] | |
| u_means_2 = [r["u_mean_logprob"] for r in ex2_results] | |
| print(f" Unconstrained mean logprobs: {[f'{x:.3f}' for x in u_means_2]}") | |
| print(f" Constrained mean logprobs: {[f'{x:.3f}' for x in c_means_2]}") | |
| print(f" Deltas: {[f'{x:.3f}' for x in deltas_2]}") | |
| print(f" Mean delta: {sum(deltas_2)/len(deltas_2):.4f}") | |
| print(f" Worst delta: {min(deltas_2):.4f}") | |
| print("\n" + "="*80) | |
| print("OVERALL RECOMMENDATION") | |
| print("="*80) | |
| all_deltas = deltas_1 + deltas_2 | |
| print(f"All deltas: {[f'{x:.3f}' for x in all_deltas]}") | |
| print(f"Global mean delta: {sum(all_deltas)/len(all_deltas):.4f}") | |
| print(f"Global worst delta: {min(all_deltas):.4f}") | |
| # ---- Save reference scores to CSV ---- | |
| import csv | |
| csv_path = "reference_scores.csv" | |
| with open(csv_path, "w", newline="") as csvfile: | |
| writer = csv.writer(csvfile) | |
| writer.writerow([ | |
| "exercise", "prompt_index", "prompt", | |
| "unconstrained_logprob", "reference_logprob", "reference_delta" | |
| ]) | |
| for i, r in enumerate(ex1_results): | |
| writer.writerow([ | |
| "exercise_1", i, r["prompt"], | |
| f"{r['u_mean_logprob']:.6f}", | |
| f"{r['c_mean_logprob']:.6f}", | |
| f"{r['delta_mean_logprob']:.6f}", | |
| ]) | |
| for i, r in enumerate(ex2_results): | |
| writer.writerow([ | |
| "exercise_2", i, r["prompt"], | |
| f"{r['u_mean_logprob']:.6f}", | |
| f"{r['c_mean_logprob']:.6f}", | |
| f"{r['delta_mean_logprob']:.6f}", | |
| ]) | |
| print(f"\nReference scores saved to {csv_path}") | |