Commit ·
4b37626
1
Parent(s): c46e600
Updated to use logprob scores
Browse files- app.py +17 -9
- challenge.py +85 -42
app.py
CHANGED
|
@@ -136,38 +136,46 @@ def submit_challenge(file, request: gr.Request):
|
|
| 136 |
# Parse the result from the evaluator
|
| 137 |
ex1_score = 0
|
| 138 |
ex2_score = 0
|
|
|
|
|
|
|
| 139 |
ex1_status = "Not evaluated"
|
| 140 |
ex2_status = "Not evaluated"
|
| 141 |
|
| 142 |
try:
|
| 143 |
import re
|
| 144 |
-
# Parse Ex 1 - look for pattern "**Ex 1 (No 'e'):** X/5 correct"
|
| 145 |
if "Ex 1" in result_text:
|
| 146 |
if "Ex 1" in result_text and "TIMEOUT" in result_text.split("Ex 2")[0]:
|
| 147 |
ex1_status = "TIMEOUT"
|
| 148 |
elif "Ex 1 Error" in result_text:
|
| 149 |
ex1_status = "ERROR"
|
| 150 |
else:
|
| 151 |
-
# Match format: **Ex 1 (No 'e'):** X/5 correct
|
| 152 |
-
ex1_match = re.search(r'Ex 1[^:]*:\*?\*?\s*(\d+)/5', result_text)
|
| 153 |
if ex1_match:
|
| 154 |
ex1_score = int(ex1_match.group(1))
|
| 155 |
-
|
|
|
|
| 156 |
|
| 157 |
-
# Parse Ex 2 - look for pattern "**Ex 2 (No Toulouse):** X/5 correct"
|
| 158 |
if "Ex 2" in result_text:
|
| 159 |
if "Ex 2" in result_text and "TIMEOUT" in result_text.split("Ex 2")[1]:
|
| 160 |
ex2_status = "TIMEOUT"
|
| 161 |
elif "Ex 2 Error" in result_text:
|
| 162 |
ex2_status = "ERROR"
|
| 163 |
else:
|
| 164 |
-
# Match format: **Ex 2 (No Toulouse):** X/5 correct
|
| 165 |
-
ex2_match = re.search(r'Ex 2[^:]*:\*?\*?\s*(\d+)/5', result_text)
|
| 166 |
if ex2_match:
|
| 167 |
ex2_score = int(ex2_match.group(1))
|
| 168 |
-
|
|
|
|
| 169 |
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
except Exception as e:
|
| 172 |
# If parsing fails, try to extract what we can from the text
|
| 173 |
total_score = 0
|
|
|
|
| 136 |
# Parse the result from the evaluator
|
| 137 |
ex1_score = 0
|
| 138 |
ex2_score = 0
|
| 139 |
+
ex1_quality = 0.0
|
| 140 |
+
ex2_quality = 0.0
|
| 141 |
ex1_status = "Not evaluated"
|
| 142 |
ex2_status = "Not evaluated"
|
| 143 |
|
| 144 |
try:
|
| 145 |
import re
|
| 146 |
+
# Parse Ex 1 - look for pattern "**Ex 1 (No 'e'):** X/5 correct | Quality: X%"
|
| 147 |
if "Ex 1" in result_text:
|
| 148 |
if "Ex 1" in result_text and "TIMEOUT" in result_text.split("Ex 2")[0]:
|
| 149 |
ex1_status = "TIMEOUT"
|
| 150 |
elif "Ex 1 Error" in result_text:
|
| 151 |
ex1_status = "ERROR"
|
| 152 |
else:
|
| 153 |
+
# Match format: **Ex 1 (No 'e'):** X/5 correct | Quality: X%
|
| 154 |
+
ex1_match = re.search(r'Ex 1[^:]*:\*?\*?\s*(\d+)/5\s*correct\s*\|\s*Quality:\s*(\d+)%', result_text)
|
| 155 |
if ex1_match:
|
| 156 |
ex1_score = int(ex1_match.group(1))
|
| 157 |
+
ex1_quality = int(ex1_match.group(2)) / 100.0
|
| 158 |
+
ex1_status = f"{ex1_score}/5 ({ex1_match.group(2)}%)"
|
| 159 |
|
| 160 |
+
# Parse Ex 2 - look for pattern "**Ex 2 (No Toulouse):** X/5 correct | Quality: X%"
|
| 161 |
if "Ex 2" in result_text:
|
| 162 |
if "Ex 2" in result_text and "TIMEOUT" in result_text.split("Ex 2")[1]:
|
| 163 |
ex2_status = "TIMEOUT"
|
| 164 |
elif "Ex 2 Error" in result_text:
|
| 165 |
ex2_status = "ERROR"
|
| 166 |
else:
|
| 167 |
+
# Match format: **Ex 2 (No Toulouse):** X/5 correct | Quality: X%
|
| 168 |
+
ex2_match = re.search(r'Ex 2[^:]*:\*?\*?\s*(\d+)/5\s*correct\s*\|\s*Quality:\s*(\d+)%', result_text)
|
| 169 |
if ex2_match:
|
| 170 |
ex2_score = int(ex2_match.group(1))
|
| 171 |
+
ex2_quality = int(ex2_match.group(2)) / 100.0
|
| 172 |
+
ex2_status = f"{ex2_score}/5 ({ex2_match.group(2)}%)"
|
| 173 |
|
| 174 |
+
# Total score: 50% correctness + 50% quality, out of 10
|
| 175 |
+
correctness_part = (ex1_score + ex2_score) / 2.0 # 0-5
|
| 176 |
+
avg_quality = (ex1_quality + ex2_quality) / 2.0
|
| 177 |
+
quality_part = avg_quality * 5 # 0-5
|
| 178 |
+
total_score = round(correctness_part + quality_part, 2) # 0-10
|
| 179 |
except Exception as e:
|
| 180 |
# If parsing fails, try to extract what we can from the text
|
| 181 |
total_score = 0
|
challenge.py
CHANGED
|
@@ -1,68 +1,111 @@
|
|
| 1 |
-
from typing import Any
|
| 2 |
import torch
|
| 3 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 4 |
|
| 5 |
-
# --- EXERCISE 1: La disparition (No 'e' or 'E) ---
|
| 6 |
class LaDisparition:
|
| 7 |
"""
|
| 8 |
Generate text without ever using the letter 'e' or 'E'.
|
| 9 |
-
|
|
|
|
| 10 |
You need to manually adjust the logits to forbid tokens containing 'e' or 'E'.
|
|
|
|
| 11 |
REQUIREMENT: Do NOT use model.generate().
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
"""
|
| 13 |
-
def __init__(self, model, tokenizer):
|
| 14 |
self.model = model
|
| 15 |
self.tokenizer = tokenizer
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
#
|
|
|
|
| 19 |
|
| 20 |
-
def __call__(self, prompt, max_tokens=
|
| 21 |
-
# Tokenize
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
#
|
| 25 |
-
#
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
pass
|
| 31 |
|
| 32 |
|
| 33 |
# --- EXERCISE 2: The Toulouse Sequence ---
|
| 34 |
class ToulouseSequence:
|
| 35 |
"""
|
| 36 |
-
Generate text without ever
|
| 37 |
-
|
| 38 |
-
You
|
| 39 |
-
|
| 40 |
REQUIREMENT: Do NOT use model.generate().
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
"""
|
| 42 |
-
def __init__(self, model, tokenizer):
|
| 43 |
self.model = model
|
| 44 |
self.tokenizer = tokenizer
|
| 45 |
-
|
| 46 |
-
#
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
-
def __call__(self, prompt, max_tokens=30):
|
| 50 |
-
# Tokenize input prompt:
|
| 51 |
-
# input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
|
| 52 |
|
| 53 |
-
# Generate tokens manually, one step at a time:
|
| 54 |
-
# (The bulk of the logic goes here)
|
| 55 |
-
# Hint: you need to track partial matches of the forbidden word
|
| 56 |
-
|
| 57 |
-
# Decode output tokens to string and return
|
| 58 |
-
# return tokenizer.decode(generated, skip_special_tokens=True)
|
| 59 |
-
pass
|
| 60 |
-
|
| 61 |
if __name__ == "__main__":
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
| 63 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 64 |
-
model = AutoModelForCausalLM.from_pretrained(
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import torch
|
| 2 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 3 |
|
| 4 |
+
# --- EXERCISE 1: La disparition (No 'e' or 'E') ---
|
| 5 |
class LaDisparition:
|
| 6 |
"""
|
| 7 |
Generate text without ever using the letter 'e' or 'E'.
|
| 8 |
+
|
| 9 |
+
You must use model() directly: model(input_ids) yields logits.
|
| 10 |
You need to manually adjust the logits to forbid tokens containing 'e' or 'E'.
|
| 11 |
+
|
| 12 |
REQUIREMENT: Do NOT use model.generate().
|
| 13 |
+
|
| 14 |
+
Hints:
|
| 15 |
+
- In __init__, pre-compute the set of forbidden token IDs by checking
|
| 16 |
+
which tokens in the vocabulary decode to strings containing 'e' or 'E'.
|
| 17 |
+
- In __call__, implement a token-by-token generation loop:
|
| 18 |
+
1. Feed the current sequence to the model to get logits
|
| 19 |
+
2. Mask out forbidden tokens (set their logits to -inf)
|
| 20 |
+
3. Pick the next token (greedy: argmax, or use beam search for better quality)
|
| 21 |
+
4. Append and repeat
|
| 22 |
+
- Return only the generated text (not the prompt).
|
| 23 |
"""
|
| 24 |
+
def __init__(self, model: AutoModelForCausalLM, tokenizer: AutoTokenizer):
|
| 25 |
self.model = model
|
| 26 |
self.tokenizer = tokenizer
|
| 27 |
+
|
| 28 |
+
# TODO: Pre-calculate forbidden token IDs
|
| 29 |
+
# Hint: also consider forbidding non-ASCII tokens that might hide the letter 'e'.
|
| 30 |
+
# YOUR CODE HERE
|
| 31 |
|
| 32 |
+
def __call__(self, prompt, max_tokens=20):
|
| 33 |
+
# Tokenize the prompt using the chat template
|
| 34 |
+
message = [{"role": "user", "content": prompt}]
|
| 35 |
+
input_ids = self.tokenizer.apply_chat_template(
|
| 36 |
+
message, add_generation_prompt=True, return_tensors="pt"
|
| 37 |
+
).to(self.model.device)
|
| 38 |
+
prompt_len = input_ids.shape[1]
|
| 39 |
|
| 40 |
+
# TODO: Implement constrained generation loop
|
| 41 |
+
# return only the generated text (after the prompt).
|
| 42 |
+
|
| 43 |
+
# YOUR CODE HERE
|
| 44 |
+
|
| 45 |
+
raise NotImplementedError("Implement constrained generation without 'e'")
|
|
|
|
| 46 |
|
| 47 |
|
| 48 |
# --- EXERCISE 2: The Toulouse Sequence ---
|
| 49 |
class ToulouseSequence:
|
| 50 |
"""
|
| 51 |
+
Generate text without ever producing the word 'Toulouse'.
|
| 52 |
+
|
| 53 |
+
You must use model() directly: model(input_ids) yields logits.
|
| 54 |
+
|
| 55 |
REQUIREMENT: Do NOT use model.generate().
|
| 56 |
+
|
| 57 |
+
This is harder than Exercise 1 because 'Toulouse' spans multiple tokens.
|
| 58 |
+
You need to track what has been generated so far and forbid any token
|
| 59 |
+
that would create a prefix of 'Toulouse' (of length >= 4).
|
| 60 |
+
|
| 61 |
+
Hints:
|
| 62 |
+
- Track the current "word prefix" (the suffix of generated text since the
|
| 63 |
+
last non-alphabetical character).
|
| 64 |
+
- For each candidate next token, check if appending it would create a
|
| 65 |
+
string that is a prefix of 'Toulouse' (case-insensitive) with length >= 4.
|
| 66 |
+
- If so, mask that token out.
|
| 67 |
"""
|
| 68 |
+
def __init__(self, model: AutoModelForCausalLM, tokenizer: AutoTokenizer):
|
| 69 |
self.model = model
|
| 70 |
self.tokenizer = tokenizer
|
| 71 |
+
self.forbidden_word = "Toulouse"
|
| 72 |
+
self.min_prefix_len = 4 # Only start blocking at 4+ chars (to allow "To", "Tou")
|
| 73 |
+
|
| 74 |
+
def __call__(self, prompt, max_tokens=20):
|
| 75 |
+
# Tokenize the prompt using the chat template
|
| 76 |
+
message = [{"role": "user", "content": prompt}]
|
| 77 |
+
inputs = self.tokenizer.apply_chat_template(
|
| 78 |
+
message, add_generation_prompt=True, return_tensors="pt"
|
| 79 |
+
).to(self.model.device)
|
| 80 |
+
prompt_length = inputs.shape[1]
|
| 81 |
+
|
| 82 |
+
# TODO: Implement constrained generation loop
|
| 83 |
+
# Return only the generated text (after the prompt).
|
| 84 |
+
# YOUR CODE HERE
|
| 85 |
+
|
| 86 |
+
raise NotImplementedError("Implement constrained generation without 'Toulouse'")
|
| 87 |
|
|
|
|
|
|
|
|
|
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
if __name__ == "__main__":
|
| 90 |
+
# NOTE: This block is for local testing only.
|
| 91 |
+
# The evaluation server provides model and tokenizer.
|
| 92 |
+
# You can use any small model for testing, e.g.:
|
| 93 |
+
MODEL_NAME = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
|
| 94 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 95 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 96 |
+
MODEL_NAME, dtype=torch.float16, device_map="auto"
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
print("=== Exercise 1: La Disparition (no 'e') ===")
|
| 100 |
+
ex1 = LaDisparition(model, tokenizer)
|
| 101 |
+
result = ex1("Who is the king of the jungle?")
|
| 102 |
+
print(f"Result: {result}")
|
| 103 |
+
has_e = 'e' in result.lower()
|
| 104 |
+
print(f"Contains 'e': {has_e} {'✗ FAIL' if has_e else '✓ PASS'}")
|
| 105 |
+
|
| 106 |
+
print("\n=== Exercise 2: No Toulouse ===")
|
| 107 |
+
ex2 = ToulouseSequence(model, tokenizer)
|
| 108 |
+
result = ex2("Where is the headquarters of Airbus located?")
|
| 109 |
+
print(f"Result: {result}")
|
| 110 |
+
has_toulouse = 'toulouse' in result.lower()
|
| 111 |
+
print(f"Contains 'Toulouse': {has_toulouse} {'✗ FAIL' if has_toulouse else '✓ PASS'}")
|