nathanael-fijalkow commited on
Commit
4b37626
·
1 Parent(s): c46e600

Updated to use logprob scores

Browse files
Files changed (2) hide show
  1. app.py +17 -9
  2. challenge.py +85 -42
app.py CHANGED
@@ -136,38 +136,46 @@ def submit_challenge(file, request: gr.Request):
136
  # Parse the result from the evaluator
137
  ex1_score = 0
138
  ex2_score = 0
 
 
139
  ex1_status = "Not evaluated"
140
  ex2_status = "Not evaluated"
141
 
142
  try:
143
  import re
144
- # Parse Ex 1 - look for pattern "**Ex 1 (No 'e'):** X/5 correct"
145
  if "Ex 1" in result_text:
146
  if "Ex 1" in result_text and "TIMEOUT" in result_text.split("Ex 2")[0]:
147
  ex1_status = "TIMEOUT"
148
  elif "Ex 1 Error" in result_text:
149
  ex1_status = "ERROR"
150
  else:
151
- # Match format: **Ex 1 (No 'e'):** X/5 correct
152
- ex1_match = re.search(r'Ex 1[^:]*:\*?\*?\s*(\d+)/5', result_text)
153
  if ex1_match:
154
  ex1_score = int(ex1_match.group(1))
155
- ex1_status = f"{ex1_score}/5"
 
156
 
157
- # Parse Ex 2 - look for pattern "**Ex 2 (No Toulouse):** X/5 correct"
158
  if "Ex 2" in result_text:
159
  if "Ex 2" in result_text and "TIMEOUT" in result_text.split("Ex 2")[1]:
160
  ex2_status = "TIMEOUT"
161
  elif "Ex 2 Error" in result_text:
162
  ex2_status = "ERROR"
163
  else:
164
- # Match format: **Ex 2 (No Toulouse):** X/5 correct
165
- ex2_match = re.search(r'Ex 2[^:]*:\*?\*?\s*(\d+)/5', result_text)
166
  if ex2_match:
167
  ex2_score = int(ex2_match.group(1))
168
- ex2_status = f"{ex2_score}/5"
 
169
 
170
- total_score = ex1_score + ex2_score # Out of 10
 
 
 
 
171
  except Exception as e:
172
  # If parsing fails, try to extract what we can from the text
173
  total_score = 0
 
136
  # Parse the result from the evaluator
137
  ex1_score = 0
138
  ex2_score = 0
139
+ ex1_quality = 0.0
140
+ ex2_quality = 0.0
141
  ex1_status = "Not evaluated"
142
  ex2_status = "Not evaluated"
143
 
144
  try:
145
  import re
146
+ # Parse Ex 1 - look for pattern "**Ex 1 (No 'e'):** X/5 correct | Quality: X%"
147
  if "Ex 1" in result_text:
148
  if "Ex 1" in result_text and "TIMEOUT" in result_text.split("Ex 2")[0]:
149
  ex1_status = "TIMEOUT"
150
  elif "Ex 1 Error" in result_text:
151
  ex1_status = "ERROR"
152
  else:
153
+ # Match format: **Ex 1 (No 'e'):** X/5 correct | Quality: X%
154
+ ex1_match = re.search(r'Ex 1[^:]*:\*?\*?\s*(\d+)/5\s*correct\s*\|\s*Quality:\s*(\d+)%', result_text)
155
  if ex1_match:
156
  ex1_score = int(ex1_match.group(1))
157
+ ex1_quality = int(ex1_match.group(2)) / 100.0
158
+ ex1_status = f"{ex1_score}/5 ({ex1_match.group(2)}%)"
159
 
160
+ # Parse Ex 2 - look for pattern "**Ex 2 (No Toulouse):** X/5 correct | Quality: X%"
161
  if "Ex 2" in result_text:
162
  if "Ex 2" in result_text and "TIMEOUT" in result_text.split("Ex 2")[1]:
163
  ex2_status = "TIMEOUT"
164
  elif "Ex 2 Error" in result_text:
165
  ex2_status = "ERROR"
166
  else:
167
+ # Match format: **Ex 2 (No Toulouse):** X/5 correct | Quality: X%
168
+ ex2_match = re.search(r'Ex 2[^:]*:\*?\*?\s*(\d+)/5\s*correct\s*\|\s*Quality:\s*(\d+)%', result_text)
169
  if ex2_match:
170
  ex2_score = int(ex2_match.group(1))
171
+ ex2_quality = int(ex2_match.group(2)) / 100.0
172
+ ex2_status = f"{ex2_score}/5 ({ex2_match.group(2)}%)"
173
 
174
+ # Total score: 50% correctness + 50% quality, out of 10
175
+ correctness_part = (ex1_score + ex2_score) / 2.0 # 0-5
176
+ avg_quality = (ex1_quality + ex2_quality) / 2.0
177
+ quality_part = avg_quality * 5 # 0-5
178
+ total_score = round(correctness_part + quality_part, 2) # 0-10
179
  except Exception as e:
180
  # If parsing fails, try to extract what we can from the text
181
  total_score = 0
challenge.py CHANGED
@@ -1,68 +1,111 @@
1
- from typing import Any
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
 
5
- # --- EXERCISE 1: La disparition (No 'e' or 'E) ---
6
  class LaDisparition:
7
  """
8
  Generate text without ever using the letter 'e' or 'E'.
9
- For this, you must use model() directly: model(input_ids) yields logits.
 
10
  You need to manually adjust the logits to forbid tokens containing 'e' or 'E'.
 
11
  REQUIREMENT: Do NOT use model.generate().
 
 
 
 
 
 
 
 
 
 
12
  """
13
- def __init__(self, model, tokenizer):
14
  self.model = model
15
  self.tokenizer = tokenizer
16
- # Here you want to pre-calculate forbidden token IDs
17
-
18
- # Warning: The evaluation server uses a different model and tokenizer than the template. Do not hard-code Token IDs. Use self.tokenizer.get_vocab() or self.tokenizer.encode() to find the IDs relevant to the current model.
 
19
 
20
- def __call__(self, prompt, max_tokens=30):
21
- # Tokenize input prompt:
22
- # input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
 
 
 
 
23
 
24
- # Generate tokens manually, one step at a time:
25
- # (The bulk of the logic goes here)
26
- # Hint: generating a single answer may not be enough!
27
-
28
- # Decode output tokens to string and return
29
- # return tokenizer.decode(generated, skip_special_tokens=True)
30
- pass
31
 
32
 
33
  # --- EXERCISE 2: The Toulouse Sequence ---
34
  class ToulouseSequence:
35
  """
36
- Generate text without ever using the word 'Toulouse'.
37
- For this, you must use model() directly: model(input_ids) yields logits.
38
- You need to manually adjust the logits. It is more difficult here because
39
- 'Toulouse' is a multi-token word.
40
  REQUIREMENT: Do NOT use model.generate().
 
 
 
 
 
 
 
 
 
 
 
41
  """
42
- def __init__(self, model, tokenizer):
43
  self.model = model
44
  self.tokenizer = tokenizer
45
- # Here you want to pre-calculate forbidden token IDs
46
- # Hint:
47
- # print(tokenizer.encode("Toulouse", add_special_tokens=False))
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
- def __call__(self, prompt, max_tokens=30):
50
- # Tokenize input prompt:
51
- # input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
52
 
53
- # Generate tokens manually, one step at a time:
54
- # (The bulk of the logic goes here)
55
- # Hint: you need to track partial matches of the forbidden word
56
-
57
- # Decode output tokens to string and return
58
- # return tokenizer.decode(generated, skip_special_tokens=True)
59
- pass
60
-
61
  if __name__ == "__main__":
62
- MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 
 
 
63
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
64
- model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float16, device_map="auto")
65
- la_disparition_generator = LaDisparition(model, tokenizer)
66
- print("Ex 1 (No 'e'):", la_disparition_generator("Describe a cat."))
67
- toulouse_sequence_generator = ToulouseSequence(model, tokenizer)
68
- print("Ex 2 (No 'Toulouse'):", toulouse_sequence_generator("The pink city in France is"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
 
4
+ # --- EXERCISE 1: La disparition (No 'e' or 'E') ---
5
  class LaDisparition:
6
  """
7
  Generate text without ever using the letter 'e' or 'E'.
8
+
9
+ You must use model() directly: model(input_ids) yields logits.
10
  You need to manually adjust the logits to forbid tokens containing 'e' or 'E'.
11
+
12
  REQUIREMENT: Do NOT use model.generate().
13
+
14
+ Hints:
15
+ - In __init__, pre-compute the set of forbidden token IDs by checking
16
+ which tokens in the vocabulary decode to strings containing 'e' or 'E'.
17
+ - In __call__, implement a token-by-token generation loop:
18
+ 1. Feed the current sequence to the model to get logits
19
+ 2. Mask out forbidden tokens (set their logits to -inf)
20
+ 3. Pick the next token (greedy: argmax, or use beam search for better quality)
21
+ 4. Append and repeat
22
+ - Return only the generated text (not the prompt).
23
  """
24
+ def __init__(self, model: AutoModelForCausalLM, tokenizer: AutoTokenizer):
25
  self.model = model
26
  self.tokenizer = tokenizer
27
+
28
+ # TODO: Pre-calculate forbidden token IDs
29
+ # Hint: also consider forbidding non-ASCII tokens that might hide the letter 'e'.
30
+ # YOUR CODE HERE
31
 
32
+ def __call__(self, prompt, max_tokens=20):
33
+ # Tokenize the prompt using the chat template
34
+ message = [{"role": "user", "content": prompt}]
35
+ input_ids = self.tokenizer.apply_chat_template(
36
+ message, add_generation_prompt=True, return_tensors="pt"
37
+ ).to(self.model.device)
38
+ prompt_len = input_ids.shape[1]
39
 
40
+ # TODO: Implement constrained generation loop
41
+ # return only the generated text (after the prompt).
42
+
43
+ # YOUR CODE HERE
44
+
45
+ raise NotImplementedError("Implement constrained generation without 'e'")
 
46
 
47
 
48
  # --- EXERCISE 2: The Toulouse Sequence ---
49
  class ToulouseSequence:
50
  """
51
+ Generate text without ever producing the word 'Toulouse'.
52
+
53
+ You must use model() directly: model(input_ids) yields logits.
54
+
55
  REQUIREMENT: Do NOT use model.generate().
56
+
57
+ This is harder than Exercise 1 because 'Toulouse' spans multiple tokens.
58
+ You need to track what has been generated so far and forbid any token
59
+ that would create a prefix of 'Toulouse' (of length >= 4).
60
+
61
+ Hints:
62
+ - Track the current "word prefix" (the suffix of generated text since the
63
+ last non-alphabetical character).
64
+ - For each candidate next token, check if appending it would create a
65
+ string that is a prefix of 'Toulouse' (case-insensitive) with length >= 4.
66
+ - If so, mask that token out.
67
  """
68
+ def __init__(self, model: AutoModelForCausalLM, tokenizer: AutoTokenizer):
69
  self.model = model
70
  self.tokenizer = tokenizer
71
+ self.forbidden_word = "Toulouse"
72
+ self.min_prefix_len = 4 # Only start blocking at 4+ chars (to allow "To", "Tou")
73
+
74
+ def __call__(self, prompt, max_tokens=20):
75
+ # Tokenize the prompt using the chat template
76
+ message = [{"role": "user", "content": prompt}]
77
+ inputs = self.tokenizer.apply_chat_template(
78
+ message, add_generation_prompt=True, return_tensors="pt"
79
+ ).to(self.model.device)
80
+ prompt_length = inputs.shape[1]
81
+
82
+ # TODO: Implement constrained generation loop
83
+ # Return only the generated text (after the prompt).
84
+ # YOUR CODE HERE
85
+
86
+ raise NotImplementedError("Implement constrained generation without 'Toulouse'")
87
 
 
 
 
88
 
 
 
 
 
 
 
 
 
89
  if __name__ == "__main__":
90
+ # NOTE: This block is for local testing only.
91
+ # The evaluation server provides model and tokenizer.
92
+ # You can use any small model for testing, e.g.:
93
+ MODEL_NAME = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
94
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
95
+ model = AutoModelForCausalLM.from_pretrained(
96
+ MODEL_NAME, dtype=torch.float16, device_map="auto"
97
+ )
98
+
99
+ print("=== Exercise 1: La Disparition (no 'e') ===")
100
+ ex1 = LaDisparition(model, tokenizer)
101
+ result = ex1("Who is the king of the jungle?")
102
+ print(f"Result: {result}")
103
+ has_e = 'e' in result.lower()
104
+ print(f"Contains 'e': {has_e} {'✗ FAIL' if has_e else '✓ PASS'}")
105
+
106
+ print("\n=== Exercise 2: No Toulouse ===")
107
+ ex2 = ToulouseSequence(model, tokenizer)
108
+ result = ex2("Where is the headquarters of Airbus located?")
109
+ print(f"Result: {result}")
110
+ has_toulouse = 'toulouse' in result.lower()
111
+ print(f"Contains 'Toulouse': {has_toulouse} {'✗ FAIL' if has_toulouse else '✓ PASS'}")