Spaces:

LLM-course
/

lipogram-challenge-submission

Sleeping

App Files Files Community

nathanael-fijalkow commited on Feb 17

Commit

4b37626

1 Parent(s): c46e600

Updated to use logprob scores

Browse files

Files changed (2) hide show

app.py +17 -9
challenge.py +85 -42

app.py CHANGED Viewed

@@ -136,38 +136,46 @@ def submit_challenge(file, request: gr.Request):
         # Parse the result from the evaluator
         ex1_score = 0
         ex2_score = 0
         ex1_status = "Not evaluated"
         ex2_status = "Not evaluated"
         try:
             import re
-            # Parse Ex 1 - look for pattern "**Ex 1 (No 'e'):** X/5 correct"
             if "Ex 1" in result_text:
                 if "Ex 1" in result_text and "TIMEOUT" in result_text.split("Ex 2")[0]:
                     ex1_status = "TIMEOUT"
                 elif "Ex 1 Error" in result_text:
                     ex1_status = "ERROR"
                 else:
-                    # Match format: **Ex 1 (No 'e'):** X/5 correct
-                    ex1_match = re.search(r'Ex 1[^:]*:\*?\*?\s*(\d+)/5', result_text)
                     if ex1_match:
                         ex1_score = int(ex1_match.group(1))
-                        ex1_status = f"{ex1_score}/5"
-            # Parse Ex 2 - look for pattern "**Ex 2 (No Toulouse):** X/5 correct"
             if "Ex 2" in result_text:
                 if "Ex 2" in result_text and "TIMEOUT" in result_text.split("Ex 2")[1]:
                     ex2_status = "TIMEOUT"
                 elif "Ex 2 Error" in result_text:
                     ex2_status = "ERROR"
                 else:
-                    # Match format: **Ex 2 (No Toulouse):** X/5 correct
-                    ex2_match = re.search(r'Ex 2[^:]*:\*?\*?\s*(\d+)/5', result_text)
                     if ex2_match:
                         ex2_score = int(ex2_match.group(1))
-                        ex2_status = f"{ex2_score}/5"
-            total_score = ex1_score + ex2_score  # Out of 10
         except Exception as e:
             # If parsing fails, try to extract what we can from the text
             total_score = 0

         # Parse the result from the evaluator
         ex1_score = 0
         ex2_score = 0
+        ex1_quality = 0.0
+        ex2_quality = 0.0
         ex1_status = "Not evaluated"
         ex2_status = "Not evaluated"
         try:
             import re
+            # Parse Ex 1 - look for pattern "**Ex 1 (No 'e'):** X/5 correct | Quality: X%"
             if "Ex 1" in result_text:
                 if "Ex 1" in result_text and "TIMEOUT" in result_text.split("Ex 2")[0]:
                     ex1_status = "TIMEOUT"
                 elif "Ex 1 Error" in result_text:
                     ex1_status = "ERROR"
                 else:
+                    # Match format: **Ex 1 (No 'e'):** X/5 correct | Quality: X%
+                    ex1_match = re.search(r'Ex 1[^:]*:\*?\*?\s*(\d+)/5\s*correct\s*\|\s*Quality:\s*(\d+)%', result_text)
                     if ex1_match:
                         ex1_score = int(ex1_match.group(1))
+                        ex1_quality = int(ex1_match.group(2)) / 100.0
+                        ex1_status = f"{ex1_score}/5 ({ex1_match.group(2)}%)"
+            # Parse Ex 2 - look for pattern "**Ex 2 (No Toulouse):** X/5 correct | Quality: X%"
             if "Ex 2" in result_text:
                 if "Ex 2" in result_text and "TIMEOUT" in result_text.split("Ex 2")[1]:
                     ex2_status = "TIMEOUT"
                 elif "Ex 2 Error" in result_text:
                     ex2_status = "ERROR"
                 else:
+                    # Match format: **Ex 2 (No Toulouse):** X/5 correct | Quality: X%
+                    ex2_match = re.search(r'Ex 2[^:]*:\*?\*?\s*(\d+)/5\s*correct\s*\|\s*Quality:\s*(\d+)%', result_text)
                     if ex2_match:
                         ex2_score = int(ex2_match.group(1))
+                        ex2_quality = int(ex2_match.group(2)) / 100.0
+                        ex2_status = f"{ex2_score}/5 ({ex2_match.group(2)}%)"
+            # Total score: 50% correctness + 50% quality, out of 10
+            correctness_part = (ex1_score + ex2_score) / 2.0  # 0-5
+            avg_quality = (ex1_quality + ex2_quality) / 2.0
+            quality_part = avg_quality * 5  # 0-5
+            total_score = round(correctness_part + quality_part, 2)  # 0-10
         except Exception as e:
             # If parsing fails, try to extract what we can from the text
             total_score = 0

challenge.py CHANGED Viewed

@@ -1,68 +1,111 @@
-from typing import Any
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
-# --- EXERCISE 1: La disparition (No 'e' or 'E) ---
 class LaDisparition:
     """
     Generate text without ever using the letter 'e' or 'E'.
-    For this, you must use model() directly: model(input_ids) yields logits.
     You need to manually adjust the logits to forbid tokens containing 'e' or 'E'.
     REQUIREMENT: Do NOT use model.generate().
     """
-    def __init__(self, model, tokenizer):
         self.model = model
         self.tokenizer = tokenizer
-        # Here you want to pre-calculate forbidden token IDs
-        # Warning: The evaluation server uses a different model and tokenizer than the template. Do not hard-code Token IDs. Use self.tokenizer.get_vocab() or self.tokenizer.encode() to find the IDs relevant to the current model.
-    def __call__(self, prompt, max_tokens=30):
-        # Tokenize input prompt:
-        # input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
-        # Generate tokens manually, one step at a time:
-        # (The bulk of the logic goes here)
-        # Hint: generating a single answer may not be enough!
-        # Decode output tokens to string and return
-        # return tokenizer.decode(generated, skip_special_tokens=True)
-        pass
 # --- EXERCISE 2: The Toulouse Sequence ---
 class ToulouseSequence:
     """
-    Generate text without ever using the word 'Toulouse'.
-    For this, you must use model() directly: model(input_ids) yields logits.
-    You need to manually adjust the logits. It is more difficult here because
-    'Toulouse' is a multi-token word.
     REQUIREMENT: Do NOT use model.generate().
     """
-    def __init__(self, model, tokenizer):
         self.model = model
         self.tokenizer = tokenizer
-        # Here you want to pre-calculate forbidden token IDs
-        # Hint:
-        # print(tokenizer.encode("Toulouse", add_special_tokens=False))
-    def __call__(self, prompt, max_tokens=30):
-        # Tokenize input prompt:
-        # input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
-        # Generate tokens manually, one step at a time:
-        # (The bulk of the logic goes here)
-        # Hint: you need to track partial matches of the forbidden word
-        # Decode output tokens to string and return
-        # return tokenizer.decode(generated, skip_special_tokens=True)
-        pass
 if __name__ == "__main__":
-    MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float16, device_map="auto")
-    la_disparition_generator = LaDisparition(model, tokenizer)
-    print("Ex 1 (No 'e'):", la_disparition_generator("Describe a cat."))
-    toulouse_sequence_generator = ToulouseSequence(model, tokenizer)
-    print("Ex 2 (No 'Toulouse'):", toulouse_sequence_generator("The pink city in France is"))

 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
+# --- EXERCISE 1: La disparition (No 'e' or 'E') ---
 class LaDisparition:
     """
     Generate text without ever using the letter 'e' or 'E'.
+    You must use model() directly: model(input_ids) yields logits.
     You need to manually adjust the logits to forbid tokens containing 'e' or 'E'.
     REQUIREMENT: Do NOT use model.generate().
+    Hints:
+    - In __init__, pre-compute the set of forbidden token IDs by checking
+      which tokens in the vocabulary decode to strings containing 'e' or 'E'.
+    - In __call__, implement a token-by-token generation loop:
+      1. Feed the current sequence to the model to get logits
+      2. Mask out forbidden tokens (set their logits to -inf)
+      3. Pick the next token (greedy: argmax, or use beam search for better quality)
+      4. Append and repeat
+    - Return only the generated text (not the prompt).
     """
+    def __init__(self, model: AutoModelForCausalLM, tokenizer: AutoTokenizer):
         self.model = model
         self.tokenizer = tokenizer
+        # TODO: Pre-calculate forbidden token IDs
+        # Hint: also consider forbidding non-ASCII tokens that might hide the letter 'e'.
+        # YOUR CODE HERE
+    def __call__(self, prompt, max_tokens=20):
+        # Tokenize the prompt using the chat template
+        message = [{"role": "user", "content": prompt}]
+        input_ids = self.tokenizer.apply_chat_template(
+            message, add_generation_prompt=True, return_tensors="pt"
+        ).to(self.model.device)
+        prompt_len = input_ids.shape[1]
+        # TODO: Implement constrained generation loop
+        # return only the generated text (after the prompt).
+        # YOUR CODE HERE
+        raise NotImplementedError("Implement constrained generation without 'e'")
 # --- EXERCISE 2: The Toulouse Sequence ---
 class ToulouseSequence:
     """
+    Generate text without ever producing the word 'Toulouse'.
+    You must use model() directly: model(input_ids) yields logits.
     REQUIREMENT: Do NOT use model.generate().
+    This is harder than Exercise 1 because 'Toulouse' spans multiple tokens.
+    You need to track what has been generated so far and forbid any token
+    that would create a prefix of 'Toulouse' (of length >= 4).
+    Hints:
+    - Track the current "word prefix" (the suffix of generated text since the
+      last non-alphabetical character).
+    - For each candidate next token, check if appending it would create a
+      string that is a prefix of 'Toulouse' (case-insensitive) with length >= 4.
+    - If so, mask that token out.
     """
+    def __init__(self, model: AutoModelForCausalLM, tokenizer: AutoTokenizer):
         self.model = model
         self.tokenizer = tokenizer
+        self.forbidden_word = "Toulouse"
+        self.min_prefix_len = 4  # Only start blocking at 4+ chars (to allow "To", "Tou")
+    def __call__(self, prompt, max_tokens=20):
+        # Tokenize the prompt using the chat template
+        message = [{"role": "user", "content": prompt}]
+        inputs = self.tokenizer.apply_chat_template(
+            message, add_generation_prompt=True, return_tensors="pt"
+        ).to(self.model.device)
+        prompt_length = inputs.shape[1]
+        # TODO: Implement constrained generation loop
+        # Return only the generated text (after the prompt).
+        # YOUR CODE HERE
+        raise NotImplementedError("Implement constrained generation without 'Toulouse'")
 if __name__ == "__main__":
+    # NOTE: This block is for local testing only.
+    # The evaluation server provides model and tokenizer.
+    # You can use any small model for testing, e.g.:
+    MODEL_NAME = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_NAME, dtype=torch.float16, device_map="auto"
+    )
+    print("=== Exercise 1: La Disparition (no 'e') ===")
+    ex1 = LaDisparition(model, tokenizer)
+    result = ex1("Who is the king of the jungle?")
+    print(f"Result: {result}")
+    has_e = 'e' in result.lower()
+    print(f"Contains 'e': {has_e} {'✗ FAIL' if has_e else '✓ PASS'}")
+    print("\n=== Exercise 2: No Toulouse ===")
+    ex2 = ToulouseSequence(model, tokenizer)
+    result = ex2("Where is the headquarters of Airbus located?")
+    print(f"Result: {result}")
+    has_toulouse = 'toulouse' in result.lower()
+    print(f"Contains 'Toulouse': {has_toulouse} {'✗ FAIL' if has_toulouse else '✓ PASS'}")