nathanael-fijalkow's picture
Updated to use logprob scores
4b37626
import json
import base64
import gradio as gr
from gradio_client import Client, handle_file
from huggingface_hub import hf_hub_download, upload_file
import os
import time
import pandas as pd
from datetime import datetime
# --- CONFIGURATION ---
PRIVATE_SPACE_ID = "LLM-course/lipogram_private"
DATASET_REPO_ID = "LLM-course/leaderboard-lipogram"
HF_TOKEN = os.environ.get("HF_TOKEN")
LOCAL_CSV = "leaderboard.csv"
def extract_username_from_session(session_cookie):
"""Extract username from Gradio's encrypted session cookie"""
try:
# The session cookie format is: base64(payload).signature.timestamp
# We need to decode the payload part
if not session_cookie:
return None
parts = session_cookie.split('.')
if len(parts) < 2:
return None
# Decode the payload (add padding if needed)
payload = parts[0]
padding = 4 - len(payload) % 4
if padding != 4:
payload += '=' * padding
try:
decoded = base64.urlsafe_b64decode(payload)
data = json.loads(decoded)
# Navigate the nested structure
if 'oauth_info' in data:
oauth = data['oauth_info']
if 'userinfo' in oauth:
userinfo = oauth['userinfo']
if 'preferred_username' in userinfo:
return userinfo['preferred_username']
return None
except Exception:
return None
except Exception:
return None
# --- DATABASE HELPERS ---
def sync_leaderboard():
"""Download the latest leaderboard from the Private Dataset"""
try:
path = hf_hub_download(
repo_id=DATASET_REPO_ID,
filename="leaderboard.csv",
repo_type="dataset",
token=HF_TOKEN
)
df = pd.read_csv(path)
return df.sort_values(by="Score", ascending=False)
except Exception:
# If file doesn't exist yet, return empty structure
return pd.DataFrame(columns=["Timestamp", "User", "Score", "Ex 1", "Ex 2"])
def save_score(user, score, ex1, ex2):
"""Update or append score - only if it's better than existing score"""
df = sync_leaderboard()
# Check if user already exists in leaderboard
existing_user = df[df['User'] == user]
if not existing_user.empty:
# User exists - check if new score is strictly better
existing_score = existing_user.iloc[0]['Score']
if score > existing_score:
# Update existing entry with better score
df.loc[df['User'] == user, 'Timestamp'] = datetime.now().strftime("%Y-%m-%d %H:%M")
df.loc[df['User'] == user, 'Score'] = score
df.loc[df['User'] == user, 'Ex 1'] = ex1
df.loc[df['User'] == user, 'Ex 2'] = ex2
else:
# Score is not better, don't update but return current leaderboard
return df
else:
# New user - add to leaderboard
new_entry = {
"Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"),
"User": user,
"Score": score,
"Ex 1": ex1,
"Ex 2": ex2
}
df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
df.to_csv(LOCAL_CSV, index=False)
# Push the updated file back to the private dataset
upload_file(
path_or_fileobj=LOCAL_CSV,
path_in_repo="leaderboard.csv",
repo_id=DATASET_REPO_ID,
repo_type="dataset",
token=HF_TOKEN
)
return df
# --- MAIN LOGIC ---
user_last_submission = {}
def submit_challenge(file, request: gr.Request):
# Extract username from session cookie
session_cookie = request.cookies.get('session')
user_name = extract_username_from_session(session_cookie)
if not user_name:
raise gr.Error("Please 'Sign in with Hugging Face' at the top of the page to submit.")
user_key = user_name # Use username as the unique key
# Rate Limiting (10 mins)
if user_key in user_last_submission and (time.time() - user_last_submission[user_key]) < 600:
raise gr.Error("One submission every 10 minutes allowed.")
if file is None: raise gr.Error("Please upload a file.")
gr.Info(f"Hello {user_name}, sending your code to the evaluator...")
try:
client = Client(PRIVATE_SPACE_ID, token=HF_TOKEN)
result_text = client.predict(file_obj=handle_file(file.name), api_name="/predict")
# Parse the result from the evaluator
ex1_score = 0
ex2_score = 0
ex1_quality = 0.0
ex2_quality = 0.0
ex1_status = "Not evaluated"
ex2_status = "Not evaluated"
try:
import re
# Parse Ex 1 - look for pattern "**Ex 1 (No 'e'):** X/5 correct | Quality: X%"
if "Ex 1" in result_text:
if "Ex 1" in result_text and "TIMEOUT" in result_text.split("Ex 2")[0]:
ex1_status = "TIMEOUT"
elif "Ex 1 Error" in result_text:
ex1_status = "ERROR"
else:
# Match format: **Ex 1 (No 'e'):** X/5 correct | Quality: X%
ex1_match = re.search(r'Ex 1[^:]*:\*?\*?\s*(\d+)/5\s*correct\s*\|\s*Quality:\s*(\d+)%', result_text)
if ex1_match:
ex1_score = int(ex1_match.group(1))
ex1_quality = int(ex1_match.group(2)) / 100.0
ex1_status = f"{ex1_score}/5 ({ex1_match.group(2)}%)"
# Parse Ex 2 - look for pattern "**Ex 2 (No Toulouse):** X/5 correct | Quality: X%"
if "Ex 2" in result_text:
if "Ex 2" in result_text and "TIMEOUT" in result_text.split("Ex 2")[1]:
ex2_status = "TIMEOUT"
elif "Ex 2 Error" in result_text:
ex2_status = "ERROR"
else:
# Match format: **Ex 2 (No Toulouse):** X/5 correct | Quality: X%
ex2_match = re.search(r'Ex 2[^:]*:\*?\*?\s*(\d+)/5\s*correct\s*\|\s*Quality:\s*(\d+)%', result_text)
if ex2_match:
ex2_score = int(ex2_match.group(1))
ex2_quality = int(ex2_match.group(2)) / 100.0
ex2_status = f"{ex2_score}/5 ({ex2_match.group(2)}%)"
# Total score: 50% correctness + 50% quality, out of 10
correctness_part = (ex1_score + ex2_score) / 2.0 # 0-5
avg_quality = (ex1_quality + ex2_quality) / 2.0
quality_part = avg_quality * 5 # 0-5
total_score = round(correctness_part + quality_part, 2) # 0-10
except Exception as e:
# If parsing fails, try to extract what we can from the text
total_score = 0
ex1_status = f"Parse error: {str(e)}"
ex2_status = "Parse error"
# Save to your Private Dataset "DB"
updated_df = save_score(user_name, total_score, ex1_status, ex2_status)
user_last_submission[user_key] = time.time()
return result_text, updated_df.sort_values(by="Score", ascending=False)
except gr.Error:
# Re-raise Gradio errors (like the 10-minute ban) so they display properly
raise
except Exception as e:
return f"Error: {str(e)}", sync_leaderboard()
# --- UPDATED UI ---
with gr.Blocks() as demo:
gr.Markdown("# LLM Lipogram Challenge Portal")
# This displays a login button if the user is not authenticated
gr.LoginButton()
# Global progress indicator (visible across all tabs)
progress_status = gr.Markdown("", visible=False)
with gr.Tabs():
with gr.TabItem("Download the Template"):
gr.Markdown("## Exercise Instructions\n\n### Exercise 1: La disparition (No 'e' or 'E')\nGenerate text without ever using the letter 'e' or 'E'. For this, you must use `model()` directly: `model(input_ids)` yields logits. You need to manually adjust the logits to forbid tokens containing 'e' or 'E'. **REQUIREMENT: Do NOT use model.generate().**\n\n### Exercise 2: The Toulouse Sequence\nGenerate text without ever using the word 'Toulouse'. For this, you must use `model()` directly: `model(input_ids)` yields logits. You need to manually adjust the logits. It is more difficult here because 'Toulouse' is a multi-token word. **REQUIREMENT: Do NOT use model.generate().**\n\nDownload the `challenge.py` template below to get started:\n")
gr.File(value="challenge.py", label="Download Template", interactive=False)
with gr.TabItem("Submit"):
gr.Markdown("### 1. Sign in above\n### 2. Upload challenge.py below")
file_input = gr.File(label="challenge.py")
submit_btn = gr.Button("Evaluate My Code", variant="primary")
output_text = gr.Markdown()
with gr.TabItem("Leaderboard"):
leaderboard_df = gr.DataFrame(value=sync_leaderboard, interactive=False)
submit_btn.click(
fn=submit_challenge,
inputs=file_input,
outputs=[output_text, leaderboard_df],
show_progress="hidden"
)
demo.launch(theme=gr.themes.Soft())