Spaces:

LLM-course
/

lipogram-challenge-submission

Sleeping

App Files Files Community

lipogram-challenge-submission / app.py

nathanael-fijalkow

Updated to use logprob scores

4b37626 2 months ago

raw

history blame contribute delete

9.41 kB

	import json
	import base64
	import gradio as gr
	from gradio_client import Client, handle_file
	from huggingface_hub import hf_hub_download, upload_file
	import os
	import time
	import pandas as pd
	from datetime import datetime

	# --- CONFIGURATION ---
	PRIVATE_SPACE_ID = "LLM-course/lipogram_private"
	DATASET_REPO_ID = "LLM-course/leaderboard-lipogram"
	HF_TOKEN = os.environ.get("HF_TOKEN")
	LOCAL_CSV = "leaderboard.csv"

	def extract_username_from_session(session_cookie):
	"""Extract username from Gradio's encrypted session cookie"""
	try:
	# The session cookie format is: base64(payload).signature.timestamp
	# We need to decode the payload part
	if not session_cookie:
	return None

	parts = session_cookie.split('.')
	if len(parts) < 2:
	return None

	# Decode the payload (add padding if needed)
	payload = parts[0]
	padding = 4 - len(payload) % 4
	if padding != 4:
	payload += '=' * padding

	try:
	decoded = base64.urlsafe_b64decode(payload)
	data = json.loads(decoded)

	# Navigate the nested structure
	if 'oauth_info' in data:
	oauth = data['oauth_info']
	if 'userinfo' in oauth:
	userinfo = oauth['userinfo']
	if 'preferred_username' in userinfo:
	return userinfo['preferred_username']

	return None
	except Exception:
	return None
	except Exception:
	return None

	# --- DATABASE HELPERS ---
	def sync_leaderboard():
	"""Download the latest leaderboard from the Private Dataset"""
	try:
	path = hf_hub_download(
	repo_id=DATASET_REPO_ID,
	filename="leaderboard.csv",
	repo_type="dataset",
	token=HF_TOKEN
	)
	df = pd.read_csv(path)
	return df.sort_values(by="Score", ascending=False)
	except Exception:
	# If file doesn't exist yet, return empty structure
	return pd.DataFrame(columns=["Timestamp", "User", "Score", "Ex 1", "Ex 2"])

	def save_score(user, score, ex1, ex2):
	"""Update or append score - only if it's better than existing score"""
	df = sync_leaderboard()

	# Check if user already exists in leaderboard
	existing_user = df[df['User'] == user]

	if not existing_user.empty:
	# User exists - check if new score is strictly better
	existing_score = existing_user.iloc[0]['Score']
	if score > existing_score:
	# Update existing entry with better score
	df.loc[df['User'] == user, 'Timestamp'] = datetime.now().strftime("%Y-%m-%d %H:%M")
	df.loc[df['User'] == user, 'Score'] = score
	df.loc[df['User'] == user, 'Ex 1'] = ex1
	df.loc[df['User'] == user, 'Ex 2'] = ex2
	else:
	# Score is not better, don't update but return current leaderboard
	return df
	else:
	# New user - add to leaderboard
	new_entry = {
	"Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"),
	"User": user,
	"Score": score,
	"Ex 1": ex1,
	"Ex 2": ex2
	}
	df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)

	df.to_csv(LOCAL_CSV, index=False)

	# Push the updated file back to the private dataset
	upload_file(
	path_or_fileobj=LOCAL_CSV,
	path_in_repo="leaderboard.csv",
	repo_id=DATASET_REPO_ID,
	repo_type="dataset",
	token=HF_TOKEN
	)
	return df

	# --- MAIN LOGIC ---
	user_last_submission = {}

	def submit_challenge(file, request: gr.Request):
	# Extract username from session cookie
	session_cookie = request.cookies.get('session')
	user_name = extract_username_from_session(session_cookie)

	if not user_name:
	raise gr.Error("Please 'Sign in with Hugging Face' at the top of the page to submit.")

	user_key = user_name # Use username as the unique key

	# Rate Limiting (10 mins)
	if user_key in user_last_submission and (time.time() - user_last_submission[user_key]) < 600:
	raise gr.Error("One submission every 10 minutes allowed.")

	if file is None: raise gr.Error("Please upload a file.")

	gr.Info(f"Hello {user_name}, sending your code to the evaluator...")

	try:
	client = Client(PRIVATE_SPACE_ID, token=HF_TOKEN)
	result_text = client.predict(file_obj=handle_file(file.name), api_name="/predict")

	# Parse the result from the evaluator
	ex1_score = 0
	ex2_score = 0
	ex1_quality = 0.0
	ex2_quality = 0.0
	ex1_status = "Not evaluated"
	ex2_status = "Not evaluated"

	try:
	import re
	# Parse Ex 1 - look for pattern "Ex 1 (No 'e'): X/5 correct \| Quality: X%"
	if "Ex 1" in result_text:
	if "Ex 1" in result_text and "TIMEOUT" in result_text.split("Ex 2")[0]:
	ex1_status = "TIMEOUT"
	elif "Ex 1 Error" in result_text:
	ex1_status = "ERROR"
	else:
	# Match format: Ex 1 (No 'e'): X/5 correct \| Quality: X%
	ex1_match = re.search(r'Ex 1[^:]:\?\?\s(\d+)/5\scorrect\s\\|\sQuality:\s(\d+)%', result_text)
	if ex1_match:
	ex1_score = int(ex1_match.group(1))
	ex1_quality = int(ex1_match.group(2)) / 100.0
	ex1_status = f"{ex1_score}/5 ({ex1_match.group(2)}%)"

	# Parse Ex 2 - look for pattern "Ex 2 (No Toulouse): X/5 correct \| Quality: X%"
	if "Ex 2" in result_text:
	if "Ex 2" in result_text and "TIMEOUT" in result_text.split("Ex 2")[1]:
	ex2_status = "TIMEOUT"
	elif "Ex 2 Error" in result_text:
	ex2_status = "ERROR"
	else:
	# Match format: Ex 2 (No Toulouse): X/5 correct \| Quality: X%
	ex2_match = re.search(r'Ex 2[^:]:\?\?\s(\d+)/5\scorrect\s\\|\sQuality:\s(\d+)%', result_text)
	if ex2_match:
	ex2_score = int(ex2_match.group(1))
	ex2_quality = int(ex2_match.group(2)) / 100.0
	ex2_status = f"{ex2_score}/5 ({ex2_match.group(2)}%)"

	# Total score: 50% correctness + 50% quality, out of 10
	correctness_part = (ex1_score + ex2_score) / 2.0 # 0-5
	avg_quality = (ex1_quality + ex2_quality) / 2.0
	quality_part = avg_quality * 5 # 0-5
	total_score = round(correctness_part + quality_part, 2) # 0-10
	except Exception as e:
	# If parsing fails, try to extract what we can from the text
	total_score = 0
	ex1_status = f"Parse error: {str(e)}"
	ex2_status = "Parse error"

	# Save to your Private Dataset "DB"
	updated_df = save_score(user_name, total_score, ex1_status, ex2_status)
	user_last_submission[user_key] = time.time()

	return result_text, updated_df.sort_values(by="Score", ascending=False)

	except gr.Error:
	# Re-raise Gradio errors (like the 10-minute ban) so they display properly
	raise
	except Exception as e:
	return f"Error: {str(e)}", sync_leaderboard()

	# --- UPDATED UI ---
	with gr.Blocks() as demo:
	gr.Markdown("# LLM Lipogram Challenge Portal")

	# This displays a login button if the user is not authenticated
	gr.LoginButton()

	# Global progress indicator (visible across all tabs)
	progress_status = gr.Markdown("", visible=False)

	with gr.Tabs():
	with gr.TabItem("Download the Template"):
	gr.Markdown("## Exercise Instructions\n\n### Exercise 1: La disparition (No 'e' or 'E')\nGenerate text without ever using the letter 'e' or 'E'. For this, you must use `model()` directly: `model(input_ids)` yields logits. You need to manually adjust the logits to forbid tokens containing 'e' or 'E'. REQUIREMENT: Do NOT use model.generate().\n\n### Exercise 2: The Toulouse Sequence\nGenerate text without ever using the word 'Toulouse'. For this, you must use `model()` directly: `model(input_ids)` yields logits. You need to manually adjust the logits. It is more difficult here because 'Toulouse' is a multi-token word. REQUIREMENT: Do NOT use model.generate().\n\nDownload the `challenge.py` template below to get started:\n")
	gr.File(value="challenge.py", label="Download Template", interactive=False)

	with gr.TabItem("Submit"):
	gr.Markdown("### 1. Sign in above\n### 2. Upload challenge.py below")
	file_input = gr.File(label="challenge.py")
	submit_btn = gr.Button("Evaluate My Code", variant="primary")
	output_text = gr.Markdown()

	with gr.TabItem("Leaderboard"):
	leaderboard_df = gr.DataFrame(value=sync_leaderboard, interactive=False)

	submit_btn.click(
	fn=submit_challenge,
	inputs=file_input,
	outputs=[output_text, leaderboard_df],
	show_progress="hidden"
	)

	demo.launch(theme=gr.themes.Soft())