Spaces:

roc-hci
/

Turing-Bench-Leaderboard

Running

App Files Files Community

Turing-Bench-Leaderboard / app.py

roc-hci

Update app.py

4a3c092 verified about 6 hours ago

raw

history blame contribute delete

13.2 kB

	import pandas as pd
	import gradio as gr

	from about import TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_TEXT, DESCRIPTION_TEXT, LEADERBOARD_DETAIL_TEXT
	from theme import build_theme, CUSTOM_CSS
	from utils import load_results, submit_prediction, _format_inline, markdown_to_html, _format_accuracy

	GIT_CLONE_COMMAND = "git clone https://github.com/Masum06/Turing-Bench.git"

	import base64
	from pathlib import Path

	img_path = Path(__file__).parent / "images" / "locke-logo.png"
	b64 = base64.b64encode(img_path.read_bytes()).decode()

	def build_leaderboard_summary(df: pd.DataFrame) -> str:
	if df.empty:
	return """
	<div class="stats-grid">
	<div class="stat-card">
	<div class="stat-label">Total submissions</div>
	<div class="stat-value">0</div>
	<div class="stat-meta">No scored results yet</div>
	</div>
	<div class="stat-card">
	<div class="stat-label">Top accuracy</div>
	<div class="stat-value">N/A</div>
	<div class="stat-meta">Waiting for evaluations</div>
	</div>
	<div class="stat-card">
	<div class="stat-label">Latest activity</div>
	<div class="stat-value">N/A</div>
	<div class="stat-meta">Refresh once submissions land</div>
	</div>
	</div>
	"""

	best_model = str(df.iloc[0]["Model"]) if "Model" in df.columns else "Unknown"
	best_accuracy = _format_accuracy(df.iloc[0]["Accuracy"]) if "Accuracy" in df.columns else "N/A"
	latest_time = "N/A"

	if "Time" in df.columns:
	latest_series = pd.to_datetime(df["Time"], errors="coerce").dropna()
	if not latest_series.empty:
	latest_time = latest_series.max().strftime("%b %d, %Y %H:%M")

	return f"""
	<div class="stats-grid">
	<div class="stat-card">
	<div class="stat-label">Total submissions</div>
	<div class="stat-value">{len(df)}</div>
	<div class="stat-meta">Ranked on the live board</div>
	</div>
	<div class="stat-card">
	<div class="stat-label">Top accuracy</div>
	<div class="stat-value">{best_accuracy}</div>
	<div class="stat-meta">Current leader: {best_model}</div>
	</div>
	<div class="stat-card">
	<div class="stat-label">Latest activity</div>
	<div class="stat-value">{latest_time}</div>
	<div class="stat-meta">Higher accuracy ranks first</div>
	</div>
	</div>
	"""


	def refresh_leaderboard_view():
	df = load_results()
	return df, build_leaderboard_summary(df)

	def submit_prediction_html(model_name, predictions_file, profile: gr.OAuthProfile \| None):
	message = submit_prediction(model_name, predictions_file, profile)
	return markdown_to_html(message, "html-block status-message")


	def update_submit_ui(profile: gr.OAuthProfile \| None):
	if profile is None:
	return (
	gr.update(visible=False),
	gr.update(
	value=markdown_to_html("Please log in with Hugging Face to submit.", "html-block status-message"),
	visible=True,
	),
	)
	username = profile.name
	return (
	gr.update(visible=True),
	gr.update(
	value=markdown_to_html(f"Logged in as {username}", "html-block status-message"),
	visible=True,
	),
	)


	with gr.Blocks(theme=build_theme(), css=CUSTOM_CSS, fill_width=True) as demo:
	with gr.Column(elem_classes=["app-shell"]):
	with gr.Group(elem_classes=["hero-panel"]):
	gr.HTML(
	"""
	<div class="hero-badges">
	<span class="hero-badge">Live Benchmark</span>
	<span class="hero-badge">Human vs AI Dialogue Detection</span>
	<span class="hero-badge">Ranked by Accuracy</span>
	</div>
	"""
	)
	gr.HTML(markdown_to_html(TITLE, "html-block hero-title"))
	gr.HTML(markdown_to_html(INTRODUCTION_TEXT, "html-block hero-copy"))

	with gr.Tabs(elem_classes=["main-tabs"]):
	with gr.Tab("Leaderboard"):

	with gr.Group(elem_classes=["panel-card", "leaderboard-card"]):
	with gr.Row(elem_classes="centered-row"):
	with gr.Column(scale=6, elem_classes="centered-column"):
	gr.HTML(
	f"""
	<div class="table-toolbar">
	<div>
	<div class="table-title">Leaderboard table</div>
	<div class="table-subtitle">
	{LEADERBOARD_DETAIL_TEXT}
	</div>
	</div>
	</div>
	"""
	)
	with gr.Column(scale=2, min_width=180, elem_classes="centered-column"):
	refresh_btn = gr.Button(
	"Refresh standings",
	variant="secondary",
	elem_classes=["refresh-button"],
	)

	leaderboard_summary = gr.HTML()

	leaderboard_df = gr.Dataframe(
	value=load_results(),
	elem_classes=["leaderboard-table"],
	interactive=False,
	)

	refresh_btn.click(
	fn=refresh_leaderboard_view,
	outputs=[leaderboard_df, leaderboard_summary],
	)

	with gr.Tab("About"):
	with gr.Group(elem_classes=["panel-card"]):
	gr.HTML(markdown_to_html(DESCRIPTION_TEXT, "html-block rich-copy"))

	with gr.Group(elem_classes=["panel-card", "info-grid-card"]):
	gr.HTML(
	"""
	<div class="info-grid">
	<div class="info-item">
	<div class="info-label">Task format</div>
	<div class="info-value">Binary A/B decision</div>
	</div>
	<div class="info-item">
	<div class="info-label">Input</div>
	<div class="info-value">Paired dialogue transcripts</div>
	</div>
	<div class="info-item">
	<div class="info-label">Outcome</div>
	<div class="info-value">Human-human conversation identification</div>
	</div>
	</div>
	"""
	)

	with gr.Group(elem_classes=["panel-card"]):
	gr.HTML("<div class=\"section-heading\">Citation</div>")
	gr.HTML(markdown_to_html(CITATION_BUTTON_TEXT, "html-block rich-copy"))


	with gr.Tab("Submit"):
	with gr.Group(elem_classes=["panel-card"]):
	gr.HTML(
	"""
	<div class="section-kicker">Submission workflow</div>
	<p>To submit your predictions, download the git repository and follow the instructions to run evaluation locally. If you want the dataset directly (no additional infrastructure), click the download link at the bottom of this page.</p>
	<div class="steps-row">
	<div class="step-chip">1. Log in</div>
	<div class="step-chip">2. Clone git repository and run evaluation locally</div>
	<div class="step-chip">3. Upload CSV</div>
	</div>
	"""
	)

	with gr.Row(equal_height=True):
	with gr.Column(scale=5):
	with gr.Group(elem_classes=["panel-card"]):
	gr.HTML(
	"""
	<div class="html-block">
	<h2>Account access</h2>
	<p>Authenticate with Hugging Face to unlock the submission form.</p>
	</div>
	"""
	)
	login_btn = gr.LoginButton("Log in with Hugging Face")
	login_status = gr.HTML(
	value=markdown_to_html(
	"Please log in with Hugging Face to submit.",
	"html-block status-message",
	),
	visible=True,
	)

	with gr.Column(scale=4):
	with gr.Group(elem_classes=["panel-card"]):
	gr.HTML(
	"""
	<div class="html-block">
	<h2>Clone repository</h2>
	<p>Use this command to get the evaluation code/infrastructure locally before generating your predictions.</p>
	</div>
	"""
	)
	git_command = gr.Textbox(
	label="Git command",
	value=GIT_CLONE_COMMAND,
	interactive=False,
	lines=1,
	elem_classes=["command-box"],
	)

	with gr.Column(visible=False, elem_classes=["submission-panel"]) as submission_form:
	with gr.Group(elem_classes=["panel-card"]):
	gr.HTML(
	"""
	<div class="html-block">
	<h2>Upload predictions</h2>
	<p>Submit a CSV containing the required <code>who_is_human</code> column.</p>
	</div>
	"""
	)
	model_name = gr.Textbox(
	label="Model name",
	placeholder="e.g. GPT-4.5, Claude Opus, or your custom system",
	)
	predictions_file = gr.File(label="Predictions file (.csv)")
	submit_btn = gr.Button(
	"Submit predictions",
	variant="primary",
	elem_classes=["submit-button"],
	)
	submission_status = gr.HTML()

	with gr.Column(scale=3):
	with gr.Group(elem_classes=["panel-card"]):
	gr.HTML(
	"""
	<div class="html-block">
	<h2>Raw dataset</h2>
	<p>If you want to download the raw dataset (no evaluation infrastructure) click the button below.</p>
	</div>
	"""
	)
	gr.DownloadButton(
	label="Download raw dataset",
	value="https://huggingface.co/datasets/roc-hci/Turing-Bench-Questions/resolve/main/turing_bench_public_shuffled.csv",
	)


	demo.load(
	fn=refresh_leaderboard_view,
	outputs=[leaderboard_df, leaderboard_summary],
	)
	demo.load(
	fn=update_submit_ui,
	inputs=None,
	outputs=[submission_form, login_status],
	)
	login_btn.click(
	fn=update_submit_ui,
	inputs=None,
	outputs=[submission_form, login_status],
	)
	submit_btn.click(
	fn=submit_prediction_html,
	inputs=[model_name, predictions_file],
	outputs=submission_status,
	)

	gr.HTML(
	f"""
	<div class="html-block">
	<p class="p-small">Thanks Locke (https://lockeidentity.com/) for sponsoring part of this research</p>
	<a href="https://lockeidentity.com/" target="_blank" rel="noopener noreferrer"><img class="logo-small" src="data:image/png;base64,{b64}"/></a>
	</div>
	"""
	)

	demo.launch()