CAIA-Benchmark-Leaderboard

Runtime error

App Files Files Community

CAIA-Benchmark-Leaderboard / app.py

zhejianzhang

bugfix

b72feb2 7 months ago

raw

history blame contribute delete

16.1 kB

	import asyncio
	import traceback
	import datetime
	import json
	import os
	import gradio as gr
	import pandas as pd
	from apscheduler.schedulers.background import BackgroundScheduler
	from huggingface_hub import HfApi, upload_file
	import requests
	from content import (
	CITATION_BUTTON_LABEL,
	CITATION_BUTTON_TEXT,
	INTRODUCTION_TEXT,
	TITLE,
	SUBMISSION_TEXT,
	styled_error,
	styled_warning,
	model_hyperlink,
	format_log,
	)
	from score import load_agent_output_dataset
	from loguru import logger


	from datasets import load_dataset, VerificationMode

	from utils import parse_eval_dataset, parseaddr
	from env import (
	CONTACT_DATASET_FILE,
	VERSION,
	BENCHMARK_INTERNAL_EVALUATE_DATASET_FILE,
	EVALUATE_RESULT_DATASET_FILE,
	TOKEN,
	SUBMISSION_DATASET,
	INTERNAL_DATASET,
	EVALUATE_RESULT_DATASET,
	REPO_ID,
	CONTACT_DATASET
	)

	TOKEN = os.getenv("HF_TOKEN")
	API = HfApi(token=TOKEN)
	LOCAL_DEBUG=False

	TYPES = ["markdown", "number", "number", "number", "number", "str", "str", "str"]


	benchmark_internal_evaluate_dataset = load_dataset(INTERNAL_DATASET, data_files=BENCHMARK_INTERNAL_EVALUATE_DATASET_FILE, token=TOKEN, verification_mode=VerificationMode.NO_CHECKS, download_mode="reuse_cache_if_exists",trust_remote_code=True)

	print(EVALUATE_RESULT_DATASET_FILE)
	eval_results = load_dataset(EVALUATE_RESULT_DATASET, data_files=EVALUATE_RESULT_DATASET_FILE, token=TOKEN, verification_mode=VerificationMode.NO_CHECKS, download_mode="reuse_cache_if_exists",trust_remote_code=True)

	benchmark_dataset = parse_eval_dataset(benchmark_internal_evaluate_dataset) # type: ignore



	def save_contact_info(contact_info):
	import tempfile
	import json

	# 加载现有联系人信息
	try:
	contact_infos = load_dataset(
	CONTACT_DATASET, data_files=CONTACT_DATASET_FILE, token=TOKEN,
	download_mode="force_redownload",
	verification_mode=VerificationMode.NO_CHECKS,
	trust_remote_code=True
	)
	print(f"load {CONTACT_DATASET_FILE} success, {contact_infos}")
	contact_info_list = list(contact_infos['train'])
	print("contact_info_list:", contact_info_list)
	contact_info_list.append(contact_info)
	except Exception as e:
	print(f"Error loading contact info: {e}")
	contact_info_list = []

	with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as temp_file:
	json.dump(contact_info_list, temp_file, default=str, indent=4)
	temp_file_path = temp_file.name

	API.upload_file(
	path_or_fileobj=temp_file_path,
	path_in_repo=CONTACT_DATASET_FILE,
	repo_id=CONTACT_DATASET,
	repo_type='dataset',
	token=TOKEN,
	commit_message=f"Add new contact: {contact_info['model']} by {contact_info['organization']}"
	)
	print(f"upload {temp_file_path} success")
	os.unlink(temp_file_path)

	# def get_dataframe_from_results(eval_results, split:str = 'train'):
	# try:
	# if hasattr(eval_results, "__getitem__"):
	# local_df = eval_results[split]
	# if hasattr(local_df, "to_pandas"):
	# df = local_df.to_pandas()
	# else:
	# df = pd.DataFrame(local_df.data if hasattr(local_df, "data") else local_df)
	# else:
	# df = pd.DataFrame(eval_results if isinstance(eval_results, list) else [])

	# print(df)
	# try:
	# if "url" in df.columns and "model" in df.columns:
	# df["model"] = df.apply(lambda row: model_hyperlink(row["url"], row["model"]))
	# except Exception as e:
	# print(f"Error applying model hyperlink: {e}")
	# pass

	# column_renames = {
	# "model": "Agent name",
	# "model_family": "Model family",
	# "score": "Average score (%)",
	# "date": "Submission date"
	# }

	# for i in [1, 2, 3]:
	# column_renames[f"score_level{i}"] = f"Level {i} score (%)"

	# rename_cols = {k: v for k, v in column_renames.items() if k in df.columns}
	# if rename_cols:
	# df = df.rename(columns=rename_cols)

	# try:
	# cols_to_drop = [col for col in ["system_prompt", "url"] if col in df.columns]
	# if cols_to_drop:
	# df = df.drop(columns=cols_to_drop)
	# except:
	# pass

	# try:
	# if "Average score (%)" in df.columns:
	# df = df.sort_values(by=["Average score (%)"], ascending=False)
	# except:
	# pass

	# try:
	# numeric_cols = [c for c in df.columns if "score" in c.lower()]
	# if numeric_cols:
	# df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
	# except Exception as e:
	# print(f"Error processing numeric columns: {e}")
	# pass
	# print(df)
	# return df
	# except Exception as e:
	# print(f"Error in get_dataframe_from_results: {e}")
	# return pd.DataFrame({})


	def get_dataframe_from_results(eval_results, split:str = 'train'):
	try:
	if hasattr(eval_results, "__getitem__"):
	local_df = eval_results[split]
	if hasattr(local_df, "to_pandas"):
	df = local_df.to_pandas()
	else:
	df = pd.DataFrame(local_df.data if hasattr(local_df, "data") else local_df)
	else:
	df = pd.DataFrame(eval_results if isinstance(eval_results, list) else [])

	print(df)
	try:
	if "url" in df.columns and "model" in df.columns:
	df["model"] = df.apply(lambda row: model_hyperlink(row["url"], row["model"]))
	except Exception as e:
	print(f"Error applying model hyperlink: {e}")
	pass

	column_renames = {
	"agent_name": "Agent name",
	# "model_family": "Model family",
	"total_score": "Average score (%)",
	"answer_score": "Answer score (%)",
	"reasoning_score":"Reasoning score (%)",
	"tool_use_score": "Tool-use score(%)",
	"date": "Submission date",
	}

	rename_cols = {k: v for k, v in column_renames.items() if k in df.columns}
	if rename_cols:
	df = df.rename(columns=rename_cols)

	try:
	cols_to_drop = [col for col in ["system_prompt", "url"] if col in df.columns]
	if cols_to_drop:
	df = df.drop(columns=cols_to_drop)
	except:
	pass

	try:
	if "Average score (%)" in df.columns:
	df = df.sort_values(by=["Average score (%)"], ascending=False)
	except:
	pass

	try:
	numeric_cols = [c for c in df.columns if "score" in c.lower()]
	if numeric_cols:
	df[numeric_cols] = df[numeric_cols].round(decimals=2)
	except Exception as e:
	print(f"Error processing numeric columns: {e}")
	pass
	print(df)
	return df
	except Exception as e:
	print(f"Error in get_dataframe_from_results: {e}")
	return pd.DataFrame({})

	# 尝试创建数据框
	try:
	eval_dataframe = get_dataframe_from_results(eval_results=eval_results)
	except Exception as e:
	print(f"Error creating dataframes: {e}")
	eval_dataframe = pd.DataFrame({})

	def refresh():
	logger.info('refreshing...')
	try:
	eval_results = load_dataset(EVALUATE_RESULT_DATASET, data_files=EVALUATE_RESULT_DATASET_FILE, token=TOKEN, verification_mode=VerificationMode.NO_CHECKS, download_mode="force_redownload",trust_remote_code=True)
	eval_dataframe = get_dataframe_from_results(eval_results=eval_results)
	return eval_dataframe
	except Exception as e:
	logger.error(f"Error in refresh: {traceback.format_exc()}")
	return pd.DataFrame({})

	def restart_space():
	try:
	API.restart_space(repo_id=REPO_ID, token=TOKEN)
	except Exception as e:
	print(f"Error restarting space: {e}")


	def add_new_eval(
	model: str,
	# model_family: str,
	# system_prompt: str,
	url: str,
	path_to_file,
	organisation: str,
	mail: str,
	profile: gr.OAuthProfile,
	):
	try:
	try:
	with open(path_to_file, 'r', encoding='utf-8') as f:
	json.load(f)
	except json.JSONDecodeError:
	return styled_error("Please upload a valid JSON file.")
	except Exception as e:
	return styled_error(f"File read error: {str(traceback.format_exc())}")
	if not LOCAL_DEBUG:
	# Was the profile created less than 2 month ago?
	user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
	print(user_data.content)
	creation_date = json.loads(user_data.content)["createdAt"]
	if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=10):
	return styled_error("This account is not authorized to submit on CAIA.")

	contact_infos = load_dataset(CONTACT_DATASET, data_files=CONTACT_DATASET_FILE, token=TOKEN,
	download_mode="force_redownload",
	verification_mode=VerificationMode.NO_CHECKS,
	trust_remote_code=True)


	user_submission_dates = []
	try:
	contact_info_list = list(contact_infos['train']) # type: ignore
	print("contact info:", contact_info_list)
	for row in contact_info_list:
	if row.get("username") == profile.username:
	user_submission_dates.append(row.get("date"))
	except Exception as e:
	print(f"Error getting user submission dates: {e}")

	user_submission_dates = sorted(user_submission_dates)
	user_submission_dates = [date.strftime('%Y-%m-%d') if isinstance(date, pd.Timestamp) else datetime.datetime.strptime(str(date), '%Y-%m-%d %H:%M:%S.%f').strftime('%Y-%m-%d') for date in user_submission_dates if date]
	print("submission_dates: ",user_submission_dates)
	if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
	return styled_error("You already submitted once today, please try again tomorrow.")

	# Very basic email parsing
	_, parsed_mail = parseaddr(mail)
	if not "@" in parsed_mail:
	return styled_warning("Please provide a valid email adress.")

	print("Adding new eval")

	if not LOCAL_DEBUG:
	# Check if the combination model/org already exists and prints a warning message if yes
	model_exists = False
	try:
	eval_results_list = list(eval_results)
	for row in eval_results_list:
	if row.get("model", "").lower() == model.lower() and row.get("organisation", "").lower() == organisation.lower():
	model_exists = True
	break
	except Exception as e:
	print(f"Error checking model existence: {e}")

	if model_exists:
	return styled_warning("This model has been already submitted.")

	if path_to_file is None:
	return styled_warning("Please attach a file.")

	file_path = path_to_file
	if hasattr(path_to_file, 'name'):
	try:
	file_path = path_to_file.name
	except:
	pass

	# SAVE CONTACT
	contact_info = {
	"model": model,
	"url": url,
	"organisation": organisation,
	"username": profile.username,
	"mail": mail,
	"date": pd.Timestamp(datetime.datetime.now())
	}
	if LOCAL_DEBUG:
	print("mock uploaded contact info")
	else:
	save_contact_info(contact_info)

	# SCORE SUBMISSION
	file_path = path_to_file.name
	print("Simulate submission...")


	agent_output = load_agent_output_dataset(dataset_path=file_path)
	agent_output_task_ids = set(output.task_id for output in agent_output)
	benchmark_task_ids = set(item.task_id for item in benchmark_dataset)
	if agent_output_task_ids != benchmark_task_ids:
	return styled_error("The task IDs in agent outputs do not match the task IDs in benchmark dataset.")

	# 上传提交文件到SUBMISSION_DATASET
	if not LOCAL_DEBUG:
	try:
	API.upload_file(
	path_or_fileobj=file_path,
	path_in_repo=f"{VERSION}/<{str(model)}>_<{str(organisation)}>_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
	repo_id=SUBMISSION_DATASET,
	repo_type='dataset',
	token=TOKEN,
	commit_message=f"Add submission: {model} by {organisation}"
	)
	print(f"Successfully uploaded submission file to {SUBMISSION_DATASET}")
	except Exception as e:
	print(f"Error uploading submission file: {e}")
	return styled_error(f"upload file failed: {str(e)}")
	else:
	print("mock uploaded submission file")


	return format_log(f"Model {model} submitted successfully by {organisation}.\nPlease wait a few hours and refresh the leaderboard to see your score.")
	except Exception as e:
	print(e)
	return styled_error(f"An error occurred, please open a discussion and indicate at what time you encountered the error.\n")

	def new_submission():
	return styled_error("This feature is not available yet.")

	# 修复datatype定义

	demo = gr.Blocks()
	with demo:
	gr.HTML(TITLE)
	gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

	with gr.Tab("Results: "):
	leaderboard_table = gr.Dataframe(
	value=eval_dataframe,
	interactive=False,
	column_widths=["20%"],
	datatype=TYPES
	)

	refresh_button = gr.Button("Refresh")
	refresh_button.click(
	refresh,
	inputs=[],
	outputs=[
	leaderboard_table,
	],
	)
	with gr.Accordion("Submission"):
	with gr.Row():
	gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
	with gr.Row():
	model_name_textbox = gr.Textbox(label="Agent name")
	# model_family_textbox = gr.Textbox(label="Model family")
	# system_prompt_textbox = gr.Textbox(label="System prompt example")
	with gr.Row():
	url_textbox = gr.Textbox(label="Url to assistant/agent information")
	with gr.Row():
	organisation = gr.Textbox(label="Organization")
	with gr.Row():
	mail = gr.Textbox(label="Contact email (will be stored privately & used if there is an issue with your submission)")
	with gr.Row():
	file_output = gr.File()


	with gr.Row():
	gr.LoginButton()
	submit_button = gr.Button("Submit Eval")
	submission_result = gr.Markdown()
	submit_button.click(
	add_new_eval,
	[
	model_name_textbox,
	# model_family_textbox,
	# system_prompt_textbox,
	url_textbox,
	file_output,
	organisation,
	mail,

	],
	submission_result,
	)

	scheduler = BackgroundScheduler()
	scheduler.add_job(restart_space, "interval", seconds=3600)
	scheduler.start()
	demo.launch(debug=True)