ICBCBench-Leaderboard

Running

App Files Files Community

ICBCBench-Leaderboard / tabs /data_viewer_tab.py

Leonnel1220

Upload folder using huggingface_hub

5148820 verified 3 days ago

Raw

History Blame Contribute Delete

8.63 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	Data-Viewer tab for ICBCBench.
	"""

	import gradio as gr
	import random
	import re

	from tabs.shared_data import get_entry, get_index


	def make_user_task_markdown(item_id, prompt):
	return f"""### User Task 🎯

	Task ID: {item_id}

	Description: {prompt}"""


	def make_article_markdown(article: str) -> str:
	if article and isinstance(article, str):
	processed_article = re.sub(r'\n{2,}', '\n\n', article)

	table_pattern = r'(\\|[^\n]\n(?:[\\|\s\-:]+\n)?(?:\\|[^\n]\n)*)'
	tables = []

	def replace_table(match):
	tables.append(match.group(1))
	return f'__TABLE_PLACEHOLDER_{len(tables)-1}__'

	processed_article = re.sub(table_pattern, replace_table, processed_article)
	processed_article = re.sub(r'(?<!\n)\\s\\([^]+?)\\:', r'\n\n \1:', processed_article)
	processed_article = re.sub(r'\\s\\([^]+?)\\:\s([^]?)\s\\s\\', r' \1: \2\n * **', processed_article)
	processed_article = re.sub(r'(?<!\n)\[\d+[^\]]\]\\s\\', r'\n\n **', processed_article)

	lines = processed_article.split('\n')
	result_lines = []
	for i, line in enumerate(lines):
	result_lines.append(line)
	if (i < len(lines) - 1 and
	line.strip() and
	lines[i + 1].strip() and
	not line.strip().startswith('*') and
	not lines[i + 1].strip().startswith('*') and
	not line.strip().startswith('#')):
	if i + 1 < len(lines) and lines[i + 1].strip():
	result_lines.append('')

	processed_article = '\n'.join(result_lines)
	for i, table in enumerate(tables):
	processed_article = processed_article.replace(f'__TABLE_PLACEHOLDER_{i}__', table)
	else:
	processed_article = article if article is not None else ""

	return f"""### Generated Article 📖

	{processed_article}"""


	def make_scores_html(entry: dict) -> str:
	"""Build score cards for ICBCBench data viewer."""
	track = entry.get("track", "subjective")

	# ICBCBench fields
	overall = entry.get("overall_score")
	objective = entry.get("objective_score")
	subjective = entry.get("subjective_score")
	expert = entry.get("expert_score")
	citation = entry.get("citation_score")
	source = entry.get("source_quality_score")
	confidence = entry.get("confidence")
	correct = entry.get("correct")

	# Legacy DeepResearch Bench fields
	comp = entry.get("comprehensiveness_score")
	insight = entry.get("insight_score")
	inst = entry.get("instruction_following_score")
	read = entry.get("readability_score")

	def fmt(val):
	if val is None:
	return "N/A"
	try:
	return f"{float(val):.2f}"
	except (TypeError, ValueError):
	return str(val)

	if track == "objective":
	scores_data = [
	("Overall<br>Score", fmt(overall)),
	("Objective<br>Score", fmt(objective)),
	("Confidence", fmt(confidence)),
	("Correct", "Yes" if correct is True else ("No" if correct is False else "N/A")),
	]
	else:
	scores_data = [
	("Overall<br>Score", fmt(overall)),
	("Subjective<br>Score", fmt(subjective)),
	("Expert<br>Score", fmt(expert)),
	("Citation", fmt(citation)),
	("Source<br>Quality", fmt(source)),
	]
	# Add legacy dimensions if ICBCBench fields not available
	if subjective is None and any(v is not None for v in [comp, insight, inst, read]):
	scores_data = [
	("Overall<br>Score", fmt(overall)),
	("Comprehen-<br>siveness", fmt(comp)),
	("Insight<br>Score", fmt(insight)),
	("Instruction<br>Following", fmt(inst)),
	("Readability<br>Score", fmt(read)),
	]

	html_items_str = ""
	for title, score in scores_data:
	html_items_str += f"""
	<div style="text-align: center; padding: 8px 5px; flex-grow: 1; flex-basis: 0;">
	<h4 style="margin: 0 0 6px 0; font-size: 1.1em; color: #4a4a4a; font-weight: 600;">{title}</h4>
	<p style="margin: 0; font-size: 1.2em; font-weight: bold; color: #333;">{score}</p>
	</div>
	"""

	return f"""
	<div style="background:#fff; border:1px solid #e0e0e0; border-radius:8px; padding: 18px 15px; margin:18px 0; box-shadow:0 2px 4px rgba(0,0,0,.06);">
	<div style="display: flex; justify-content: space-between; align-items: flex-start;">
	{html_items_str}
	</div>
	</div>"""


	# ---------- 生成 Tab ----------
	def create_data_viewer_tab():
	with gr.Tab("🔍Data Viewer"):
	gr.HTML(
	"""
	<style>
	.card{background:#fff;border:1px solid #e0e0e0;border-radius:8px;padding:22px 24px;margin:18px 0;box-shadow:0 2px 4px rgba(0,0,0,.06);}
	.scrollable-sm{max-height:260px;overflow-y:auto;}
	.scrollable-lg{max-height:700px;overflow-y:auto;}
	.card p{color:#424242 !important;line-height:1.75;margin:0 0 14px 0;text-align:justify;}
	.card ul,.card ol{margin:12px 0 12px 24px;color:#424242 !important;}
	.card li{margin:4px 0;color:#424242 !important;}
	.card blockquote{border-left:4px solid #3498db;margin:18px 0;padding:14px 18px;background:#f8f9fa;font-style:italic;color:#555 !important;}
	.card pre{background:#f8f8f8;color:#333 !important;padding:18px;border-radius:6px;overflow-x:auto;border:1px solid #e0e0e0;}
	.card strong,.card b{font-weight:700 !important;}
	.card::-webkit-scrollbar{width:10px}
	.card::-webkit-scrollbar-track{background:#f5f5f5;border-radius:5px}
	.card::-webkit-scrollbar-thumb{background:#c0c0c0;border-radius:5px}
	.card::-webkit-scrollbar-thumb:hover{background:#a0a0a0}
	</style>
	"""
	)

	with gr.Row():
	model_dd = gr.Dropdown(label="Select Model", choices=[], interactive=True)
	task_dd = gr.Dropdown(label="Select Task", choices=[], interactive=True)

	user_md = gr.Markdown(value="Loading data…", elem_classes=["card", "scrollable-sm"])
	article_md = gr.Markdown(elem_classes=["card", "scrollable-lg"])
	scores_html = gr.HTML()

	def _build_task_choices(tasks):
	choices = []
	for task in tasks:
	item_id = str(task["id"])
	prompt = task.get("prompt", "")
	limit = 60
	preview = prompt[:limit] + ("…" if len(prompt) > limit else "")
	choices.append(f"{item_id}. {preview}")
	return choices

	def fetch(model, task_disp):
	if not model or not task_disp:
	msg = "请选择模型和任务。"
	return make_user_task_markdown("--", msg), make_article_markdown(msg), ""

	item_id = task_disp.split(".", 1)[0].strip()
	entry = get_entry(model, item_id)
	if not entry:
	err = f"未找到模型 {model} 对应任务 {item_id} 的内容或分数。"
	return make_user_task_markdown(item_id, err), make_article_markdown(err), ""

	prompt = entry.get("prompt", "")
	article = entry.get("article", "")
	scores_content = make_scores_html(entry)
	return make_user_task_markdown(item_id, prompt), make_article_markdown(article), scores_content

	def on_load():
	index = get_index()
	models = index.get("models", [])
	tasks = index.get("tasks", [])
	if not models or not tasks:
	return gr.update(choices=[], value=None), gr.update(choices=[], value=None), \
	make_user_task_markdown("--", "No data"), make_article_markdown(""), ""
	task_choices = _build_task_choices(tasks)
	init_model = random.choice(models) if models else None
	init_task = random.choice(task_choices) if task_choices else None
	u, a, s = (
	make_user_task_markdown("--", "请选择模型和任务。"),
	make_article_markdown("请选择模型和任务。"), ""
	)
	return (gr.update(choices=models, value=init_model),
	gr.update(choices=task_choices, value=init_task),
	u, a, s)

	model_dd.change(fetch, inputs=[model_dd, task_dd], outputs=[user_md, article_md, scores_html])
	task_dd.change(fetch, inputs=[model_dd, task_dd], outputs=[user_md, article_md, scores_html])

	return on_load, [model_dd, task_dd, user_md, article_md, scores_html]