Spaces:

Now-Join-Us
/

Generalist-Value-Model-V0

Running

App Files Files Community

Generalist-Value-Model-V0 / app.py

zhangyikai

Update UI notes

aba7b4a 28 days ago

raw

history blame contribute delete

32.1 kB

	import gradio as gr
	import pandas as pd
	import json
	import random
	import os
	from huggingface_hub import hf_hub_download, snapshot_download

	# ==========================================
	# V0 Model Import & Initialization
	# ==========================================
	MODEL_REPO_ID = "Now-Join-Us/Generalist-Value-Model-V0"
	EMBEDDING_REPO_ID = "Qwen/Qwen3-Embedding-0.6B"

	v0_model = None

	print(">>> Starting V0 App...")

	try:
	# Ensure v0_core is accessible.
	# If v0_core is in the root directory, this import works.
	from v0_core.models.v0 import V0

	print(f">>> Downloading models from Hugging Face Hub...")

	# 1. Download V0 Checkpoint
	print(f" - Fetching checkpoint from {MODEL_REPO_ID}...")
	checkpoint_path = hf_hub_download(
	repo_id=MODEL_REPO_ID,
	filename="v_0_for_router_demo.pt"
	)

	# 2. Download TabPFN Classifier
	print(f" - Fetching TabPFN head from {MODEL_REPO_ID}...")
	tabpfn_path = hf_hub_download(
	repo_id=MODEL_REPO_ID,
	filename="pretrained/tabpfn-v2.5-classifier-v2.5_default.ckpt"
	)

	# 3. Download Qwen Embedding Model (Snapshot because we need config, tokenizer, etc.)
	print(f" - Fetching Embedding model from {EMBEDDING_REPO_ID}...")
	embedding_path = snapshot_download(
	repo_id=EMBEDDING_REPO_ID
	)

	print(">>> All assets downloaded. Initializing V0 class...")

	# Determine device (Defaulting to CPU as per request, but CUDA is preferred if available)
	# device = "cuda" if torch.cuda.is_available() else "cpu"
	device = "cpu"
	print(f">>> Device selected: {device}")

	# Initialize the Model
	v0_model = V0.from_pretrained(
	checkpoint_path=checkpoint_path,
	embedding_model_path=embedding_path,
	tabpfn_head_path=tabpfn_path,
	device=device
	)

	# Set to eval mode if applicable (V0 usually handles this internally, but good practice)
	if hasattr(v0_model, 'eval'):
	v0_model.eval()

	print(f">>> V0 Model Loaded Successfully on {device}!")

	except Exception as e:
	print(f"Warning: Failed to load V0 model or dependencies. Prediction will fail. Error: {e}")
	v0_model = None

	# ==========================================
	# 0. 配置与数据加载逻辑
	# ==========================================

	# 指定的目标模型列表
	TARGET_MODELS = [
	'Qwen3-30B-A3B-Instruct-2507',
	'Qwen3-4B-Instruct-2507',
	'DeepSeek-R1-Distill-Qwen-1.5B',
	"Qwen3-32B",
	"DeepSeek-R1-Distill-Qwen-7B",
	"Qwen3-0.6B",
	]

	# 指定的数据集列表
	TARGET_DATASETS = [
	'aime_2024', 'aime_2025', 'amc23', 'gaokao_math_cloze', 'gpqa_diamond', 'olympiad'
	]

	# 全局变量：Prompt 字典缓存
	PROMPT_DICT_CACHE = None
	# 全局变量：性能数据缓存 { "Model_Dataset": { id_int: score_float } }
	PERFORMANCE_DB = {}

	def load_prompt_dict():
	"""读取 Prompt 字典文件"""
	global PROMPT_DICT_CACHE
	if PROMPT_DICT_CACHE is not None:
	return PROMPT_DICT_CACHE

	path = "data/router_context_sampled/prompt_dict.json"
	try:
	with open(path, "r", encoding="utf-8") as f:
	PROMPT_DICT_CACHE = json.load(f)
	print(f"Loaded {len(PROMPT_DICT_CACHE)} prompts from {path}")
	except FileNotFoundError:
	print(f"Warning: {path} not found. Using empty dict.")
	PROMPT_DICT_CACHE = {}
	return PROMPT_DICT_CACHE

	def load_performance_data(model_name, dataset_name):
	"""
	按需加载特定模型在特定数据集上的性能数据到内存缓存中
	Key格式: f"{model_name}_{dataset_name}"
	"""
	cache_key = f"{model_name}_{dataset_name}"

	# 如果已经缓存过，直接返回
	if cache_key in PERFORMANCE_DB:
	return PERFORMANCE_DB[cache_key]

	file_path = f"data/router_val_rollouts_and_performance/{model_name}_{dataset_name}.jsonl"
	data_map = {}

	if os.path.exists(file_path):
	try:
	with open(file_path, "r", encoding="utf-8") as f:
	for line in f:
	try:
	item = json.loads(line)
	idx = item.get("id")
	score = item.get("mean@10")

	if idx is not None and score is not None:
	data_map[idx] = (float(score) + 1.0) / 2.0
	except json.JSONDecodeError:
	continue
	print(f"Loaded performance map for {cache_key}: {len(data_map)} entries.")
	except Exception as e:
	print(f"Error loading performance file {file_path}: {e}")
	else:
	pass

	PERFORMANCE_DB[cache_key] = data_map
	return data_map

	def load_real_model_data(model_name):
	prompt_db = load_prompt_dict()
	file_path = f"data/router_context_sampled/performance/{model_name}.jsonl"

	# 临时存储解析后的所有数据
	all_parsed_data = []

	if not os.path.exists(file_path):
	return [{
	"prompt": f"Error: File {file_path} not found.",
	"score": 0.0,
	"is_correct": False
	}]

	try:
	with open(file_path, "r", encoding="utf-8") as f:
	lines = f.readlines()

	# 1. 先解析所有行，获取 label 信息
	for line in lines:
	try:
	item = json.loads(line)
	dataset = item.get("dataset", "")
	idx = item.get("id", "")
	key = f"{dataset}_{idx}"
	prompt_text = prompt_db.get(key, f"[Prompt missing for key: {key}]")

	raw_score = float(item.get("mean@10", -1.0))
	normalized_score = (raw_score + 1.0) / 2.0
	is_correct = normalized_score > 0.6

	all_parsed_data.append({
	"prompt": prompt_text,
	"score": normalized_score,
	"is_correct": is_correct
	})

	except json.JSONDecodeError:
	continue

	# 2. 采样逻辑 (Target = 16)
	target_count = 16

	if len(all_parsed_data) <= target_count:
	return all_parsed_data

	# 初步随机采样 16 个
	sampled_data = random.sample(all_parsed_data, target_count)

	# 检查正负样本分布
	positives = [x for x in sampled_data if x['is_correct']]
	negatives = [x for x in sampled_data if not x['is_correct']]

	# 情况 A: 全是正样本 (All Positive) -> 强行塞入一个负样本
	if len(negatives) == 0:
	# 从原始池中寻找所有负样本
	all_negatives_pool = [x for x in all_parsed_data if not x['is_correct']]
	if all_negatives_pool:
	# 随机选一个负样本
	replacement = random.choice(all_negatives_pool)
	# 随机替换掉当前采样中的一个
	replace_idx = random.randint(0, target_count - 1)
	sampled_data[replace_idx] = replacement

	# 情况 B: 全是负样本 (All Negative) -> 强行塞入一个正样本
	elif len(positives) == 0:
	# 从原始池中寻找所有正样本
	all_positives_pool = [x for x in all_parsed_data if x['is_correct']]
	if all_positives_pool:
	# 随机选一个正样本
	replacement = random.choice(all_positives_pool)
	# 随机替换掉当前采样中的一个
	replace_idx = random.randint(0, target_count - 1)
	sampled_data[replace_idx] = replacement

	return sampled_data

	except Exception as e:
	print(f"Error reading {file_path}: {e}")
	return []

	def load_dataset_batch(dataset_name):
	"""
	从指定数据集文件中随机采样4条数据
	返回: (input_list, id_list, dataset_name)
	"""
	file_path = f"data/router_val_prompt_and_gts/{dataset_name}.jsonl"

	inputs = []
	ids = []

	try:
	if os.path.exists(file_path):
	with open(file_path, "r", encoding="utf-8") as f:
	lines = f.readlines()

	# 随机采样 4 条，需要保留原始索引或ID信息
	count = 4
	total_lines = len(lines)

	if total_lines > 0:
	# 随机生成索引
	sampled_indices = random.sample(range(total_lines), min(count, total_lines))

	for idx in sampled_indices:
	line = lines[idx]
	try:
	item = json.loads(line)
	inputs.append(item.get("input", ""))
	# 优先使用 JSON 中的 "id"，如果没有则尝试使用 "idx"，最后使用文件行号
	# 注意：需要确保这里的 id 能够对应到 performance 文件中的 id
	item_id = item.get("id", item.get("idx", idx))
	ids.append(item_id)
	except:
	inputs.append("Error parsing JSON")
	ids.append(-1)
	else:
	inputs = []
	ids = []

	else:
	inputs = [f"File not found: {file_path}"] * 4
	ids = [-1] * 4

	except Exception as e:
	inputs = [f"Error loading dataset: {str(e)}"] * 4
	ids = [-1] * 4

	while len(inputs) < 4:
	inputs.append("")
	ids.append(-1)

	return inputs[0], inputs[1], inputs[2], inputs[3], ids, dataset_name

	# ==========================================
	# 1. 核心逻辑
	# ==========================================

	def process_custom_file(file_obj):
	"""解析用户上传的 JSONL 文件"""
	if file_obj is None:
	return []

	data_list = []
	try:
	with open(file_obj.name, 'r', encoding='utf-8') as f:
	for line in f:
	line = line.strip()
	if not line: continue
	try:
	item = json.loads(line)
	prompt = item.get('prompt')
	if not prompt: continue

	# 尝试多种分数格式
	score = 0.0
	if 'score' in item:
	score = float(item['score'])
	elif 'mean@10' in item:
	score = (float(item['mean@10']) + 1.0) / 2.0
	elif 'is_correct' in item:
	score = 1.0 if item['is_correct'] else 0.0

	is_correct = score > 0.6
	data_list.append({
	"prompt": prompt,
	"score": score,
	"is_correct": is_correct
	})
	except:
	continue
	except Exception as e:
	print(f"Error parsing custom file: {e}")
	return []

	return data_list

	def format_model_card(data_list, model_name, is_empty=False):
	if is_empty or not data_list:
	return f"""
	<div class='model-card empty'>
	<div class='card-title'>Empty Slot</div>
	<div class='card-subtitle'>Select a model above to load</div>
	</div>
	"""

	total = len(data_list)
	rows_html = ""
	preview_limit = 3
	preview_data = data_list[:preview_limit]

	for item in preview_data:
	p_text = item.get('prompt', '')
	if len(p_text) > 45:
	p_text = p_text[:45] + " ... "

	score = item.get('score', 0.0)
	if score >= 0.8:
	status_class = "status-green"
	icon = "✔"
	elif score >= 0.4:
	status_class = "status-yellow"
	icon = "~"
	else:
	status_class = "status-red"
	icon = "✘"

	rows_html += f"""
	<div class='history-row'>
	<div class='status-box {status_class}'>{icon}</div>
	<div class='prompt-text'>{p_text}</div>
	<span style='opacity:0.9; font-size:0.9em; white-space: nowrap; margin-left: 8px;'>{score:.2f}</span>
	</div>
	"""

	remaining = total - preview_limit
	if remaining > 0:
	rows_html += f"<div class='history-more'>+ {remaining} more items</div>"

	return f"""
	<div class='model-card populated'>
	<div class='card-header'>
	<span class='model-name'>{model_name}</span>
	<span class='acc-badge'>Context Size: {total}</span>
	</div>
	<div class='card-body'>
	<div class='history-container'>
	{rows_html}
	</div>
	</div>
	</div>
	"""

	def select_model(btn_name, current_pointer, data_s1, name_s1, data_s2, name_s2):
	if btn_name == name_s1 or btn_name == name_s2:
	html1 = format_model_card(data_s1, name_s1, is_empty=(name_s1 is None))
	html2 = format_model_card(data_s2, name_s2, is_empty=(name_s2 is None))
	return (html1, html2, current_pointer, data_s1, name_s1, data_s2, name_s2)

	new_data = load_real_model_data(btn_name)

	if current_pointer == 0:
	html1 = format_model_card(new_data, btn_name)
	html2 = format_model_card(data_s2, name_s2, is_empty=(name_s2 is None))
	return (html1, html2, 1, new_data, btn_name, data_s2, name_s2)
	else:
	html1 = format_model_card(data_s1, name_s1, is_empty=(name_s1 is None))
	html2 = format_model_card(new_data, btn_name)
	return (html1, html2, 0, data_s1, name_s1, new_data, btn_name)

	def handle_custom_upload(file_obj, current_pointer, data_s1, name_s1, data_s2, name_s2):
	"""处理自定义文件上传"""
	if file_obj is None:
	html1 = format_model_card(data_s1, name_s1, is_empty=(name_s1 is None))
	html2 = format_model_card(data_s2, name_s2, is_empty=(name_s2 is None))
	return (html1, html2, current_pointer, data_s1, name_s1, data_s2, name_s2)

	new_data = process_custom_file(file_obj)
	custom_name = f"Custom: {os.path.basename(file_obj.name)[:15]}"

	if not new_data:
	# 文件解析失败处理
	custom_name = "Upload Failed"
	new_data = [{"prompt": "Invalid JSONL format", "score": 0.0, "is_correct": False}]

	if current_pointer == 0:
	html1 = format_model_card(new_data, custom_name)
	html2 = format_model_card(data_s2, name_s2, is_empty=(name_s2 is None))
	return (html1, html2, 1, new_data, custom_name, data_s2, name_s2)
	else:
	html1 = format_model_card(data_s1, name_s1, is_empty=(name_s1 is None))
	html2 = format_model_card(new_data, custom_name)
	return (html1, html2, 0, data_s1, name_s1, new_data, custom_name)

	def invalidate_specific_id(current_ids, index):
	"""
	当用户修改了第 index 个文本框时，将对应的 ID 设为 -1 (N/A)
	"""
	if current_ids is None:
	return [-1, -1, -1, -1]

	new_ids = list(current_ids)

	while len(new_ids) <= index:
	new_ids.append(-1)

	new_ids[index] = -1
	return new_ids

	def predict_performance(data1, name1, data2, name2, t1, t2, t3, t4, current_ids, current_dataset):
	"""
	使用 V0 模型进行性能预测
	"""
	if v0_model is None:
	return pd.DataFrame([{"Error": "V0 Model not loaded successfully on server."}])

	# 1. 收集目标 Instruction
	inputs_list = [t1, t2, t3, t4]
	target_prompts = []
	target_ids = []
	target_indices = [] # 记录原始索引以便填回

	for i, t in enumerate(inputs_list):
	if t.strip():
	target_prompts.append(t)
	target_indices.append(i)
	if current_ids and i < len(current_ids):
	target_ids.append(current_ids[i])
	else:
	target_ids.append(-1)

	if not target_prompts:
	return pd.DataFrame([{"Error": "Please enter at least one target instruction."}])

	# 2. 确定要运行的模型
	active_models = []
	if name1 and data1: active_models.append((name1, data1))
	if name2 and data2: active_models.append((name2, data2))

	if not active_models:
	return pd.DataFrame([{"Error": "Please select at least one model above."}])

	results = []

	for m_name, m_history in active_models:
	# --- 准备 Context (History) ---
	# 提取历史数据中的 prompt 和 score
	# 限制最大 Context 长度为 256 (模型要求)

	context_prompts = []
	context_labels = []

	# 过滤无效数据并取前256个
	valid_items = [x for x in m_history if isinstance(x.get('score'), (int, float)) and x.get('prompt')]
	# 这里假设 m_history 已经是随机采样过的（在 load_real_model_data 中处理）
	batch_context = valid_items[:256]

	for item in batch_context:
	context_prompts.append(item['prompt'])
	# 逻辑：先确保 score 在 0-1 之间 (load_real_model_data 已经做了 normalized)，
	# 然后以 0.5 为分界，转化为 0 或 1 的标签
	score_val = item['score']
	label = 1 if score_val > 0.5 else 0
	context_labels.append(label)

	if not context_prompts:
	results.append({"Model": m_name, "Error": "No valid context data found."})
	continue

	# --- 获取 Ground Truth (如果可用) ---
	real_perf_map = {}
	if current_dataset and "Custom:" not in m_name:
	real_perf_map = load_performance_data(m_name, current_dataset)

	# --- 运行 V0 预测 ---
	try:
	# V0 模型一次性处理所有 targets，共享同一个 context
	pred_scores = v0_model.predict(
	context_prompts=context_prompts,
	context_labels=context_labels,
	target_prompts=target_prompts
	)
	except Exception as e:
	print(f"Inference error for {m_name}: {e}")
	pred_scores = [0.0] * len(target_prompts)

	# --- 格式化结果 ---
	for idx, t_text in enumerate(target_prompts):
	cur_id = target_ids[idx]

	# 真实分数
	if cur_id != -1 and cur_id in real_perf_map:
	real_val = real_perf_map[cur_id]
	real_mean_10_str = f"{real_val:.2f}"
	else:
	real_mean_10_str = "N/A"

	# 预测分数
	final_score = float(pred_scores[idx])
	pred_str = "✔ Success" if final_score > 0.5 else "✘ Failure"

	results.append({
	"Model": m_name,
	"Instruction": t_text,
	"Actual mean@10": real_mean_10_str,
	"Predicted Score": f"{final_score:.4f}",
	"Predicted Class": pred_str
	})

	return pd.DataFrame(results)

	# ==========================================
	# 2. CSS 样式
	# ==========================================
	css = """
	/* 全局变量 */
	:root {
	--primary: #10b981;
	--primary-light: #ecfdf5;
	--primary-dark: #047857;
	--bg-card: #ffffff;
	--border-sub: #e5e7eb;
	--text-main: #1f2937;
	--text-sub: #6b7280;
	--success: #10b981;
	--fail: #ef4444;
	--warning: #f59e0b;
	}
	.dark {
	--bg-card: #1f2937;
	--border-sub: #374151;
	--text-main: #f3f4f6;
	--text-sub: #9ca3af;
	--primary-light: #064e3b;
	--primary-dark: #ecfdf5;
	}

	/* 顶部 Banner */
	.concept-banner {
	background: linear-gradient(135deg, rgba(16, 185, 129, 0.08) 0%, rgba(59, 130, 246, 0.05) 100%);
	border: 1px solid var(--primary-light);
	border-radius: 12px;
	padding: 24px;
	text-align: center;
	margin-bottom: 20px;
	}
	.concept-title {
	font-size: 1.8em; font-weight: 700; color: var(--text-main); margin-bottom: 8px;
	}
	.concept-subtitle { font-size: 1em; color: var(--text-sub); max-width: 600px; margin: 0 auto; line-height: 1.5; }

	/* 步骤标题 */
	.step-header {
	display: flex; align-items: center; margin-bottom: 15px;
	border-bottom: 2px solid var(--border-sub); padding-bottom: 10px; margin-top: 20px;
	}
	.step-num {
	background: var(--primary); color: white; width: 28px; height: 28px;
	border-radius: 50%; display: flex; align-items: center; justify-content: center;
	font-weight: bold; margin-right: 10px; font-size: 0.9em;
	}
	.step-title { font-size: 1.2em; font-weight: 600; color: var(--text-main); }
	.step-desc { font-size: 0.93em; color: var(--text-sub); margin-left: auto; font-style: italic;}

	/* --- 模型选择按钮容器 --- */
	.model-select-container {
	display: flex !important;
	justify-content: flex-start !important;
	flex-wrap: wrap !important;
	align-items: center !important;
	gap: 10px !important;
	width: 100% !important;
	margin-bottom: 20px !important;
	}

	/* 按钮通用样式 */
	button.model-btn {
	background: linear-gradient(145deg, #ecfdf5 0%, #d1fae5 100%);
	border: 1px solid #6ee7b7 !important;
	color: #065f46 !important;
	font-weight: 600 !important;
	font-size: 0.9em !important;
	border-radius: 12px !important;
	box-shadow: 0 2px 4px rgba(16, 185, 129, 0.1);
	transition: all 0.2s ease;
	height: 46px !important;
	min-height: 46px !important;
	white-space: nowrap;
	padding: 0 30px !important;
	min-width: 220px !important;
	text-align: center !important;
	flex-grow: 0 !important;
	width: auto !important;
	}
	.dark button.model-btn {
	background: linear-gradient(145deg, #064e3b 0%, #065f46 100%);
	border-color: #059669 !important;
	color: #ecfdf5 !important;
	}

	button.model-btn:hover {
	transform: translateY(-2px);
	box-shadow: 0 4px 8px rgba(16, 185, 129, 0.25);
	background: linear-gradient(145deg, #d1fae5 0%, #a7f3d0 100%);
	}

	.dark button.model-btn:hover {
	background: linear-gradient(145deg, #059669 0%, #047857 100%) !important;
	color: #ffffff !important;
	box-shadow: 0 4px 8px rgba(0, 0, 0, 0.4);
	}

	/* --- 卡片样式 --- */
	.model-card {
	background: var(--bg-card); border: 1px solid var(--border-sub);
	border-radius: 10px; padding: 16px; margin-bottom: 15px; height: 100%;
	transition: all 0.2s; position: relative; overflow: hidden;
	min-height: 200px;
	}
	.model-card.populated { border-left: 5px solid var(--primary); box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.05); }
	.model-card.empty {
	border: 2px dashed var(--border-sub);
	display: flex; flex-direction: column; justify-content: center; align-items: center;
	opacity: 0.6;
	}
	.card-title { font-weight: bold; color: var(--text-sub); font-size: 1.1em; }
	.card-subtitle { font-size: 0.85em; color: var(--text-sub); margin-top: 5px;}
	.card-header { display: flex; justify-content: space-between; align-items: center; margin-bottom: 12px; }
	.model-name { font-weight: bold; font-size: 1.05em; color: var(--text-main); }
	.acc-badge { background: var(--primary-light); color: var(--primary-dark); font-size: 0.75em; padding: 3px 8px; border-radius: 12px; font-weight: 700; }
	.history-container { display: flex; flex-direction: column; gap: 6px; }
	.history-row { display: flex; align-items: center; background: rgba(0,0,0,0.03); padding: 5px 8px; border-radius: 6px; }
	.status-box {
	width: 20px; height: 20px; border-radius: 5px; display: flex; align-items: center; justify-content: center;
	color: white; font-size: 0.7em; font-weight: bold; margin-right: 8px; flex-shrink: 0;
	}
	.status-green { background-color: var(--success); }
	.status-red { background-color: var(--fail); }
	.status-yellow { background-color: var(--warning); }

	.prompt-text {
	flex: 1;
	min-width: 0;
	font-size: 0.85em; color: var(--text-main);
	white-space: nowrap; overflow: hidden; text-overflow: ellipsis;
	}
	.history-more { font-size: 0.8em; color: var(--text-sub); text-align: center; font-style: italic; margin-top: 4px; }

	/* 按钮和链接 */
	.custom-btn { font-weight: bold !important; font-size: 1.1em !important; }
	.sample-btn {
	border: 1px dashed rgba(16, 185, 129, 0.3) !important;
	color: var(--primary-dark) !important;
	background: rgba(16, 185, 129, 0.05) !important;
	}
	.sample-btn:hover {
	background: rgba(16, 185, 129, 0.15) !important;
	}
	.paper-link {
	font-size: 0.5em; vertical-align: middle; color: var(--primary);
	text-decoration: none; border: 1px solid var(--primary); padding: 4px 10px;
	border-radius: 15px; font-weight: normal; transition: all 0.2s; background: transparent;
	}
	.paper-link:hover { background: var(--primary); color: white; }

	/* Custom Upload Styling */
	.upload-container {
	display: flex; align-items: center; gap: 10px; margin-bottom: 20px;
	padding: 8px 12px; background: rgba(0,0,0,0.02); border-radius: 8px;
	border: 1px dashed var(--border-sub);
	}
	.upload-label { font-size: 0.9em; font-weight: 600; color: var(--text-sub); white-space: nowrap;}
	.code-inline {
	font-family: monospace; background: rgba(0,0,0,0.05); padding: 2px 4px;
	border-radius: 4px; font-size: 0.85em; color: var(--primary-dark);
	}
	"""

	# ==========================================
	# 3. UI 构建
	# ==========================================
	with gr.Blocks(theme=gr.themes.Soft(primary_hue="emerald"), css=css, title="V0 Predictor") as demo:

	# 默认加载第一个模型到左侧 slot
	initial_model = TARGET_MODELS[2] # DeepSeek-R1-Distill-Qwen-1.5B
	initial_data = load_real_model_data(initial_model)

	init_in1, init_in2, init_in3, init_in4, init_ids, init_ds_name = load_dataset_batch(TARGET_DATASETS[0])

	state_pointer = gr.State(value=1)
	state_data1 = gr.State(value=initial_data)
	state_name1 = gr.State(value=initial_model)
	state_data2 = gr.State(value=None)
	state_name2 = gr.State(value=None)

	state_current_ids = gr.State(value=init_ids)
	state_current_dataset = gr.State(value=init_ds_name)

	gr.HTML("""
	<div class="concept-banner">
	<div class="concept-title">
	Generalist Value Model V<sub>0</sub>
	<a href="https://arxiv.org/abs/2602.03584" class="paper-link">Paper ↗</a>
	<a href="https://github.com/Now-Join-Us/V0" class="paper-link">Code ↗</a>
	</div>
	<div class="concept-subtitle">
	<span style="color: var(--primary); font-weight: bold;">Function:</span> V<sub>0</sub> uses a model's historical performance to predict<br>
	how it will perform on unseen instructions without running the model itself.
	</div>
	</div>
	""")

	# STEP 1: MODEL SELECTION
	gr.HTML("""
	<div class="step-header">
	<div class="step-num">1</div>
	<div class="step-title">Select Model (with instruction-performance pairs)</div>
	<div class="step-desc">Click buttons below to load real pairs, or upload your own.</div>
	</div>
	""")

	with gr.Row(elem_classes=["model-select-container"]):
	model_buttons = []
	for m_name in TARGET_MODELS:
	btn = gr.Button(m_name, elem_classes=["model-btn"], scale=0)
	model_buttons.append((btn, m_name))

	with gr.Row():
	with gr.Column(variant="panel"):
	slot1_html = gr.HTML(format_model_card(initial_data, initial_model))
	with gr.Column(variant="panel"):
	slot2_html = gr.HTML(format_model_card(None, None, True))

	with gr.Row(elem_classes=["upload-container"]):
	gr.HTML(
	f"""<div class='upload-label'>
	[Optional] Upload Custom Model (.jsonl)  \|
	<span class='code-inline'>Format: {{"prompt": "...", "score": 0.8 }}\\n{{"prompt": "...", "score": 0.2}} ...</span>
	</div>"""
	)
	upload_btn = gr.File(
	file_count="single",
	file_types=[".jsonl"],
	label=None,
	show_label=False,
	container=False,
	height=30,
	scale=1
	)

	# STEP 2: INSTRUCTION INPUT
	gr.HTML("""
	<div class="step-header" style="margin-top: 40px;">
	<div class="step-num">2</div>
	<div class="step-title">Select Instructions (to predict how well the above models perform on them)</div>
	<div class="step-desc">Click to randomly sample 4 from the corresponding dataset.</div>
	</div>
	""")

	with gr.Row(elem_classes=["model-select-container"]):
	dataset_buttons = []
	for d_name in TARGET_DATASETS:
	d_btn = gr.Button(d_name, elem_classes=["model-btn"], scale=0)
	dataset_buttons.append((d_btn, d_name))

	with gr.Row():
	t1 = gr.Textbox(label="Instruction 1", value=init_in1, lines=2)
	t2 = gr.Textbox(label="Instruction 2", value=init_in2, lines=2)

	with gr.Row():
	t3 = gr.Textbox(label="Instruction 3", value=init_in3, lines=2)
	t4 = gr.Textbox(label="Instruction 4", value=init_in4, lines=2)

	gr.HTML("<div style='height: 8px;'></div>")

	# STEP 3: PREDICTION
	with gr.Row():
	with gr.Column():
	predict_btn = gr.Button("Run V₀ Prediction", variant="primary", size="lg", elem_classes=["custom-btn"])

	gr.HTML("""
	<div class="step-header" style="margin-top: 20px; border-bottom: none;">
	<div class="step-num">3</div>
	<div class="step-title">Results</div>
	<div class="step-desc">Limited to 16 context size (vs. standard 256) by HF CPU limits; GPU enables ms-level speeds.</div>
	</div>
	""")

	output_df = gr.Dataframe(
	headers=["Model", "Instruction", "Actual mean@10", "Predicted Score", "Predicted Class"],
	datatype=["str", "str", "str", "str", "str"],
	interactive=False,
	column_widths=["20%", "44%", "10%", "13%", "13%"]
	)

	# Event Binding
	def make_click_handler(model_name):
	return dict(
	fn=lambda ptr, d1, n1, d2, n2: select_model(model_name, ptr, d1, n1, d2, n2),
	inputs=[state_pointer, state_data1, state_name1, state_data2, state_name2],
	outputs=[slot1_html, slot2_html, state_pointer, state_data1, state_name1, state_data2, state_name2]
	)

	for btn, name in model_buttons:
	btn.click(**make_click_handler(name))

	upload_btn.upload(
	fn=handle_custom_upload,
	inputs=[upload_btn, state_pointer, state_data1, state_name1, state_data2, state_name2],
	outputs=[slot1_html, slot2_html, state_pointer, state_data1, state_name1, state_data2, state_name2]
	)

	def make_dataset_handler(ds_name):
	def handler():
	i1, i2, i3, i4, ids, d_name = load_dataset_batch(ds_name)
	return i1, i2, i3, i4, ids, d_name
	return dict(
	fn=handler,
	inputs=None,
	outputs=[t1, t2, t3, t4, state_current_ids, state_current_dataset],
	show_progress="hidden"
	)

	for d_btn, d_name in dataset_buttons:
	d_btn.click(**make_dataset_handler(d_name))

	t1.input(fn=lambda ids: invalidate_specific_id(ids, 0), inputs=[state_current_ids], outputs=[state_current_ids])
	t2.input(fn=lambda ids: invalidate_specific_id(ids, 1), inputs=[state_current_ids], outputs=[state_current_ids])
	t3.input(fn=lambda ids: invalidate_specific_id(ids, 2), inputs=[state_current_ids], outputs=[state_current_ids])
	t4.input(fn=lambda ids: invalidate_specific_id(ids, 3), inputs=[state_current_ids], outputs=[state_current_ids])

	predict_btn.click(
	fn=predict_performance,
	inputs=[state_data1, state_name1, state_data2, state_name2, t1, t2, t3, t4, state_current_ids, state_current_dataset],
	outputs=[output_df]
	)

	if __name__ == "__main__":
	demo.launch()