import gradio as gr import pandas as pd # ========================================== # 模型类型定义 # ========================================== CLOSED_SOURCE_MODELS = {"Claude Sonnet 4", "Gemini 2.5 Flash", "Gemini 2.5 Pro"} OPEN_SOURCE_MODELS = {"Gemma3-4B", "Qwen3-VL-8B-Inst.", "Qwen3-VL-235B-Inst."} # 模型日期(inference date) MODEL_DATES = { "Claude Sonnet 4": "2026-02-10", "Gemini 2.5 Flash": "2026-02-10", "Gemini 2.5 Pro": "2026-02-10", "Gemma3-4B": "2026-02-10", "Qwen3-VL-8B-Inst.": "2026-02-10", "Qwen3-VL-235B-Inst.": "2026-02-10", } # ========================================== # 1. 准备 Image-Only 任务的数据 (Table 1) # ========================================== image_only_data = [ # Claude Sonnet 4 {"Model": "Claude Sonnet 4", "Split": "Open", "1/ Task-Aware VQA\n(a) UI State Rec. (IT)": 57.89, "2/ UI Compr.\n(IT)": 62.16, "3/ Action\n(IT)": 67.74, "4/ State Transition\n(IT)": 68.42, "4/ State Transition\n(II)": 54.74, "5/ Verification\n(a) Planning (IT)": 73.33, "5/ Verification\n(b) Expected State (IT)": 87.50, "5/ Verification\n(b) Expected State (II)": 43.75}, {"Model": "Claude Sonnet 4", "Split": "Held", "1/ Task-Aware VQA\n(a) UI State Rec. (IT)": 53.18, "2/ UI Compr.\n(IT)": 55.81, "3/ Action\n(IT)": 68.98, "4/ State Transition\n(IT)": 77.83, "4/ State Transition\n(II)": 60.18, "5/ Verification\n(a) Planning (IT)": 72.73, "5/ Verification\n(b) Expected State (IT)": 72.22, "5/ Verification\n(b) Expected State (II)": 45.45}, {"Model": "Claude Sonnet 4", "Split": "Overall", "1/ Task-Aware VQA\n(a) UI State Rec. (IT)": 54.59, "2/ UI Compr.\n(IT)": 57.72, "3/ Action\n(IT)": 68.61, "4/ State Transition\n(IT)": 75.00, "4/ State Transition\n(II)": 58.54, "5/ Verification\n(a) Planning (IT)": 72.92, "5/ Verification\n(b) Expected State (IT)": 76.92, "5/ Verification\n(b) Expected State (II)": 44.90}, # Gemini 2.5 Flash {"Model": "Gemini 2.5 Flash", "Split": "Open", "1/ Task-Aware VQA\n(a) UI State Rec. (IT)": 68.42, "2/ UI Compr.\n(IT)": 83.78, "3/ Action\n(IT)": 75.27, "4/ State Transition\n(IT)": 75.79, "4/ State Transition\n(II)": 44.21, "5/ Verification\n(a) Planning (IT)": 80.00, "5/ Verification\n(b) Expected State (IT)": 62.50, "5/ Verification\n(b) Expected State (II)": 37.50}, {"Model": "Gemini 2.5 Flash", "Split": "Held", "1/ Task-Aware VQA\n(a) UI State Rec. (IT)": 68.91, "2/ UI Compr.\n(IT)": 67.44, "3/ Action\n(IT)": 72.22, "4/ State Transition\n(IT)": 74.21, "4/ State Transition\n(II)": 39.82, "5/ Verification\n(a) Planning (IT)": 81.82, "5/ Verification\n(b) Expected State (IT)": 80.56, "5/ Verification\n(b) Expected State (II)": 30.30}, {"Model": "Gemini 2.5 Flash", "Split": "Overall", "1/ Task-Aware VQA\n(a) UI State Rec. (IT)": 68.77, "2/ UI Compr.\n(IT)": 72.36, "3/ Action\n(IT)": 73.14, "4/ State Transition\n(IT)": 74.68, "4/ State Transition\n(II)": 41.14, "5/ Verification\n(a) Planning (IT)": 81.25, "5/ Verification\n(b) Expected State (IT)": 75.00, "5/ Verification\n(b) Expected State (II)": 32.65}, # Gemini 2.5 Pro {"Model": "Gemini 2.5 Pro", "Split": "Open", "1/ Task-Aware VQA\n(a) UI State Rec. (IT)": 66.67, "2/ UI Compr.\n(IT)": 86.49, "3/ Action\n(IT)": 73.12, "4/ State Transition\n(IT)": 72.63, "4/ State Transition\n(II)": 40.00, "5/ Verification\n(a) Planning (IT)": 60.00, "5/ Verification\n(b) Expected State (IT)": 75.00, "5/ Verification\n(b) Expected State (II)": 18.75}, {"Model": "Gemini 2.5 Pro", "Split": "Held", "1/ Task-Aware VQA\n(a) UI State Rec. (IT)": 72.28, "2/ UI Compr.\n(IT)": 69.77, "3/ Action\n(IT)": 71.30, "4/ State Transition\n(IT)": 74.66, "4/ State Transition\n(II)": 43.44, "5/ Verification\n(a) Planning (IT)": 78.79, "5/ Verification\n(b) Expected State (IT)": 77.78, "5/ Verification\n(b) Expected State (II)": 45.45}, {"Model": "Gemini 2.5 Pro", "Split": "Overall", "1/ Task-Aware VQA\n(a) UI State Rec. (IT)": 70.60, "2/ UI Compr.\n(IT)": 74.80, "3/ Action\n(IT)": 71.84, "4/ State Transition\n(IT)": 74.05, "4/ State Transition\n(II)": 42.41, "5/ Verification\n(a) Planning (IT)": 72.92, "5/ Verification\n(b) Expected State (IT)": 76.92, "5/ Verification\n(b) Expected State (II)": 36.73}, # Gemma3-4B {"Model": "Gemma3-4B", "Split": "Open", "1/ Task-Aware VQA\n(a) UI State Rec. (IT)": 46.49, "2/ UI Compr.\n(IT)": 43.24, "3/ Action\n(IT)": 50.54, "4/ State Transition\n(IT)": 68.42, "4/ State Transition\n(II)": 22.11, "5/ Verification\n(a) Planning (IT)": 53.33, "5/ Verification\n(b) Expected State (IT)": 43.75, "5/ Verification\n(b) Expected State (II)": 18.75}, {"Model": "Gemma3-4B", "Split": "Held", "1/ Task-Aware VQA\n(a) UI State Rec. (IT)": 38.20, "2/ UI Compr.\n(IT)": 41.28, "3/ Action\n(IT)": 50.46, "4/ State Transition\n(IT)": 57.01, "4/ State Transition\n(II)": 28.51, "5/ Verification\n(a) Planning (IT)": 48.48, "5/ Verification\n(b) Expected State (IT)": 58.33, "5/ Verification\n(b) Expected State (II)": 24.24}, {"Model": "Gemma3-4B", "Split": "Overall", "1/ Task-Aware VQA\n(a) UI State Rec. (IT)": 40.68, "2/ UI Compr.\n(IT)": 41.87, "3/ Action\n(IT)": 50.49, "4/ State Transition\n(IT)": 60.44, "4/ State Transition\n(II)": 26.58, "5/ Verification\n(a) Planning (IT)": 50.00, "5/ Verification\n(b) Expected State (IT)": 53.85, "5/ Verification\n(b) Expected State (II)": 22.45}, # Qwen3-VL-8B-Inst. {"Model": "Qwen3-VL-8B-Inst.", "Split": "Open", "1/ Task-Aware VQA\n(a) UI State Rec. (IT)": 68.42, "2/ UI Compr.\n(IT)": 78.38, "3/ Action\n(IT)": 66.67, "4/ State Transition\n(IT)": 68.42, "4/ State Transition\n(II)": 31.58, "5/ Verification\n(a) Planning (IT)": 66.67, "5/ Verification\n(b) Expected State (IT)": 68.75, "5/ Verification\n(b) Expected State (II)": 25.00}, {"Model": "Qwen3-VL-8B-Inst.", "Split": "Held", "1/ Task-Aware VQA\n(a) UI State Rec. (IT)": 60.30, "2/ UI Compr.\n(IT)": 59.88, "3/ Action\n(IT)": 64.81, "4/ State Transition\n(IT)": 68.78, "4/ State Transition\n(II)": 28.96, "5/ Verification\n(a) Planning (IT)": 57.58, "5/ Verification\n(b) Expected State (IT)": 69.44, "5/ Verification\n(b) Expected State (II)": 12.12}, {"Model": "Qwen3-VL-8B-Inst.", "Split": "Overall", "1/ Task-Aware VQA\n(a) UI State Rec. (IT)": 62.73, "2/ UI Compr.\n(IT)": 65.45, "3/ Action\n(IT)": 65.37, "4/ State Transition\n(IT)": 68.67, "4/ State Transition\n(II)": 29.75, "5/ Verification\n(a) Planning (IT)": 60.42, "5/ Verification\n(b) Expected State (IT)": 69.23, "5/ Verification\n(b) Expected State (II)": 16.33}, # Qwen3-VL-235B-Inst. {"Model": "Qwen3-VL-235B-Inst.", "Split": "Open", "1/ Task-Aware VQA\n(a) UI State Rec. (IT)": 70.18, "2/ UI Compr.\n(IT)": 78.38, "3/ Action\n(IT)": 65.59, "4/ State Transition\n(IT)": 70.53, "4/ State Transition\n(II)": 30.53, "5/ Verification\n(a) Planning (IT)": 66.67, "5/ Verification\n(b) Expected State (IT)": 81.25, "5/ Verification\n(b) Expected State (II)": 50.00}, {"Model": "Qwen3-VL-235B-Inst.", "Split": "Held", "1/ Task-Aware VQA\n(a) UI State Rec. (IT)": 71.16, "2/ UI Compr.\n(IT)": 69.19, "3/ Action\n(IT)": 68.98, "4/ State Transition\n(IT)": 70.59, "4/ State Transition\n(II)": 33.94, "5/ Verification\n(a) Planning (IT)": 78.79, "5/ Verification\n(b) Expected State (IT)": 80.56, "5/ Verification\n(b) Expected State (II)": 48.48}, {"Model": "Qwen3-VL-235B-Inst.", "Split": "Overall", "1/ Task-Aware VQA\n(a) UI State Rec. (IT)": 70.87, "2/ UI Compr.\n(IT)": 71.95, "3/ Action\n(IT)": 67.96, "4/ State Transition\n(IT)": 70.57, "4/ State Transition\n(II)": 32.91, "5/ Verification\n(a) Planning (IT)": 75.00, "5/ Verification\n(b) Expected State (IT)": 80.77, "5/ Verification\n(b) Expected State (II)": 48.98}, ] # ========================================== # 2. 准备 Video-Included 任务的数据 (Table 2) # ========================================== video_included_data = [ # Gemini 2.5 Flash {"Model": "Gemini 2.5 Flash", "Split": "Open", "1/ Task-Aware VQA\n(b) Goal Reasoning (VT)": 73.17, "3/ Action\n(VT)": 63.44, "3/ Action\n(IV)": 54.84, "3/ Action\n(VV)": 46.24, "4/ State Transition\n(VT)": 64.21, "4/ State Transition\n(VI)": 48.42, "5/ Verification\n(a) Planning (VT)": 66.67, "5/ Verification\n(a) Planning (IV)": 40.00, "5/ Verification\n(a) Planning (VV)": 46.67, "5/ Verification\n(b) Expected State (VT)": 68.75, "5/ Verification\n(b) Expected State (VI)": 43.75}, {"Model": "Gemini 2.5 Flash", "Split": "Held", "1/ Task-Aware VQA\n(b) Goal Reasoning (VT)": 73.40, "3/ Action\n(VT)": 67.77, "3/ Action\n(IV)": 61.97, "3/ Action\n(VV)": 43.48, "4/ State Transition\n(VT)": 68.78, "4/ State Transition\n(VI)": 51.13, "5/ Verification\n(a) Planning (VT)": 80.00, "5/ Verification\n(a) Planning (IV)": 60.61, "5/ Verification\n(a) Planning (VV)": 36.36, "5/ Verification\n(b) Expected State (VT)": 69.44, "5/ Verification\n(b) Expected State (VI)": 33.33}, {"Model": "Gemini 2.5 Flash", "Split": "Overall", "1/ Task-Aware VQA\n(b) Goal Reasoning (VT)": 73.33, "3/ Action\n(VT)": 66.45, "3/ Action\n(IV)": 59.80, "3/ Action\n(VV)": 44.33, "4/ State Transition\n(VT)": 67.41, "4/ State Transition\n(VI)": 50.32, "5/ Verification\n(a) Planning (VT)": 76.00, "5/ Verification\n(a) Planning (IV)": 54.17, "5/ Verification\n(a) Planning (VV)": 39.58, "5/ Verification\n(b) Expected State (VT)": 69.23, "5/ Verification\n(b) Expected State (VI)": 36.73}, # Gemini 2.5 Pro {"Model": "Gemini 2.5 Pro", "Split": "Open", "1/ Task-Aware VQA\n(b) Goal Reasoning (VT)": 70.73, "3/ Action\n(VT)": 66.67, "3/ Action\n(IV)": 62.37, "3/ Action\n(VV)": 52.69, "4/ State Transition\n(VT)": 63.16, "4/ State Transition\n(VI)": 51.58, "5/ Verification\n(a) Planning (VT)": 66.67, "5/ Verification\n(a) Planning (IV)": 40.00, "5/ Verification\n(a) Planning (VV)": 60.00, "5/ Verification\n(b) Expected State (VT)": 75.00, "5/ Verification\n(b) Expected State (VI)": 62.50}, {"Model": "Gemini 2.5 Pro", "Split": "Held", "1/ Task-Aware VQA\n(b) Goal Reasoning (VT)": 76.60, "3/ Action\n(VT)": 70.14, "3/ Action\n(IV)": 60.09, "3/ Action\n(VV)": 46.38, "4/ State Transition\n(VT)": 69.23, "4/ State Transition\n(VI)": 55.20, "5/ Verification\n(a) Planning (VT)": 82.86, "5/ Verification\n(a) Planning (IV)": 54.55, "5/ Verification\n(a) Planning (VV)": 54.55, "5/ Verification\n(b) Expected State (VT)": 72.22, "5/ Verification\n(b) Expected State (VI)": 39.39}, {"Model": "Gemini 2.5 Pro", "Split": "Overall", "1/ Task-Aware VQA\n(b) Goal Reasoning (VT)": 74.81, "3/ Action\n(VT)": 69.08, "3/ Action\n(IV)": 60.78, "3/ Action\n(VV)": 48.33, "4/ State Transition\n(VT)": 67.41, "4/ State Transition\n(VI)": 54.11, "5/ Verification\n(a) Planning (VT)": 78.00, "5/ Verification\n(a) Planning (IV)": 50.00, "5/ Verification\n(a) Planning (VV)": 56.25, "5/ Verification\n(b) Expected State (VT)": 73.08, "5/ Verification\n(b) Expected State (VI)": 46.94}, # Qwen3-VL-8B-Inst. {"Model": "Qwen3-VL-8B-Inst.", "Split": "Open", "1/ Task-Aware VQA\n(b) Goal Reasoning (VT)": 48.78, "3/ Action\n(VT)": 54.84, "3/ Action\n(IV)": 41.94, "3/ Action\n(VV)": 22.58, "4/ State Transition\n(VT)": 60.00, "4/ State Transition\n(VI)": 58.95, "5/ Verification\n(a) Planning (VT)": 66.67, "5/ Verification\n(a) Planning (IV)": 40.00, "5/ Verification\n(a) Planning (VV)": 26.67, "5/ Verification\n(b) Expected State (VT)": 50.00, "5/ Verification\n(b) Expected State (VI)": 43.75}, {"Model": "Qwen3-VL-8B-Inst.", "Split": "Held", "1/ Task-Aware VQA\n(b) Goal Reasoning (VT)": 68.09, "3/ Action\n(VT)": 56.40, "3/ Action\n(IV)": 39.91, "3/ Action\n(VV)": 16.91, "4/ State Transition\n(VT)": 65.16, "4/ State Transition\n(VI)": 48.42, "5/ Verification\n(a) Planning (VT)": 45.71, "5/ Verification\n(a) Planning (IV)": 30.30, "5/ Verification\n(a) Planning (VV)": 15.15, "5/ Verification\n(b) Expected State (VT)": 75.00, "5/ Verification\n(b) Expected State (VI)": 45.45}, {"Model": "Qwen3-VL-8B-Inst.", "Split": "Overall", "1/ Task-Aware VQA\n(b) Goal Reasoning (VT)": 62.22, "3/ Action\n(VT)": 55.92, "3/ Action\n(IV)": 40.52, "3/ Action\n(VV)": 18.67, "4/ State Transition\n(VT)": 63.61, "4/ State Transition\n(VI)": 51.58, "5/ Verification\n(a) Planning (VT)": 52.00, "5/ Verification\n(a) Planning (IV)": 33.33, "5/ Verification\n(a) Planning (VV)": 18.75, "5/ Verification\n(b) Expected State (VT)": 67.31, "5/ Verification\n(b) Expected State (VI)": 44.90}, # Qwen3-VL-235B-Inst. {"Model": "Qwen3-VL-235B-Inst.", "Split": "Open", "1/ Task-Aware VQA\n(b) Goal Reasoning (VT)": 56.10, "3/ Action\n(VT)": 61.29, "3/ Action\n(IV)": 48.39, "3/ Action\n(VV)": 25.81, "4/ State Transition\n(VT)": 65.26, "4/ State Transition\n(VI)": 63.16, "5/ Verification\n(a) Planning (VT)": 53.33, "5/ Verification\n(a) Planning (IV)": 26.67, "5/ Verification\n(a) Planning (VV)": 26.67, "5/ Verification\n(b) Expected State (VT)": 68.75, "5/ Verification\n(b) Expected State (VI)": 56.25}, {"Model": "Qwen3-VL-235B-Inst.", "Split": "Held", "1/ Task-Aware VQA\n(b) Goal Reasoning (VT)": 75.53, "3/ Action\n(VT)": 59.72, "3/ Action\n(IV)": 51.64, "3/ Action\n(VV)": 22.22, "4/ State Transition\n(VT)": 65.61, "4/ State Transition\n(VI)": 57.92, "5/ Verification\n(a) Planning (VT)": 65.71, "5/ Verification\n(a) Planning (IV)": 33.33, "5/ Verification\n(a) Planning (VV)": 39.39, "5/ Verification\n(b) Expected State (VT)": 77.78, "5/ Verification\n(b) Expected State (VI)": 45.45}, {"Model": "Qwen3-VL-235B-Inst.", "Split": "Overall", "1/ Task-Aware VQA\n(b) Goal Reasoning (VT)": 69.63, "3/ Action\n(VT)": 60.20, "3/ Action\n(IV)": 50.65, "3/ Action\n(VV)": 23.33, "4/ State Transition\n(VT)": 65.51, "4/ State Transition\n(VI)": 59.49, "5/ Verification\n(a) Planning (VT)": 62.00, "5/ Verification\n(a) Planning (IV)": 31.25, "5/ Verification\n(a) Planning (VV)": 35.42, "5/ Verification\n(b) Expected State (VT)": 75.00, "5/ Verification\n(b) Expected State (VI)": 48.98}, ] df_image = pd.DataFrame(image_only_data) df_video = pd.DataFrame(video_included_data) # ========================================== # 3. 简化列名映射(用于显示) # ========================================== IMAGE_COL_RENAME = { "1/ Task-Aware VQA\n(a) UI State Rec. (IT)": "1.a UI State Rec.\n(IT)", "2/ UI Compr.\n(IT)": "2. UI Compr.\n(IT)", "3/ Action\n(IT)": "3. Action\n(IT)", "4/ State Transition\n(IT)": "4. State Trans.\n(IT)", "4/ State Transition\n(II)": "4. State Trans.\n(II)", "5/ Verification\n(a) Planning (IT)": "5.a Verif. Plan.\n(IT)", "5/ Verification\n(b) Expected State (IT)": "5.b Verif. State\n(IT)", "5/ Verification\n(b) Expected State (II)": "5.b Verif. State\n(II)", } VIDEO_COL_RENAME = { "1/ Task-Aware VQA\n(b) Goal Reasoning (VT)": "1.b Goal Reas.\n(VT)", "3/ Action\n(VT)": "3. Action\n(VT)", "3/ Action\n(IV)": "3. Action\n(IV)", "3/ Action\n(VV)": "3. Action\n(VV)", "4/ State Transition\n(VT)": "4. State Trans.\n(VT)", "4/ State Transition\n(VI)": "4. State Trans.\n(VI)", "5/ Verification\n(a) Planning (VT)": "5.a Verif. Plan.\n(VT)", "5/ Verification\n(a) Planning (IV)": "5.a Verif. Plan.\n(IV)", "5/ Verification\n(a) Planning (VV)": "5.a Verif. Plan.\n(VV)", "5/ Verification\n(b) Expected State (VT)": "5.b Verif. State\n(VT)", "5/ Verification\n(b) Expected State (VI)": "5.b Verif. State\n(VI)", } # ========================================== # 4. 辅助函数 # ========================================== def get_model_label(model_name): """返回带有模型类型标签和日期的模型名称""" date_str = MODEL_DATES.get(model_name, "") date_line = f"\n({date_str})" if date_str else "" if model_name in CLOSED_SOURCE_MODELS: return f"🔒 {model_name}{date_line}" elif model_name in OPEN_SOURCE_MODELS: return f"🌐 {model_name}{date_line}" return f"{model_name}{date_line}" def format_dataframe_with_highlights(df, col_rename=None, show_all_mode=False): """格式化DataFrame:最高值加粗,第二高值加下划线,添加模型类型标签,计算Avg并排序 Args: df: 输入DataFrame col_rename: 列名重命名映射 show_all_mode: 是否为"Show All"模式(只对Overall计算Avg和Rank) """ formatted_df = df.copy() # 获取数值列(不包括 Model, Split, Rank) numeric_cols = [c for c in formatted_df.columns if c not in ["Model", "Split", "Rank"]] if show_all_mode and "Split" in formatted_df.columns: # Show All 模式:只对 Overall 计算 Avg,按 Overall 的 Avg 排序,然后展开为 Overall/Held/Open # 分离出 Overall 行来计算 Avg 和排序 overall_df = formatted_df[formatted_df["Split"] == "Overall"].copy() overall_df["Avg"] = overall_df[numeric_cols].mean(axis=1).round(2) overall_df = overall_df.sort_values(by="Avg", ascending=False).reset_index(drop=True) # 创建排名 overall_df["Rank"] = range(1, len(overall_df) + 1) # 获取排序后的模型顺序 model_order = overall_df["Model"].tolist() model_avg = dict(zip(overall_df["Model"], overall_df["Avg"])) model_rank = dict(zip(overall_df["Model"], overall_df["Rank"])) # 重建完整的 DataFrame,按照模型顺序,每个模型依次显示 Overall, Held, Open result_rows = [] for model in model_order: for split in ["Overall", "Held", "Open"]: row_data = formatted_df[(formatted_df["Model"] == model) & (formatted_df["Split"] == split)] if not row_data.empty: row = row_data.iloc[0].to_dict() if split == "Overall": row["Avg"] = model_avg[model] row["Rank"] = model_rank[model] else: row["Avg"] = None # Held 和 Open 不显示 Avg row["Rank"] = None # Held 和 Open 不显示 Rank result_rows.append(row) formatted_df = pd.DataFrame(result_rows) # 保存原始 Avg 值 avg_values = formatted_df["Avg"].copy() rank_values = formatted_df["Rank"].copy() # 找出每列的最高值和第二高值(只针对 Overall 行,且只高亮 Overall 行) overall_mask = formatted_df["Split"] == "Overall" for col in numeric_cols: overall_values = formatted_df.loc[overall_mask, col].dropna().tolist() if len(overall_values) >= 2: sorted_unique = sorted(set(overall_values), reverse=True) top1 = sorted_unique[0] if len(sorted_unique) > 0 else None top2 = sorted_unique[1] if len(sorted_unique) > 1 else None # 创建一个新列来存储格式化后的字符串 formatted_col = [] for idx, row in formatted_df.iterrows(): x = row[col] is_overall = row["Split"] == "Overall" if pd.isnull(x): formatted_col.append("-") elif is_overall: val_str = f"{x:.2f}" if x == top1: # 金色背景 - 第一名 formatted_col.append(f'{val_str}') elif x == top2: # 银色背景 - 第二名 formatted_col.append(f'{val_str}') else: formatted_col.append(val_str) else: # Held/Open 行只格式化数字,不高亮 formatted_col.append(f"{x:.2f}") formatted_df[col] = formatted_col else: formatted_df[col] = formatted_df[col].apply( lambda x: f"{x:.2f}" if pd.notnull(x) else "-" ) # 格式化 Avg 和 Rank 列 formatted_df["Task Avg"] = avg_values.apply(lambda x: f"{x:.2f}" if pd.notnull(x) else "") formatted_df["Rank"] = rank_values.apply(lambda x: f"{int(x)}" if pd.notnull(x) else "") # 删除旧的 Avg 列(如果存在) if "Avg" in formatted_df.columns: formatted_df = formatted_df.drop(columns=["Avg"]) # 添加模型类型标签 formatted_df["Model"] = formatted_df["Model"].apply(get_model_label) # 重新排列列顺序:Rank, Model, Split, Task Avg, 其他列 cols = ["Rank", "Model", "Split", "Task Avg"] cols.extend([c for c in formatted_df.columns if c not in cols]) formatted_df = formatted_df[cols] else: # 普通模式(非 Show All) # 计算 Avg 列(在格式化之前,用原始数值) formatted_df["Avg"] = df[numeric_cols].mean(axis=1).round(2) # 按 Avg 从高到低排序 formatted_df = formatted_df.sort_values(by="Avg", ascending=False).reset_index(drop=True) # 保存原始 Avg 值用于排序后的格式化 avg_values = formatted_df["Avg"].copy() # 找出每列的最高值和第二高值(不包括 Avg 列) for col in numeric_cols: values = [v for v in formatted_df[col].dropna().tolist() if pd.notnull(v)] if len(values) >= 2: sorted_unique = sorted(set(values), reverse=True) top1 = sorted_unique[0] if len(sorted_unique) > 0 else None top2 = sorted_unique[1] if len(sorted_unique) > 1 else None def format_cell(x, t1=top1, t2=top2): if pd.isnull(x): return "-" val_str = f"{x:.2f}" if x == t1: # 金色背景 - 第一名 return f'{val_str}' elif x == t2: # 银色背景 - 第二名 return f'{val_str}' else: return val_str formatted_df[col] = formatted_df[col].apply(format_cell) else: formatted_df[col] = formatted_df[col].apply( lambda x: f"{x:.2f}" if pd.notnull(x) else "-" ) # Task Avg 列只格式化为两位小数 formatted_df["Task Avg"] = avg_values.apply(lambda x: f"{x:.2f}") # 删除旧的 Avg 列(如果存在) if "Avg" in formatted_df.columns: formatted_df = formatted_df.drop(columns=["Avg"]) # 添加模型类型标签 formatted_df["Model"] = formatted_df["Model"].apply(get_model_label) # 重新排列列顺序:Model, (Split), Task Avg, 其他列 cols = ["Model"] if "Split" in formatted_df.columns: cols.append("Split") cols.append("Task Avg") cols.extend([c for c in formatted_df.columns if c not in cols]) formatted_df = formatted_df[cols] # 重命名列 if col_rename: formatted_df = formatted_df.rename(columns=col_rename) return formatted_df def filter_data(split_choice): """根据 Split 筛选数据""" if split_choice == "Show All": df_img = df_image.copy() df_vid = df_video.copy() # 使用 show_all_mode=True 来启用新的显示逻辑 return ( format_dataframe_with_highlights(df_img, IMAGE_COL_RENAME, show_all_mode=True), format_dataframe_with_highlights(df_vid, VIDEO_COL_RENAME, show_all_mode=True) ) else: df_img = df_image[df_image["Split"] == split_choice].drop(columns=["Split"]) df_vid = df_video[df_video["Split"] == split_choice].drop(columns=["Split"]) return ( format_dataframe_with_highlights(df_img, IMAGE_COL_RENAME), format_dataframe_with_highlights(df_vid, VIDEO_COL_RENAME) ) # ========================================== # 5. 自定义 CSS - 简洁统一风格 # ========================================== custom_css = """ /* 全局样式 */ .gradio-container { max-width: 1400px !important; margin: auto !important; font-family: 'Segoe UI', system-ui, -apple-system, sans-serif !important; } /* 标题区域 */ .title-container { text-align: center; padding: 2rem 1rem 1rem 1rem; background: #1e293b; border-radius: 12px; margin-bottom: 1rem; } .title-container h1 { color: #f1f5f9 !important; font-size: 2rem !important; font-weight: 700 !important; margin: 0 !important; letter-spacing: -0.3px; } /* 简介文字 - 左对齐等宽 */ .intro-text { padding: 1rem 0; font-size: 0.95rem; color: #475569; line-height: 1.7; } .intro-text strong { color: #1e293b; } /* 表格注释 */ .table-note { font-size: 0.85rem; color: #64748b; margin-bottom: 0.5rem; padding: 0.5rem 0; } /* 表格容器 - 修复滚动问题 */ .table-container { overflow-x: auto; border-radius: 8px; border: 1px solid #e2e8f0; } /* 表格样式 */ .dataframe { font-size: 13px !important; } .dataframe table { border-collapse: collapse !important; width: 100% !important; } .dataframe thead th { background: #f8fafc !important; color: #1e293b !important; font-weight: 600 !important; font-size: 11px !important; padding: 12px 8px !important; text-align: center !important; white-space: pre-line !important; line-height: 1.4 !important; border-bottom: 2px solid #e2e8f0 !important; position: sticky; top: 0; z-index: 10; } .dataframe tbody td { padding: 10px 8px !important; text-align: center !important; border-bottom: 1px solid #f1f5f9 !important; font-size: 12px !important; color: #334155 !important; } .dataframe tbody tr:hover { background-color: #f8fafc !important; } /* 最后一行增加底部边距 */ .dataframe tbody tr:last-child td { padding-bottom: 16px !important; } /* 第一列(Model)特殊样式 */ .dataframe tbody td:first-child, .dataframe thead th:first-child { font-weight: 600 !important; text-align: left !important; padding-left: 14px !important; white-space: nowrap !important; min-width: 200px !important; background: #fafbfc !important; position: sticky; left: 0; z-index: 5; } .dataframe thead th:first-child { z-index: 15; } /* 第二列样式 (Model in Show All / Task Avg in normal) */ .dataframe tbody td:nth-child(2), .dataframe thead th:nth-child(2) { min-width: 120px !important; white-space: nowrap !important; } /* 第三列样式 (Split in Show All) */ .dataframe tbody td:nth-child(3), .dataframe thead th:nth-child(3) { min-width: 110px !important; white-space: nowrap !important; } /* 第四列样式 (Task Avg in Show All) */ .dataframe tbody td:nth-child(4), .dataframe thead th:nth-child(4) { min-width: 100px !important; white-space: nowrap !important; } /* Tab 样式 */ .tabs { margin-top: 0.5rem; } button.selected { background: #1e293b !important; color: white !important; font-weight: 600 !important; } /* 底部图例 */ .legend-box { background: #f8fafc; border-radius: 8px; padding: 1.25rem; margin-top: 1.5rem; border: 1px solid #e2e8f0; } .legend-box h3 { color: #1e293b; margin: 0 0 0.75rem 0; font-size: 0.95rem; font-weight: 600; } .legend-grid { display: flex; flex-wrap: wrap; gap: 1rem 2rem; } .legend-item { display: flex; align-items: center; gap: 0.4rem; font-size: 0.85rem; color: #64748b; } .legend-item code { background: #e2e8f0; padding: 2px 6px; border-radius: 4px; font-weight: 600; color: #334155; font-size: 0.8rem; } /* 页脚 */ .footer { text-align: center; padding: 1.5rem 0; color: #94a3b8; font-size: 0.85rem; border-top: 1px solid #e2e8f0; margin-top: 2rem; } """ # ========================================== # 6. 构建 Gradio 界面 # ========================================== with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo: # 标题区域 gr.HTML("""
IT Image→Text,
II Image→Image
VT Video→Text,
IV Image→Video,
VV Video→Video,
VI Video→Image
If you use SWITCH in your research, please cite:
@article{switch2025,
title={{SWITCH}: {B}enchmarking Modeling and Handling of Tangible Interfaces in Long-horizon Embodied Scenarios},
author={Jieru Lin, Zhiwei Yu, Börje F. Karlsson},
journal={arXiv preprint arXiv:2511.17649},
year={2025}
}
baai-agents at baai.ac.cn.