Spaces:
Running
Running
| import streamlit as st | |
| import pandas as pd | |
| import os | |
| import json | |
| import streamlit.components.v1 as components | |
| import plotly.graph_objects as go | |
| import hashlib | |
| # Page configuration: set wide layout | |
| st.set_page_config(page_title="VideoScience-Bench", layout="wide", initial_sidebar_state="collapsed") | |
| # ===== CSS styling: compact layout and modernized UI ===== | |
| st.markdown(""" | |
| <style> | |
| /* 1. Reduce top spacing */ | |
| .block-container { | |
| padding-top: 1.5rem; | |
| padding-bottom: 2rem; | |
| padding-left: 2rem; | |
| padding-right: 2rem; | |
| } | |
| /* 2. Compress global component spacing */ | |
| div[data-testid="stVerticalBlock"] > div { | |
| gap: 0.5rem !important; | |
| } | |
| div[data-testid="stHorizontalBlock"] { | |
| gap: 0.5rem !important; | |
| } | |
| /* 3. Refined styling for Tabs */ | |
| .stTabs [data-baseweb="tab-list"] { | |
| gap: 4px; | |
| margin-bottom: 0.5rem; | |
| } | |
| .stTabs [data-baseweb="tab"] { | |
| padding: 4px 12px; | |
| font-size: 14px; | |
| } | |
| /* 4. Dropdown styling refinement for compact appearance */ | |
| div[data-baseweb="select"] > div { | |
| min-height: 32px; | |
| padding-top: 0; | |
| padding-bottom: 0; | |
| } | |
| /* 5. Enhanced button styling for play controls */ | |
| div.stButton > button { | |
| width: 100%; | |
| border-radius: 6px; | |
| padding: 0.25rem 0.5rem; | |
| line-height: 1.2; | |
| } | |
| /* 6. Container styling for rating bars */ | |
| .rating-container { | |
| background-color: #f8f9fa; | |
| border-radius: 6px; | |
| padding: 8px; | |
| margin-top: 0px; | |
| border: 1px solid #eee; | |
| } | |
| /* 7. Metric card styling used in leaderboard */ | |
| .metric-card { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| padding: 1rem; | |
| border-radius: 8px; | |
| color: white; | |
| text-align: center; | |
| margin: 0.5rem 0; | |
| } | |
| /* Rank badge styling */ | |
| .rank-badge { | |
| display: inline-block; | |
| width: 24px; | |
| height: 24px; | |
| line-height: 24px; | |
| border-radius: 50%; | |
| text-align: center; | |
| font-weight: bold; | |
| font-size: 12px; | |
| margin-right: 6px; | |
| } | |
| .rank-1 { background: linear-gradient(135deg, #FFD700, #FFA500); color: #000; } | |
| .rank-2 { background: linear-gradient(135deg, #C0C0C0, #808080); color: #000; } | |
| .rank-3 { background: linear-gradient(135deg, #CD7F32, #8B4513); color: #fff; } | |
| .rank-other { background: linear-gradient(135deg, #e0e0e0, #bdbdbd); color: #000; } | |
| /* Styling for complete rankings expanders to align with Auto-Judge theme */ | |
| .rankings-section div[data-testid="stExpander"] { | |
| background: linear-gradient(90deg, rgba(102, 126, 234, 0.15) 0%, rgba(118, 75, 162, 0.15) 100%) !important; | |
| border: 1px solid rgba(118, 75, 162, 0.3) !important; | |
| border-radius: 10px !important; | |
| color: #f0f4ff !important; | |
| } | |
| /* Larger, centered text for expander headers */ | |
| .rankings-section div[data-testid="stExpander"] summary { | |
| font-size: 20px !important; | |
| font-weight: 600 !important; | |
| padding: 16px 0 !important; | |
| display: flex !important; | |
| align-items: center !important; | |
| justify-content: center !important; | |
| text-align: center !important; | |
| line-height: 1.6 !important; | |
| min-height: 56px !important; | |
| width: 100%; | |
| color: #f0f4ff !important; | |
| } | |
| /* Ensure nested text elements in headers remain centered */ | |
| .rankings-section div[data-testid="stExpander"] summary p, | |
| .rankings-section div[data-testid="stExpander"] summary span, | |
| .rankings-section div[data-testid="stExpander"] summary div { | |
| margin: 0 auto !important; | |
| display: inline-flex !important; | |
| align-items: center !important; | |
| justify-content: center !important; | |
| line-height: 1.6 !important; | |
| text-align: center !important; | |
| color: #f0f4ff !important; | |
| } | |
| /* Larger metric font inside expanders */ | |
| .rankings-section div[data-testid="stExpander"] [data-testid="stMetricValue"] { | |
| font-size: 24px !important; | |
| } | |
| .rankings-section div[data-testid="stExpander"] [data-testid="stMetricLabel"] { | |
| font-size: 16px !important; | |
| } | |
| /* Adjustments for light mode to match leaderboard color scheme */ | |
| body[data-theme="light"] .rankings-section div[data-testid="stExpander"] { | |
| background: linear-gradient(90deg, rgba(102, 126, 234, 0.1) 0%, rgba(118, 75, 162, 0.1) 100%) !important; | |
| border: 1px solid rgba(118, 75, 162, 0.25) !important; | |
| color: #1f1f2d !important; | |
| } | |
| body[data-theme="light"] .rankings-section div[data-testid="stExpander"] summary, | |
| body[data-theme="light"] .rankings-section div[data-testid="stExpander"] summary *, | |
| body[data-theme="light"] .rankings-section div[data-testid="stExpander"] [data-testid="stMetricLabel"], | |
| body[data-theme="light"] .rankings-section div[data-testid="stExpander"] [data-testid="stMetricValue"] { | |
| color: #1f1f2d !important; | |
| } | |
| /* Keyword tag styling - Notion-like */ | |
| .keyword-tag { | |
| display: inline-block; | |
| padding: 3px 10px; | |
| margin: 3px 4px; | |
| border-radius: 4px; | |
| font-size: 13px; | |
| font-weight: 500; | |
| white-space: nowrap; | |
| transition: transform 0.2s ease; | |
| } | |
| .keyword-tag:hover { | |
| transform: translateY(-1px); | |
| } | |
| .keywords-container { | |
| display: inline-flex; | |
| flex-wrap: wrap; | |
| align-items: center; | |
| justify-content: center; | |
| margin-top: 8px; | |
| gap: 2px; | |
| } | |
| @media (max-width: 768px) { | |
| .keyword-tag { | |
| font-size: 11px; | |
| padding: 2px 8px; | |
| margin: 2px 3px; | |
| } | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # ===== Data loading section (actual logic) ===== | |
| RATINGS_FILE = "ratings.json" | |
| VIDEO_BASE_DIR = "downloaded_videos" | |
| CSV_FILE = "Examples.csv" | |
| MODEL_NAME_MAP = { | |
| "bytedance-seedance-1-pro": "seed-dance", | |
| "kling-v2-5-turbo-pro": "klingv2.5", | |
| "minimax-hailuo-2.3": "hailuo2.3", | |
| "ray-2": "ray-2", | |
| "sora-2": "sora-2", | |
| "veo3-quality": "veo3", | |
| "wan2.5-t2v-preview": "wan2.5", | |
| } | |
| MODELS = [ | |
| "bytedance-seedance-1-pro", | |
| "kling-v2-5-turbo-pro", | |
| "minimax-hailuo-2.3", | |
| "ray-2", | |
| "sora-2", | |
| "veo3-quality", | |
| "wan2.5-t2v-preview", | |
| ] | |
| RATING_DIMENSIONS = [ | |
| ("prompt_consistency", "Prompt Consistency"), | |
| ("expected_phenomenon", "Phenomenon Congruency"), | |
| ("dynamism", "Correct Dynamism"), | |
| ("immutability", "Immutability"), | |
| ("coherence", "Spatio-Temporal Coherence"), | |
| ] | |
| def generate_tag_color(keyword): | |
| """Generate a consistent color for each keyword using hash""" | |
| # Use hash to generate consistent colors | |
| hash_val = int(hashlib.md5(keyword.encode()).hexdigest(), 16) | |
| # Color palette inspired by Notion tags | |
| colors = [ | |
| ('#FEE2E2', '#991B1B'), # Red | |
| ('#FFEDD5', '#9A3412'), # Orange | |
| ('#FEF3C7', '#92400E'), # Yellow | |
| ('#D1FAE5', '#065F46'), # Green | |
| ('#DBEAFE', '#1E40AF'), # Blue | |
| ('#E0E7FF', '#3730A3'), # Indigo | |
| ('#F3E8FF', '#6B21A8'), # Purple | |
| ('#FCE7F3', '#9F1239'), # Pink | |
| ('#E5E7EB', '#374151'), # Gray | |
| ('#D1F5FF', '#0369A1'), # Cyan | |
| ] | |
| return colors[hash_val % len(colors)] | |
| def render_keywords(keywords_str): | |
| """Render keywords as Notion-style tags""" | |
| if not keywords_str or pd.isna(keywords_str): | |
| return "" | |
| keywords = [kw.strip() for kw in str(keywords_str).split(',') if kw.strip()] | |
| if not keywords: | |
| return "" | |
| tags_html = "<div class='keywords-container'>" | |
| for keyword in keywords: | |
| bg_color, text_color = generate_tag_color(keyword) | |
| tags_html += f"""<span class='keyword-tag' style='background-color: {bg_color}; color: {text_color};'> | |
| {keyword} | |
| </span>""" | |
| tags_html += "</div>" | |
| return tags_html | |
| def load_ratings(): | |
| try: | |
| if os.path.exists(RATINGS_FILE): | |
| with open(RATINGS_FILE, 'r', encoding='utf-8') as f: | |
| return json.load(f) | |
| return {} | |
| except Exception as e: | |
| st.error(f"Error loading ratings: {e}") | |
| return {} | |
| def load_csv_data(): | |
| try: | |
| if os.path.exists(CSV_FILE): | |
| df = pd.read_csv(CSV_FILE, encoding='utf-8-sig') | |
| data_map = {} | |
| for _, row in df.iterrows(): | |
| unique_id = row.get('Unique ID') | |
| if pd.notna(unique_id): | |
| data_map[int(unique_id)] = { | |
| 'prompt': row.get('Prompts', 'N/A'), | |
| 'expected': row.get('Expected phenomenon', 'N/A'), | |
| 'topic': row.get('Example Title', f'Example {unique_id}'), | |
| 'keywords': row.get('Keywords', ''), # Add keywords field | |
| } | |
| return data_map | |
| return {} | |
| except Exception as e: | |
| st.error(f"Error loading CSV file: {e}") | |
| return {} | |
| def get_rating(ratings_data, video_id, model_name, dimension, run_number): | |
| video_id_str = str(video_id) | |
| if video_id_str not in ratings_data or dimension not in ratings_data[video_id_str]: | |
| return None | |
| json_model_name = MODEL_NAME_MAP.get(model_name) | |
| if not json_model_name or json_model_name not in ratings_data[video_id_str][dimension]: | |
| return None | |
| ratings_list = ratings_data[video_id_str][dimension][json_model_name] | |
| if run_number < 1 or run_number > len(ratings_list): | |
| return None | |
| return ratings_list[run_number - 1] | |
| def build_example(topic, prompt, expected, keywords, video_id, model_runs): | |
| videos = {} | |
| for model in MODELS: | |
| run_number = model_runs.get(model, 1) | |
| video_path = os.path.join(VIDEO_BASE_DIR, model, f"vid_{video_id}_run_{run_number}.mp4") | |
| videos[model] = video_path | |
| return { | |
| "topic": topic, | |
| "prompt": prompt, | |
| "expected": expected, | |
| "keywords": keywords, # Add keywords to example dict | |
| "video_id": video_id, | |
| "model_runs": model_runs, | |
| "videos": videos, | |
| } | |
| def build_examples(example_specs, csv_data): | |
| examples = [] | |
| for spec in example_specs: | |
| video_id = spec["video_id"] | |
| model_runs = spec["model_runs"] | |
| csv_entry = csv_data.get(video_id, {}) | |
| topic = csv_entry.get('topic', f'Example {video_id}') | |
| prompt = csv_entry.get('prompt', 'N/A') | |
| expected = csv_entry.get('expected', 'N/A') | |
| keywords = csv_entry.get('keywords', '') # Get keywords | |
| examples.append(build_example(topic, prompt, expected, keywords, video_id, model_runs)) | |
| return examples | |
| # Load actual data | |
| ratings_data = load_ratings() | |
| csv_data = load_csv_data() | |
| example_specs = [ | |
| {"video_id": 113, "model_runs": {"bytedance-seedance-1-pro": 1, "kling-v2-5-turbo-pro": 1, "minimax-hailuo-2.3": 2, "ray-2": 1, "sora-2": 1, "veo3-quality": 1, "wan2.5-t2v-preview": 1}}, | |
| {"video_id": 143, "model_runs": {"bytedance-seedance-1-pro": 2, "kling-v2-5-turbo-pro": 2, "minimax-hailuo-2.3": 3, "ray-2": 2, "sora-2": 2, "veo3-quality": 2, "wan2.5-t2v-preview": 2}}, | |
| {"video_id": 175, "model_runs": {"bytedance-seedance-1-pro": 1, "kling-v2-5-turbo-pro": 1, "minimax-hailuo-2.3": 1, "ray-2": 1, "sora-2": 1, "veo3-quality": 1, "wan2.5-t2v-preview": 1}}, | |
| {"video_id": 138, "model_runs": {"bytedance-seedance-1-pro": 3, "kling-v2-5-turbo-pro": 3, "minimax-hailuo-2.3": 3, "ray-2": 3, "sora-2": 2, "veo3-quality": 1, "wan2.5-t2v-preview": 3}}, | |
| ] | |
| examples = build_examples(example_specs, csv_data) | |
| # Provide a fallback dummy entry if loading fails | |
| if not examples: | |
| examples = [{"video_id": 0, "topic": "No Data", "prompt": "No Data", "expected": "No Data", "keywords": "", "videos": {}, "model_runs": {}}] | |
| TOTAL_EXAMPLES = len(examples) | |
| # ===== Responsive rating bar rendering ===== | |
| def render_rating_bars(ratings_data, video_id, model1, model2, run1, run2): | |
| st.markdown( | |
| f""" | |
| <style> | |
| .rating-container {{ | |
| font-family: monospace; | |
| overflow-x: auto; | |
| -webkit-overflow-scrolling: touch; | |
| color: var(--text-color); | |
| background-color: transparent; | |
| }} | |
| .rating-header {{ | |
| font-size: clamp(12px, 3vw, 20px); | |
| margin: 12px 0; | |
| white-space: nowrap; | |
| }} | |
| .rating-row {{ | |
| font-size: clamp(12px, 3vw, 20px); | |
| margin: 10px 0; | |
| line-height: 1.6; | |
| white-space: nowrap; | |
| }} | |
| .model-left {{ | |
| color: #FF6B6B; | |
| display: inline-block; | |
| width: 28%; | |
| min-width: 120px; | |
| text-align: right; | |
| font-weight: 700; | |
| }} | |
| .model-right {{ | |
| color: #4ECDC4; | |
| display: inline-block; | |
| width: 28%; | |
| min-width: 120px; | |
| text-align: left; | |
| font-weight: 700; | |
| }} | |
| .dimension-center {{ | |
| display: inline-block; | |
| width: 42%; | |
| min-width: 150px; | |
| text-align: center; | |
| font-weight: bold; | |
| }} | |
| .score-left {{ | |
| color: #FF6B6B; | |
| display: inline-block; | |
| width: 28%; | |
| min-width: 120px; | |
| text-align: right; | |
| font-weight: 600; | |
| }} | |
| .score-right {{ | |
| color: #4ECDC4; | |
| display: inline-block; | |
| width: 28%; | |
| min-width: 120px; | |
| text-align: left; | |
| font-weight: 600; | |
| }} | |
| .dim-name {{ | |
| display: inline-block; | |
| width: 42%; | |
| min-width: 150px; | |
| text-align: center; | |
| font-weight: 700; | |
| }} | |
| @media (max-width: 768px) {{ | |
| .rating-container {{ | |
| padding: 0 8px; | |
| }} | |
| }} | |
| </style> | |
| <div class='rating-container'> | |
| <div class='rating-header'> | |
| <span class='model-left'>{model1}</span> | |
| <span class='dimension-center'>Rating Dimensions</span> | |
| <span class='model-right'>{model2}</span> | |
| </div> | |
| </div> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| for dim_key, dim_name in RATING_DIMENSIONS: | |
| rating1 = get_rating(ratings_data, video_id, model1, dim_key, run1) or 0 | |
| rating2 = get_rating(ratings_data, video_id, model2, dim_key, run2) or 0 | |
| bar1 = "██" * rating1 + "▍" if rating1 > 0 else "" | |
| bar2 = "██" * rating2 + "▍" if rating2 > 0 else "" | |
| left_bar = f"{bar1} {rating1}" if rating1 > 0 else f"{rating1}" | |
| right_bar = f"{rating2} {bar2}" if rating2 > 0 else f"{rating2}" | |
| st.markdown( | |
| f""" | |
| <div class='rating-container'> | |
| <div class='rating-row'> | |
| <span class='score-left'>{left_bar}</span> | |
| <span class='dim-name'>{dim_name}</span> | |
| <span class='score-right'>{right_bar}</span> | |
| </div> | |
| </div> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| if "example_idx" not in st.session_state: | |
| st.session_state.example_idx = 0 | |
| if "model1" not in st.session_state: | |
| st.session_state.model1 = MODELS[4] # Sora | |
| if "model2" not in st.session_state: | |
| st.session_state.model2 = MODELS[5] # Veo | |
| # ===== Main layout ===== | |
| st.title("⚛️ VideoScience-Bench") | |
| tab2, tab3, tab1 = st.tabs(["🤖 Auto-Judge Leaderboard", "👥 Human Evaluation", "📹 Video Comparison"]) | |
| # ===== TAB 1: Video Comparison ===== | |
| with tab1: | |
| # --- 1. Minimal top navigation bar --- | |
| col_nav_1, col_nav_2, col_nav_3 = st.columns([1, 10, 1]) | |
| with col_nav_1: | |
| if st.button("◀", key="prev", use_container_width=True, disabled=(st.session_state.example_idx == 0)): | |
| st.session_state.example_idx -= 1 | |
| st.rerun() | |
| with col_nav_2: | |
| # Display centered example index and topic with keywords | |
| current = examples[st.session_state.example_idx] | |
| keywords_html = render_keywords(current.get('keywords', '')) | |
| st.markdown( | |
| f""" | |
| <div style='text-align: center; margin-top: -5px;'> | |
| <span style='font-size: 14px; color: #888;'>EXAMPLE {st.session_state.example_idx + 1} / {TOTAL_EXAMPLES}</span><br> | |
| <span style='font-size: 18px; font-weight: 700;'>{current['topic']}</span> | |
| {keywords_html} | |
| </div> | |
| """, unsafe_allow_html=True | |
| ) | |
| with col_nav_3: | |
| if st.button("▶", key="next", use_container_width=True, disabled=(st.session_state.example_idx == TOTAL_EXAMPLES - 1)): | |
| st.session_state.example_idx += 1 | |
| st.rerun() | |
| # --- 2. Prompt and expectation (expanded by default) --- | |
| with st.expander("📝 View Prompt & Expectation Details", expanded=True): | |
| c1, c2 = st.columns(2) | |
| with c1: | |
| st.caption("PROMPT") | |
| st.write(current['prompt']) | |
| with c2: | |
| st.caption("EXPECTED PHENOMENON") | |
| st.write(current['expected']) | |
| # --- 3. Comparison control panel --- | |
| st.markdown("<div style='margin-bottom: 5px;'></div>", unsafe_allow_html=True) | |
| ctrl_col1, ctrl_col2, ctrl_col3 = st.columns([3, 1, 3]) | |
| with ctrl_col1: | |
| # Hidden label to save vertical space | |
| model1 = st.selectbox("Model Left", MODELS, index=MODELS.index(st.session_state.model1), | |
| key="m1_select", label_visibility="collapsed") | |
| st.session_state.model1 = model1 | |
| with ctrl_col2: | |
| # Trigger playback of both videos | |
| play = st.button("▶ Play Both", use_container_width=True, type="primary") | |
| with ctrl_col3: | |
| model2 = st.selectbox("Model Right", MODELS, index=MODELS.index(st.session_state.model2), | |
| key="m2_select", label_visibility="collapsed") | |
| st.session_state.model2 = model2 | |
| # --- 4. Video playback section --- | |
| vid_col1, vid_col2 = st.columns(2) | |
| # JavaScript-based autoplay | |
| if play: | |
| components.html(""" | |
| <script> | |
| setTimeout(() => { | |
| const videos = window.parent.document.querySelectorAll('video'); | |
| videos.forEach(v => { v.currentTime = 0; v.play(); }); | |
| }, 100); | |
| </script> | |
| """, height=0) | |
| with vid_col1: | |
| video_path1 = current["videos"].get(model1, "") | |
| if os.path.exists(video_path1): | |
| st.video(video_path1) | |
| else: | |
| st.error(f"❌ Video not found: {video_path1}") | |
| with vid_col2: | |
| video_path2 = current["videos"].get(model2, "") | |
| if os.path.exists(video_path2): | |
| st.video(video_path2) | |
| else: | |
| st.error(f"❌ Video not found: {video_path2}") | |
| # --- 5. Integrated rating bars --- | |
| render_rating_bars( | |
| ratings_data, | |
| current['video_id'], | |
| model1, | |
| model2, | |
| current["model_runs"].get(model1, 1), | |
| current["model_runs"].get(model2, 1) | |
| ) | |
| # ===== TAB 2: Auto-Judge Leaderboard (CL+CV) ===== | |
| with tab2: | |
| st.markdown("### 🤖 VideoScience-Judge Leaderboard") | |
| st.markdown("<small>Scores are computed using an evidence-grounded scheme integrating prompt-specific checklist and CV-based analysis, then averaged across all dimensions and normalized to 1.</small>", unsafe_allow_html=True) | |
| # Data from Table 2 – CL+CV column | |
| auto_data = { | |
| "Model": ["Kling-v3.0", "Sora-2", "Veo-3", "Kling-v2.5", "Wan-2.5", "Seedance-1.0-Pro", "Hailuo-2.3", "Ray2"], | |
| "Score": [0.78, 0.76, 0.65, 0.59, 0.59, 0.54, 0.50, 0.34] | |
| } | |
| df_auto = pd.DataFrame(auto_data).sort_values("Score", ascending=False).reset_index(drop=True) | |
| df_auto["Rank"] = range(1, len(df_auto) + 1) | |
| # Build leaderboard visualization | |
| for idx, row in df_auto.iterrows(): | |
| rank = row["Rank"] | |
| model = row["Model"] | |
| score = row["Score"] | |
| # Assign visual badge style based on rank | |
| if rank == 1: | |
| badge_class = "rank-1" | |
| medal = "🥇" | |
| elif rank == 2: | |
| badge_class = "rank-2" | |
| medal = "🥈" | |
| elif rank == 3: | |
| badge_class = "rank-3" | |
| medal = "🥉" | |
| else: | |
| badge_class = "rank-other" | |
| medal = "" | |
| # Convert score to progress bar width | |
| bar_width = score * 100 | |
| st.markdown(f""" | |
| <div style='background: linear-gradient(90deg, rgba(102,126,234,0.1) 0%, rgba(118,75,162,0.1) 100%); | |
| padding: 12px; border-radius: 8px; margin: 8px 0; border-left: 4px solid #667eea;'> | |
| <div style='display: flex; align-items: center; justify-content: space-between;'> | |
| <div style='display: flex; align-items: center; gap: 12px;'> | |
| <span class='rank-badge {badge_class}'>{rank}</span> | |
| <span style='font-weight: 600; font-size: 16px;'>{medal} {model}</span> | |
| </div> | |
| <div style='font-weight: 700; font-size: 20px; color: #667eea;'>{score:.2f}</div> | |
| </div> | |
| <div style='width: 100%; height: 6px; background: #e0e0e0; border-radius: 3px; margin-top: 8px; overflow: hidden;'> | |
| <div style='width: {bar_width}%; height: 100%; background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);'></div> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # ===== TAB 3: Human Evaluation ===== | |
| def render_human_rankings_section(df_human): | |
| # Display full rankings using expandable rows | |
| # 1. Initialize session state for expander control | |
| if 'expander_state' not in st.session_state: | |
| st.session_state['expander_state'] = False | |
| # 2. Control buttons for expand/collapse all | |
| col1, col2, col3 = st.columns([6, 1, 1]) | |
| with col1: | |
| st.markdown("### 📊 Complete Rankings") | |
| with col2: | |
| if st.button("➕ Expand All", use_container_width=True, type="secondary"): | |
| st.session_state['expander_state'] = True | |
| st.rerun() | |
| with col3: | |
| if st.button("➖ Collapse All", use_container_width=True, type="secondary"): | |
| st.session_state['expander_state'] = False | |
| st.rerun() | |
| st.markdown("<div class='rankings-section'>", unsafe_allow_html=True) | |
| for idx, row in df_human.iterrows(): | |
| rank = idx + 1 | |
| model = row["Model"] | |
| avg = row["Average"] | |
| if rank <= 3: | |
| medals = ["🥇", "🥈", "🥉"] | |
| medal = medals[rank-1] | |
| else: | |
| medal = "" | |
| # Expandable ranking entry | |
| with st.expander(f"**#{rank} {medal} {model}** — Avg: {avg:.2f}", expanded=st.session_state['expander_state']): | |
| cols = st.columns(5) | |
| dimensions = [("PCS", "Prompt Consistency"), ("PCG", "Phenomenon Congruency"), | |
| ("CDN", "Correct Dynamism"), ("IMB", "Immutability"), ("STC", "CoherSpatio-Temporal Coherenceence")] | |
| for col, (key, name) in zip(cols, dimensions): | |
| score = row[key] | |
| col.metric(name, f"{score:.2f}", delta=None) | |
| st.markdown("</div>", unsafe_allow_html=True) | |
| with tab3: | |
| st.markdown("### 👥 Human Evaluation Scores") | |
| st.markdown("<small>Mean annotator scores from a 1–4 Likert scale.</small>", unsafe_allow_html=True) | |
| # Human evaluation dataset (Table 1) | |
| human_data = { | |
| "Model": ["Sora-2", "Veo-3", "Kling-v2.5", "Wan-2.5", "Seedance-1.0-Pro", "Hailuo-2.3", "Ray2"], | |
| "PCS": [3.32, 3.01, 2.77, 2.87, 2.56, 2.39, 1.65], | |
| "PCG": [2.56, 2.35, 1.91, 1.84, 1.78, 1.67, 1.26], | |
| "CDN": [3.33, 2.83, 2.75, 2.83, 2.52, 2.57, 2.13], | |
| "IMB": [3.73, 3.30, 3.36, 3.36, 3.15, 3.16, 2.44], | |
| "STC": [3.71, 3.42, 3.60, 3.46, 3.46, 3.46, 2.92] | |
| } | |
| df_human = pd.DataFrame(human_data) | |
| df_human["Average"] = df_human[["PCS", "PCG", "CDN", "IMB", "STC"]].mean(axis=1) | |
| df_human = df_human.sort_values("Average", ascending=False).reset_index(drop=True) | |
| # Radar chart for human scores | |
| fig = go.Figure() | |
| categories = ['Prompt<br>Consistency', 'Phenomenon<br>Congruency', 'Dynamism', 'Immutability', 'Coherence'] | |
| colors = ['#667eea', '#f093fb', '#4facfe', '#43cea2', '#ff9a9e', '#fbc2eb', '#90f7ec'] | |
| for idx in range(len(df_human)): | |
| row = df_human.iloc[idx] | |
| values = [row["PCS"], row["PCG"], row["CDN"], row["IMB"], row["STC"]] | |
| fig.add_trace(go.Scatterpolar( | |
| r=values, | |
| theta=categories, | |
| fill='toself', | |
| name=row["Model"], | |
| line=dict(color=colors[idx % len(colors)], width=2), | |
| marker=dict(size=8) | |
| )) | |
| fig.update_layout( | |
| polar=dict( | |
| radialaxis=dict(visible=True, range=[0, 4], tickfont=dict(size=10, color='red')), | |
| angularaxis=dict(tickfont=dict(size=11)) | |
| ), | |
| showlegend=True, | |
| height=450, | |
| margin=dict(l=80, r=80, t=40, b=40), | |
| legend=dict(orientation="h", yanchor="bottom", y=-0.15, xanchor="center", x=0.5) | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| render_human_rankings_section(df_human) | |