import streamlit as st import pandas as pd import os import json import streamlit.components.v1 as components import plotly.graph_objects as go import hashlib # Page configuration: set wide layout st.set_page_config(page_title="VideoScience-Bench", layout="wide", initial_sidebar_state="collapsed") # ===== CSS styling: compact layout and modernized UI ===== st.markdown(""" """, unsafe_allow_html=True) # ===== Data loading section (actual logic) ===== RATINGS_FILE = "ratings.json" VIDEO_BASE_DIR = "downloaded_videos" CSV_FILE = "Examples.csv" MODEL_NAME_MAP = { "bytedance-seedance-1-pro": "seed-dance", "kling-v2-5-turbo-pro": "klingv2.5", "minimax-hailuo-2.3": "hailuo2.3", "ray-2": "ray-2", "sora-2": "sora-2", "veo3-quality": "veo3", "wan2.5-t2v-preview": "wan2.5", } MODELS = [ "bytedance-seedance-1-pro", "kling-v2-5-turbo-pro", "minimax-hailuo-2.3", "ray-2", "sora-2", "veo3-quality", "wan2.5-t2v-preview", ] RATING_DIMENSIONS = [ ("prompt_consistency", "Prompt Consistency"), ("expected_phenomenon", "Phenomenon Congruency"), ("dynamism", "Correct Dynamism"), ("immutability", "Immutability"), ("coherence", "Spatio-Temporal Coherence"), ] def generate_tag_color(keyword): """Generate a consistent color for each keyword using hash""" # Use hash to generate consistent colors hash_val = int(hashlib.md5(keyword.encode()).hexdigest(), 16) # Color palette inspired by Notion tags colors = [ ('#FEE2E2', '#991B1B'), # Red ('#FFEDD5', '#9A3412'), # Orange ('#FEF3C7', '#92400E'), # Yellow ('#D1FAE5', '#065F46'), # Green ('#DBEAFE', '#1E40AF'), # Blue ('#E0E7FF', '#3730A3'), # Indigo ('#F3E8FF', '#6B21A8'), # Purple ('#FCE7F3', '#9F1239'), # Pink ('#E5E7EB', '#374151'), # Gray ('#D1F5FF', '#0369A1'), # Cyan ] return colors[hash_val % len(colors)] def render_keywords(keywords_str): """Render keywords as Notion-style tags""" if not keywords_str or pd.isna(keywords_str): return "" keywords = [kw.strip() for kw in str(keywords_str).split(',') if kw.strip()] if not keywords: return "" tags_html = "

" for keyword in keywords: bg_color, text_color = generate_tag_color(keyword) tags_html += f""" {keyword} """ tags_html += "

" return tags_html @st.cache_data def load_ratings(): try: if os.path.exists(RATINGS_FILE): with open(RATINGS_FILE, 'r', encoding='utf-8') as f: return json.load(f) return {} except Exception as e: st.error(f"Error loading ratings: {e}") return {} @st.cache_data def load_csv_data(): try: if os.path.exists(CSV_FILE): df = pd.read_csv(CSV_FILE, encoding='utf-8-sig') data_map = {} for _, row in df.iterrows(): unique_id = row.get('Unique ID') if pd.notna(unique_id): data_map[int(unique_id)] = { 'prompt': row.get('Prompts', 'N/A'), 'expected': row.get('Expected phenomenon', 'N/A'), 'topic': row.get('Example Title', f'Example {unique_id}'), 'keywords': row.get('Keywords', ''), # Add keywords field } return data_map return {} except Exception as e: st.error(f"Error loading CSV file: {e}") return {} def get_rating(ratings_data, video_id, model_name, dimension, run_number): video_id_str = str(video_id) if video_id_str not in ratings_data or dimension not in ratings_data[video_id_str]: return None json_model_name = MODEL_NAME_MAP.get(model_name) if not json_model_name or json_model_name not in ratings_data[video_id_str][dimension]: return None ratings_list = ratings_data[video_id_str][dimension][json_model_name] if run_number < 1 or run_number > len(ratings_list): return None return ratings_list[run_number - 1] def build_example(topic, prompt, expected, keywords, video_id, model_runs): videos = {} for model in MODELS: run_number = model_runs.get(model, 1) video_path = os.path.join(VIDEO_BASE_DIR, model, f"vid_{video_id}_run_{run_number}.mp4") videos[model] = video_path return { "topic": topic, "prompt": prompt, "expected": expected, "keywords": keywords, # Add keywords to example dict "video_id": video_id, "model_runs": model_runs, "videos": videos, } def build_examples(example_specs, csv_data): examples = [] for spec in example_specs: video_id = spec["video_id"] model_runs = spec["model_runs"] csv_entry = csv_data.get(video_id, {}) topic = csv_entry.get('topic', f'Example {video_id}') prompt = csv_entry.get('prompt', 'N/A') expected = csv_entry.get('expected', 'N/A') keywords = csv_entry.get('keywords', '') # Get keywords examples.append(build_example(topic, prompt, expected, keywords, video_id, model_runs)) return examples # Load actual data ratings_data = load_ratings() csv_data = load_csv_data() example_specs = [ {"video_id": 113, "model_runs": {"bytedance-seedance-1-pro": 1, "kling-v2-5-turbo-pro": 1, "minimax-hailuo-2.3": 2, "ray-2": 1, "sora-2": 1, "veo3-quality": 1, "wan2.5-t2v-preview": 1}}, {"video_id": 143, "model_runs": {"bytedance-seedance-1-pro": 2, "kling-v2-5-turbo-pro": 2, "minimax-hailuo-2.3": 3, "ray-2": 2, "sora-2": 2, "veo3-quality": 2, "wan2.5-t2v-preview": 2}}, {"video_id": 175, "model_runs": {"bytedance-seedance-1-pro": 1, "kling-v2-5-turbo-pro": 1, "minimax-hailuo-2.3": 1, "ray-2": 1, "sora-2": 1, "veo3-quality": 1, "wan2.5-t2v-preview": 1}}, {"video_id": 138, "model_runs": {"bytedance-seedance-1-pro": 3, "kling-v2-5-turbo-pro": 3, "minimax-hailuo-2.3": 3, "ray-2": 3, "sora-2": 2, "veo3-quality": 1, "wan2.5-t2v-preview": 3}}, ] examples = build_examples(example_specs, csv_data) # Provide a fallback dummy entry if loading fails if not examples: examples = [{"video_id": 0, "topic": "No Data", "prompt": "No Data", "expected": "No Data", "keywords": "", "videos": {}, "model_runs": {}}] TOTAL_EXAMPLES = len(examples) # ===== Responsive rating bar rendering ===== def render_rating_bars(ratings_data, video_id, model1, model2, run1, run2): st.markdown( f"""

{model1} Rating Dimensions {model2}

""", unsafe_allow_html=True ) for dim_key, dim_name in RATING_DIMENSIONS: rating1 = get_rating(ratings_data, video_id, model1, dim_key, run1) or 0 rating2 = get_rating(ratings_data, video_id, model2, dim_key, run2) or 0 bar1 = "██" * rating1 + "▍" if rating1 > 0 else "" bar2 = "██" * rating2 + "▍" if rating2 > 0 else "" left_bar = f"{bar1} {rating1}" if rating1 > 0 else f"{rating1}" right_bar = f"{rating2} {bar2}" if rating2 > 0 else f"{rating2}" st.markdown( f"""

{left_bar} {dim_name} {right_bar}

""", unsafe_allow_html=True ) if "example_idx" not in st.session_state: st.session_state.example_idx = 0 if "model1" not in st.session_state: st.session_state.model1 = MODELS[4] # Sora if "model2" not in st.session_state: st.session_state.model2 = MODELS[5] # Veo # ===== Main layout ===== st.title("⚛️ VideoScience-Bench") tab2, tab3, tab1 = st.tabs(["🤖 Auto-Judge Leaderboard", "👥 Human Evaluation", "📹 Video Comparison"]) # ===== TAB 1: Video Comparison ===== with tab1: # --- 1. Minimal top navigation bar --- col_nav_1, col_nav_2, col_nav_3 = st.columns([1, 10, 1]) with col_nav_1: if st.button("◀", key="prev", use_container_width=True, disabled=(st.session_state.example_idx == 0)): st.session_state.example_idx -= 1 st.rerun() with col_nav_2: # Display centered example index and topic with keywords current = examples[st.session_state.example_idx] keywords_html = render_keywords(current.get('keywords', '')) st.markdown( f"""

EXAMPLE {st.session_state.example_idx + 1} / {TOTAL_EXAMPLES}
{current['topic']} {keywords_html}

""", unsafe_allow_html=True ) with col_nav_3: if st.button("▶", key="next", use_container_width=True, disabled=(st.session_state.example_idx == TOTAL_EXAMPLES - 1)): st.session_state.example_idx += 1 st.rerun() # --- 2. Prompt and expectation (expanded by default) --- with st.expander("📝 View Prompt & Expectation Details", expanded=True): c1, c2 = st.columns(2) with c1: st.caption("PROMPT") st.write(current['prompt']) with c2: st.caption("EXPECTED PHENOMENON") st.write(current['expected']) # --- 3. Comparison control panel --- st.markdown("

", unsafe_allow_html=True) ctrl_col1, ctrl_col2, ctrl_col3 = st.columns([3, 1, 3]) with ctrl_col1: # Hidden label to save vertical space model1 = st.selectbox("Model Left", MODELS, index=MODELS.index(st.session_state.model1), key="m1_select", label_visibility="collapsed") st.session_state.model1 = model1 with ctrl_col2: # Trigger playback of both videos play = st.button("▶ Play Both", use_container_width=True, type="primary") with ctrl_col3: model2 = st.selectbox("Model Right", MODELS, index=MODELS.index(st.session_state.model2), key="m2_select", label_visibility="collapsed") st.session_state.model2 = model2 # --- 4. Video playback section --- vid_col1, vid_col2 = st.columns(2) # JavaScript-based autoplay if play: components.html(""" """, height=0) with vid_col1: video_path1 = current["videos"].get(model1, "") if os.path.exists(video_path1): st.video(video_path1) else: st.error(f"❌ Video not found: {video_path1}") with vid_col2: video_path2 = current["videos"].get(model2, "") if os.path.exists(video_path2): st.video(video_path2) else: st.error(f"❌ Video not found: {video_path2}") # --- 5. Integrated rating bars --- render_rating_bars( ratings_data, current['video_id'], model1, model2, current["model_runs"].get(model1, 1), current["model_runs"].get(model2, 1) ) # ===== TAB 2: Auto-Judge Leaderboard (CL+CV) ===== with tab2: st.markdown("### 🤖 VideoScience-Judge Leaderboard") st.markdown("Scores are computed using an evidence-grounded scheme integrating prompt-specific checklist and CV-based analysis, then averaged across all dimensions and normalized to 1.", unsafe_allow_html=True) # Data from Table 2 – CL+CV column auto_data = { "Model": ["Kling-v3.0", "Sora-2", "Veo-3", "Kling-v2.5", "Wan-2.5", "Seedance-1.0-Pro", "Hailuo-2.3", "Ray2"], "Score": [0.78, 0.76, 0.65, 0.59, 0.59, 0.54, 0.50, 0.34] } df_auto = pd.DataFrame(auto_data).sort_values("Score", ascending=False).reset_index(drop=True) df_auto["Rank"] = range(1, len(df_auto) + 1) # Build leaderboard visualization for idx, row in df_auto.iterrows(): rank = row["Rank"] model = row["Model"] score = row["Score"] # Assign visual badge style based on rank if rank == 1: badge_class = "rank-1" medal = "🥇" elif rank == 2: badge_class = "rank-2" medal = "🥈" elif rank == 3: badge_class = "rank-3" medal = "🥉" else: badge_class = "rank-other" medal = "" # Convert score to progress bar width bar_width = score * 100 st.markdown(f"""

{rank} {medal} {model}

{score:.2f}

""", unsafe_allow_html=True) # ===== TAB 3: Human Evaluation ===== @st.fragment def render_human_rankings_section(df_human): # Display full rankings using expandable rows # 1. Initialize session state for expander control if 'expander_state' not in st.session_state: st.session_state['expander_state'] = False # 2. Control buttons for expand/collapse all col1, col2, col3 = st.columns([6, 1, 1]) with col1: st.markdown("### 📊 Complete Rankings") with col2: if st.button("➕ Expand All", use_container_width=True, type="secondary"): st.session_state['expander_state'] = True st.rerun() with col3: if st.button("➖ Collapse All", use_container_width=True, type="secondary"): st.session_state['expander_state'] = False st.rerun() st.markdown("

", unsafe_allow_html=True) for idx, row in df_human.iterrows(): rank = idx + 1 model = row["Model"] avg = row["Average"] if rank <= 3: medals = ["🥇", "🥈", "🥉"] medal = medals[rank-1] else: medal = "" # Expandable ranking entry with st.expander(f"**#{rank} {medal} {model}** — Avg: {avg:.2f}", expanded=st.session_state['expander_state']): cols = st.columns(5) dimensions = [("PCS", "Prompt Consistency"), ("PCG", "Phenomenon Congruency"), ("CDN", "Correct Dynamism"), ("IMB", "Immutability"), ("STC", "CoherSpatio-Temporal Coherenceence")] for col, (key, name) in zip(cols, dimensions): score = row[key] col.metric(name, f"{score:.2f}", delta=None) st.markdown("

", unsafe_allow_html=True) with tab3: st.markdown("### 👥 Human Evaluation Scores") st.markdown("Mean annotator scores from a 1–4 Likert scale.", unsafe_allow_html=True) # Human evaluation dataset (Table 1) human_data = { "Model": ["Sora-2", "Veo-3", "Kling-v2.5", "Wan-2.5", "Seedance-1.0-Pro", "Hailuo-2.3", "Ray2"], "PCS": [3.32, 3.01, 2.77, 2.87, 2.56, 2.39, 1.65], "PCG": [2.56, 2.35, 1.91, 1.84, 1.78, 1.67, 1.26], "CDN": [3.33, 2.83, 2.75, 2.83, 2.52, 2.57, 2.13], "IMB": [3.73, 3.30, 3.36, 3.36, 3.15, 3.16, 2.44], "STC": [3.71, 3.42, 3.60, 3.46, 3.46, 3.46, 2.92] } df_human = pd.DataFrame(human_data) df_human["Average"] = df_human[["PCS", "PCG", "CDN", "IMB", "STC"]].mean(axis=1) df_human = df_human.sort_values("Average", ascending=False).reset_index(drop=True) # Radar chart for human scores fig = go.Figure() categories = ['Prompt
Consistency', 'Phenomenon
Congruency', 'Dynamism', 'Immutability', 'Coherence'] colors = ['#667eea', '#f093fb', '#4facfe', '#43cea2', '#ff9a9e', '#fbc2eb', '#90f7ec'] for idx in range(len(df_human)): row = df_human.iloc[idx] values = [row["PCS"], row["PCG"], row["CDN"], row["IMB"], row["STC"]] fig.add_trace(go.Scatterpolar( r=values, theta=categories, fill='toself', name=row["Model"], line=dict(color=colors[idx % len(colors)], width=2), marker=dict(size=8) )) fig.update_layout( polar=dict( radialaxis=dict(visible=True, range=[0, 4], tickfont=dict(size=10, color='red')), angularaxis=dict(tickfont=dict(size=11)) ), showlegend=True, height=450, margin=dict(l=80, r=80, t=40, b=40), legend=dict(orientation="h", yanchor="bottom", y=-0.15, xanchor="center", x=0.5) ) st.plotly_chart(fig, use_container_width=True) render_human_rankings_section(df_human)