import streamlit as st
import pandas as pd
import os
import json
import streamlit.components.v1 as components
import plotly.graph_objects as go
import hashlib
# Page configuration: set wide layout
st.set_page_config(page_title="VideoScience-Bench", layout="wide", initial_sidebar_state="collapsed")
# ===== CSS styling: compact layout and modernized UI =====
st.markdown("""
""", unsafe_allow_html=True)
# ===== Data loading section (actual logic) =====
RATINGS_FILE = "ratings.json"
VIDEO_BASE_DIR = "downloaded_videos"
CSV_FILE = "Examples.csv"
MODEL_NAME_MAP = {
"bytedance-seedance-1-pro": "seed-dance",
"kling-v2-5-turbo-pro": "klingv2.5",
"minimax-hailuo-2.3": "hailuo2.3",
"ray-2": "ray-2",
"sora-2": "sora-2",
"veo3-quality": "veo3",
"wan2.5-t2v-preview": "wan2.5",
}
MODELS = [
"bytedance-seedance-1-pro",
"kling-v2-5-turbo-pro",
"minimax-hailuo-2.3",
"ray-2",
"sora-2",
"veo3-quality",
"wan2.5-t2v-preview",
]
RATING_DIMENSIONS = [
("prompt_consistency", "Prompt Consistency"),
("expected_phenomenon", "Phenomenon Congruency"),
("dynamism", "Correct Dynamism"),
("immutability", "Immutability"),
("coherence", "Spatio-Temporal Coherence"),
]
def generate_tag_color(keyword):
"""Generate a consistent color for each keyword using hash"""
# Use hash to generate consistent colors
hash_val = int(hashlib.md5(keyword.encode()).hexdigest(), 16)
# Color palette inspired by Notion tags
colors = [
('#FEE2E2', '#991B1B'), # Red
('#FFEDD5', '#9A3412'), # Orange
('#FEF3C7', '#92400E'), # Yellow
('#D1FAE5', '#065F46'), # Green
('#DBEAFE', '#1E40AF'), # Blue
('#E0E7FF', '#3730A3'), # Indigo
('#F3E8FF', '#6B21A8'), # Purple
('#FCE7F3', '#9F1239'), # Pink
('#E5E7EB', '#374151'), # Gray
('#D1F5FF', '#0369A1'), # Cyan
]
return colors[hash_val % len(colors)]
def render_keywords(keywords_str):
"""Render keywords as Notion-style tags"""
if not keywords_str or pd.isna(keywords_str):
return ""
keywords = [kw.strip() for kw in str(keywords_str).split(',') if kw.strip()]
if not keywords:
return ""
tags_html = "
"
for keyword in keywords:
bg_color, text_color = generate_tag_color(keyword)
tags_html += f"""
{keyword}
"""
tags_html += "
"
return tags_html
@st.cache_data
def load_ratings():
try:
if os.path.exists(RATINGS_FILE):
with open(RATINGS_FILE, 'r', encoding='utf-8') as f:
return json.load(f)
return {}
except Exception as e:
st.error(f"Error loading ratings: {e}")
return {}
@st.cache_data
def load_csv_data():
try:
if os.path.exists(CSV_FILE):
df = pd.read_csv(CSV_FILE, encoding='utf-8-sig')
data_map = {}
for _, row in df.iterrows():
unique_id = row.get('Unique ID')
if pd.notna(unique_id):
data_map[int(unique_id)] = {
'prompt': row.get('Prompts', 'N/A'),
'expected': row.get('Expected phenomenon', 'N/A'),
'topic': row.get('Example Title', f'Example {unique_id}'),
'keywords': row.get('Keywords', ''), # Add keywords field
}
return data_map
return {}
except Exception as e:
st.error(f"Error loading CSV file: {e}")
return {}
def get_rating(ratings_data, video_id, model_name, dimension, run_number):
video_id_str = str(video_id)
if video_id_str not in ratings_data or dimension not in ratings_data[video_id_str]:
return None
json_model_name = MODEL_NAME_MAP.get(model_name)
if not json_model_name or json_model_name not in ratings_data[video_id_str][dimension]:
return None
ratings_list = ratings_data[video_id_str][dimension][json_model_name]
if run_number < 1 or run_number > len(ratings_list):
return None
return ratings_list[run_number - 1]
def build_example(topic, prompt, expected, keywords, video_id, model_runs):
videos = {}
for model in MODELS:
run_number = model_runs.get(model, 1)
video_path = os.path.join(VIDEO_BASE_DIR, model, f"vid_{video_id}_run_{run_number}.mp4")
videos[model] = video_path
return {
"topic": topic,
"prompt": prompt,
"expected": expected,
"keywords": keywords, # Add keywords to example dict
"video_id": video_id,
"model_runs": model_runs,
"videos": videos,
}
def build_examples(example_specs, csv_data):
examples = []
for spec in example_specs:
video_id = spec["video_id"]
model_runs = spec["model_runs"]
csv_entry = csv_data.get(video_id, {})
topic = csv_entry.get('topic', f'Example {video_id}')
prompt = csv_entry.get('prompt', 'N/A')
expected = csv_entry.get('expected', 'N/A')
keywords = csv_entry.get('keywords', '') # Get keywords
examples.append(build_example(topic, prompt, expected, keywords, video_id, model_runs))
return examples
# Load actual data
ratings_data = load_ratings()
csv_data = load_csv_data()
example_specs = [
{"video_id": 113, "model_runs": {"bytedance-seedance-1-pro": 1, "kling-v2-5-turbo-pro": 1, "minimax-hailuo-2.3": 2, "ray-2": 1, "sora-2": 1, "veo3-quality": 1, "wan2.5-t2v-preview": 1}},
{"video_id": 143, "model_runs": {"bytedance-seedance-1-pro": 2, "kling-v2-5-turbo-pro": 2, "minimax-hailuo-2.3": 3, "ray-2": 2, "sora-2": 2, "veo3-quality": 2, "wan2.5-t2v-preview": 2}},
{"video_id": 175, "model_runs": {"bytedance-seedance-1-pro": 1, "kling-v2-5-turbo-pro": 1, "minimax-hailuo-2.3": 1, "ray-2": 1, "sora-2": 1, "veo3-quality": 1, "wan2.5-t2v-preview": 1}},
{"video_id": 138, "model_runs": {"bytedance-seedance-1-pro": 3, "kling-v2-5-turbo-pro": 3, "minimax-hailuo-2.3": 3, "ray-2": 3, "sora-2": 2, "veo3-quality": 1, "wan2.5-t2v-preview": 3}},
]
examples = build_examples(example_specs, csv_data)
# Provide a fallback dummy entry if loading fails
if not examples:
examples = [{"video_id": 0, "topic": "No Data", "prompt": "No Data", "expected": "No Data", "keywords": "", "videos": {}, "model_runs": {}}]
TOTAL_EXAMPLES = len(examples)
# ===== Responsive rating bar rendering =====
def render_rating_bars(ratings_data, video_id, model1, model2, run1, run2):
st.markdown(
f"""
""",
unsafe_allow_html=True
)
for dim_key, dim_name in RATING_DIMENSIONS:
rating1 = get_rating(ratings_data, video_id, model1, dim_key, run1) or 0
rating2 = get_rating(ratings_data, video_id, model2, dim_key, run2) or 0
bar1 = "██" * rating1 + "▍" if rating1 > 0 else ""
bar2 = "██" * rating2 + "▍" if rating2 > 0 else ""
left_bar = f"{bar1} {rating1}" if rating1 > 0 else f"{rating1}"
right_bar = f"{rating2} {bar2}" if rating2 > 0 else f"{rating2}"
st.markdown(
f"""
{left_bar}
{dim_name}
{right_bar}
""",
unsafe_allow_html=True
)
if "example_idx" not in st.session_state:
st.session_state.example_idx = 0
if "model1" not in st.session_state:
st.session_state.model1 = MODELS[4] # Sora
if "model2" not in st.session_state:
st.session_state.model2 = MODELS[5] # Veo
# ===== Main layout =====
st.title("⚛️ VideoScience-Bench")
tab2, tab3, tab1 = st.tabs(["🤖 Auto-Judge Leaderboard", "👥 Human Evaluation", "📹 Video Comparison"])
# ===== TAB 1: Video Comparison =====
with tab1:
# --- 1. Minimal top navigation bar ---
col_nav_1, col_nav_2, col_nav_3 = st.columns([1, 10, 1])
with col_nav_1:
if st.button("◀", key="prev", use_container_width=True, disabled=(st.session_state.example_idx == 0)):
st.session_state.example_idx -= 1
st.rerun()
with col_nav_2:
# Display centered example index and topic with keywords
current = examples[st.session_state.example_idx]
keywords_html = render_keywords(current.get('keywords', ''))
st.markdown(
f"""
EXAMPLE {st.session_state.example_idx + 1} / {TOTAL_EXAMPLES}
{current['topic']}
{keywords_html}
""", unsafe_allow_html=True
)
with col_nav_3:
if st.button("▶", key="next", use_container_width=True, disabled=(st.session_state.example_idx == TOTAL_EXAMPLES - 1)):
st.session_state.example_idx += 1
st.rerun()
# --- 2. Prompt and expectation (expanded by default) ---
with st.expander("📝 View Prompt & Expectation Details", expanded=True):
c1, c2 = st.columns(2)
with c1:
st.caption("PROMPT")
st.write(current['prompt'])
with c2:
st.caption("EXPECTED PHENOMENON")
st.write(current['expected'])
# --- 3. Comparison control panel ---
st.markdown("", unsafe_allow_html=True)
ctrl_col1, ctrl_col2, ctrl_col3 = st.columns([3, 1, 3])
with ctrl_col1:
# Hidden label to save vertical space
model1 = st.selectbox("Model Left", MODELS, index=MODELS.index(st.session_state.model1),
key="m1_select", label_visibility="collapsed")
st.session_state.model1 = model1
with ctrl_col2:
# Trigger playback of both videos
play = st.button("▶ Play Both", use_container_width=True, type="primary")
with ctrl_col3:
model2 = st.selectbox("Model Right", MODELS, index=MODELS.index(st.session_state.model2),
key="m2_select", label_visibility="collapsed")
st.session_state.model2 = model2
# --- 4. Video playback section ---
vid_col1, vid_col2 = st.columns(2)
# JavaScript-based autoplay
if play:
components.html("""
""", height=0)
with vid_col1:
video_path1 = current["videos"].get(model1, "")
if os.path.exists(video_path1):
st.video(video_path1)
else:
st.error(f"❌ Video not found: {video_path1}")
with vid_col2:
video_path2 = current["videos"].get(model2, "")
if os.path.exists(video_path2):
st.video(video_path2)
else:
st.error(f"❌ Video not found: {video_path2}")
# --- 5. Integrated rating bars ---
render_rating_bars(
ratings_data,
current['video_id'],
model1,
model2,
current["model_runs"].get(model1, 1),
current["model_runs"].get(model2, 1)
)
# ===== TAB 2: Auto-Judge Leaderboard (CL+CV) =====
with tab2:
st.markdown("### 🤖 VideoScience-Judge Leaderboard")
st.markdown("Scores are computed using an evidence-grounded scheme integrating prompt-specific checklist and CV-based analysis, then averaged across all dimensions and normalized to 1.", unsafe_allow_html=True)
# Data from Table 2 – CL+CV column
auto_data = {
"Model": ["Kling-v3.0", "Sora-2", "Veo-3", "Kling-v2.5", "Wan-2.5", "Seedance-1.0-Pro", "Hailuo-2.3", "Ray2"],
"Score": [0.78, 0.76, 0.65, 0.59, 0.59, 0.54, 0.50, 0.34]
}
df_auto = pd.DataFrame(auto_data).sort_values("Score", ascending=False).reset_index(drop=True)
df_auto["Rank"] = range(1, len(df_auto) + 1)
# Build leaderboard visualization
for idx, row in df_auto.iterrows():
rank = row["Rank"]
model = row["Model"]
score = row["Score"]
# Assign visual badge style based on rank
if rank == 1:
badge_class = "rank-1"
medal = "🥇"
elif rank == 2:
badge_class = "rank-2"
medal = "🥈"
elif rank == 3:
badge_class = "rank-3"
medal = "🥉"
else:
badge_class = "rank-other"
medal = ""
# Convert score to progress bar width
bar_width = score * 100
st.markdown(f"""
{rank}
{medal} {model}
{score:.2f}
""", unsafe_allow_html=True)
# ===== TAB 3: Human Evaluation =====
@st.fragment
def render_human_rankings_section(df_human):
# Display full rankings using expandable rows
# 1. Initialize session state for expander control
if 'expander_state' not in st.session_state:
st.session_state['expander_state'] = False
# 2. Control buttons for expand/collapse all
col1, col2, col3 = st.columns([6, 1, 1])
with col1:
st.markdown("### 📊 Complete Rankings")
with col2:
if st.button("➕ Expand All", use_container_width=True, type="secondary"):
st.session_state['expander_state'] = True
st.rerun()
with col3:
if st.button("➖ Collapse All", use_container_width=True, type="secondary"):
st.session_state['expander_state'] = False
st.rerun()
st.markdown("", unsafe_allow_html=True)
for idx, row in df_human.iterrows():
rank = idx + 1
model = row["Model"]
avg = row["Average"]
if rank <= 3:
medals = ["🥇", "🥈", "🥉"]
medal = medals[rank-1]
else:
medal = ""
# Expandable ranking entry
with st.expander(f"**#{rank} {medal} {model}** — Avg: {avg:.2f}", expanded=st.session_state['expander_state']):
cols = st.columns(5)
dimensions = [("PCS", "Prompt Consistency"), ("PCG", "Phenomenon Congruency"),
("CDN", "Correct Dynamism"), ("IMB", "Immutability"), ("STC", "CoherSpatio-Temporal Coherenceence")]
for col, (key, name) in zip(cols, dimensions):
score = row[key]
col.metric(name, f"{score:.2f}", delta=None)
st.markdown("
", unsafe_allow_html=True)
with tab3:
st.markdown("### 👥 Human Evaluation Scores")
st.markdown("Mean annotator scores from a 1–4 Likert scale.", unsafe_allow_html=True)
# Human evaluation dataset (Table 1)
human_data = {
"Model": ["Sora-2", "Veo-3", "Kling-v2.5", "Wan-2.5", "Seedance-1.0-Pro", "Hailuo-2.3", "Ray2"],
"PCS": [3.32, 3.01, 2.77, 2.87, 2.56, 2.39, 1.65],
"PCG": [2.56, 2.35, 1.91, 1.84, 1.78, 1.67, 1.26],
"CDN": [3.33, 2.83, 2.75, 2.83, 2.52, 2.57, 2.13],
"IMB": [3.73, 3.30, 3.36, 3.36, 3.15, 3.16, 2.44],
"STC": [3.71, 3.42, 3.60, 3.46, 3.46, 3.46, 2.92]
}
df_human = pd.DataFrame(human_data)
df_human["Average"] = df_human[["PCS", "PCG", "CDN", "IMB", "STC"]].mean(axis=1)
df_human = df_human.sort_values("Average", ascending=False).reset_index(drop=True)
# Radar chart for human scores
fig = go.Figure()
categories = ['Prompt
Consistency', 'Phenomenon
Congruency', 'Dynamism', 'Immutability', 'Coherence']
colors = ['#667eea', '#f093fb', '#4facfe', '#43cea2', '#ff9a9e', '#fbc2eb', '#90f7ec']
for idx in range(len(df_human)):
row = df_human.iloc[idx]
values = [row["PCS"], row["PCG"], row["CDN"], row["IMB"], row["STC"]]
fig.add_trace(go.Scatterpolar(
r=values,
theta=categories,
fill='toself',
name=row["Model"],
line=dict(color=colors[idx % len(colors)], width=2),
marker=dict(size=8)
))
fig.update_layout(
polar=dict(
radialaxis=dict(visible=True, range=[0, 4], tickfont=dict(size=10, color='red')),
angularaxis=dict(tickfont=dict(size=11))
),
showlegend=True,
height=450,
margin=dict(l=80, r=80, t=40, b=40),
legend=dict(orientation="h", yanchor="bottom", y=-0.15, xanchor="center", x=0.5)
)
st.plotly_chart(fig, use_container_width=True)
render_human_rankings_section(df_human)