videoscience-bench / src /streamlit_app.py
lmgame's picture
Add Kling-v3.0
e6515dd verified
import streamlit as st
import pandas as pd
import os
import json
import streamlit.components.v1 as components
import plotly.graph_objects as go
import hashlib
# Page configuration: set wide layout
st.set_page_config(page_title="VideoScience-Bench", layout="wide", initial_sidebar_state="collapsed")
# ===== CSS styling: compact layout and modernized UI =====
st.markdown("""
<style>
/* 1. Reduce top spacing */
.block-container {
padding-top: 1.5rem;
padding-bottom: 2rem;
padding-left: 2rem;
padding-right: 2rem;
}
/* 2. Compress global component spacing */
div[data-testid="stVerticalBlock"] > div {
gap: 0.5rem !important;
}
div[data-testid="stHorizontalBlock"] {
gap: 0.5rem !important;
}
/* 3. Refined styling for Tabs */
.stTabs [data-baseweb="tab-list"] {
gap: 4px;
margin-bottom: 0.5rem;
}
.stTabs [data-baseweb="tab"] {
padding: 4px 12px;
font-size: 14px;
}
/* 4. Dropdown styling refinement for compact appearance */
div[data-baseweb="select"] > div {
min-height: 32px;
padding-top: 0;
padding-bottom: 0;
}
/* 5. Enhanced button styling for play controls */
div.stButton > button {
width: 100%;
border-radius: 6px;
padding: 0.25rem 0.5rem;
line-height: 1.2;
}
/* 6. Container styling for rating bars */
.rating-container {
background-color: #f8f9fa;
border-radius: 6px;
padding: 8px;
margin-top: 0px;
border: 1px solid #eee;
}
/* 7. Metric card styling used in leaderboard */
.metric-card {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
padding: 1rem;
border-radius: 8px;
color: white;
text-align: center;
margin: 0.5rem 0;
}
/* Rank badge styling */
.rank-badge {
display: inline-block;
width: 24px;
height: 24px;
line-height: 24px;
border-radius: 50%;
text-align: center;
font-weight: bold;
font-size: 12px;
margin-right: 6px;
}
.rank-1 { background: linear-gradient(135deg, #FFD700, #FFA500); color: #000; }
.rank-2 { background: linear-gradient(135deg, #C0C0C0, #808080); color: #000; }
.rank-3 { background: linear-gradient(135deg, #CD7F32, #8B4513); color: #fff; }
.rank-other { background: linear-gradient(135deg, #e0e0e0, #bdbdbd); color: #000; }
/* Styling for complete rankings expanders to align with Auto-Judge theme */
.rankings-section div[data-testid="stExpander"] {
background: linear-gradient(90deg, rgba(102, 126, 234, 0.15) 0%, rgba(118, 75, 162, 0.15) 100%) !important;
border: 1px solid rgba(118, 75, 162, 0.3) !important;
border-radius: 10px !important;
color: #f0f4ff !important;
}
/* Larger, centered text for expander headers */
.rankings-section div[data-testid="stExpander"] summary {
font-size: 20px !important;
font-weight: 600 !important;
padding: 16px 0 !important;
display: flex !important;
align-items: center !important;
justify-content: center !important;
text-align: center !important;
line-height: 1.6 !important;
min-height: 56px !important;
width: 100%;
color: #f0f4ff !important;
}
/* Ensure nested text elements in headers remain centered */
.rankings-section div[data-testid="stExpander"] summary p,
.rankings-section div[data-testid="stExpander"] summary span,
.rankings-section div[data-testid="stExpander"] summary div {
margin: 0 auto !important;
display: inline-flex !important;
align-items: center !important;
justify-content: center !important;
line-height: 1.6 !important;
text-align: center !important;
color: #f0f4ff !important;
}
/* Larger metric font inside expanders */
.rankings-section div[data-testid="stExpander"] [data-testid="stMetricValue"] {
font-size: 24px !important;
}
.rankings-section div[data-testid="stExpander"] [data-testid="stMetricLabel"] {
font-size: 16px !important;
}
/* Adjustments for light mode to match leaderboard color scheme */
body[data-theme="light"] .rankings-section div[data-testid="stExpander"] {
background: linear-gradient(90deg, rgba(102, 126, 234, 0.1) 0%, rgba(118, 75, 162, 0.1) 100%) !important;
border: 1px solid rgba(118, 75, 162, 0.25) !important;
color: #1f1f2d !important;
}
body[data-theme="light"] .rankings-section div[data-testid="stExpander"] summary,
body[data-theme="light"] .rankings-section div[data-testid="stExpander"] summary *,
body[data-theme="light"] .rankings-section div[data-testid="stExpander"] [data-testid="stMetricLabel"],
body[data-theme="light"] .rankings-section div[data-testid="stExpander"] [data-testid="stMetricValue"] {
color: #1f1f2d !important;
}
/* Keyword tag styling - Notion-like */
.keyword-tag {
display: inline-block;
padding: 3px 10px;
margin: 3px 4px;
border-radius: 4px;
font-size: 13px;
font-weight: 500;
white-space: nowrap;
transition: transform 0.2s ease;
}
.keyword-tag:hover {
transform: translateY(-1px);
}
.keywords-container {
display: inline-flex;
flex-wrap: wrap;
align-items: center;
justify-content: center;
margin-top: 8px;
gap: 2px;
}
@media (max-width: 768px) {
.keyword-tag {
font-size: 11px;
padding: 2px 8px;
margin: 2px 3px;
}
}
</style>
""", unsafe_allow_html=True)
# ===== Data loading section (actual logic) =====
RATINGS_FILE = "ratings.json"
VIDEO_BASE_DIR = "downloaded_videos"
CSV_FILE = "Examples.csv"
MODEL_NAME_MAP = {
"bytedance-seedance-1-pro": "seed-dance",
"kling-v2-5-turbo-pro": "klingv2.5",
"minimax-hailuo-2.3": "hailuo2.3",
"ray-2": "ray-2",
"sora-2": "sora-2",
"veo3-quality": "veo3",
"wan2.5-t2v-preview": "wan2.5",
}
MODELS = [
"bytedance-seedance-1-pro",
"kling-v2-5-turbo-pro",
"minimax-hailuo-2.3",
"ray-2",
"sora-2",
"veo3-quality",
"wan2.5-t2v-preview",
]
RATING_DIMENSIONS = [
("prompt_consistency", "Prompt Consistency"),
("expected_phenomenon", "Phenomenon Congruency"),
("dynamism", "Correct Dynamism"),
("immutability", "Immutability"),
("coherence", "Spatio-Temporal Coherence"),
]
def generate_tag_color(keyword):
"""Generate a consistent color for each keyword using hash"""
# Use hash to generate consistent colors
hash_val = int(hashlib.md5(keyword.encode()).hexdigest(), 16)
# Color palette inspired by Notion tags
colors = [
('#FEE2E2', '#991B1B'), # Red
('#FFEDD5', '#9A3412'), # Orange
('#FEF3C7', '#92400E'), # Yellow
('#D1FAE5', '#065F46'), # Green
('#DBEAFE', '#1E40AF'), # Blue
('#E0E7FF', '#3730A3'), # Indigo
('#F3E8FF', '#6B21A8'), # Purple
('#FCE7F3', '#9F1239'), # Pink
('#E5E7EB', '#374151'), # Gray
('#D1F5FF', '#0369A1'), # Cyan
]
return colors[hash_val % len(colors)]
def render_keywords(keywords_str):
"""Render keywords as Notion-style tags"""
if not keywords_str or pd.isna(keywords_str):
return ""
keywords = [kw.strip() for kw in str(keywords_str).split(',') if kw.strip()]
if not keywords:
return ""
tags_html = "<div class='keywords-container'>"
for keyword in keywords:
bg_color, text_color = generate_tag_color(keyword)
tags_html += f"""<span class='keyword-tag' style='background-color: {bg_color}; color: {text_color};'>
{keyword}
</span>"""
tags_html += "</div>"
return tags_html
@st.cache_data
def load_ratings():
try:
if os.path.exists(RATINGS_FILE):
with open(RATINGS_FILE, 'r', encoding='utf-8') as f:
return json.load(f)
return {}
except Exception as e:
st.error(f"Error loading ratings: {e}")
return {}
@st.cache_data
def load_csv_data():
try:
if os.path.exists(CSV_FILE):
df = pd.read_csv(CSV_FILE, encoding='utf-8-sig')
data_map = {}
for _, row in df.iterrows():
unique_id = row.get('Unique ID')
if pd.notna(unique_id):
data_map[int(unique_id)] = {
'prompt': row.get('Prompts', 'N/A'),
'expected': row.get('Expected phenomenon', 'N/A'),
'topic': row.get('Example Title', f'Example {unique_id}'),
'keywords': row.get('Keywords', ''), # Add keywords field
}
return data_map
return {}
except Exception as e:
st.error(f"Error loading CSV file: {e}")
return {}
def get_rating(ratings_data, video_id, model_name, dimension, run_number):
video_id_str = str(video_id)
if video_id_str not in ratings_data or dimension not in ratings_data[video_id_str]:
return None
json_model_name = MODEL_NAME_MAP.get(model_name)
if not json_model_name or json_model_name not in ratings_data[video_id_str][dimension]:
return None
ratings_list = ratings_data[video_id_str][dimension][json_model_name]
if run_number < 1 or run_number > len(ratings_list):
return None
return ratings_list[run_number - 1]
def build_example(topic, prompt, expected, keywords, video_id, model_runs):
videos = {}
for model in MODELS:
run_number = model_runs.get(model, 1)
video_path = os.path.join(VIDEO_BASE_DIR, model, f"vid_{video_id}_run_{run_number}.mp4")
videos[model] = video_path
return {
"topic": topic,
"prompt": prompt,
"expected": expected,
"keywords": keywords, # Add keywords to example dict
"video_id": video_id,
"model_runs": model_runs,
"videos": videos,
}
def build_examples(example_specs, csv_data):
examples = []
for spec in example_specs:
video_id = spec["video_id"]
model_runs = spec["model_runs"]
csv_entry = csv_data.get(video_id, {})
topic = csv_entry.get('topic', f'Example {video_id}')
prompt = csv_entry.get('prompt', 'N/A')
expected = csv_entry.get('expected', 'N/A')
keywords = csv_entry.get('keywords', '') # Get keywords
examples.append(build_example(topic, prompt, expected, keywords, video_id, model_runs))
return examples
# Load actual data
ratings_data = load_ratings()
csv_data = load_csv_data()
example_specs = [
{"video_id": 113, "model_runs": {"bytedance-seedance-1-pro": 1, "kling-v2-5-turbo-pro": 1, "minimax-hailuo-2.3": 2, "ray-2": 1, "sora-2": 1, "veo3-quality": 1, "wan2.5-t2v-preview": 1}},
{"video_id": 143, "model_runs": {"bytedance-seedance-1-pro": 2, "kling-v2-5-turbo-pro": 2, "minimax-hailuo-2.3": 3, "ray-2": 2, "sora-2": 2, "veo3-quality": 2, "wan2.5-t2v-preview": 2}},
{"video_id": 175, "model_runs": {"bytedance-seedance-1-pro": 1, "kling-v2-5-turbo-pro": 1, "minimax-hailuo-2.3": 1, "ray-2": 1, "sora-2": 1, "veo3-quality": 1, "wan2.5-t2v-preview": 1}},
{"video_id": 138, "model_runs": {"bytedance-seedance-1-pro": 3, "kling-v2-5-turbo-pro": 3, "minimax-hailuo-2.3": 3, "ray-2": 3, "sora-2": 2, "veo3-quality": 1, "wan2.5-t2v-preview": 3}},
]
examples = build_examples(example_specs, csv_data)
# Provide a fallback dummy entry if loading fails
if not examples:
examples = [{"video_id": 0, "topic": "No Data", "prompt": "No Data", "expected": "No Data", "keywords": "", "videos": {}, "model_runs": {}}]
TOTAL_EXAMPLES = len(examples)
# ===== Responsive rating bar rendering =====
def render_rating_bars(ratings_data, video_id, model1, model2, run1, run2):
st.markdown(
f"""
<style>
.rating-container {{
font-family: monospace;
overflow-x: auto;
-webkit-overflow-scrolling: touch;
color: var(--text-color);
background-color: transparent;
}}
.rating-header {{
font-size: clamp(12px, 3vw, 20px);
margin: 12px 0;
white-space: nowrap;
}}
.rating-row {{
font-size: clamp(12px, 3vw, 20px);
margin: 10px 0;
line-height: 1.6;
white-space: nowrap;
}}
.model-left {{
color: #FF6B6B;
display: inline-block;
width: 28%;
min-width: 120px;
text-align: right;
font-weight: 700;
}}
.model-right {{
color: #4ECDC4;
display: inline-block;
width: 28%;
min-width: 120px;
text-align: left;
font-weight: 700;
}}
.dimension-center {{
display: inline-block;
width: 42%;
min-width: 150px;
text-align: center;
font-weight: bold;
}}
.score-left {{
color: #FF6B6B;
display: inline-block;
width: 28%;
min-width: 120px;
text-align: right;
font-weight: 600;
}}
.score-right {{
color: #4ECDC4;
display: inline-block;
width: 28%;
min-width: 120px;
text-align: left;
font-weight: 600;
}}
.dim-name {{
display: inline-block;
width: 42%;
min-width: 150px;
text-align: center;
font-weight: 700;
}}
@media (max-width: 768px) {{
.rating-container {{
padding: 0 8px;
}}
}}
</style>
<div class='rating-container'>
<div class='rating-header'>
<span class='model-left'>{model1}</span>
<span class='dimension-center'>Rating Dimensions</span>
<span class='model-right'>{model2}</span>
</div>
</div>
""",
unsafe_allow_html=True
)
for dim_key, dim_name in RATING_DIMENSIONS:
rating1 = get_rating(ratings_data, video_id, model1, dim_key, run1) or 0
rating2 = get_rating(ratings_data, video_id, model2, dim_key, run2) or 0
bar1 = "██" * rating1 + "▍" if rating1 > 0 else ""
bar2 = "██" * rating2 + "▍" if rating2 > 0 else ""
left_bar = f"{bar1} {rating1}" if rating1 > 0 else f"{rating1}"
right_bar = f"{rating2} {bar2}" if rating2 > 0 else f"{rating2}"
st.markdown(
f"""
<div class='rating-container'>
<div class='rating-row'>
<span class='score-left'>{left_bar}</span>
<span class='dim-name'>{dim_name}</span>
<span class='score-right'>{right_bar}</span>
</div>
</div>
""",
unsafe_allow_html=True
)
if "example_idx" not in st.session_state:
st.session_state.example_idx = 0
if "model1" not in st.session_state:
st.session_state.model1 = MODELS[4] # Sora
if "model2" not in st.session_state:
st.session_state.model2 = MODELS[5] # Veo
# ===== Main layout =====
st.title("⚛️ VideoScience-Bench")
tab2, tab3, tab1 = st.tabs(["🤖 Auto-Judge Leaderboard", "👥 Human Evaluation", "📹 Video Comparison"])
# ===== TAB 1: Video Comparison =====
with tab1:
# --- 1. Minimal top navigation bar ---
col_nav_1, col_nav_2, col_nav_3 = st.columns([1, 10, 1])
with col_nav_1:
if st.button("◀", key="prev", use_container_width=True, disabled=(st.session_state.example_idx == 0)):
st.session_state.example_idx -= 1
st.rerun()
with col_nav_2:
# Display centered example index and topic with keywords
current = examples[st.session_state.example_idx]
keywords_html = render_keywords(current.get('keywords', ''))
st.markdown(
f"""
<div style='text-align: center; margin-top: -5px;'>
<span style='font-size: 14px; color: #888;'>EXAMPLE {st.session_state.example_idx + 1} / {TOTAL_EXAMPLES}</span><br>
<span style='font-size: 18px; font-weight: 700;'>{current['topic']}</span>
{keywords_html}
</div>
""", unsafe_allow_html=True
)
with col_nav_3:
if st.button("▶", key="next", use_container_width=True, disabled=(st.session_state.example_idx == TOTAL_EXAMPLES - 1)):
st.session_state.example_idx += 1
st.rerun()
# --- 2. Prompt and expectation (expanded by default) ---
with st.expander("📝 View Prompt & Expectation Details", expanded=True):
c1, c2 = st.columns(2)
with c1:
st.caption("PROMPT")
st.write(current['prompt'])
with c2:
st.caption("EXPECTED PHENOMENON")
st.write(current['expected'])
# --- 3. Comparison control panel ---
st.markdown("<div style='margin-bottom: 5px;'></div>", unsafe_allow_html=True)
ctrl_col1, ctrl_col2, ctrl_col3 = st.columns([3, 1, 3])
with ctrl_col1:
# Hidden label to save vertical space
model1 = st.selectbox("Model Left", MODELS, index=MODELS.index(st.session_state.model1),
key="m1_select", label_visibility="collapsed")
st.session_state.model1 = model1
with ctrl_col2:
# Trigger playback of both videos
play = st.button("▶ Play Both", use_container_width=True, type="primary")
with ctrl_col3:
model2 = st.selectbox("Model Right", MODELS, index=MODELS.index(st.session_state.model2),
key="m2_select", label_visibility="collapsed")
st.session_state.model2 = model2
# --- 4. Video playback section ---
vid_col1, vid_col2 = st.columns(2)
# JavaScript-based autoplay
if play:
components.html("""
<script>
setTimeout(() => {
const videos = window.parent.document.querySelectorAll('video');
videos.forEach(v => { v.currentTime = 0; v.play(); });
}, 100);
</script>
""", height=0)
with vid_col1:
video_path1 = current["videos"].get(model1, "")
if os.path.exists(video_path1):
st.video(video_path1)
else:
st.error(f"❌ Video not found: {video_path1}")
with vid_col2:
video_path2 = current["videos"].get(model2, "")
if os.path.exists(video_path2):
st.video(video_path2)
else:
st.error(f"❌ Video not found: {video_path2}")
# --- 5. Integrated rating bars ---
render_rating_bars(
ratings_data,
current['video_id'],
model1,
model2,
current["model_runs"].get(model1, 1),
current["model_runs"].get(model2, 1)
)
# ===== TAB 2: Auto-Judge Leaderboard (CL+CV) =====
with tab2:
st.markdown("### 🤖 VideoScience-Judge Leaderboard")
st.markdown("<small>Scores are computed using an evidence-grounded scheme integrating prompt-specific checklist and CV-based analysis, then averaged across all dimensions and normalized to 1.</small>", unsafe_allow_html=True)
# Data from Table 2 – CL+CV column
auto_data = {
"Model": ["Kling-v3.0", "Sora-2", "Veo-3", "Kling-v2.5", "Wan-2.5", "Seedance-1.0-Pro", "Hailuo-2.3", "Ray2"],
"Score": [0.78, 0.76, 0.65, 0.59, 0.59, 0.54, 0.50, 0.34]
}
df_auto = pd.DataFrame(auto_data).sort_values("Score", ascending=False).reset_index(drop=True)
df_auto["Rank"] = range(1, len(df_auto) + 1)
# Build leaderboard visualization
for idx, row in df_auto.iterrows():
rank = row["Rank"]
model = row["Model"]
score = row["Score"]
# Assign visual badge style based on rank
if rank == 1:
badge_class = "rank-1"
medal = "🥇"
elif rank == 2:
badge_class = "rank-2"
medal = "🥈"
elif rank == 3:
badge_class = "rank-3"
medal = "🥉"
else:
badge_class = "rank-other"
medal = ""
# Convert score to progress bar width
bar_width = score * 100
st.markdown(f"""
<div style='background: linear-gradient(90deg, rgba(102,126,234,0.1) 0%, rgba(118,75,162,0.1) 100%);
padding: 12px; border-radius: 8px; margin: 8px 0; border-left: 4px solid #667eea;'>
<div style='display: flex; align-items: center; justify-content: space-between;'>
<div style='display: flex; align-items: center; gap: 12px;'>
<span class='rank-badge {badge_class}'>{rank}</span>
<span style='font-weight: 600; font-size: 16px;'>{medal} {model}</span>
</div>
<div style='font-weight: 700; font-size: 20px; color: #667eea;'>{score:.2f}</div>
</div>
<div style='width: 100%; height: 6px; background: #e0e0e0; border-radius: 3px; margin-top: 8px; overflow: hidden;'>
<div style='width: {bar_width}%; height: 100%; background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);'></div>
</div>
</div>
""", unsafe_allow_html=True)
# ===== TAB 3: Human Evaluation =====
@st.fragment
def render_human_rankings_section(df_human):
# Display full rankings using expandable rows
# 1. Initialize session state for expander control
if 'expander_state' not in st.session_state:
st.session_state['expander_state'] = False
# 2. Control buttons for expand/collapse all
col1, col2, col3 = st.columns([6, 1, 1])
with col1:
st.markdown("### 📊 Complete Rankings")
with col2:
if st.button("➕ Expand All", use_container_width=True, type="secondary"):
st.session_state['expander_state'] = True
st.rerun()
with col3:
if st.button("➖ Collapse All", use_container_width=True, type="secondary"):
st.session_state['expander_state'] = False
st.rerun()
st.markdown("<div class='rankings-section'>", unsafe_allow_html=True)
for idx, row in df_human.iterrows():
rank = idx + 1
model = row["Model"]
avg = row["Average"]
if rank <= 3:
medals = ["🥇", "🥈", "🥉"]
medal = medals[rank-1]
else:
medal = ""
# Expandable ranking entry
with st.expander(f"**#{rank} {medal} {model}** — Avg: {avg:.2f}", expanded=st.session_state['expander_state']):
cols = st.columns(5)
dimensions = [("PCS", "Prompt Consistency"), ("PCG", "Phenomenon Congruency"),
("CDN", "Correct Dynamism"), ("IMB", "Immutability"), ("STC", "CoherSpatio-Temporal Coherenceence")]
for col, (key, name) in zip(cols, dimensions):
score = row[key]
col.metric(name, f"{score:.2f}", delta=None)
st.markdown("</div>", unsafe_allow_html=True)
with tab3:
st.markdown("### 👥 Human Evaluation Scores")
st.markdown("<small>Mean annotator scores from a 1–4 Likert scale.</small>", unsafe_allow_html=True)
# Human evaluation dataset (Table 1)
human_data = {
"Model": ["Sora-2", "Veo-3", "Kling-v2.5", "Wan-2.5", "Seedance-1.0-Pro", "Hailuo-2.3", "Ray2"],
"PCS": [3.32, 3.01, 2.77, 2.87, 2.56, 2.39, 1.65],
"PCG": [2.56, 2.35, 1.91, 1.84, 1.78, 1.67, 1.26],
"CDN": [3.33, 2.83, 2.75, 2.83, 2.52, 2.57, 2.13],
"IMB": [3.73, 3.30, 3.36, 3.36, 3.15, 3.16, 2.44],
"STC": [3.71, 3.42, 3.60, 3.46, 3.46, 3.46, 2.92]
}
df_human = pd.DataFrame(human_data)
df_human["Average"] = df_human[["PCS", "PCG", "CDN", "IMB", "STC"]].mean(axis=1)
df_human = df_human.sort_values("Average", ascending=False).reset_index(drop=True)
# Radar chart for human scores
fig = go.Figure()
categories = ['Prompt<br>Consistency', 'Phenomenon<br>Congruency', 'Dynamism', 'Immutability', 'Coherence']
colors = ['#667eea', '#f093fb', '#4facfe', '#43cea2', '#ff9a9e', '#fbc2eb', '#90f7ec']
for idx in range(len(df_human)):
row = df_human.iloc[idx]
values = [row["PCS"], row["PCG"], row["CDN"], row["IMB"], row["STC"]]
fig.add_trace(go.Scatterpolar(
r=values,
theta=categories,
fill='toself',
name=row["Model"],
line=dict(color=colors[idx % len(colors)], width=2),
marker=dict(size=8)
))
fig.update_layout(
polar=dict(
radialaxis=dict(visible=True, range=[0, 4], tickfont=dict(size=10, color='red')),
angularaxis=dict(tickfont=dict(size=11))
),
showlegend=True,
height=450,
margin=dict(l=80, r=80, t=40, b=40),
legend=dict(orientation="h", yanchor="bottom", y=-0.15, xanchor="center", x=0.5)
)
st.plotly_chart(fig, use_container_width=True)
render_human_rankings_section(df_human)