Spaces:

lmgame
/

videoscience-bench

Running

File size: 26,177 Bytes

import streamlit as st
import pandas as pd
import os
import json
import streamlit.components.v1 as components
import plotly.graph_objects as go
import hashlib

# Page configuration: set wide layout
st.set_page_config(page_title="VideoScience-Bench", layout="wide", initial_sidebar_state="collapsed")

# ===== CSS styling: compact layout and modernized UI =====
st.markdown("""
<style>
    /* 1. Reduce top spacing */
    .block-container {
        padding-top: 1.5rem;
        padding-bottom: 2rem;
        padding-left: 2rem;
        padding-right: 2rem;
    }
    
    /* 2. Compress global component spacing */
    div[data-testid="stVerticalBlock"] > div {
        gap: 0.5rem !important;
    }
    div[data-testid="stHorizontalBlock"] {
        gap: 0.5rem !important;
    }
    
    /* 3. Refined styling for Tabs */
    .stTabs [data-baseweb="tab-list"] {
        gap: 4px;
        margin-bottom: 0.5rem;
    }
    .stTabs [data-baseweb="tab"] {
        padding: 4px 12px;
        font-size: 14px;
    }

    /* 4. Dropdown styling refinement for compact appearance */
    div[data-baseweb="select"] > div {
        min-height: 32px;
        padding-top: 0;
        padding-bottom: 0;
    }
    
    /* 5. Enhanced button styling for play controls */
    div.stButton > button {
        width: 100%;
        border-radius: 6px;
        padding: 0.25rem 0.5rem;
        line-height: 1.2;
    }
    
    /* 6. Container styling for rating bars */
    .rating-container {
        background-color: #f8f9fa;
        border-radius: 6px;
        padding: 8px;
        margin-top: 0px;
        border: 1px solid #eee;
    }
    
    /* 7. Metric card styling used in leaderboard */
    .metric-card {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        padding: 1rem;
        border-radius: 8px;
        color: white;
        text-align: center;
        margin: 0.5rem 0;
    }

    /* Rank badge styling */
    .rank-badge {
        display: inline-block;
        width: 24px;
        height: 24px;
        line-height: 24px;
        border-radius: 50%;
        text-align: center;
        font-weight: bold;
        font-size: 12px;
        margin-right: 6px;
    }
    .rank-1 { background: linear-gradient(135deg, #FFD700, #FFA500); color: #000; }
    .rank-2 { background: linear-gradient(135deg, #C0C0C0, #808080); color: #000; }
    .rank-3 { background: linear-gradient(135deg, #CD7F32, #8B4513); color: #fff; }
    .rank-other { background: linear-gradient(135deg, #e0e0e0, #bdbdbd); color: #000; }
    
    /* Styling for complete rankings expanders to align with Auto-Judge theme */
    .rankings-section div[data-testid="stExpander"] {
        background: linear-gradient(90deg, rgba(102, 126, 234, 0.15) 0%, rgba(118, 75, 162, 0.15) 100%) !important;
        border: 1px solid rgba(118, 75, 162, 0.3) !important;
        border-radius: 10px !important;
        color: #f0f4ff !important;
    }
    
    /* Larger, centered text for expander headers */
    .rankings-section div[data-testid="stExpander"] summary {
        font-size: 20px !important;
        font-weight: 600 !important;
        padding: 16px 0 !important;
        display: flex !important;
        align-items: center !important;
        justify-content: center !important;
        text-align: center !important;
        line-height: 1.6 !important;
        min-height: 56px !important;
        width: 100%;
        color: #f0f4ff !important;
    }
    
    /* Ensure nested text elements in headers remain centered */
    .rankings-section div[data-testid="stExpander"] summary p,
    .rankings-section div[data-testid="stExpander"] summary span,
    .rankings-section div[data-testid="stExpander"] summary div {
        margin: 0 auto !important;
        display: inline-flex !important;
        align-items: center !important;
        justify-content: center !important;
        line-height: 1.6 !important;
        text-align: center !important;
        color: #f0f4ff !important;
    }
    
    /* Larger metric font inside expanders */
    .rankings-section div[data-testid="stExpander"] [data-testid="stMetricValue"] {
        font-size: 24px !important;
    }
    
    .rankings-section div[data-testid="stExpander"] [data-testid="stMetricLabel"] {
        font-size: 16px !important;
    }

    /* Adjustments for light mode to match leaderboard color scheme */
    body[data-theme="light"] .rankings-section div[data-testid="stExpander"] {
        background: linear-gradient(90deg, rgba(102, 126, 234, 0.1) 0%, rgba(118, 75, 162, 0.1) 100%) !important;
        border: 1px solid rgba(118, 75, 162, 0.25) !important;
        color: #1f1f2d !important;
    }
    
    body[data-theme="light"] .rankings-section div[data-testid="stExpander"] summary,
    body[data-theme="light"] .rankings-section div[data-testid="stExpander"] summary *,
    body[data-theme="light"] .rankings-section div[data-testid="stExpander"] [data-testid="stMetricLabel"],
    body[data-theme="light"] .rankings-section div[data-testid="stExpander"] [data-testid="stMetricValue"] {
        color: #1f1f2d !important;
    }
    
    /* Keyword tag styling - Notion-like */
    .keyword-tag {
        display: inline-block;
        padding: 3px 10px;
        margin: 3px 4px;
        border-radius: 4px;
        font-size: 13px;
        font-weight: 500;
        white-space: nowrap;
        transition: transform 0.2s ease;
    }
    
    .keyword-tag:hover {
        transform: translateY(-1px);
    }
    
    .keywords-container {
        display: inline-flex;
        flex-wrap: wrap;
        align-items: center;
        justify-content: center;
        margin-top: 8px;
        gap: 2px;
    }
    
    @media (max-width: 768px) {
        .keyword-tag {
            font-size: 11px;
            padding: 2px 8px;
            margin: 2px 3px;
        }
    }
</style>
""", unsafe_allow_html=True)

# ===== Data loading section (actual logic) =====
RATINGS_FILE = "ratings.json"
VIDEO_BASE_DIR = "downloaded_videos"
CSV_FILE = "Examples.csv"

MODEL_NAME_MAP = {
    "bytedance-seedance-1-pro": "seed-dance",
    "kling-v2-5-turbo-pro": "klingv2.5",
    "minimax-hailuo-2.3": "hailuo2.3",
    "ray-2": "ray-2",
    "sora-2": "sora-2",
    "veo3-quality": "veo3",
    "wan2.5-t2v-preview": "wan2.5",
}

MODELS = [
    "bytedance-seedance-1-pro",
    "kling-v2-5-turbo-pro",
    "minimax-hailuo-2.3",
    "ray-2",
    "sora-2",
    "veo3-quality",
    "wan2.5-t2v-preview",
]

RATING_DIMENSIONS = [
    ("prompt_consistency", "Prompt Consistency"),
    ("expected_phenomenon", "Phenomenon Congruency"),
    ("dynamism", "Correct Dynamism"),
    ("immutability", "Immutability"),
    ("coherence", "Spatio-Temporal Coherence"),
]

def generate_tag_color(keyword):
    """Generate a consistent color for each keyword using hash"""
    # Use hash to generate consistent colors
    hash_val = int(hashlib.md5(keyword.encode()).hexdigest(), 16)
    
    # Color palette inspired by Notion tags
    colors = [
        ('#FEE2E2', '#991B1B'),  # Red
        ('#FFEDD5', '#9A3412'),  # Orange
        ('#FEF3C7', '#92400E'),  # Yellow
        ('#D1FAE5', '#065F46'),  # Green
        ('#DBEAFE', '#1E40AF'),  # Blue
        ('#E0E7FF', '#3730A3'),  # Indigo
        ('#F3E8FF', '#6B21A8'),  # Purple
        ('#FCE7F3', '#9F1239'),  # Pink
        ('#E5E7EB', '#374151'),  # Gray
        ('#D1F5FF', '#0369A1'),  # Cyan
    ]
    
    return colors[hash_val % len(colors)]

def render_keywords(keywords_str):
    """Render keywords as Notion-style tags"""
    if not keywords_str or pd.isna(keywords_str):
        return ""
    
    keywords = [kw.strip() for kw in str(keywords_str).split(',') if kw.strip()]
    
    if not keywords:
        return ""
    
    tags_html = "<div class='keywords-container'>"
    for keyword in keywords:
        bg_color, text_color = generate_tag_color(keyword)
        tags_html += f"""<span class='keyword-tag' style='background-color: {bg_color}; color: {text_color};'>
                {keyword}
            </span>"""
    tags_html += "</div>"
    
    return tags_html

@st.cache_data
def load_ratings():
    try:
        if os.path.exists(RATINGS_FILE):
            with open(RATINGS_FILE, 'r', encoding='utf-8') as f:
                return json.load(f)
        return {}
    except Exception as e:
        st.error(f"Error loading ratings: {e}")
        return {}

@st.cache_data
def load_csv_data():
    try:
        if os.path.exists(CSV_FILE):
            df = pd.read_csv(CSV_FILE, encoding='utf-8-sig')
            data_map = {}
            for _, row in df.iterrows():
                unique_id = row.get('Unique ID')
                if pd.notna(unique_id):
                    data_map[int(unique_id)] = {
                        'prompt': row.get('Prompts', 'N/A'),
                        'expected': row.get('Expected phenomenon', 'N/A'),
                        'topic': row.get('Example Title', f'Example {unique_id}'),
                        'keywords': row.get('Keywords', ''),  # Add keywords field
                    }
            return data_map
        return {}
    except Exception as e:
        st.error(f"Error loading CSV file: {e}")
        return {}

def get_rating(ratings_data, video_id, model_name, dimension, run_number):
    video_id_str = str(video_id)
    if video_id_str not in ratings_data or dimension not in ratings_data[video_id_str]:
        return None
    json_model_name = MODEL_NAME_MAP.get(model_name)
    if not json_model_name or json_model_name not in ratings_data[video_id_str][dimension]:
        return None
    ratings_list = ratings_data[video_id_str][dimension][json_model_name]
    if run_number < 1 or run_number > len(ratings_list):
        return None
    return ratings_list[run_number - 1]

def build_example(topic, prompt, expected, keywords, video_id, model_runs):
    videos = {}
    for model in MODELS:
        run_number = model_runs.get(model, 1)
        video_path = os.path.join(VIDEO_BASE_DIR, model, f"vid_{video_id}_run_{run_number}.mp4")
        videos[model] = video_path
    return {
        "topic": topic,
        "prompt": prompt,
        "expected": expected,
        "keywords": keywords,  # Add keywords to example dict
        "video_id": video_id,
        "model_runs": model_runs,
        "videos": videos,
    }

def build_examples(example_specs, csv_data):
    examples = []
    for spec in example_specs:
        video_id = spec["video_id"]
        model_runs = spec["model_runs"]
        csv_entry = csv_data.get(video_id, {})
        topic = csv_entry.get('topic', f'Example {video_id}')
        prompt = csv_entry.get('prompt', 'N/A')
        expected = csv_entry.get('expected', 'N/A')
        keywords = csv_entry.get('keywords', '')  # Get keywords
        examples.append(build_example(topic, prompt, expected, keywords, video_id, model_runs))
    return examples

# Load actual data
ratings_data = load_ratings()
csv_data = load_csv_data()

example_specs = [
    {"video_id": 113, "model_runs": {"bytedance-seedance-1-pro": 1, "kling-v2-5-turbo-pro": 1, "minimax-hailuo-2.3": 2, "ray-2": 1, "sora-2": 1, "veo3-quality": 1, "wan2.5-t2v-preview": 1}},
    {"video_id": 143, "model_runs": {"bytedance-seedance-1-pro": 2, "kling-v2-5-turbo-pro": 2, "minimax-hailuo-2.3": 3, "ray-2": 2, "sora-2": 2, "veo3-quality": 2, "wan2.5-t2v-preview": 2}},
    {"video_id": 175, "model_runs": {"bytedance-seedance-1-pro": 1, "kling-v2-5-turbo-pro": 1, "minimax-hailuo-2.3": 1, "ray-2": 1, "sora-2": 1, "veo3-quality": 1, "wan2.5-t2v-preview": 1}},
    {"video_id": 138, "model_runs": {"bytedance-seedance-1-pro": 3, "kling-v2-5-turbo-pro": 3, "minimax-hailuo-2.3": 3, "ray-2": 3, "sora-2": 2, "veo3-quality": 1, "wan2.5-t2v-preview": 3}},
]

examples = build_examples(example_specs, csv_data)
# Provide a fallback dummy entry if loading fails
if not examples:
    examples = [{"video_id": 0, "topic": "No Data", "prompt": "No Data", "expected": "No Data", "keywords": "", "videos": {}, "model_runs": {}}]
TOTAL_EXAMPLES = len(examples)

# ===== Responsive rating bar rendering =====
def render_rating_bars(ratings_data, video_id, model1, model2, run1, run2):
    st.markdown(
        f"""
        <style>
            .rating-container {{
                font-family: monospace;
                overflow-x: auto;
                -webkit-overflow-scrolling: touch;
                color: var(--text-color);
                background-color: transparent;
            }}
            
            .rating-header {{
                font-size: clamp(12px, 3vw, 20px);
                margin: 12px 0;
                white-space: nowrap;
            }}
            
            .rating-row {{
                font-size: clamp(12px, 3vw, 20px);
                margin: 10px 0;
                line-height: 1.6;
                white-space: nowrap;
            }}
            
            .model-left {{
                color: #FF6B6B;
                display: inline-block;
                width: 28%;
                min-width: 120px;
                text-align: right;
                font-weight: 700;
            }}
            
            .model-right {{
                color: #4ECDC4;
                display: inline-block;
                width: 28%;
                min-width: 120px;
                text-align: left;
                font-weight: 700;
            }}
            
            .dimension-center {{
                display: inline-block;
                width: 42%;
                min-width: 150px;
                text-align: center;
                font-weight: bold;
            }}
            
            .score-left {{
                color: #FF6B6B;
                display: inline-block;
                width: 28%;
                min-width: 120px;
                text-align: right;
                font-weight: 600;
            }}
            
            .score-right {{
                color: #4ECDC4;
                display: inline-block;
                width: 28%;
                min-width: 120px;
                text-align: left;
                font-weight: 600;
            }}
            
            .dim-name {{
                display: inline-block;
                width: 42%;
                min-width: 150px;
                text-align: center;
                font-weight: 700;
            }}
            
            @media (max-width: 768px) {{
                .rating-container {{
                    padding: 0 8px;
                }}
            }}
        </style>
        
        <div class='rating-container'>
            <div class='rating-header'>
                <span class='model-left'>{model1}</span>
                <span class='dimension-center'>Rating Dimensions</span>
                <span class='model-right'>{model2}</span>
            </div>
        </div>
        """,
        unsafe_allow_html=True
    )
    
    for dim_key, dim_name in RATING_DIMENSIONS:
        rating1 = get_rating(ratings_data, video_id, model1, dim_key, run1) or 0
        rating2 = get_rating(ratings_data, video_id, model2, dim_key, run2) or 0
        
        bar1 = "██" * rating1 + "▍" if rating1 > 0 else ""
        bar2 = "██" * rating2 + "▍" if rating2 > 0 else ""
        
        left_bar = f"{bar1} {rating1}" if rating1 > 0 else f"{rating1}"
        right_bar = f"{rating2} {bar2}" if rating2 > 0 else f"{rating2}"
        
        st.markdown(
            f"""
            <div class='rating-container'>
                <div class='rating-row'>
                    <span class='score-left'>{left_bar}</span>
                    <span class='dim-name'>{dim_name}</span>
                    <span class='score-right'>{right_bar}</span>
                </div>
            </div>
            """,
            unsafe_allow_html=True
        )

if "example_idx" not in st.session_state:
    st.session_state.example_idx = 0
if "model1" not in st.session_state:
    st.session_state.model1 = MODELS[4] # Sora
if "model2" not in st.session_state:
    st.session_state.model2 = MODELS[5] # Veo

# ===== Main layout =====
st.title("⚛️ VideoScience-Bench")

tab2, tab3, tab1 = st.tabs(["🤖 Auto-Judge Leaderboard", "👥 Human Evaluation", "📹 Video Comparison"])

# ===== TAB 1: Video Comparison =====
with tab1:
    # --- 1. Minimal top navigation bar ---
    col_nav_1, col_nav_2, col_nav_3 = st.columns([1, 10, 1])
    
    with col_nav_1:
        if st.button("◀", key="prev", use_container_width=True, disabled=(st.session_state.example_idx == 0)):
            st.session_state.example_idx -= 1
            st.rerun()
            
    with col_nav_2:
        # Display centered example index and topic with keywords
        current = examples[st.session_state.example_idx]
        keywords_html = render_keywords(current.get('keywords', ''))
        
        st.markdown(
            f"""
            <div style='text-align: center; margin-top: -5px;'>
                <span style='font-size: 14px; color: #888;'>EXAMPLE {st.session_state.example_idx + 1} / {TOTAL_EXAMPLES}</span><br>
                <span style='font-size: 18px; font-weight: 700;'>{current['topic']}</span>
                {keywords_html}
            </div>
            """, unsafe_allow_html=True
        )
        
    with col_nav_3:
        if st.button("▶", key="next", use_container_width=True, disabled=(st.session_state.example_idx == TOTAL_EXAMPLES - 1)):
            st.session_state.example_idx += 1
            st.rerun()

    # --- 2. Prompt and expectation (expanded by default) ---
    with st.expander("📝 View Prompt & Expectation Details", expanded=True):
        c1, c2 = st.columns(2)
        with c1:
            st.caption("PROMPT")
            st.write(current['prompt'])
        with c2:
            st.caption("EXPECTED PHENOMENON")
            st.write(current['expected'])

    # --- 3. Comparison control panel ---
    st.markdown("<div style='margin-bottom: 5px;'></div>", unsafe_allow_html=True)
    
    ctrl_col1, ctrl_col2, ctrl_col3 = st.columns([3, 1, 3])
    
    with ctrl_col1:
        # Hidden label to save vertical space
        model1 = st.selectbox("Model Left", MODELS, index=MODELS.index(st.session_state.model1), 
                             key="m1_select", label_visibility="collapsed")
        st.session_state.model1 = model1
        
    with ctrl_col2:
        # Trigger playback of both videos
        play = st.button("▶ Play Both", use_container_width=True, type="primary")
        
    with ctrl_col3:
        model2 = st.selectbox("Model Right", MODELS, index=MODELS.index(st.session_state.model2), 
                             key="m2_select", label_visibility="collapsed")
        st.session_state.model2 = model2

    # --- 4. Video playback section ---
    vid_col1, vid_col2 = st.columns(2)
    
    # JavaScript-based autoplay
    if play:
        components.html("""
            <script>
            setTimeout(() => {
                const videos = window.parent.document.querySelectorAll('video');
                videos.forEach(v => { v.currentTime = 0; v.play(); });
            }, 100);
            </script>
        """, height=0)

    with vid_col1:
        video_path1 = current["videos"].get(model1, "")
        if os.path.exists(video_path1):
            st.video(video_path1)
        else:
            st.error(f"❌ Video not found: {video_path1}")

    with vid_col2:
        video_path2 = current["videos"].get(model2, "")
        if os.path.exists(video_path2):
            st.video(video_path2)
        else:
            st.error(f"❌ Video not found: {video_path2}")

    # --- 5. Integrated rating bars ---
    render_rating_bars(
        ratings_data, 
        current['video_id'], 
        model1, 
        model2, 
        current["model_runs"].get(model1, 1),
        current["model_runs"].get(model2, 1)
    )

# ===== TAB 2: Auto-Judge Leaderboard (CL+CV) =====
with tab2:
    st.markdown("### 🤖 VideoScience-Judge Leaderboard")
    st.markdown("<small>Scores are computed using an evidence-grounded scheme integrating prompt-specific checklist and CV-based analysis, then averaged across all dimensions and normalized to 1.</small>", unsafe_allow_html=True)
    
    # Data from Table 2 – CL+CV column
    auto_data = {
        "Model": ["Kling-v3.0", "Sora-2", "Veo-3", "Kling-v2.5", "Wan-2.5", "Seedance-1.0-Pro", "Hailuo-2.3", "Ray2"],
        "Score": [0.78, 0.76, 0.65, 0.59, 0.59, 0.54, 0.50, 0.34]
    }
    
    df_auto = pd.DataFrame(auto_data).sort_values("Score", ascending=False).reset_index(drop=True)
    df_auto["Rank"] = range(1, len(df_auto) + 1)
    
    # Build leaderboard visualization
    for idx, row in df_auto.iterrows():
        rank = row["Rank"]
        model = row["Model"]
        score = row["Score"]
        
        # Assign visual badge style based on rank
        if rank == 1:
            badge_class = "rank-1"
            medal = "🥇"
        elif rank == 2:
            badge_class = "rank-2"
            medal = "🥈"
        elif rank == 3:
            badge_class = "rank-3"
            medal = "🥉"
        else:
            badge_class = "rank-other"
            medal = ""
        
        # Convert score to progress bar width
        bar_width = score * 100
        
        st.markdown(f"""
        <div style='background: linear-gradient(90deg, rgba(102,126,234,0.1) 0%, rgba(118,75,162,0.1) 100%); 
                    padding: 12px; border-radius: 8px; margin: 8px 0; border-left: 4px solid #667eea;'>
            <div style='display: flex; align-items: center; justify-content: space-between;'>
                <div style='display: flex; align-items: center; gap: 12px;'>
                    <span class='rank-badge {badge_class}'>{rank}</span>
                    <span style='font-weight: 600; font-size: 16px;'>{medal} {model}</span>
                </div>
                <div style='font-weight: 700; font-size: 20px; color: #667eea;'>{score:.2f}</div>
            </div>
            <div style='width: 100%; height: 6px; background: #e0e0e0; border-radius: 3px; margin-top: 8px; overflow: hidden;'>
                <div style='width: {bar_width}%; height: 100%; background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);'></div>
            </div>
        </div>
        """, unsafe_allow_html=True)
    
# ===== TAB 3: Human Evaluation =====
@st.fragment
def render_human_rankings_section(df_human):
    # Display full rankings using expandable rows
    # 1. Initialize session state for expander control
    if 'expander_state' not in st.session_state:
        st.session_state['expander_state'] = False

    # 2. Control buttons for expand/collapse all
    col1, col2, col3 = st.columns([6, 1, 1])

    with col1:
        st.markdown("### 📊 Complete Rankings")

    with col2:
        if st.button("➕ Expand All", use_container_width=True, type="secondary"):
            st.session_state['expander_state'] = True
            st.rerun()

    with col3:
        if st.button("➖ Collapse All", use_container_width=True, type="secondary"):
            st.session_state['expander_state'] = False
            st.rerun()

    st.markdown("<div class='rankings-section'>", unsafe_allow_html=True)
    
    for idx, row in df_human.iterrows():
        rank = idx + 1
        model = row["Model"]
        avg = row["Average"]
        
        if rank <= 3:
            medals = ["🥇", "🥈", "🥉"]
            medal = medals[rank-1]
        else:
            medal = ""
        
        # Expandable ranking entry
        with st.expander(f"**#{rank} {medal} {model}** — Avg: {avg:.2f}", expanded=st.session_state['expander_state']):
            cols = st.columns(5)
            dimensions = [("PCS", "Prompt Consistency"), ("PCG", "Phenomenon Congruency"), 
                         ("CDN", "Correct Dynamism"), ("IMB", "Immutability"), ("STC", "CoherSpatio-Temporal Coherenceence")]
            
            for col, (key, name) in zip(cols, dimensions):
                score = row[key]
                col.metric(name, f"{score:.2f}", delta=None)

    st.markdown("</div>", unsafe_allow_html=True)

with tab3:
    st.markdown("### 👥 Human Evaluation Scores")
    st.markdown("<small>Mean annotator scores from a 1–4 Likert scale.</small>", unsafe_allow_html=True)
    
    # Human evaluation dataset (Table 1)
    human_data = {
        "Model": ["Sora-2", "Veo-3", "Kling-v2.5", "Wan-2.5", "Seedance-1.0-Pro", "Hailuo-2.3", "Ray2"],
        "PCS": [3.32, 3.01, 2.77, 2.87, 2.56, 2.39, 1.65],
        "PCG": [2.56, 2.35, 1.91, 1.84, 1.78, 1.67, 1.26],
        "CDN": [3.33, 2.83, 2.75, 2.83, 2.52, 2.57, 2.13],
        "IMB": [3.73, 3.30, 3.36, 3.36, 3.15, 3.16, 2.44],
        "STC": [3.71, 3.42, 3.60, 3.46, 3.46, 3.46, 2.92]
    }
    
    df_human = pd.DataFrame(human_data)
    df_human["Average"] = df_human[["PCS", "PCG", "CDN", "IMB", "STC"]].mean(axis=1)
    df_human = df_human.sort_values("Average", ascending=False).reset_index(drop=True)
    
    # Radar chart for human scores
    fig = go.Figure()
    
    categories = ['Prompt<br>Consistency', 'Phenomenon<br>Congruency', 'Dynamism', 'Immutability', 'Coherence']
    
    colors = ['#667eea', '#f093fb', '#4facfe', '#43cea2', '#ff9a9e', '#fbc2eb', '#90f7ec']
    
    for idx in range(len(df_human)):
        row = df_human.iloc[idx]
        values = [row["PCS"], row["PCG"], row["CDN"], row["IMB"], row["STC"]]
        
        fig.add_trace(go.Scatterpolar(
            r=values,
            theta=categories,
            fill='toself',
            name=row["Model"],
            line=dict(color=colors[idx % len(colors)], width=2),
            marker=dict(size=8)
        ))
    
    fig.update_layout(
        polar=dict(
            radialaxis=dict(visible=True, range=[0, 4], tickfont=dict(size=10, color='red')),
            angularaxis=dict(tickfont=dict(size=11))
        ),
        showlegend=True,
        height=450,
        margin=dict(l=80, r=80, t=40, b=40),
        legend=dict(orientation="h", yanchor="bottom", y=-0.15, xanchor="center", x=0.5)
    )
    
    st.plotly_chart(fig, use_container_width=True)
    
    render_human_rankings_section(df_human)