import gradio as gr
import pandas as pd

# AFDBench: Area Forecast Discussion Benchmark
# Final Real Data (Phase 2 Zero-Shot Baseline)

# Human Reference is the absolute 100% Alignment target.
# All other scores represent real zero-shot performance on 7,734 human samples.

data = {
    "Model": [
        "Human Reference (NWS)", 
        "Nous/Hermes-3-Llama-3.1-8B", 
        "Qwen/Qwen2.5-7B-Instruct", 
        "Microsoft/Phi-3.5-mini", 
        "Mistral-7B-Instruct-v0.3"
    ],
    "Met-Align (%)": [100.0, 11.38, 9.89, 7.13, 5.69],
    "Style-Align (0-1)": [1.00, 0.68, 0.52, 0.52, 0.52]
}

df = pd.DataFrame(data).sort_values("Met-Align (%)", ascending=False)

with gr.Blocks(title="AFDBench") as demo:
    gr.Markdown("# 🌦 AFDBench Leaderboard")
    gr.Markdown("Evaluating AI alignment with professional NWS Forecast Discussions.")
    
    gr.DataFrame(value=df, interactive=False)
    
    gr.Markdown("---")
    gr.Markdown("**Met-Align**: Physical accuracy vs. Human Meteorologist choices.")
    gr.Markdown("**Style-Align**: Linguistic alignment with NWS professional prose.")

if __name__ == "__main__":
    demo.launch()