File size: 1,142 Bytes
69e0075
 
 
bd00fe6
 
 
 
 
69e0075
 
 
 
 
 
bd00fe6
69e0075
 
 
bd00fe6
69e0075
 
 
 
bd00fe6
 
 
69e0075
 
 
 
bd00fe6
 
69e0075
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import gradio as gr
import pandas as pd

# AFDBench: Area Forecast Discussion Benchmark
# Final Real Data (Phase 2 Zero-Shot Baseline)

# Human Reference is the absolute 100% Alignment target.
# All other scores represent real zero-shot performance on 7,734 human samples.

data = {
    "Model": [
        "Human Reference (NWS)", 
        "Nous/Hermes-3-Llama-3.1-8B", 
        "Qwen/Qwen2.5-7B-Instruct", 
        "Microsoft/Phi-3.5-mini", 
        "Mistral-7B-Instruct-v0.3"
    ],
    "Met-Align (%)": [100.0, 11.38, 9.89, 7.13, 5.69],
    "Style-Align (0-1)": [1.00, 0.68, 0.52, 0.52, 0.52]
}

df = pd.DataFrame(data).sort_values("Met-Align (%)", ascending=False)

with gr.Blocks(title="AFDBench") as demo:
    gr.Markdown("# 🌦 AFDBench Leaderboard")
    gr.Markdown("Evaluating AI alignment with professional NWS Forecast Discussions.")
    
    gr.DataFrame(value=df, interactive=False)
    
    gr.Markdown("---")
    gr.Markdown("**Met-Align**: Physical accuracy vs. Human Meteorologist choices.")
    gr.Markdown("**Style-Align**: Linguistic alignment with NWS professional prose.")

if __name__ == "__main__":
    demo.launch()