import gradio as gr import pandas as pd # AFDBench: Area Forecast Discussion Benchmark # Final Real Data (Phase 2 Zero-Shot Baseline) # Human Reference is the absolute 100% Alignment target. # All other scores represent real zero-shot performance on 7,734 human samples. data = { "Model": [ "Human Reference (NWS)", "Nous/Hermes-3-Llama-3.1-8B", "Qwen/Qwen2.5-7B-Instruct", "Microsoft/Phi-3.5-mini", "Mistral-7B-Instruct-v0.3" ], "Met-Align (%)": [100.0, 11.38, 9.89, 7.13, 5.69], "Style-Align (0-1)": [1.00, 0.68, 0.52, 0.52, 0.52] } df = pd.DataFrame(data).sort_values("Met-Align (%)", ascending=False) with gr.Blocks(title="AFDBench") as demo: gr.Markdown("# 🌦 AFDBench Leaderboard") gr.Markdown("Evaluating AI alignment with professional NWS Forecast Discussions.") gr.DataFrame(value=df, interactive=False) gr.Markdown("---") gr.Markdown("**Met-Align**: Physical accuracy vs. Human Meteorologist choices.") gr.Markdown("**Style-Align**: Linguistic alignment with NWS professional prose.") if __name__ == "__main__": demo.launch()