Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import pandas as pd | |
| # AFDBench: Area Forecast Discussion Benchmark | |
| # Final Real Data (Phase 2 Zero-Shot Baseline) | |
| # Human Reference is the absolute 100% Alignment target. | |
| # All other scores represent real zero-shot performance on 7,734 human samples. | |
| data = { | |
| "Model": [ | |
| "Human Reference (NWS)", | |
| "Nous/Hermes-3-Llama-3.1-8B", | |
| "Qwen/Qwen2.5-7B-Instruct", | |
| "Microsoft/Phi-3.5-mini", | |
| "Mistral-7B-Instruct-v0.3" | |
| ], | |
| "Met-Align (%)": [100.0, 11.38, 9.89, 7.13, 5.69], | |
| "Style-Align (0-1)": [1.00, 0.68, 0.52, 0.52, 0.52] | |
| } | |
| df = pd.DataFrame(data).sort_values("Met-Align (%)", ascending=False) | |
| with gr.Blocks(title="AFDBench") as demo: | |
| gr.Markdown("# 🌦 AFDBench Leaderboard") | |
| gr.Markdown("Evaluating AI alignment with professional NWS Forecast Discussions.") | |
| gr.DataFrame(value=df, interactive=False) | |
| gr.Markdown("---") | |
| gr.Markdown("**Met-Align**: Physical accuracy vs. Human Meteorologist choices.") | |
| gr.Markdown("**Style-Align**: Linguistic alignment with NWS professional prose.") | |
| if __name__ == "__main__": | |
| demo.launch() | |