AFDBench / app.py
manmeet3591's picture
Upload folder using huggingface_hub
bd00fe6 verified
import gradio as gr
import pandas as pd
# AFDBench: Area Forecast Discussion Benchmark
# Final Real Data (Phase 2 Zero-Shot Baseline)
# Human Reference is the absolute 100% Alignment target.
# All other scores represent real zero-shot performance on 7,734 human samples.
data = {
"Model": [
"Human Reference (NWS)",
"Nous/Hermes-3-Llama-3.1-8B",
"Qwen/Qwen2.5-7B-Instruct",
"Microsoft/Phi-3.5-mini",
"Mistral-7B-Instruct-v0.3"
],
"Met-Align (%)": [100.0, 11.38, 9.89, 7.13, 5.69],
"Style-Align (0-1)": [1.00, 0.68, 0.52, 0.52, 0.52]
}
df = pd.DataFrame(data).sort_values("Met-Align (%)", ascending=False)
with gr.Blocks(title="AFDBench") as demo:
gr.Markdown("# 🌦 AFDBench Leaderboard")
gr.Markdown("Evaluating AI alignment with professional NWS Forecast Discussions.")
gr.DataFrame(value=df, interactive=False)
gr.Markdown("---")
gr.Markdown("**Met-Align**: Physical accuracy vs. Human Meteorologist choices.")
gr.Markdown("**Style-Align**: Linguistic alignment with NWS professional prose.")
if __name__ == "__main__":
demo.launch()