{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Downloading data: 100%|██████████| 4.62k/4.62k [00:01<00:00, 4.14kB/s]\n", "Generating test split: 100%|██████████| 8/8 [00:00<00:00, 933.60 examples/s]\n" ] } ], "source": [ "import datasets\n", "\n", "data = datasets.load_dataset(\"lmms-lab/LiveBenchResults\", \"2024-07\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df = data[\"test\"].to_pandas()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "df.to_csv(\"2024-07.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv(\"2024-07.csv\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | Model Name | \n", "Total | \n", "Concrete Recognition | \n", "Contextual Analysis | \n", "Deeper Implications | \n", "Broader Implications | \n", "Further Insights | \n", "
|---|---|---|---|---|---|---|---|
| 0 | \n", "gpt-4o-mini | \n", "86.240000 | \n", "89.0 | \n", "81.0 | \n", "89.6 | \n", "87.600000 | \n", "84.0 | \n", "
| 1 | \n", "gemini-1.5-flash | \n", "80.760000 | \n", "84.4 | \n", "69.2 | \n", "81.7 | \n", "90.300000 | \n", "78.2 | \n", "
| 2 | \n", "gpt-4o | \n", "91.300000 | \n", "92.2 | \n", "83.4 | \n", "94.1 | \n", "94.400000 | \n", "92.4 | \n", "
| 3 | \n", "gemini-1.5-pro | \n", "86.280000 | \n", "90.6 | \n", "76.6 | \n", "85.6 | \n", "91.600000 | \n", "87.0 | \n", "
| 4 | \n", "llama3-llava-next-8b | \n", "62.650602 | \n", "60.1 | \n", "61.4 | \n", "74.8 | \n", "63.673469 | \n", "53.3 | \n", "
| 5 | \n", "llava-1.5-7b | \n", "41.940000 | \n", "38.6 | \n", "34.5 | \n", "58.8 | \n", "42.800000 | \n", "35.0 | \n", "
| 6 | \n", "Idefics2-8B | \n", "25.860000 | \n", "18.0 | \n", "16.3 | \n", "43.8 | \n", "27.000000 | \n", "24.2 | \n", "
| 7 | \n", "InternVL2-2B | \n", "56.840000 | \n", "65.8 | \n", "49.9 | \n", "64.2 | \n", "55.800000 | \n", "48.5 | \n", "