{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Generating test split: 100%|██████████| 4/4 [00:00<00:00, 860.15 examples/s]\n" ] } ], "source": [ "import datasets\n", "\n", "data = datasets.load_dataset(\"lmms-lab/LiveBenchResults\", \"2024-09\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "df = pd.DataFrame(data[\"test\"])\n", "df = df.drop(columns=\"__index_level_0__\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | Model Name | \n", "Total | \n", "Concrete Recognition | \n", "Analytical Questions | \n", "Divergent Thinking | \n", "Real-world Assistance | \n", "
|---|---|---|---|---|---|---|
| 0 | \n", "LLaVA-1.5-7B | \n", "30.15000 | \n", "9.400 | \n", "36.4 | \n", "45.4 | \n", "29.400 | \n", "
| 1 | \n", "GPT-4o-mini | \n", "91.90475 | \n", "94.644 | \n", "93.4 | \n", "95.3 | \n", "84.275 | \n", "
| 2 | \n", "LLaVA-OneVision-0.5B | \n", "32.36300 | \n", "25.052 | \n", "33.6 | \n", "40.2 | \n", "30.600 | \n", "
| 3 | \n", "LLaVA-OneVision-7B | \n", "64.85775 | \n", "57.206 | \n", "67.0 | \n", "76.2 | \n", "59.025 | \n", "