{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Downloading data: 100%|██████████| 4.62k/4.62k [00:01<00:00, 4.14kB/s]\n", "Generating test split: 100%|██████████| 8/8 [00:00<00:00, 933.60 examples/s]\n" ] } ], "source": [ "import datasets\n", "\n", "data = datasets.load_dataset(\"lmms-lab/LiveBenchResults\", \"2024-07\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df = data[\"test\"].to_pandas()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "df.to_csv(\"2024-07.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv(\"2024-07.csv\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Model NameTotalConcrete RecognitionContextual AnalysisDeeper ImplicationsBroader ImplicationsFurther Insights
0gpt-4o-mini86.24000089.081.089.687.60000084.0
1gemini-1.5-flash80.76000084.469.281.790.30000078.2
2gpt-4o91.30000092.283.494.194.40000092.4
3gemini-1.5-pro86.28000090.676.685.691.60000087.0
4llama3-llava-next-8b62.65060260.161.474.863.67346953.3
5llava-1.5-7b41.94000038.634.558.842.80000035.0
6Idefics2-8B25.86000018.016.343.827.00000024.2
7InternVL2-2B56.84000065.849.964.255.80000048.5
\n", "
" ], "text/plain": [ " Model Name Total Concrete Recognition Contextual Analysis \\\n", "0 gpt-4o-mini 86.240000 89.0 81.0 \n", "1 gemini-1.5-flash 80.760000 84.4 69.2 \n", "2 gpt-4o 91.300000 92.2 83.4 \n", "3 gemini-1.5-pro 86.280000 90.6 76.6 \n", "4 llama3-llava-next-8b 62.650602 60.1 61.4 \n", "5 llava-1.5-7b 41.940000 38.6 34.5 \n", "6 Idefics2-8B 25.860000 18.0 16.3 \n", "7 InternVL2-2B 56.840000 65.8 49.9 \n", "\n", " Deeper Implications Broader Implications Further Insights \n", "0 89.6 87.600000 84.0 \n", "1 81.7 90.300000 78.2 \n", "2 94.1 94.400000 92.4 \n", "3 85.6 91.600000 87.0 \n", "4 74.8 63.673469 53.3 \n", "5 58.8 42.800000 35.0 \n", "6 43.8 27.000000 24.2 \n", "7 64.2 55.800000 48.5 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "data = datasets.Dataset.from_pandas(df)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 473.45ba/s]\n", "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.17s/it]\n" ] }, { "data": { "text/plain": [ "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBenchResults/commit/a29f8ecb399dbd7ab7475f0de2c48ee54affbff9', commit_message='Upload dataset', commit_description='', oid='a29f8ecb399dbd7ab7475f0de2c48ee54affbff9', pr_url=None, pr_revision=None, pr_num=None)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.push_to_hub(\"lmms-lab/LiveBenchResults\", \"2024-07\", split=\"test\")" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "data = datasets.load_dataset(\"lmms-lab/LiveBenchDetailedResults\", \"2024-07\")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "data[\"Idefics2_8B\"] = data[\"idefics2\"]" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " gpt_4o_mini: Dataset({\n", " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", " num_rows: 250\n", " })\n", " gemini_1.5_flash: Dataset({\n", " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", " num_rows: 250\n", " })\n", " gpt_4o: Dataset({\n", " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", " num_rows: 250\n", " })\n", " gemini_1.5_pro: Dataset({\n", " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", " num_rows: 250\n", " })\n", " llama3_llava_next_8b: Dataset({\n", " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", " num_rows: 250\n", " })\n", " llava_1.5_7b: Dataset({\n", " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", " num_rows: 250\n", " })\n", " idefics2: Dataset({\n", " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", " num_rows: 250\n", " })\n", " InternVL2_2B: Dataset({\n", " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", " num_rows: 250\n", " })\n", " Idefics2_8B: Dataset({\n", " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", " num_rows: 250\n", " })\n", "})" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "new_data = {}\n", "for k, v in data.items():\n", " if k == \"idefics2\":\n", " continue\n", " new_data[k] = v\n", "data = datasets.DatasetDict(new_data)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " gpt_4o_mini: Dataset({\n", " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", " num_rows: 250\n", " })\n", " gemini_1.5_flash: Dataset({\n", " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", " num_rows: 250\n", " })\n", " gpt_4o: Dataset({\n", " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", " num_rows: 250\n", " })\n", " gemini_1.5_pro: Dataset({\n", " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", " num_rows: 250\n", " })\n", " llama3_llava_next_8b: Dataset({\n", " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", " num_rows: 250\n", " })\n", " llava_1.5_7b: Dataset({\n", " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", " num_rows: 250\n", " })\n", " InternVL2_2B: Dataset({\n", " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", " num_rows: 250\n", " })\n", " Idefics2_8B: Dataset({\n", " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", " num_rows: 250\n", " })\n", "})" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Map: 100%|██████████| 250/250 [00:00<00:00, 347.35 examples/s]it/s]\n", "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.58ba/s]\n", "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.63s/it]\n", "Map: 100%|██████████| 250/250 [00:00<00:00, 363.40 examples/s]it/s]\n", "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.70ba/s]\n", "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.59s/it]\n", "Map: 100%|██████████| 250/250 [00:00<00:00, 472.60 examples/s]it/s]\n", "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.62ba/s]\n", "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.43s/it]\n", "Map: 100%|██████████| 250/250 [00:00<00:00, 352.11 examples/s]it/s]\n", "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 11.55ba/s]\n", "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.63s/it]\n", "Map: 100%|██████████| 250/250 [00:00<00:00, 475.90 examples/s]it/s]\n", "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 11.38ba/s]\n", "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.46s/it]\n", "Map: 100%|██████████| 250/250 [00:00<00:00, 364.89 examples/s]it/s]\n", "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 10.94ba/s]\n", "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.59s/it]\n", "Map: 100%|██████████| 250/250 [00:00<00:00, 529.96 examples/s]it/s]\n", "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 13.51ba/s]\n", "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.33s/it]\n", "Map: 100%|██████████| 250/250 [00:00<00:00, 349.67 examples/s]it/s]\n", "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.74ba/s]\n", "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.57s/it]\n" ] }, { "data": { "text/plain": [ "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBenchDetailedResults/commit/047d6dc66759e0a8b57b4e6015db6208da1cd4da', commit_message='Upload dataset', commit_description='', oid='047d6dc66759e0a8b57b4e6015db6208da1cd4da', pr_url=None, pr_revision=None, pr_num=None)" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.push_to_hub(\"lmms-lab/LiveBenchDetailedResults\", \"2024-07\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "live_bench", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 2 }