{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# This notebook will guide you to make correct format of Huggingface dataset, in proper parquet format and visualizable in Huggingface dataset hub.\n", "# We will take the example of the dataset \"Otter-AI/MMVet\" and convert it to the proper format." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/tiger/miniconda3/envs/llava/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "100%|██████████| 499/499 [00:18<00:00, 26.87it/s]\n" ] } ], "source": [ "from datasets import Dataset, Features, Value, Image\n", "import pandas as pd\n", "from tqdm import tqdm\n", "import os\n", "\n", "# Define the features for the dataset\n", "features = Features(\n", " {\n", " \"video_name\": Value(dtype=\"string\"),\n", " \"question\": Value(dtype=\"string\"),\n", " \"answer\": Value(dtype=\"string\"),\n", " }\n", ")\n", "\n", "df_items = {\n", " \"video_name\": [],\n", " \"question\": [],\n", " \"answer\": [],\n", "}\n", "\n", "description_root = \"/mnt/bn/vl-research/workspace/yhzhang/data/llava_video/video_detail_description/Test_Human_Annotated_Captions\"\n", "videos = os.listdir(description_root)\n", "for cur_video_name in tqdm(videos):\n", " sample_set = {}\n", " video_name = cur_video_name.split(\".\")[0]\n", " with open(f\"{description_root}/{cur_video_name}\", encoding=\"utf-8-sig\") as f:\n", " description = f.readlines()[0]\n", " question = \"Please provide a detailed description of the video, focusing on the main subjects, their actions, and the background scenes\"\n", " df_items[\"video_name\"].append(video_name)\n", " df_items[\"question\"].append(question)\n", " df_items[\"answer\"].append(description)\n", " # Add other fields as necessary\n", "\n", "df_items = pd.DataFrame(df_items)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
video_namequestionanswer
0v_-6dz6tBH77IPlease provide a detailed description of the v...The video is of a man in athletic clothes stan...
1v_-D1gdv_gQywPlease provide a detailed description of the v...The video begins with a man holding a knife in...
2v_-HpCLXdtcasPlease provide a detailed description of the v...A man is standing behind a barbell placed on t...
3v_-IMXSEIabMMPlease provide a detailed description of the v...The video starts with two people standing behi...
4v_-MbZ-W0AbN0Please provide a detailed description of the v...The video starts with an advertisement for fur...
\n", "
" ], "text/plain": [ " video_name question \\\n", "0 v_-6dz6tBH77I Please provide a detailed description of the v... \n", "1 v_-D1gdv_gQyw Please provide a detailed description of the v... \n", "2 v_-HpCLXdtcas Please provide a detailed description of the v... \n", "3 v_-IMXSEIabMM Please provide a detailed description of the v... \n", "4 v_-MbZ-W0AbN0 Please provide a detailed description of the v... \n", "\n", " answer \n", "0 The video is of a man in athletic clothes stan... \n", "1 The video begins with a man holding a knife in... \n", "2 A man is standing behind a barbell placed on t... \n", "3 The video starts with two people standing behi... \n", "4 The video starts with an advertisement for fur... " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_items.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "dataset = Dataset.from_pandas(df_items, features=features)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 340.67ba/s]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00, 2.46it/s]\n" ] }, { "data": { "text/plain": [ "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/VideoDetailDescription/commit/ad8e58fa42ad8daf56808724a4bcf4724688194e', commit_message='Upload dataset', commit_description='', oid='ad8e58fa42ad8daf56808724a4bcf4724688194e', pr_url=None, pr_revision=None, pr_num=None)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hub_dataset_path = \"lmms-lab/VideoDetailDescription\"\n", "dataset.push_to_hub(repo_id=hub_dataset_path, split=\"test\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "lmms-eval", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 2 }