{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# This notebook will guide you to make correct format of Huggingface dataset, in proper parquet format and visualizable in Huggingface dataset hub.\n", "# We will take the example of the dataset \"Otter-AI/MMVet\" and convert it to the proper format." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/tiger/miniconda3/envs/llava/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "100%|██████████| 499/499 [00:18<00:00, 26.87it/s]\n" ] } ], "source": [ "from datasets import Dataset, Features, Value, Image\n", "import pandas as pd\n", "from tqdm import tqdm\n", "import os\n", "\n", "# Define the features for the dataset\n", "features = Features(\n", " {\n", " \"video_name\": Value(dtype=\"string\"),\n", " \"question\": Value(dtype=\"string\"),\n", " \"answer\": Value(dtype=\"string\"),\n", " }\n", ")\n", "\n", "df_items = {\n", " \"video_name\": [],\n", " \"question\": [],\n", " \"answer\": [],\n", "}\n", "\n", "description_root = \"/mnt/bn/vl-research/workspace/yhzhang/data/llava_video/video_detail_description/Test_Human_Annotated_Captions\"\n", "videos = os.listdir(description_root)\n", "for cur_video_name in tqdm(videos):\n", " sample_set = {}\n", " video_name = cur_video_name.split(\".\")[0]\n", " with open(f\"{description_root}/{cur_video_name}\", encoding=\"utf-8-sig\") as f:\n", " description = f.readlines()[0]\n", " question = \"Please provide a detailed description of the video, focusing on the main subjects, their actions, and the background scenes\"\n", " df_items[\"video_name\"].append(video_name)\n", " df_items[\"question\"].append(question)\n", " df_items[\"answer\"].append(description)\n", " # Add other fields as necessary\n", "\n", "df_items = pd.DataFrame(df_items)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | video_name | \n", "question | \n", "answer | \n", "
|---|---|---|---|
| 0 | \n", "v_-6dz6tBH77I | \n", "Please provide a detailed description of the v... | \n", "The video is of a man in athletic clothes stan... | \n", "
| 1 | \n", "v_-D1gdv_gQyw | \n", "Please provide a detailed description of the v... | \n", "The video begins with a man holding a knife in... | \n", "
| 2 | \n", "v_-HpCLXdtcas | \n", "Please provide a detailed description of the v... | \n", "A man is standing behind a barbell placed on t... | \n", "
| 3 | \n", "v_-IMXSEIabMM | \n", "Please provide a detailed description of the v... | \n", "The video starts with two people standing behi... | \n", "
| 4 | \n", "v_-MbZ-W0AbN0 | \n", "Please provide a detailed description of the v... | \n", "The video starts with an advertisement for fur... | \n", "