File size: 8,473 Bytes

55500d6

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tool\n",
    "import json\n",
    "import os\n",
    "import random\n",
    "import shutil\n",
    "from tqdm import tqdm\n",
    "import re\n",
    "import pandas as pd\n",
    "\n",
    "\n",
    "def load_jsonl(path):\n",
    "    datas = []\n",
    "    with open(path, 'r') as file:\n",
    "        for line in file:\n",
    "            data = json.loads(line)\n",
    "            datas.append(data)\n",
    "    return datas\n",
    "\n",
    "def load_jsonl_fromdir(res_dir):\n",
    "    res_name = sorted(os.listdir(res_dir))\n",
    "    res_paths = [os.path.join(res_dir, name) for name in res_name]\n",
    "\n",
    "    datas = []\n",
    "    for path in res_paths:\n",
    "        datas.extend(load_jsonl(path))\n",
    "    return datas\n",
    "\n",
    "def load_json(path):\n",
    "    with open(path, 'r') as file:\n",
    "        datas = json.load(file)\n",
    "    return datas\n",
    "\n",
    "def save_json(datas, path, indent=4):\n",
    "    with open(path, 'w') as file:\n",
    "        json.dump(datas, file, indent=indent)\n",
    "\n",
    "def parse(generated_text):\n",
    "    generated_text = generated_text.strip()\n",
    "    if \"```json\" in generated_text:\n",
    "        generated_text = re.sub(r\"^```json\\s*|\\s*```$\", \"\", generated_text.strip())\n",
    "    try:\n",
    "        data = eval(generated_text)\n",
    "    except:\n",
    "        generated_text = generated_text.replace('\\'Q\\': \\'', \"\\\"Q\\\": \\\"\").replace('\\', \\'A\\': \\'', \"\\\", \\\"A\\\": \\\"\").replace('\\'}', \"\\\"}\")\n",
    "        data = eval(generated_text)\n",
    "\n",
    "    return data\n",
    "\n",
    "def formating_conversations(data):\n",
    "    \n",
    "    question = data['Q']\n",
    "    options = data['Options']\n",
    "    answer = data['Answer']\n",
    "\n",
    "    question_inp = question + '\\n' + '\\n'.join(options)\n",
    "    answer_inp = answer\n",
    "\n",
    "    conversations = [\n",
    "        {\n",
    "            \"from\": \"human\",\n",
    "            \"value\": '<image>\\n' + question_inp\n",
    "        },\n",
    "        {\n",
    "            \"from\": \"gpt\",\n",
    "            \"value\": answer_inp\n",
    "        }\n",
    "    ]\n",
    "\n",
    "    return conversations\n",
    "\n",
    "def time_to_seconds(time_str):\n",
    "    # Split the string by the dot to separate seconds and milliseconds\n",
    "    time_parts = time_str.split('.')\n",
    "    seconds = 0\n",
    "    \n",
    "    # If there are milliseconds, process them\n",
    "    if len(time_parts) == 2:\n",
    "        time_str = time_parts[0]\n",
    "        milliseconds = int(time_parts[1])\n",
    "    else:\n",
    "        time_str = time_parts[0]\n",
    "        milliseconds = 0\n",
    "\n",
    "    # Split the time string by colon to get hours, minutes, and seconds\n",
    "    time_parts = time_str.split(':')\n",
    "    hours = int(time_parts[0])\n",
    "    minutes = int(time_parts[1])\n",
    "    seconds += float(time_parts[2])\n",
    "\n",
    "    # Convert everything to seconds\n",
    "    total_seconds = hours * 3600 + minutes * 60 + seconds + milliseconds / 1000\n",
    "    return total_seconds\n",
    "\n",
    "def get_datas_from_df(df_path):\n",
    "    df = pd.read_csv(df_path)\n",
    "    datas = df.to_dict('records')\n",
    "    return datas\n",
    "\n",
    "def list_2_dict(datas, key='video_id'):\n",
    "    datas_dict = {}\n",
    "    for data in tqdm(datas, desc='list_2_dict'):\n",
    "        video_id = data['video_id']\n",
    "        if video_id not in datas_dict:\n",
    "            datas_dict[video_id] = [data]\n",
    "        else:\n",
    "            datas_dict[video_id].append(data)\n",
    "            \n",
    "    return datas_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'get_datas_from_df' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m/share/minghao/VideoProjects/Sythesis2/LongCaption/tmp.ipynb Cell 2\u001b[0m line \u001b[0;36m4\n\u001b[1;32m      <a href='vscode-notebook-cell://dsw-gateway-cn-wulanchabu.data.aliyun.com/share/minghao/VideoProjects/Sythesis2/LongCaption/tmp.ipynb#W1sdnNjb2RlLXJlbW90ZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpandas\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mpd\u001b[39;00m\n\u001b[1;32m      <a href='vscode-notebook-cell://dsw-gateway-cn-wulanchabu.data.aliyun.com/share/minghao/VideoProjects/Sythesis2/LongCaption/tmp.ipynb#W1sdnNjb2RlLXJlbW90ZQ%3D%3D?line=2'>3</a>\u001b[0m path \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39m/share_2/minghao/Datasets/Panda70M/panda70m_training_full.csv\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m----> <a href='vscode-notebook-cell://dsw-gateway-cn-wulanchabu.data.aliyun.com/share/minghao/VideoProjects/Sythesis2/LongCaption/tmp.ipynb#W1sdnNjb2RlLXJlbW90ZQ%3D%3D?line=3'>4</a>\u001b[0m panda_70M_datas \u001b[39m=\u001b[39m get_datas_from_df(path)\n",
      "\u001b[0;31mNameError\u001b[0m: name 'get_datas_from_df' is not defined"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "path = '/share_2/minghao/Datasets/Panda70M/panda70m_training_full.csv'\n",
    "panda_70M_datas = get_datas_from_df(path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "size: 50000\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/50000 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'videoID': 'QLfFIVI-Ows', 'url': 'https://www.youtube.com/watch?v=QLfFIVI-Ows', 'timestamp': \"[['0:00:09.000', '0:00:17.360'], ['0:00:21.400', '0:00:32.000'], ['0:00:43.480', '0:00:46.680'], ['0:00:48.040', '0:00:56.000'], ['0:00:57.800', '0:01:04.760'], ['0:01:39.480', '0:01:42.760'], ['0:01:43.440', '0:01:45.880'], ['0:02:43.480', '0:02:48.680'], ['0:03:37.800', '0:03:53.480']]\", 'caption': \"['The guitar player is sitting in front of a microphone and playing an electric guitar.', 'The guitar player is sitting on a stool in front of a wall with a black background, holding an electric guitar and wearing a black shirt.', 'A carbon copy pedal is sitting on a table next to a guitar.', 'A man sitting on a chair with a guitar in front of him.', 'A finger pressing a button on a guitar pedal.', 'A person pressing a button on a guitar amplifier.', 'A person is using a guitar amplifier and plugging in a cable.', 'A person is using a guitar amplifier and plugging it in.', 'The guitarist is playing an electric guitar and sitting on a chair in front of a microphone.']\", 'matching_score': '[0.462158203125, 0.4716796875, 0.44482421875, 0.45166015625, 0.47705078125, 0.48193359375, 0.482421875, 0.482421875, 0.449951171875]', 'desirable_filtering': \"['desirable', 'desirable', '2_tiny_camera_movement', 'desirable', 'desirable', 'desirable', 'desirable', 'desirable', 'desirable']\", 'shot_boundary_detection': \"[[['0:00:00.000', '0:00:08.320']], [['0:00:00.000', '0:00:10.560']], [['0:00:00.000', '0:00:03.160']], [['0:00:00.000', '0:00:07.920']], [['0:00:00.000', '0:00:06.920']], [['0:00:00.000', '0:00:03.240']], [['0:00:00.000', '0:00:02.400']], [['0:00:00.000', '0:00:05.160']], [['0:00:00.000', '0:00:15.640']]]\", 'video_path': '/share_2/minghao/Datasets/Panda70M/0_5min_50k/00036169.mp4'}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "path = '/share/minghao/VideoProjects/Sythesis/Ordering/Task1/Candidates/0_5min.jsonl'\n",
    "datas = load_jsonl(path)\n",
    "print(f'size: {len(datas)}')\n",
    "\n",
    "for data in tqdm(datas):\n",
    "    print(data)\n",
    "    break"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}