{ "cells": [ { "cell_type": "code", "execution_count": 3, "source": [ "import jsonlines\n", "import os" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 4, "source": [ "import cv2" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 8, "source": [ "# extract first frame from mp4 file and save it as image\n", "\n", "def extract_frame(mp4,file_name):\n", " save_path='./imgs/'+file_name[:-4]+\".jpg\"\n", " # print(save_path)\n", " if os.path.exists(save_path):\n", " # print(\"file already exists\")\n", " return\n", " else:\n", " try:\n", " # extract first frame\n", " cap = cv2.VideoCapture(mp4)\n", " ret, frame = cap.read()\n", " cv2.imwrite(save_path, frame)\n", " except:\n", " print(f\"error in {save_path} file\")" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 16, "source": [ "# read jsonlines\n", "im_un=[]\n", "base_dir=\"./gifs\"\n", "data=jsonlines.open('./ReactionGIF.json')\n", "\n", "train_writer=jsonlines.open(\"train.json\", mode='w')\n", "val_writer=jsonlines.open(\"val.json\", mode='w')\n", "writer=train_writer\n", "\n", "for x in data:\n", " text=x[\"text\"]\n", " sentiment=x[\"label\"]\n", " image_name=x[\"reply\"]\n", " if image_name is None:\n", " continue\n", " else:\n", " image_name=image_name.split(\"/\")[-1]\n", " jpg_name='/home/ceyda/data/ReactionGIF/imgs/'+image_name[:-4]+\".jpg\"\n", "\n", " # image_path=os.path.join(base_dir,image_name)\n", " # extract_frame(image_path,image_name)\n", " if os.path.exists(jpg_name):\n", " dic={\n", " \"image_path\":jpg_name,\n", " \"captions\":[text,sentiment]\n", " }\n", " im_un.append(jpg_name)\n", " if len(im_un)>=19000:\n", " writer=val_writer\n", " writer.write(dic)\n", " # text,jpg_name,sentiment" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 13, "source": [ "len(im_un)" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "19387" ] }, "metadata": {}, "execution_count": 13 } ], "metadata": {} }, { "cell_type": "code", "execution_count": 14, "source": [ "len(set(im_un))" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "19385" ] }, "metadata": {}, "execution_count": 14 } ], "metadata": {} }, { "cell_type": "code", "execution_count": 17, "source": [ "import jax" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 18, "source": [ "jax.device_count()" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "8" ] }, "metadata": {}, "execution_count": 18 } ], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "import shutil\n", "from pathlib import Path\n", "val=jsonlines.open('./val.json')\n", "for v in val:\n", "\n", " shutil.copy(v['image_path'],\"/home/ceyda/code/clip-reply-demo/imgs/\" +Path(v['image_path']).name)\n" ], "outputs": [], "metadata": {} } ], "metadata": { "orig_nbformat": 4, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 2 }