Upload 2 files
Browse files
Notebooks/GSI_VideoRetrieval_EmbedVideos.ipynb
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"attachments": {},
|
| 5 |
+
"cell_type": "markdown",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"# GSI Technology Video Search Demo - Embedding Videos Notebook:\n",
|
| 9 |
+
"\n",
|
| 10 |
+
"The following Notebook will include code that demonstrates the process of video embedding.<br>\n",
|
| 11 |
+
"It specifically focuses on embedding a single video using the [Diangle/clip4clip-webvid](https://huggingface.co/Diangle/clip4clip-webvid) model."
|
| 12 |
+
]
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"cell_type": "code",
|
| 16 |
+
"execution_count": 1,
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"outputs": [],
|
| 19 |
+
"source": [
|
| 20 |
+
"\"Close-up women's hands scratch\"\n",
|
| 21 |
+
"example = './example/34721191.mp4'"
|
| 22 |
+
]
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"cell_type": "code",
|
| 26 |
+
"execution_count": 2,
|
| 27 |
+
"metadata": {},
|
| 28 |
+
"outputs": [],
|
| 29 |
+
"source": [
|
| 30 |
+
"from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, InterpolationMode\n",
|
| 31 |
+
"from PIL import Image\n",
|
| 32 |
+
"import cv2\n",
|
| 33 |
+
"import numpy as np\n",
|
| 34 |
+
"import torch\n",
|
| 35 |
+
"\n",
|
| 36 |
+
"# Code to convert one video to few images. \n",
|
| 37 |
+
"def video2image(video_path, frame_rate=1.0, size=224):\n",
|
| 38 |
+
" def preprocess(size, n_px):\n",
|
| 39 |
+
" return Compose([\n",
|
| 40 |
+
" Resize(size, interpolation=InterpolationMode.BICUBIC), \n",
|
| 41 |
+
" CenterCrop(size),\n",
|
| 42 |
+
" lambda image: image.convert(\"RGB\"),\n",
|
| 43 |
+
" ToTensor(),\n",
|
| 44 |
+
" Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),\n",
|
| 45 |
+
" ])(n_px)\n",
|
| 46 |
+
" \n",
|
| 47 |
+
" cap = cv2.VideoCapture(video_path)\n",
|
| 48 |
+
" cap = cv2.VideoCapture(video_path, cv2.CAP_FFMPEG)\n",
|
| 49 |
+
" frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))\n",
|
| 50 |
+
" fps = int(cap.get(cv2.CAP_PROP_FPS))\n",
|
| 51 |
+
" if fps < 1:\n",
|
| 52 |
+
" images = np.zeros([3, size, size], dtype=np.float32) \n",
|
| 53 |
+
" print(\"ERROR: problem reading video file: \", video_path)\n",
|
| 54 |
+
" else:\n",
|
| 55 |
+
" total_duration = (frameCount + fps - 1) // fps\n",
|
| 56 |
+
" start_sec, end_sec = 0, total_duration\n",
|
| 57 |
+
" interval = fps / frame_rate\n",
|
| 58 |
+
" frames_idx = np.floor(np.arange(start_sec*fps, end_sec*fps, interval))\n",
|
| 59 |
+
" ret = True \n",
|
| 60 |
+
" images = np.zeros([len(frames_idx), 3, size, size], dtype=np.float32)\n",
|
| 61 |
+
" \n",
|
| 62 |
+
" for i, idx in enumerate(frames_idx):\n",
|
| 63 |
+
" cap.set(cv2.CAP_PROP_POS_FRAMES , idx)\n",
|
| 64 |
+
" ret, frame = cap.read() \n",
|
| 65 |
+
" if not ret: break\n",
|
| 66 |
+
" frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) \n",
|
| 67 |
+
" last_frame = i\n",
|
| 68 |
+
" images[i,:,:,:] = preprocess(size, Image.fromarray(frame).convert(\"RGB\"))\n",
|
| 69 |
+
" \n",
|
| 70 |
+
" images = images[:last_frame+1]\n",
|
| 71 |
+
" cap.release()\n",
|
| 72 |
+
" video_frames = torch.tensor(images)\n",
|
| 73 |
+
" return video_frames\n",
|
| 74 |
+
" \n",
|
| 75 |
+
"video = video2image(example)"
|
| 76 |
+
]
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"cell_type": "code",
|
| 80 |
+
"execution_count": 3,
|
| 81 |
+
"metadata": {},
|
| 82 |
+
"outputs": [
|
| 83 |
+
{
|
| 84 |
+
"name": "stderr",
|
| 85 |
+
"output_type": "stream",
|
| 86 |
+
"text": [
|
| 87 |
+
"Some weights of the model checkpoint at Diangle/clip4clip-webvid were not used when initializing CLIPVisionModelWithProjection: ['text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'logit_scale', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.final_layer_norm.bias', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.final_layer_norm.weight', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.embeddings.position_ids', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_projection.weight', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.4.self_attn.q_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.embeddings.position_embedding.weight', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.7.self_attn.k_proj.bias']\n",
|
| 88 |
+
"- This IS expected if you are initializing CLIPVisionModelWithProjection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
|
| 89 |
+
"- This IS NOT expected if you are initializing CLIPVisionModelWithProjection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
|
| 90 |
+
]
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"name": "stdout",
|
| 94 |
+
"output_type": "stream",
|
| 95 |
+
"text": [
|
| 96 |
+
"tensor([-2.9570e-02, 6.0339e-03, 1.7294e-02, -1.3951e-02, 4.8329e-02,\n",
|
| 97 |
+
" 2.4099e-02, 3.3340e-02, 3.1769e-02, 2.1997e-03, 4.2602e-03,\n",
|
| 98 |
+
" -1.3887e-02, 8.2744e-03, 2.5123e-03, -2.2163e-02, -4.1139e-02,\n",
|
| 99 |
+
" -1.2101e-02, -6.1914e-02, 6.7091e-03, 4.2834e-02, -2.2604e-02,\n",
|
| 100 |
+
" -2.7443e-02, 1.0600e-02, 2.9430e-03, 3.2580e-02, -1.3577e-02,\n",
|
| 101 |
+
" 7.8084e-03, 1.2397e-02, -5.3404e-03, 1.4736e-02, -2.4564e-02,\n",
|
| 102 |
+
" -5.4057e-02, 3.9507e-02, 1.2754e-02, 4.6864e-04, 7.4087e-03,\n",
|
| 103 |
+
" 3.8710e-03, 7.9482e-03, 1.3444e-02, -1.7326e-02, -1.2486e-01,\n",
|
| 104 |
+
" -8.4992e-02, -3.9097e-02, -2.1903e-02, -7.1480e-03, -2.7220e-03,\n",
|
| 105 |
+
" 4.1397e-03, 1.7315e-02, 4.4724e-02, 9.1722e-04, 3.1429e-02,\n",
|
| 106 |
+
" 3.8212e-02, -2.1133e-02, 2.4437e-03, -1.4371e-03, -2.9859e-03,\n",
|
| 107 |
+
" 7.8939e-04, 2.4093e-02, -2.2199e-02, -3.9110e-02, 1.7673e-02,\n",
|
| 108 |
+
" 1.1360e-01, 3.3466e-03, -1.9643e-02, 1.7798e-03, 1.5112e-02,\n",
|
| 109 |
+
" -6.2003e-03, -2.0564e-02, 6.4936e-02, 6.6286e-02, -2.0585e-02,\n",
|
| 110 |
+
" 2.0740e-02, 1.0476e-02, -5.9948e-03, -2.4672e-02, 2.3725e-02,\n",
|
| 111 |
+
" -4.6442e-03, 1.8887e-02, 3.7517e-02, 3.1605e-02, -3.7756e-03,\n",
|
| 112 |
+
" 2.7584e-02, 5.7234e-03, 3.4368e-02, 1.4564e-02, 2.6392e-02,\n",
|
| 113 |
+
" -1.9975e-02, 1.2648e-01, -5.3093e-03, 7.3013e-02, 4.8827e-03,\n",
|
| 114 |
+
" -2.8492e-02, -4.9734e-02, -6.6967e-01, 1.2463e-02, 2.4013e-02,\n",
|
| 115 |
+
" 1.3702e-02, 2.9382e-02, 1.4373e-02, -2.1994e-02, 3.6824e-03,\n",
|
| 116 |
+
" 2.9366e-02, -2.1474e-03, 1.7371e-02, -6.1958e-02, -4.6649e-02,\n",
|
| 117 |
+
" -4.3063e-03, 1.0081e-01, -3.1598e-02, 9.4211e-03, -9.7909e-03,\n",
|
| 118 |
+
" 4.4678e-02, -4.8716e-03, 1.8896e-02, 9.5822e-03, -2.3881e-02,\n",
|
| 119 |
+
" -9.0785e-03, 5.4653e-03, 3.0017e-02, -3.0415e-02, -1.3150e-03,\n",
|
| 120 |
+
" 2.9047e-02, 3.2315e-02, -1.0728e-02, 4.7503e-02, -4.0033e-02,\n",
|
| 121 |
+
" 3.4482e-02, 6.2684e-02, 3.0337e-02, 5.0680e-02, -8.6022e-03,\n",
|
| 122 |
+
" 1.5261e-02, 3.7766e-02, -2.4730e-02, 8.6131e-02, 4.5388e-02,\n",
|
| 123 |
+
" 5.4677e-02, 3.9401e-02, 4.4164e-02, -5.2270e-02, -8.8473e-03,\n",
|
| 124 |
+
" 8.1178e-03, -1.0574e-02, -7.6409e-05, -8.3209e-03, -8.1179e-04,\n",
|
| 125 |
+
" 3.2574e-02, -1.4150e-02, -4.0937e-02, 1.0180e-02, 1.3868e-03,\n",
|
| 126 |
+
" 3.4978e-02, -1.1991e-02, -2.1560e-02, 2.0833e-02, 3.8494e-02,\n",
|
| 127 |
+
" 1.4916e-02, -1.5102e-02, -1.0009e-02, -9.6670e-03, 3.6516e-03,\n",
|
| 128 |
+
" 2.6473e-02, -9.1190e-03, -1.9326e-02, 3.2072e-02, -2.9562e-02,\n",
|
| 129 |
+
" -4.1949e-02, -9.4430e-03, 2.7654e-02, 3.1868e-02, 2.6336e-03,\n",
|
| 130 |
+
" -1.6622e-02, -3.4676e-02, -3.4540e-02, 8.5971e-03, -9.4823e-03,\n",
|
| 131 |
+
" -3.6754e-02, 4.9925e-02, 9.8040e-04, -6.7678e-02, 5.0645e-03,\n",
|
| 132 |
+
" -7.5227e-03, 1.2880e-02, 5.5055e-02, -5.1705e-02, -6.1548e-02,\n",
|
| 133 |
+
" 1.4440e-03, -6.8204e-03, -1.4279e-02, -2.8179e-02, -2.2386e-02,\n",
|
| 134 |
+
" 5.2374e-02, -3.4718e-02, 5.3560e-03, -6.3553e-02, 8.3361e-02,\n",
|
| 135 |
+
" -2.7192e-02, 4.2078e-02, 3.2605e-03, -5.6035e-02, -8.2745e-03,\n",
|
| 136 |
+
" -2.8813e-02, 4.3161e-02, -5.0922e-02, 3.0529e-02, 2.0102e-02,\n",
|
| 137 |
+
" 2.9533e-02, -7.8186e-03, -3.0819e-02, -2.1356e-02, -2.7967e-02,\n",
|
| 138 |
+
" 2.4877e-02, 2.3300e-02, 2.8305e-02, 2.9761e-02, 1.2363e-02,\n",
|
| 139 |
+
" -1.4158e-02, -1.1000e-02, 2.3479e-02, 4.8863e-02, -1.3325e-02,\n",
|
| 140 |
+
" 1.2415e-02, -1.0494e-02, -5.3160e-04, -1.3253e-02, -2.4968e-03,\n",
|
| 141 |
+
" 2.0370e-02, -5.9943e-03, -9.5419e-03, 5.9531e-03, -8.3129e-03,\n",
|
| 142 |
+
" -4.0607e-03, 6.1272e-03, -2.9724e-02, -1.8579e-02, 1.2740e-02,\n",
|
| 143 |
+
" -2.6391e-02, 4.1079e-03, -4.0331e-03, 3.4990e-02, -3.4697e-04,\n",
|
| 144 |
+
" -9.6936e-03, -2.2701e-02, 3.2625e-02, 1.1973e-02, -3.9408e-02,\n",
|
| 145 |
+
" -6.4848e-02, 4.3097e-02, 2.6910e-02, -3.9942e-02, 3.4112e-02,\n",
|
| 146 |
+
" -7.8409e-03, -4.3240e-02, -1.6996e-02, 3.8101e-02, -3.8530e-02,\n",
|
| 147 |
+
" 2.1452e-04, 3.7173e-02, 2.3474e-02, 1.9435e-03, -2.1596e-02,\n",
|
| 148 |
+
" 1.2855e-02, 4.8854e-03, 2.1395e-02, -2.4349e-02, 7.3487e-03,\n",
|
| 149 |
+
" -2.7641e-02, -1.5773e-02, 1.1367e-02, 8.7802e-03, 2.3783e-02,\n",
|
| 150 |
+
" 3.3420e-02, 3.4498e-02, 2.2979e-02, -1.2473e-02, 3.1100e-02,\n",
|
| 151 |
+
" 6.0752e-02, -2.5795e-02, 1.7830e-02, -1.3168e-02, 8.0613e-04,\n",
|
| 152 |
+
" 1.3292e-02, 8.1109e-03, 2.1875e-03, -1.0863e-02, 3.8718e-02,\n",
|
| 153 |
+
" 4.5967e-02, -1.2454e-01, 2.6564e-02, -4.4082e-04, 1.8394e-02,\n",
|
| 154 |
+
" 2.9872e-02, 6.4751e-03, 5.4129e-03, 2.0823e-02, -4.9624e-02,\n",
|
| 155 |
+
" -2.3234e-02, -5.7144e-02, -1.3117e-02, -5.3304e-02, -1.9084e-02,\n",
|
| 156 |
+
" -1.9121e-02, 2.5556e-04, -3.9970e-02, -3.3640e-02, 1.0532e-02,\n",
|
| 157 |
+
" 5.7862e-02, -4.0414e-02, 6.6390e-03, 1.6265e-03, 1.0555e-02,\n",
|
| 158 |
+
" -5.1818e-03, -3.9941e-02, 8.6119e-02, 2.5038e-02, 1.1136e-02,\n",
|
| 159 |
+
" -8.5421e-03, -2.0004e-02, 3.0798e-02, -4.8180e-03, -1.1030e-02,\n",
|
| 160 |
+
" 7.1489e-03, 7.0376e-02, -4.2558e-02, -5.4193e-02, 6.0990e-03,\n",
|
| 161 |
+
" 1.5232e-02, 1.3667e-02, -1.5016e-02, -1.0382e-03, -6.4072e-03,\n",
|
| 162 |
+
" 2.3970e-03, 3.7884e-02, -1.7684e-02, 2.0192e-02, -2.1400e-02,\n",
|
| 163 |
+
" 1.6529e-02, 1.8982e-02, 1.6748e-02, -2.0919e-02, 1.2904e-02,\n",
|
| 164 |
+
" -1.5105e-02, -1.7961e-02, 2.2824e-03, 9.0103e-04, 1.3905e-02,\n",
|
| 165 |
+
" -5.2162e-02, 5.7747e-03, 6.7262e-03, 6.3685e-03, -1.2071e-02,\n",
|
| 166 |
+
" -2.7873e-02, -1.4171e-04, -4.8872e-02, -8.9744e-03, -1.0448e-02,\n",
|
| 167 |
+
" 4.9146e-02, -2.0365e-02, -6.8874e-02, 1.3715e-02, -2.8159e-02,\n",
|
| 168 |
+
" 5.1973e-03, -4.1494e-02, 1.7353e-02, -1.4510e-02, -4.5331e-03,\n",
|
| 169 |
+
" 1.0267e-02, -2.9127e-02, 1.0169e-02, -5.0776e-03, -2.0463e-02,\n",
|
| 170 |
+
" 1.6880e-02, 2.4789e-02, -3.2186e-02, -1.5043e-02, -9.5236e-03,\n",
|
| 171 |
+
" -1.8453e-02, 1.9968e-01, -3.1110e-02, -3.4481e-02, -5.3706e-03,\n",
|
| 172 |
+
" -2.3295e-02, -6.6525e-02, 1.5241e-02, -5.3700e-02, -1.3558e-02,\n",
|
| 173 |
+
" -7.4800e-02, 4.6305e-02, 4.3405e-03, 1.0513e-02, -1.4961e-02,\n",
|
| 174 |
+
" 1.2347e-01, -4.1887e-02, -2.9692e-02, -2.0832e-02, 2.5459e-03,\n",
|
| 175 |
+
" 1.5311e-02, -1.3357e-02, 1.3205e-02, 2.8943e-02, 4.9173e-02,\n",
|
| 176 |
+
" 3.3758e-02, 1.1087e-02, 4.2151e-02, 6.3205e-04, -4.3288e-02,\n",
|
| 177 |
+
" 2.3333e-02, 1.5167e-02, -1.0237e-02, -7.9236e-02, 4.3594e-03,\n",
|
| 178 |
+
" 3.1445e-02, 4.2794e-03, -9.3492e-03, -3.5418e-02, -1.9242e-02,\n",
|
| 179 |
+
" -3.0336e-02, 7.7880e-03, 6.6255e-02, -7.5213e-03, 2.5932e-02,\n",
|
| 180 |
+
" -1.7802e-02, 1.8590e-03, 5.3834e-03, 9.6787e-02, 2.8787e-02,\n",
|
| 181 |
+
" 9.1017e-04, -1.8586e-02, 2.2730e-02, -9.7814e-02, 4.2616e-02,\n",
|
| 182 |
+
" 4.0229e-02, -8.9988e-03, -2.0952e-02, 7.7816e-03, -4.0449e-04,\n",
|
| 183 |
+
" -1.3639e-02, -1.7206e-03, -9.1304e-03, 4.3670e-03, 1.9919e-02,\n",
|
| 184 |
+
" -2.0095e-02, -2.6256e-03, 3.0235e-02, 3.7728e-03, 6.3254e-04,\n",
|
| 185 |
+
" -6.9728e-02, 2.5881e-03, 1.0343e-02, 3.3831e-02, 2.2356e-03,\n",
|
| 186 |
+
" -2.7363e-02, 3.5232e-02, 5.3659e-02, -7.8222e-03, -2.0881e-03,\n",
|
| 187 |
+
" 2.2187e-02, 2.0626e-02, 3.6413e-02, -4.4460e-03, 4.6213e-02,\n",
|
| 188 |
+
" -1.4652e-03, 2.1768e-02, 3.3055e-03, -2.3867e-02, -2.7972e-02,\n",
|
| 189 |
+
" -6.7086e-02, 2.4510e-02, 4.0885e-02, -1.6748e-03, 1.2575e-02,\n",
|
| 190 |
+
" -2.0675e-04, -1.1889e-02, 4.2555e-03, -2.6686e-02, -9.5006e-03,\n",
|
| 191 |
+
" -1.3144e-02, 3.0939e-02, -1.9938e-02, 4.2527e-02, -1.4343e-02,\n",
|
| 192 |
+
" 5.5876e-03, 2.4495e-02, 3.9814e-03, 2.8102e-02, 4.3181e-02,\n",
|
| 193 |
+
" -1.7406e-02, -4.2736e-02, -8.1578e-03, -5.3989e-03, 2.9429e-03,\n",
|
| 194 |
+
" 4.3196e-02, -2.0857e-02, -3.0203e-02, -4.0288e-03, -4.4894e-02,\n",
|
| 195 |
+
" 2.7039e-02, 3.5724e-02, -1.4012e-02, -2.3949e-03, 1.4861e-02,\n",
|
| 196 |
+
" 3.1610e-02, 4.8555e-02, 1.8550e-02, 1.2663e-02, -6.1358e-03,\n",
|
| 197 |
+
" -4.1771e-02, 2.8252e-02, -1.1711e-02, -4.0601e-03, -2.9267e-02,\n",
|
| 198 |
+
" -3.0001e-02, 1.6215e-02], grad_fn=<DivBackward0>)\n"
|
| 199 |
+
]
|
| 200 |
+
}
|
| 201 |
+
],
|
| 202 |
+
"source": [
|
| 203 |
+
"from transformers import CLIPVisionModelWithProjection\n",
|
| 204 |
+
"\n",
|
| 205 |
+
"model = CLIPVisionModelWithProjection.from_pretrained(\"Diangle/clip4clip-webvid\")\n",
|
| 206 |
+
"model = model.eval()\n",
|
| 207 |
+
"visual_output = model(video)\n",
|
| 208 |
+
"\n",
|
| 209 |
+
"# Normalizing the embeddings and calculating mean between all embeddings. \n",
|
| 210 |
+
"visual_output = visual_output[\"image_embeds\"]\n",
|
| 211 |
+
"visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True)\n",
|
| 212 |
+
"visual_output = torch.mean(visual_output, dim=0)\n",
|
| 213 |
+
"visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True)\n",
|
| 214 |
+
"print(visual_output)\n",
|
| 215 |
+
"\n",
|
| 216 |
+
" "
|
| 217 |
+
]
|
| 218 |
+
}
|
| 219 |
+
],
|
| 220 |
+
"metadata": {
|
| 221 |
+
"kernelspec": {
|
| 222 |
+
"display_name": "Python 3",
|
| 223 |
+
"language": "python",
|
| 224 |
+
"name": "python3"
|
| 225 |
+
},
|
| 226 |
+
"language_info": {
|
| 227 |
+
"codemirror_mode": {
|
| 228 |
+
"name": "ipython",
|
| 229 |
+
"version": 3
|
| 230 |
+
},
|
| 231 |
+
"file_extension": ".py",
|
| 232 |
+
"mimetype": "text/x-python",
|
| 233 |
+
"name": "python",
|
| 234 |
+
"nbconvert_exporter": "python",
|
| 235 |
+
"pygments_lexer": "ipython3",
|
| 236 |
+
"version": "3.10.9"
|
| 237 |
+
},
|
| 238 |
+
"orig_nbformat": 4
|
| 239 |
+
},
|
| 240 |
+
"nbformat": 4,
|
| 241 |
+
"nbformat_minor": 2
|
| 242 |
+
}
|
Notebooks/example/34721191.mp4
ADDED
|
Binary file (875 kB). View file
|
|
|