Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / computer-vision-course /pr_397 /en /unit7 /video-processing /multimodal-based-video-models.html

rtrm

about 1 month ago

download

raw

29.7 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Multimodal Based Video Models","local":"multimodal-based-video-models","sections":[{"title":"What Modalities Are Present in Video?","local":"what-modalities-are-present-in-video","sections":[],"depth":2},{"title":"Video and Text","local":"video-and-text","sections":[{"title":"VideoBERT","local":"videobert","sections":[],"depth":3},{"title":"MERLOT","local":"merlot","sections":[],"depth":3}],"depth":2},{"title":"Video and Audio, Text","local":"video-and-audio-text","sections":[{"title":"VATT(Visual-Audio-Text Transformer)","local":"vattvisual-audio-text-transformer","sections":[],"depth":3},{"title":"Video-Llama","local":"video-llama","sections":[],"depth":3}],"depth":2},{"title":"Video and Multiple Modalities","local":"video-and-multiple-modalities","sections":[{"title":"ImageBind","local":"imagebind","sections":[],"depth":3}],"depth":2},{"title":"Conclusion","local":"conclusion","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/computer-vision-course/pr_397/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/entry/start.7f209408.js">
	<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/scheduler.7bc62968.js">
	<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/singletons.b15acae1.js">
	<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/paths.11cdc4b4.js">
	<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/entry/app.32e8338e.js">
	<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/index.2f8492b0.js">
	<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/nodes/0.e37092e8.js">
	<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/nodes/76.f46c7b52.js">
	<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/index.514d62da.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Multimodal Based Video Models","local":"multimodal-based-video-models","sections":[{"title":"What Modalities Are Present in Video?","local":"what-modalities-are-present-in-video","sections":[],"depth":2},{"title":"Video and Text","local":"video-and-text","sections":[{"title":"VideoBERT","local":"videobert","sections":[],"depth":3},{"title":"MERLOT","local":"merlot","sections":[],"depth":3}],"depth":2},{"title":"Video and Audio, Text","local":"video-and-audio-text","sections":[{"title":"VATT(Visual-Audio-Text Transformer)","local":"vattvisual-audio-text-transformer","sections":[],"depth":3},{"title":"Video-Llama","local":"video-llama","sections":[],"depth":3}],"depth":2},{"title":"Video and Multiple Modalities","local":"video-and-multiple-modalities","sections":[{"title":"ImageBind","local":"imagebind","sections":[],"depth":3}],"depth":2},{"title":"Conclusion","local":"conclusion","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="multimodal-based-video-models" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#multimodal-based-video-models"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Multimodal Based Video Models</span></h1> <p data-svelte-h="svelte-kajwg9">As discussed in previous chapters, a video can be simply defined as a sequence of images. However, unlike simple images, videos contain various modalities such as sound, text, and movement. From this perspective, to properly understand a video, we must consider multiple modalities at the same time. In this chapter, we first briefly explain what modalities can exist in a video. Then, we introduce architectures that can learn by aligning videos with different modalities.</p> <h2 class="relative group"><a id="what-modalities-are-present-in-video" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#what-modalities-are-present-in-video"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>What Modalities Are Present in Video?</span></h2> <p data-svelte-h="svelte-zaaa0o">Videos encompass a variety of modalities beyond just sequences of images. Understanding these different modalities is crucial for comprehensive video analysis and processing. The primary modalities present in videos include:</p> <ol data-svelte-h="svelte-z2pnld"><li>Visual Modality(Frames/Images): The most common modality, consisting of a sequence of images that provides the visual information for the video.</li> <li>Audio Modality(Sound): Includes dialogue, background music, and environmental sounds that can convey contextual information about the video.</li> <li>Text Modality(Captions/Subtitles): Appears as subtitles, captions, or on-screen text, offering explicit information related to the video’s context.</li> <li>Motion Modality(Movement Dynamics): Captures temporal changes between video frames, reflecting movement and transitions.</li> <li>Depth Modality: Represents the 3D spatial information of the video.</li> <li>Sensor Modality: In some applications, videos may include modalities like temperature or biometric data.</li></ol> <div class="flex justify-center" data-svelte-h="svelte-1j42n3q"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Multimodal_Based_Video_Models/Modality_example.jpg" alt="Modality examples. The image is from the original LanuageBind paper"></div> <p data-svelte-h="svelte-542f97">Beyond the modalities mentioned above, videos can incorporate even more diverse types of modalities. Be sure to consider which modalities are necessary for your specific work or project. In the next section, we will explore video architectures that can align and represent these modalities jointly.</p> <h2 class="relative group"><a id="video-and-text" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#video-and-text"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Video and Text</span></h2> <h3 class="relative group"><a id="videobert" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#videobert"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>VideoBERT</span></h3> <p data-svelte-h="svelte-9pdvmw"><strong>Overview</strong></p> <div class="flex justify-center" data-svelte-h="svelte-jhssv2"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Multimodal_Based_Video_Models/Overview_VideoBERT.png" alt="VideoBERT model architecture."></div> <p data-svelte-h="svelte-16c55p1"><a href="https://arxiv.org/abs/1904.01766" rel="nofollow">VideoBERT</a> is an attempt to apply the BERT architecture directly to video data. Just like BERT in language models, the goal is to learn good visual-linguistic representation without any supervsion. For the text modality, VideoBERT uses ASR (Automatic Speech Recognition) to convert audio into text, and then obtains BERT token embeddings. For the video, it uses S3D to get token embeddings for each frame.</p> <p data-svelte-h="svelte-1i4nuav"><strong>Key Features</strong></p> <ol data-svelte-h="svelte-1m33s9o"><li><strong>Linguistic-visual alignment</strong>: Classifies whether a given text and video frames are aligned or not.</li> <li><strong>Masked Language Modeling</strong>: Predicts masked tokens in the text (just like in BERT).</li> <li><strong>Masked Frame Modeling</strong>: Predicts the masked video frames (like MLM predicts masked tokens in text).</li></ol> <p data-svelte-h="svelte-1w10pj6"><strong>Why It Matters</strong></p> <p data-svelte-h="svelte-16un5kb">VideoBERT was one of the first models to effectively integrate video-language understanding by learning joint representations.
	Unlike previous methods, VideoBERT does not use a detection model for image-text labeling. Instead, it uses a <em>clustering algorithm</em> to enable Masked Frame modeling, allowing the model to predict masked frames without needing explicit labeled data.</p> <h3 class="relative group"><a id="merlot" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#merlot"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>MERLOT</span></h3> <p data-svelte-h="svelte-9pdvmw"><strong>Overview</strong></p> <p data-svelte-h="svelte-1lcyl5s"><a href="https://arxiv.org/abs/2106.02636" rel="nofollow">MERLOT</a> is designed to improve multimodal reasoning by learning from large-scale video-text datasets. It focuses on understanding interactions between visual and textual information using no labeled data. By leveraging the large-scale unlabeled dataset <strong>YT-Temporal-180M</strong>, <strong>MERLOT</strong> demonstrates strong performance in visual commonsense reasoning without relying on heavy visual supervision.</p> <p data-svelte-h="svelte-1i4nuav"><strong>Key Features</strong></p> <ol data-svelte-h="svelte-rbeuwb"><li>Temporal Reordering Task (from <a href="https://aclanthology.org/2020.emnlp-main.161.pdf" rel="nofollow">HERO</a>)</li> <li>Frame-Caption Matching Task (from <a href="https://arxiv.org/pdf/1906.05743" rel="nofollow">CBT</a>, <a href="https://aclanthology.org/2020.emnlp-main.161.pdf" rel="nofollow">HAMMER</a>)</li> <li>Masked Language Modeling</li></ol> <p data-svelte-h="svelte-y42e9n">Why It Matters</p> <p data-svelte-h="svelte-1k33y5e">While the model architecture and training method are not entirely new, MERLOT achieves performance improvements by training on <strong>YT-Temporal-180M</strong>, a large-scale visual-text dataset. This extensive dataset enables the model to better understand temporal dynamics and multimodal interactions, leading to enhanced reasoning and prediction capabilities in video-language tasks.</p> <u data-svelte-h="svelte-248d1k">Note</u>: If you're looking to understand the detailed training process of MERLOT, make sure to refer to the MERLOT paper as well as earlier works like [HERO](https://aclanthology.org/2020.emnlp-main.161.pdf), [CBT](https://arxiv.org/pdf/1906.05743) and [HAMMER](https://aclanthology.org/2020.emnlp-main.161.pdf).
	<h2 class="relative group"><a id="video-and-audio-text" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#video-and-audio-text"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Video and Audio, Text</span></h2> <h3 class="relative group"><a id="vattvisual-audio-text-transformer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#vattvisual-audio-text-transformer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>VATT(Visual-Audio-Text Transformer)</span></h3> <p data-svelte-h="svelte-9pdvmw"><strong>Overview</strong></p> <div class="flex justify-center" data-svelte-h="svelte-imol36"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Multimodal_Based_Video_Models/Overview_VATT.png" alt="VATT model architecture."></div> <p data-svelte-h="svelte-1mr0o6t"><a href="https://arxiv.org/abs/2104.11178" rel="nofollow">VATT</a> is a model designed for self-supervised learning from raw video, audio, and text. Different tokenization and positional encoding methods were applied to each modality, and VATT used the Transformer Encoder to effectively integrate the representations from the raw multimodal data. As a result, it achieved strong performance in various downstream tasks such as action recognition and text-to-video retrieval.</p> <p data-svelte-h="svelte-1i4nuav"><strong>Key Features</strong></p> <ol data-svelte-h="svelte-n7mh5y"><li>Modality-Specific & Modality-Agnostic: The <strong>modality-specific</strong> version uses separate Transformer encoders for each modality, while the modality-agnostic version integrates all modalities with a single Transformer encoder. While modality-specific demonstrated better performance, the <strong>modality-agnostic</strong> still showed strong performance in downstream tasks with fewer parameters.</li> <li>Droptoken: Due to the redundancies in video (with audio and text data), sampling only a subset of tokens allows for more efficient training.</li> <li>Multimodal Contrastive Learning: Noise Contrastive Estimation (NCE) was used for video-audio pairs, while Multiple Instance Learning NCE (MIL-NCE) was applied to video-text pairs</li></ol> <p data-svelte-h="svelte-f0v7ot"><strong>Why It Matter</strong></p> <p data-svelte-h="svelte-aaa8bv">Previous models using transformers for video multimodal tasks tended to rely heavily on visual data and required extensive training time and computational complexity. In contrast, VATT utilizes <strong>Droptoken</strong> and <strong>weight sharing</strong> to learn powerful multimodal representations from raw visual, audio, and text data with relatively lower computational complexity.</p> <h3 class="relative group"><a id="video-llama" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#video-llama"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Video-Llama</span></h3> <p data-svelte-h="svelte-9pdvmw"><strong>Overview</strong></p> <p data-svelte-h="svelte-oyhjzp"><a href="https://arxiv.org/abs/2306.02858" rel="nofollow">Video-LLaMA</a> is a multimodal framework designed to extend Large Language Models (LLMs) to understand both visual and auditory content in videos. It integrates video, audio and text, allowing the model to process and generate meaningful responses grounded in audiovisual information. Video-LLaMA addresses two key challenges: capturing temporal changes in visual scenes and integrating audio-visual signals into a unified system.</p> <p data-svelte-h="svelte-1i4nuav"><strong>Key Features</strong></p> <p data-svelte-h="svelte-ony8cp">Video-LLaMA has two branches</p> <ol data-svelte-h="svelte-o2scpm"><li><u>Vision-Language branch</u> for processing video frames</li> <li><u>Audio-Language branch</u> for handling audio signals.</li></ol> <p data-svelte-h="svelte-90mn1x">These branches are trained separately, undergoing both pre-training and fine-tuning phases. In the pre-training phase, the model learns to integrate different modalities, while in the fine-tuning phase, it focuses on improving its ability to follow instructions accurately.</p> <p data-svelte-h="svelte-1l2di1q">In the case of the vision-language branch, there is an abundance of visual-text data available. However, for the audio-language branch, there is a lack of sufficient audio-text data. To address this, the model utilizes <strong>ImageBind</strong>, allowing the audio-language branch to be trained using visual-text data instead.</p> <p data-svelte-h="svelte-1w10pj6"><strong>Why It Matters</strong></p> <p data-svelte-h="svelte-7boocd">Previous models struggled to handle both visual and auditory content together. Video-LLaMA addresses this by integrating these modalities in a single framework, capturing temporal changes in video and aligning audio-visual signals. It overcomes the limitations of earlier research by using cross-modal pre-training and instruction fine-tuning, achieving strong performance in multimodal tasks like video-based conversations without relying on separate models.</p> <h2 class="relative group"><a id="video-and-multiple-modalities" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#video-and-multiple-modalities"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Video and Multiple Modalities</span></h2> <h3 class="relative group"><a id="imagebind" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#imagebind"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ImageBind</span></h3> <p data-svelte-h="svelte-9pdvmw"><strong>Overview</strong></p> <div class="flex justify-center" data-svelte-h="svelte-xk041a"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Multimodal_Based_Video_Models/Overview_ImageBind.png" alt="ImageBind model architecture."></div> <p data-svelte-h="svelte-evyoj6">ImageBind utilizes paired data between images and other modalities to integrate diverse modality representations, centering around image data.</p> <p data-svelte-h="svelte-1i4nuav"><strong>Key Features</strong></p> <p data-svelte-h="svelte-xs0er2">ImageBind unifies many kinds of modalities by utilizing pairs of images and other modalities. By leveraging <em>InfoNCE</em> as the loss function, the model aligns representations between the various inputs. Even in cases where paired data between non-image modalities are absent, ImageBind can effectively perform cross-modal retrieval and zero-shot tasks.
	Additionally, the training process of ImageBind is relatively simple compared to other models and can be implemented in various ways.</p> <p data-svelte-h="svelte-1w10pj6"><strong>Why It Matters</strong></p> <p data-svelte-h="svelte-6hhak8">ImageBind’s key contribution is its ability to integrate various modalities without the need for specific modality-paired datasets. Using images as a reference, it aligns and combines up to six different modalities — such as audio, text, depth, and more — into a unified representation space. The significance lies in its capacity to achieve this alignment across multiple modalities simultaneously, without requiring direct pairing for each combination, making it highly efficient for multimodal learning.</p> <h2 class="relative group"><a id="conclusion" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#conclusion"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Conclusion</span></h2> <p data-svelte-h="svelte-fec7cm">We have briefly examined the different modalities present in videos and then explored models that integrate visual information with various other modalities.
	As time goes on, there is a growing body of research focused on integrating a wide range of modalities all at once.</p> <p data-svelte-h="svelte-h4ky2m">I’m excited to see what future models will emerge, integrating even more diverse modalities within the video content. The potential for advancing multimodal representation learning through videos feels limitless!</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/computer-vision-course/blob/main/chapters/en/unit7/video-processing/multimodal-based-video-models.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1p6gie1 = {
	assets: "/docs/computer-vision-course/pr_397/en",
	base: "/docs/computer-vision-course/pr_397/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/computer-vision-course/pr_397/en/_app/immutable/entry/start.7f209408.js"),
	import("/docs/computer-vision-course/pr_397/en/_app/immutable/entry/app.32e8338e.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 76],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 29.7 kB
Xet hash:: bb9e9316255c82f5e110e9867b271d77958b613efa81a1a8567cbc87eeec2224

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.