Buckets:

hf-doc-build
/

doc-dev

hf-doc-build/doc-dev / transformers /pr_31809 /en /_app /immutable /nodes /167.fbb4bae6.js

rtrm's picture

about 2 months ago

91.5 kB

	import{s as dn,f as cn,o as mn,n as mt}from"../chunks/scheduler.25b97de1.js";import{S as pn,i as un,g as d,s as n,r as p,A as fn,h as c,f as t,c as r,j as V,u,x as v,k as w,y as s,a as l,v as f,d as g,t as h,w as _}from"../chunks/index.d9030fc9.js";import{T as ln}from"../chunks/Tip.baa67368.js";import{D as B}from"../chunks/Docstring.e257edda.js";import{C as St}from"../chunks/CodeBlock.e6cd0d95.js";import{E as Lt}from"../chunks/ExampleCodeBlock.20db4b6e.js";import{H as Z,E as gn}from"../chunks/EditOnGithub.91d95064.js";function hn(J){let i,T="Example:",b,m,y;return m=new St({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMCglMEElMjAlMjAlMjAlMjBJbnN0cnVjdEJsaXBWaWRlb1Zpc2lvbkNvbmZpZyUyQyUwQSUyMCUyMCUyMCUyMEluc3RydWN0QmxpcFZpZGVvUUZvcm1lckNvbmZpZyUyQyUwQSUyMCUyMCUyMCUyME9QVENvbmZpZyUyQyUwQSUyMCUyMCUyMCUyMEluc3RydWN0QmxpcFZpZGVvQ29uZmlnJTJDJTBBJTIwJTIwJTIwJTIwSW5zdHJ1Y3RCbGlwVmlkZW9Gb3JDb25kaXRpb25hbEdlbmVyYXRpb24lMkMlMEEpJTBBJTBBJTIzJTIwSW5pdGlhbGl6aW5nJTIwYSUyMEluc3RydWN0QmxpcFZpZGVvQ29uZmlnJTIwd2l0aCUyMFNhbGVzZm9yY2UlMkZpbnN0cnVjdC1ibGlwLWZsYW4tdDUlMjBzdHlsZSUyMGNvbmZpZ3VyYXRpb24lMEFjb25maWd1cmF0aW9uJTIwJTNEJTIwSW5zdHJ1Y3RCbGlwVmlkZW9Db25maWcoKSUwQSUwQSUyMyUyMEluaXRpYWxpemluZyUyMGElMjBJbnN0cnVjdEJsaXBWaWRlb0ZvckNvbmRpdGlvbmFsR2VuZXJhdGlvbiUyMCh3aXRoJTIwcmFuZG9tJTIwd2VpZ2h0cyklMjBmcm9tJTIwdGhlJTIwU2FsZXNmb3JjZSUyRmluc3RydWN0LWJsaXAtZmxhbi10NSUyMHN0eWxlJTIwY29uZmlndXJhdGlvbiUwQW1vZGVsJTIwJTNEJTIwSW5zdHJ1Y3RCbGlwVmlkZW9Gb3JDb25kaXRpb25hbEdlbmVyYXRpb24oY29uZmlndXJhdGlvbiklMEElMEElMjMlMjBBY2Nlc3NpbmclMjB0aGUlMjBtb2RlbCUyMGNvbmZpZ3VyYXRpb24lMEFjb25maWd1cmF0aW9uJTIwJTNEJTIwbW9kZWwuY29uZmlnJTBBJTBBJTIzJTIwV2UlMjBjYW4lMjBhbHNvJTIwaW5pdGlhbGl6ZSUyMGElMjBJbnN0cnVjdEJsaXBWaWRlb0NvbmZpZyUyMGZyb20lMjBhJTIwSW5zdHJ1Y3RCbGlwVmlkZW9WaXNpb25Db25maWclMkMlMjBJbnN0cnVjdEJsaXBWaWRlb1FGb3JtZXJDb25maWclMjBhbmQlMjBhbnklMjBQcmV0cmFpbmVkQ29uZmlnJTBBJTBBJTIzJTIwSW5pdGlhbGl6aW5nJTIwSW5zdHJ1Y3RibGlwdmlkZW8lMjB2aXNpb24lMkMlMjBJbnN0cnVjdGJsaXB2aWRlbyUyMFEtRm9ybWVyJTIwYW5kJTIwbGFuZ3VhZ2UlMjBtb2RlbCUyMGNvbmZpZ3VyYXRpb25zJTBBdmlzaW9uX2NvbmZpZyUyMCUzRCUyMEluc3RydWN0QmxpcFZpZGVvVmlzaW9uQ29uZmlnKCklMEFxZm9ybWVyX2NvbmZpZyUyMCUzRCUyMEluc3RydWN0QmxpcFZpZGVvUUZvcm1lckNvbmZpZygpJTBBdGV4dF9jb25maWclMjAlM0QlMjBPUFRDb25maWcoKSUwQSUwQWNvbmZpZyUyMCUzRCUyMEluc3RydWN0QmxpcFZpZGVvQ29uZmlnLmZyb21fdGV4dF92aXNpb25fY29uZmlncyh2aXNpb25fY29uZmlnJTJDJTIwcWZvcm1lcl9jb25maWclMkMlMjB0ZXh0X2NvbmZpZyk=",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> (
	<span class="hljs-meta">... </span> InstructBlipVideoVisionConfig,
	<span class="hljs-meta">... </span> InstructBlipVideoQFormerConfig,
	<span class="hljs-meta">... </span> OPTConfig,
	<span class="hljs-meta">... </span> InstructBlipVideoConfig,
	<span class="hljs-meta">... </span> InstructBlipVideoForConditionalGeneration,
	<span class="hljs-meta">... </span>)

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># Initializing a InstructBlipVideoConfig with Salesforce/instruct-blip-flan-t5 style configuration</span>
	<span class="hljs-meta">>>> </span>configuration = InstructBlipVideoConfig()

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># Initializing a InstructBlipVideoForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration</span>
	<span class="hljs-meta">>>> </span>model = InstructBlipVideoForConditionalGeneration(configuration)

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># Accessing the model configuration</span>
	<span class="hljs-meta">>>> </span>configuration = model.config

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># We can also initialize a InstructBlipVideoConfig from a InstructBlipVideoVisionConfig, InstructBlipVideoQFormerConfig and any PretrainedConfig</span>

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># Initializing Instructblipvideo vision, Instructblipvideo Q-Former and language model configurations</span>
	<span class="hljs-meta">>>> </span>vision_config = InstructBlipVideoVisionConfig()
	<span class="hljs-meta">>>> </span>qformer_config = InstructBlipVideoQFormerConfig()
	<span class="hljs-meta">>>> </span>text_config = OPTConfig()

	<span class="hljs-meta">>>> </span>config = InstructBlipVideoConfig.from_text_vision_configs(vision_config, qformer_config, text_config)`,wrap:!1}}),{c(){i=d("p"),i.textContent=T,b=n(),p(m.$$.fragment)},l(a){i=c(a,"P",{"data-svelte-h":!0}),v(i)!=="svelte-11lpom8"&&(i.textContent=T),b=r(a),u(m.$$.fragment,a)},m(a,I){l(a,i,I),l(a,b,I),f(m,a,I),y=!0},p:mt,i(a){y\|\|(g(m.$$.fragment,a),y=!0)},o(a){h(m.$$.fragment,a),y=!1},d(a){a&&(t(i),t(b)),_(m,a)}}}function _n(J){let i,T="Example:",b,m,y;return m=new St({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEluc3RydWN0QmxpcFZpZGVvVmlzaW9uQ29uZmlnJTJDJTIwSW5zdHJ1Y3RCbGlwVmlkZW9WaXNpb25Nb2RlbCUwQSUwQSUyMyUyMEluaXRpYWxpemluZyUyMGElMjBJbnN0cnVjdEJsaXBWaWRlb1Zpc2lvbkNvbmZpZyUyMHdpdGglMjBTYWxlc2ZvcmNlJTJGaW5zdHJ1Y3QtYmxpcC1mbGFuLXQ1JTIwc3R5bGUlMjBjb25maWd1cmF0aW9uJTBBY29uZmlndXJhdGlvbiUyMCUzRCUyMEluc3RydWN0QmxpcFZpZGVvVmlzaW9uQ29uZmlnKCklMEElMEElMjMlMjBJbml0aWFsaXppbmclMjBhJTIwSW5zdHJ1Y3RCbGlwVmlkZW9WaXNpb25Nb2RlbCUyMCh3aXRoJTIwcmFuZG9tJTIwd2VpZ2h0cyklMjBmcm9tJTIwdGhlJTIwU2FsZXNmb3JjZSUyRmluc3RydWN0LWJsaXAtZmxhbi10NSUyMHN0eWxlJTIwY29uZmlndXJhdGlvbiUwQW1vZGVsJTIwJTNEJTIwSW5zdHJ1Y3RCbGlwVmlkZW9WaXNpb25Nb2RlbChjb25maWd1cmF0aW9uKSUwQSUwQSUyMyUyMEFjY2Vzc2luZyUyMHRoZSUyMG1vZGVsJTIwY29uZmlndXJhdGlvbiUwQWNvbmZpZ3VyYXRpb24lMjAlM0QlMjBtb2RlbC5jb25maWc=",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> InstructBlipVideoVisionConfig, InstructBlipVideoVisionModel

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># Initializing a InstructBlipVideoVisionConfig with Salesforce/instruct-blip-flan-t5 style configuration</span>
	<span class="hljs-meta">>>> </span>configuration = InstructBlipVideoVisionConfig()

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># Initializing a InstructBlipVideoVisionModel (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration</span>
	<span class="hljs-meta">>>> </span>model = InstructBlipVideoVisionModel(configuration)

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># Accessing the model configuration</span>
	<span class="hljs-meta">>>> </span>configuration = model.config`,wrap:!1}}),{c(){i=d("p"),i.textContent=T,b=n(),p(m.$$.fragment)},l(a){i=c(a,"P",{"data-svelte-h":!0}),v(i)!=="svelte-11lpom8"&&(i.textContent=T),b=r(a),u(m.$$.fragment,a)},m(a,I){l(a,i,I),l(a,b,I),f(m,a,I),y=!0},p:mt,i(a){y\|\|(g(m.$$.fragment,a),y=!0)},o(a){h(m.$$.fragment,a),y=!1},d(a){a&&(t(i),t(b)),_(m,a)}}}function bn(J){let i,T="Examples:",b,m,y;return m=new St({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEluc3RydWN0QmxpcFZpZGVvUUZvcm1lckNvbmZpZyUyQyUyMEluc3RydWN0QmxpcFZpZGVvUUZvcm1lck1vZGVsJTBBJTBBJTIzJTIwSW5pdGlhbGl6aW5nJTIwYSUyMEluc3RydWN0YmxpcHZpZGVvJTIwU2FsZXNmb3JjZSUyRmluc3RydWN0LWJsaXAtZmxhbi10NSUyMHN0eWxlJTIwY29uZmlndXJhdGlvbiUwQWNvbmZpZ3VyYXRpb24lMjAlM0QlMjBJbnN0cnVjdEJsaXBWaWRlb1FGb3JtZXJDb25maWcoKSUwQSUwQSUyMyUyMEluaXRpYWxpemluZyUyMGElMjBtb2RlbCUyMCh3aXRoJTIwcmFuZG9tJTIwd2VpZ2h0cyklMjBmcm9tJTIwdGhlJTIwU2FsZXNmb3JjZSUyRmluc3RydWN0LWJsaXAtZmxhbi10NSUyMHN0eWxlJTIwY29uZmlndXJhdGlvbiUwQW1vZGVsJTIwJTNEJTIwSW5zdHJ1Y3RCbGlwVmlkZW9RRm9ybWVyTW9kZWwoY29uZmlndXJhdGlvbiklMEElMjMlMjBBY2Nlc3NpbmclMjB0aGUlMjBtb2RlbCUyMGNvbmZpZ3VyYXRpb24lMEFjb25maWd1cmF0aW9uJTIwJTNEJTIwbW9kZWwuY29uZmln",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> InstructBlipVideoQFormerConfig, InstructBlipVideoQFormerModel

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># Initializing a Instructblipvideo Salesforce/instruct-blip-flan-t5 style configuration</span>
	<span class="hljs-meta">>>> </span>configuration = InstructBlipVideoQFormerConfig()

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># Initializing a model (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration</span>
	<span class="hljs-meta">>>> </span>model = InstructBlipVideoQFormerModel(configuration)
	<span class="hljs-meta">>>> </span><span class="hljs-comment"># Accessing the model configuration</span>
	<span class="hljs-meta">>>> </span>configuration = model.config`,wrap:!1}}),{c(){i=d("p"),i.textContent=T,b=n(),p(m.$$.fragment)},l(a){i=c(a,"P",{"data-svelte-h":!0}),v(i)!=="svelte-kvfsh7"&&(i.textContent=T),b=r(a),u(m.$$.fragment,a)},m(a,I){l(a,i,I),l(a,b,I),f(m,a,I),y=!0},p:mt,i(a){y\|\|(g(m.$$.fragment,a),y=!0)},o(a){h(m.$$.fragment,a),y=!1},d(a){a&&(t(i),t(b)),_(m,a)}}}function vn(J){let i,T=`Although the recipe for forward pass needs to be defined within this function, one should call the <code>Module</code>
	instance afterwards instead of this since the former takes care of running the pre and post processing steps while
	the latter silently ignores them.`;return{c(){i=d("p"),i.innerHTML=T},l(b){i=c(b,"P",{"data-svelte-h":!0}),v(i)!=="svelte-fincs2"&&(i.innerHTML=T)},m(b,m){l(b,i,m)},p:mt,d(b){b&&t(i)}}}function yn(J){let i,T=`Although the recipe for forward pass needs to be defined within this function, one should call the <code>Module</code>
	instance afterwards instead of this since the former takes care of running the pre and post processing steps while
	the latter silently ignores them.`;return{c(){i=d("p"),i.innerHTML=T},l(b){i=c(b,"P",{"data-svelte-h":!0}),v(i)!=="svelte-fincs2"&&(i.innerHTML=T)},m(b,m){l(b,i,m)},p:mt,d(b){b&&t(i)}}}function In(J){let i,T="Examples:",b,m,y;return m=new St({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEluc3RydWN0QmxpcFZpZGVvUHJvY2Vzc29yJTJDJTIwSW5zdHJ1Y3RCbGlwVmlkZW9Gb3JDb25kaXRpb25hbEdlbmVyYXRpb24lMEFpbXBvcnQlMjB0b3JjaCUwQWZyb20lMjBodWdnaW5nZmFjZV9odWIlMjBpbXBvcnQlMjBoZl9odWJfZG93bmxvYWQlMEFmcm9tJTIwYXYlMEElMEFkZWYlMjByZWFkX3ZpZGVvX3B5YXYoY29udGFpbmVyJTJDJTIwaW5kaWNlcyklM0ElMEElMjAlMjAlMjAlMjAnJyclMEElMjAlMjAlMjAlMjBEZWNvZGUlMjB0aGUlMjB2aWRlbyUyMHdpdGglMjBQeUFWJTIwZGVjb2Rlci4lMEElMjAlMjAlMjAlMjBBcmdzJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwY29udGFpbmVyJTIwKCU2MGF2LmNvbnRhaW5lci5pbnB1dC5JbnB1dENvbnRhaW5lciU2MCklM0ElMjBQeUFWJTIwY29udGFpbmVyLiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGluZGljZXMlMjAoJTYwTGlzdCU1QmludCU1RCU2MCklM0ElMjBMaXN0JTIwb2YlMjBmcmFtZSUyMGluZGljZXMlMjB0byUyMGRlY29kZS4lMEElMjAlMjAlMjAlMjBSZXR1cm5zJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcmVzdWx0JTIwKG5wLm5kYXJyYXkpJTNBJTIwbnAlMjBhcnJheSUyMG9mJTIwZGVjb2RlZCUyMGZyYW1lcyUyMG9mJTIwc2hhcGUlMjAobnVtX2ZyYW1lcyUyQyUyMGhlaWdodCUyQyUyMHdpZHRoJTJDJTIwMykuJTBBJTIwJTIwJTIwJTIwJycnJTBBJTIwJTIwJTIwJTIwZnJhbWVzJTIwJTNEJTIwJTVCJTVEJTBBJTIwJTIwJTIwJTIwY29udGFpbmVyLnNlZWsoMCklMEElMjAlMjAlMjAlMjBzdGFydF9pbmRleCUyMCUzRCUyMGluZGljZXMlNUIwJTVEJTBBJTIwJTIwJTIwJTIwZW5kX2luZGV4JTIwJTNEJTIwaW5kaWNlcyU1Qi0xJTVEJTBBJTIwJTIwJTIwJTIwZm9yJTIwaSUyQyUyMGZyYW1lJTIwaW4lMjBlbnVtZXJhdGUoY29udGFpbmVyLmRlY29kZSh2aWRlbyUzRDApKSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGlmJTIwaSUyMCUzRSUyMGVuZF9pbmRleCUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGJyZWFrJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwaWYlMjBpJTIwJTNFJTNEJTIwc3RhcnRfaW5kZXglMjBhbmQlMjBpJTIwaW4lMjBpbmRpY2VzJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwZnJhbWVzLmFwcGVuZChmcmFtZSklMEElMjAlMjAlMjAlMjByZXR1cm4lMjBucC5zdGFjayglNUJ4LnRvX25kYXJyYXkoZm9ybWF0JTNEJTIycmdiMjQlMjIpJTIwZm9yJTIweCUyMGluJTIwZnJhbWVzJTVEKSUwQSUwQW1vZGVsJTIwJTNEJTIwSW5zdHJ1Y3RCbGlwVmlkZW9Qcm9jZXNzb3IuZnJvbV9wcmV0cmFpbmVkKCUyMlNhbGVzZm9yY2UlMkZpbnN0cnVjdGJsaXAtdmljdW5hLTdiJTIyJTJDJTIwZGV2aWNlX21hcCUzRCUyMmF1dG8lMjIpJTBBcHJvY2Vzc29yJTIwJTNEJTIwSW5zdHJ1Y3RCbGlwVmlkZW9Gb3JDb25kaXRpb25hbEdlbmVyYXRpb24uZnJvbV9wcmV0cmFpbmVkKCUyMlNhbGVzZm9yY2UlMkZpbnN0cnVjdGJsaXAtdmljdW5hLTdiJTIyKSUwQSUwQWZpbGVfcGF0aCUyMCUzRCUyMGhmX2h1Yl9kb3dubG9hZCglMEFjb250YWluZXIlMjAlM0QlMjBhdi5vcGVuKHZpZGVvX3BhdGgpJTBBJTIzJTIwc2FtcGxlJTIwdW5pZm9ybWx5JTIwNCUyMGZyYW1lcyUyMGZyb20lMjB0aGUlMjB2aWRlV2h5JTIwaXMlMjB0aGlzJTIwdmlkZW8lMjBmdW5ueSUzRm8lMEF0b3RhbF9mcmFtZXMlMjAlM0QlMjBjb250YWluZXIuc3RyZWFtcy52aWRlbyU1QjAlNUQuZnJhbWVzJTBBaW5kaWNlcyUyMCUzRCUyMG5wLmFyYW5nZSgwJTJDJTIwdG90YWxfZnJhbWVzJTJDJTIwdG90YWxfZnJhbWVzJTIwJTJGJTIwNCkuYXN0eXBlKGludCklMEFjbGlwJTIwJTNEJTIwcmVhZF92aWRlb19weWF2KGNvbnRhaW5lciUyQyUyMGluZGljZXMpJTBBJTBBcHJvbXB0JTIwJTNEJTIwJTIyV2hhdCUyMGlzJTIwaGFwcGVuaW5nJTIwaW4lMjB0aGUlMjB2aWRlbyUzRiUyMiUwQWlucHV0cyUyMCUzRCUyMHByb2Nlc3Nvcih2aWRlb3MlM0RjbGlwJTJDJTIwdGV4dCUzRHByb21wdCUyQyUyMHJldHVybl90ZW5zb3JzJTNEJTIycHQlMjIpLnRvKGRldmljZSklMEElMEFvdXRwdXRzJTIwJTNEJTIwbW9kZWwuZ2VuZXJhdGUoJTBBJTIwJTIwJTIwJTIwKippbnB1dHMlMkMlMEElMjAlMjAlMjAlMjBkb19zYW1wbGUlM0RGYWxzZSUyQyUwQSUyMCUyMCUyMCUyMG51bV9iZWFtcyUzRDUlMkMlMEElMjAlMjAlMjAlMjBtYXhfbGVuZ3RoJTNEMjU2JTJDJTBBJTIwJTIwJTIwJTIwcmVwZXRpdGlvbl9wZW5hbHR5JTNEMS41JTJDJTBBJTIwJTIwJTIwJTIwbGVuZ3RoX3BlbmFsdHklM0QxLjAlMkMlMEEpJTBBZ2VuZXJhdGVkX3RleHQlMjAlM0QlMjBwcm9jZXNzb3IuYmF0Y2hfZGVjb2RlKG91dHB1dHMlMkMlMjBza2lwX3NwZWNpYWxfdG9rZW5zJTNEVHJ1ZSklNUIwJTVELnN0cmlwKCklMEFwcmludChnZW5lcmF0ZWRfdGV4dCk=",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> torch
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> hf_hub_download
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> av

	<span class="hljs-meta">>>> </span><span class="hljs-keyword">def</span> <span class="hljs-title function_">read_video_pyav</span>(<span class="hljs-params">container, indices</span>):
	<span class="hljs-meta">... </span> <span class="hljs-string">'''
	<span class="hljs-meta">... </span> Decode the video with PyAV decoder.
	<span class="hljs-meta">... </span> Args:
	<span class="hljs-meta">... </span> container (\`av.container.input.InputContainer\`): PyAV container.
	<span class="hljs-meta">... </span> indices (\`List[int]\`): List of frame indices to decode.
	<span class="hljs-meta">... </span> Returns:
	<span class="hljs-meta">... </span> result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
	<span class="hljs-meta">... </span> '''</span>
	<span class="hljs-meta">... </span> frames = []
	<span class="hljs-meta">... </span> container.seek(<span class="hljs-number">0</span>)
	<span class="hljs-meta">... </span> start_index = indices[<span class="hljs-number">0</span>]
	<span class="hljs-meta">... </span> end_index = indices[-<span class="hljs-number">1</span>]
	<span class="hljs-meta">... </span> <span class="hljs-keyword">for</span> i, frame <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(container.decode(video=<span class="hljs-number">0</span>)):
	<span class="hljs-meta">... </span> <span class="hljs-keyword">if</span> i > end_index:
	<span class="hljs-meta">... </span> <span class="hljs-keyword">break</span>
	<span class="hljs-meta">... </span> <span class="hljs-keyword">if</span> i >= start_index <span class="hljs-keyword">and</span> i <span class="hljs-keyword">in</span> indices:
	<span class="hljs-meta">... </span> frames.append(frame)
	<span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> np.stack([x.to_ndarray(<span class="hljs-built_in">format</span>=<span class="hljs-string">"rgb24"</span>) <span class="hljs-keyword">for</span> x <span class="hljs-keyword">in</span> frames])

	<span class="hljs-meta">>>> </span>model = InstructBlipVideoProcessor.from_pretrained(<span class="hljs-string">"Salesforce/instructblip-vicuna-7b"</span>, device_map=<span class="hljs-string">"auto"</span>)
	<span class="hljs-meta">>>> </span>processor = InstructBlipVideoForConditionalGeneration.from_pretrained(<span class="hljs-string">"Salesforce/instructblip-vicuna-7b"</span>)

	<span class="hljs-meta">>>> </span>file_path = hf_hub_download(
	repo_id=<span class="hljs-string">"nielsr/video-demo"</span>, filename=<span class="hljs-string">"eating_spaghetti.mp4"</span>, repo_type=<span class="hljs-string">"dataset"</span>
	)
	<span class="hljs-meta">>>> </span>container = av.<span class="hljs-built_in">open</span>(video_path)
	<span class="hljs-meta">>>> </span><span class="hljs-comment"># sample uniformly 4 frames from the videWhy is this video funny?o</span>
	<span class="hljs-meta">>>> </span>total_frames = container.streams.video[<span class="hljs-number">0</span>].frames
	<span class="hljs-meta">>>> </span>indices = np.arange(<span class="hljs-number">0</span>, total_frames, total_frames / <span class="hljs-number">4</span>).astype(<span class="hljs-built_in">int</span>)
	<span class="hljs-meta">>>> </span>clip = read_video_pyav(container, indices)

	<span class="hljs-meta">>>> </span>prompt = <span class="hljs-string">"What is happening in the video?"</span>
	<span class="hljs-meta">>>> </span>inputs = processor(videos=clip, text=prompt, return_tensors=<span class="hljs-string">"pt"</span>).to(device)

	<span class="hljs-meta">>>> </span>outputs = model.generate(
	<span class="hljs-meta">... </span> **inputs,
	<span class="hljs-meta">... </span> do_sample=<span class="hljs-literal">False</span>,
	<span class="hljs-meta">... </span> num_beams=<span class="hljs-number">5</span>,
	<span class="hljs-meta">... </span> max_length=<span class="hljs-number">256</span>,
	<span class="hljs-meta">... </span> repetition_penalty=<span class="hljs-number">1.5</span>,
	<span class="hljs-meta">... </span> length_penalty=<span class="hljs-number">1.0</span>,
	<span class="hljs-meta">... </span>)
	<span class="hljs-meta">>>> </span>generated_text = processor.batch_decode(outputs, skip_special_tokens=<span class="hljs-literal">True</span>)[<span class="hljs-number">0</span>].strip()
	<span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(generated_text)
	<span class="hljs-string">"A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"</span>`,wrap:!1}}),{c(){i=d("p"),i.textContent=T,b=n(),p(m.$$.fragment)},l(a){i=c(a,"P",{"data-svelte-h":!0}),v(i)!=="svelte-kvfsh7"&&(i.textContent=T),b=r(a),u(m.$$.fragment,a)},m(a,I){l(a,i,I),l(a,b,I),f(m,a,I),y=!0},p:mt,i(a){y\|\|(g(m.$$.fragment,a),y=!0)},o(a){h(m.$$.fragment,a),y=!1},d(a){a&&(t(i),t(b)),_(m,a)}}}function Tn(J){let i,T,b,m,y,a,I,ut,ne,ft,re,xo=`The InstructBLIPVideo is an extension of the models proposed in <a href="https://arxiv.org/abs/2305.06500" rel="nofollow">InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning</a> by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
	InstructBLIPVideo uses the same architecture as <a href="instructblip">InstructBLIP</a> and works with the same checkpoints as <a href="instructblip">InstructBLIP</a>. The only difference is the ability to process videos.`,gt,se,ko="The abstract from the paper is the following:",ht,ae,zo="<em>General-purpose language models that can solve various language-domain tasks have emerged driven by the pre-training and instruction-tuning pipeline. However, building general-purpose vision-language models is challenging due to the increased task discrepancy introduced by the additional visual input. Although vision-language pre-training has been widely studied, vision-language instruction tuning remains relatively less explored. In this paper, we conduct a systematic and comprehensive study on vision-language instruction tuning based on the pre-trained BLIP-2 models. We gather a wide variety of 26 publicly available datasets, transform them into instruction tuning format and categorize them into two clusters for held-in instruction tuning and held-out zero-shot evaluation. Additionally, we introduce instruction-aware visual feature extraction, a crucial method that enables the model to extract informative features tailored to the given instruction. The resulting InstructBLIP models achieve state-of-the-art zero-shot performance across all 13 held-out datasets, substantially outperforming BLIP-2 and the larger Flamingo. Our models also lead to state-of-the-art performance when finetuned on individual downstream tasks (e.g., 90.7% accuracy on ScienceQA IMG). Furthermore, we qualitatively demonstrate the advantages of InstructBLIP over concurrent multimodal models.</em>",_t,Q,Uo,bt,ie,Wo='InstructBLIPVideo architecture. Taken from the <a href="https://arxiv.org/abs/2305.06500">original paper.</a>',vt,le,Zo=`This model was contributed by <a href="https://huggingface.co/RaushanTurganbay" rel="nofollow">RaushanTurganbay</a>.
	The original code can be found <a href="https://github.com/salesforce/LAVIS/tree/main/projects/instructblip" rel="nofollow">here</a>.`,yt,de,It,ce,Fo="<li>The model was trained by sampling 4 frames per video, so it’s recommended to sample 4 frames</li>",Tt,me,wt,C,pe,Ht,Ge,Go=`<a href="/docs/transformers/pr_31809/en/model_doc/instructblipvideo#transformers.InstructBlipVideoConfig">InstructBlipVideoConfig</a> is the configuration class to store the configuration of a
	<a href="/docs/transformers/pr_31809/en/model_doc/instructblipvideo#transformers.InstructBlipVideoForConditionalGeneration">InstructBlipVideoForConditionalGeneration</a>. It is used to instantiate a Instructblipvideo model according to the specified
	arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
	the defaults will yield a similar configuration to that of the Instructblipvideo
	<a href="https://huggingface.co/Salesforce/instruct-blip-flan-t5" rel="nofollow">Salesforce/instruct-blip-flan-t5</a> architecture.`,Xt,Ne,No=`Configuration objects inherit from <a href="/docs/transformers/pr_31809/en/main_classes/configuration#transformers.PretrainedConfig">PretrainedConfig</a> and can be used to control the model outputs. Read the
	documentation from <a href="/docs/transformers/pr_31809/en/main_classes/configuration#transformers.PretrainedConfig">PretrainedConfig</a> for more information.`,Yt,L,At,S,ue,Dt,Pe,Po=`Instantiate a <a href="/docs/transformers/pr_31809/en/model_doc/instructblipvideo#transformers.InstructBlipVideoConfig">InstructBlipVideoConfig</a> (or a derived class) from a Instructblipvideo vision model, Q-Former and
	language model configurations.`,Mt,fe,Vt,x,ge,Ot,Re,Ro=`This is the configuration class to store the configuration of a <a href="/docs/transformers/pr_31809/en/model_doc/instructblipvideo#transformers.InstructBlipVideoVisionModel">InstructBlipVideoVisionModel</a>. It is used to
	instantiate a Instructblipvideo vision encoder according to the specified arguments, defining the model architecture.
	Instantiating a configuration defaults will yield a similar configuration to that of the Instructblipvideo
	<a href="https://huggingface.co/Salesforce/instruct-blip-flan-t5" rel="nofollow">Salesforce/instruct-blip-flan-t5</a> architecture.`,Kt,qe,qo=`Configuration objects inherit from <a href="/docs/transformers/pr_31809/en/main_classes/configuration#transformers.PretrainedConfig">PretrainedConfig</a> and can be used to control the model outputs. Read the
	documentation from <a href="/docs/transformers/pr_31809/en/main_classes/configuration#transformers.PretrainedConfig">PretrainedConfig</a> for more information.`,eo,H,Bt,he,Ct,k,_e,to,Ee,Eo=`This is the configuration class to store the configuration of a <a href="/docs/transformers/pr_31809/en/model_doc/instructblipvideo#transformers.InstructBlipVideoQFormerModel">InstructBlipVideoQFormerModel</a>. It is used to
	instantiate a Instructblipvideo Querying Transformer (Q-Former) model according to the specified arguments, defining the
	model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
	the Instructblipvideo <a href="https://huggingface.co/Salesforce/instruct-blip-flan-t5" rel="nofollow">Salesforce/instruct-blip-flan-t5</a>
	architecture. Configuration objects inherit from <a href="/docs/transformers/pr_31809/en/main_classes/configuration#transformers.PretrainedConfig">PretrainedConfig</a> and can be used to control the model outputs.
	Read the documentation from <a href="/docs/transformers/pr_31809/en/main_classes/configuration#transformers.PretrainedConfig">PretrainedConfig</a> for more information.`,oo,Qe,Qo='Note that <a href="/docs/transformers/pr_31809/en/model_doc/instructblipvideo#transformers.InstructBlipVideoQFormerModel">InstructBlipVideoQFormerModel</a> is very similar to <a href="/docs/transformers/pr_31809/en/model_doc/bert#transformers.BertLMHeadModel">BertLMHeadModel</a> with interleaved cross-attention.',no,X,$t,be,Jt,$,ve,ro,Le,Lo=`Constructs an InstructBLIPVideo processor which wraps a InstructBLIP image processor and a LLaMa/T5 tokenizer into a single
	processor.`,so,Se,So=`<a href="/docs/transformers/pr_31809/en/model_doc/instructblipvideo#transformers.InstructBlipVideoProcessor">InstructBlipVideoProcessor</a> offers all the functionalities of <a href="/docs/transformers/pr_31809/en/model_doc/instructblipvideo#transformers.InstructBlipVideoImageProcessor">InstructBlipVideoImageProcessor</a> and <a href="/docs/transformers/pr_31809/en/model_doc/auto#transformers.AutoTokenizer">AutoTokenizer</a>. See the
	docstring of <code>__call__()</code> and <a href="/docs/transformers/pr_31809/en/model_doc/instructblipvideo#transformers.InstructBlipVideoProcessor.decode">decode()</a> for more information.`,ao,Y,ye,io,He,Ho=`This method forwards all its arguments to PreTrainedTokenizer’s <a href="/docs/transformers/pr_31809/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.batch_decode">batch_decode()</a>. Please
	refer to the docstring of this method for more information.`,lo,A,Ie,co,Xe,Xo=`This method forwards all its arguments to PreTrainedTokenizer’s <a href="/docs/transformers/pr_31809/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.decode">decode()</a>. Please refer to
	the docstring of this method for more information.`,jt,Te,xt,F,we,mo,Ye,Yo="Constructs a InstructBLIPVideo image processor.",po,D,Me,uo,Ae,Ao="Preprocess a video or batch of images/videos.",kt,Ve,zt,E,Be,fo,R,Ce,go,De,Do='The <a href="/docs/transformers/pr_31809/en/model_doc/instructblipvideo#transformers.InstructBlipVideoVisionModel">InstructBlipVideoVisionModel</a> forward method, overrides the <code>__call__</code> special method.',ho,O,Ut,$e,Wt,G,Je,_o,Oe,Oo=`Querying Transformer (Q-Former), used in Instructblipvideo. Slightly modified from BLIP-2 as it also takes the
	instruction as input.`,bo,q,je,vo,Ke,Ko=`encoder_hidden_states (<code>torch.FloatTensor</code> of shape <code>(batch_size, sequence_length, hidden_size)</code>, <em>optional</em>):
	Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
	the model is configured as a decoder.
	encoder_attention_mask (<code>torch.FloatTensor</code> of shape <code>(batch_size, sequence_length)</code>, <em>optional</em>):
	Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
	the cross-attention if the model is configured as a decoder. Mask values selected in <code>[0, 1]</code>:`,yo,et,en=`<li>1 for tokens that are <strong>not masked</strong>,</li> <li>0 for tokens that are <strong>masked</strong>.
	past_key_values (<code>tuple(tuple(torch.FloatTensor))</code> of length <code>config.n_layers</code> with each tuple having 4 tensors of:
	shape <code>(batch_size, num_heads, sequence_length - 1, embed_size_per_head)</code>): Contains precomputed key and
	value hidden states of the attention blocks. Can be used to speed up decoding. If <code>past_key_values</code> are
	used, the user can optionally input only the last <code>decoder_input_ids</code> (those that don’t have their past key
	value states given to this model) of shape <code>(batch_size, 1)</code> instead of all <code>decoder_input_ids</code> of shape
	<code>(batch_size, sequence_length)</code>.
	use_cache (<code>bool</code>, <em>optional</em>):
	If set to <code>True</code>, <code>past_key_values</code> key value states are returned and can be used to speed up decoding (see
	<code>past_key_values</code>).</li>`,Zt,xe,Ft,M,ke,Io,tt,tn=`Instructblipvideo Model for generating text given an image and an optional text prompt. The model consists of a vision
	encoder, Querying Transformer (Q-Former) and a language model.`,To,ot,on=`One can optionally pass <code>input_ids</code> to the model, which serve as a text prompt, to make the language model continue
	the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.`,wo,nt,nn=`This model inherits from <a href="/docs/transformers/pr_31809/en/main_classes/model#transformers.PreTrainedModel">PreTrainedModel</a>. Check the superclass documentation for the generic methods the
	library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
	etc.)`,Mo,rt,rn=`This model is also a PyTorch <a href="https://pytorch.org/docs/stable/nn.html#torch.nn.Module" rel="nofollow">torch.nn.Module</a> subclass.
	Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
	and behavior.`,Vo,W,ze,Bo,st,sn='The <a href="/docs/transformers/pr_31809/en/model_doc/instructblipvideo#transformers.InstructBlipVideoForConditionalGeneration">InstructBlipVideoForConditionalGeneration</a> forward method, overrides the <code>__call__</code> special method.',Co,K,$o,ee,Jo,te,Ue,jo,at,an="Overrides <code>generate</code> function to be able to use the model as a conditional generator.",Gt,We,Nt,pt,Pt;return y=new Z({props:{title:"InstructBlipVideo",local:"instructblipvideo",headingTag:"h1"}}),I=new Z({props:{title:"Overview",local:"overview",headingTag:"h2"}}),ne=new Z({props:{title:"Overview",local:"overview",headingTag:"h2"}}),de=new Z({props:{title:"Usage tips",local:"usage-tips",headingTag:"h2"}}),me=new Z({props:{title:"InstructBlipVideoConfig",local:"transformers.InstructBlipVideoConfig",headingTag:"h2"}}),pe=new B({props:{name:"class transformers.InstructBlipVideoConfig",anchor:"transformers.InstructBlipVideoConfig",parameters:[{name:"vision_config",val:" = None"},{name:"qformer_config",val:" = None"},{name:"text_config",val:" = None"},{name:"num_query_tokens",val:" = 32"},{name:"**kwargs",val:""}],parametersDescription:[{anchor:"transformers.InstructBlipVideoConfig.vision_config",description:`<strong>vision_config</strong> (<code>dict</code>, <em>optional</em>) —
	Dictionary of configuration options used to initialize <a href="/docs/transformers/pr_31809/en/model_doc/instructblipvideo#transformers.InstructBlipVideoVisionConfig">InstructBlipVideoVisionConfig</a>.`,name:"vision_config"},{anchor:"transformers.InstructBlipVideoConfig.qformer_config",description:`<strong>qformer_config</strong> (<code>dict</code>, <em>optional</em>) —
	Dictionary of configuration options used to initialize <a href="/docs/transformers/pr_31809/en/model_doc/instructblipvideo#transformers.InstructBlipVideoQFormerConfig">InstructBlipVideoQFormerConfig</a>.`,name:"qformer_config"},{anchor:"transformers.InstructBlipVideoConfig.text_config",description:`<strong>text_config</strong> (<code>dict</code>, <em>optional</em>) —
	Dictionary of configuration options used to initialize any <a href="/docs/transformers/pr_31809/en/main_classes/configuration#transformers.PretrainedConfig">PretrainedConfig</a>.`,name:"text_config"},{anchor:"transformers.InstructBlipVideoConfig.num_query_tokens",description:`<strong>num_query_tokens</strong> (<code>int</code>, <em>optional</em>, defaults to 32) —
	The number of query tokens passed through the Transformer.`,name:"num_query_tokens"},{anchor:"transformers.InstructBlipVideoConfig.kwargs",description:`<strong>kwargs</strong> (<em>optional</em>) —
	Dictionary of keyword arguments.`,name:"kwargs"}],source:"https://github.com/huggingface/transformers/blob/vr_31809/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py#L258"}}),L=new Lt({props:{anchor:"transformers.InstructBlipVideoConfig.example",$$slots:{default:[hn]},$$scope:{ctx:J}}}),ue=new B({props:{name:"from_vision_qformer_text_configs",anchor:"transformers.InstructBlipVideoConfig.from_vision_qformer_text_configs",parameters:[{name:"vision_config",val:": InstructBlipVideoVisionConfig"},{name:"qformer_config",val:": InstructBlipVideoQFormerConfig"},{name:"text_config",val:": PretrainedConfig"},{name:"**kwargs",val:""}],source:"https://github.com/huggingface/transformers/blob/vr_31809/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py#L343",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>


	<p>An instance of a configuration object</p>
	`,returnType:`<script context="module">export const metadata = 'undefined';<\/script>


	<p><a
	href="/docs/transformers/pr_31809/en/model_doc/instructblipvideo#transformers.InstructBlipVideoConfig"
	>InstructBlipVideoConfig</a></p>
	`}}),fe=new Z({props:{title:"InstructBlipVideoVisionConfig",local:"transformers.InstructBlipVideoVisionConfig",headingTag:"h2"}}),ge=new B({props:{name:"class transformers.InstructBlipVideoVisionConfig",anchor:"transformers.InstructBlipVideoVisionConfig",parameters:[{name:"hidden_size",val:" = 1408"},{name:"intermediate_size",val:" = 6144"},{name:"num_hidden_layers",val:" = 39"},{name:"num_attention_heads",val:" = 16"},{name:"image_size",val:" = 224"},{name:"patch_size",val:" = 14"},{name:"hidden_act",val:" = 'gelu'"},{name:"layer_norm_eps",val:" = 1e-06"},{name:"attention_dropout",val:" = 0.0"},{name:"initializer_range",val:" = 1e-10"},{name:"qkv_bias",val:" = True"},{name:"**kwargs",val:""}],parametersDescription:[{anchor:"transformers.InstructBlipVideoVisionConfig.hidden_size",description:`<strong>hidden_size</strong> (<code>int</code>, <em>optional</em>, defaults to 1408) —
	Dimensionality of the encoder layers and the pooler layer.`,name:"hidden_size"},{anchor:"transformers.InstructBlipVideoVisionConfig.intermediate_size",description:`<strong>intermediate_size</strong> (<code>int</code>, <em>optional</em>, defaults to 6144) —
	Dimensionality of the “intermediate” (i.e., feed-forward) layer in the Transformer encoder.`,name:"intermediate_size"},{anchor:"transformers.InstructBlipVideoVisionConfig.num_hidden_layers",description:`<strong>num_hidden_layers</strong> (<code>int</code>, <em>optional</em>, defaults to 39) —
	Number of hidden layers in the Transformer encoder.`,name:"num_hidden_layers"},{anchor:"transformers.InstructBlipVideoVisionConfig.num_attention_heads",description:`<strong>num_attention_heads</strong> (<code>int</code>, <em>optional</em>, defaults to 16) —
	Number of attention heads for each attention layer in the Transformer encoder.`,name:"num_attention_heads"},{anchor:"transformers.InstructBlipVideoVisionConfig.image_size",description:`<strong>image_size</strong> (<code>int</code>, <em>optional</em>, defaults to 224) —
	The size (resolution) of each image.`,name:"image_size"},{anchor:"transformers.InstructBlipVideoVisionConfig.patch_size",description:`<strong>patch_size</strong> (<code>int</code>, <em>optional</em>, defaults to 14) —
	The size (resolution) of each patch.`,name:"patch_size"},{anchor:"transformers.InstructBlipVideoVisionConfig.hidden_act",description:`<strong>hidden_act</strong> (<code>str</code> or <code>function</code>, <em>optional</em>, defaults to <code>"gelu"</code>) —
	The non-linear activation function (function or string) in the encoder and pooler. If string, <code>"gelu"</code>,
	<code>"relu"</code>, <code>"selu"</code> and <code>"gelu_new"</code> \`<code>"gelu"</code> are supported. to 1e-5): The epsilon used by the layer
	normalization layers.`,name:"hidden_act"},{anchor:"transformers.InstructBlipVideoVisionConfig.layer_norm_eps",description:`<strong>layer_norm_eps</strong> (<code>float</code>, <em>optional</em>, defaults to 1e-06) —
	The epsilon used by the layer normalization layers.`,name:"layer_norm_eps"},{anchor:"transformers.InstructBlipVideoVisionConfig.attention_dropout",description:`<strong>attention_dropout</strong> (<code>float</code>, <em>optional</em>, defaults to 0.0) —
	The dropout ratio for the attention probabilities.`,name:"attention_dropout"},{anchor:"transformers.InstructBlipVideoVisionConfig.initializer_range",description:`<strong>initializer_range</strong> (<code>float</code>, <em>optional</em>, defaults to 1e-10) —
	The standard deviation of the truncated_normal_initializer for initializing all weight matrices.`,name:"initializer_range"},{anchor:"transformers.InstructBlipVideoVisionConfig.qkv_bias",description:`<strong>qkv_bias</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) —
	Whether to add a bias to the queries and values in the self-attention layers.`,name:"qkv_bias"}],source:"https://github.com/huggingface/transformers/blob/vr_31809/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py#L36"}}),H=new Lt({props:{anchor:"transformers.InstructBlipVideoVisionConfig.example",$$slots:{default:[_n]},$$scope:{ctx:J}}}),he=new Z({props:{title:"InstructBlipVideoQFormerConfig",local:"transformers.InstructBlipVideoQFormerConfig",headingTag:"h2"}}),_e=new B({props:{name:"class transformers.InstructBlipVideoQFormerConfig",anchor:"transformers.InstructBlipVideoQFormerConfig",parameters:[{name:"vocab_size",val:" = 30522"},{name:"hidden_size",val:" = 768"},{name:"num_hidden_layers",val:" = 12"},{name:"num_attention_heads",val:" = 12"},{name:"intermediate_size",val:" = 3072"},{name:"hidden_act",val:" = 'gelu'"},{name:"hidden_dropout_prob",val:" = 0.1"},{name:"attention_probs_dropout_prob",val:" = 0.1"},{name:"max_position_embeddings",val:" = 512"},{name:"initializer_range",val:" = 0.02"},{name:"layer_norm_eps",val:" = 1e-12"},{name:"pad_token_id",val:" = 0"},{name:"position_embedding_type",val:" = 'absolute'"},{name:"cross_attention_frequency",val:" = 2"},{name:"encoder_hidden_size",val:" = 1408"},{name:"**kwargs",val:""}],parametersDescription:[{anchor:"transformers.InstructBlipVideoQFormerConfig.vocab_size",description:`<strong>vocab_size</strong> (<code>int</code>, <em>optional</em>, defaults to 30522) —
	Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by
	the <code>inputs_ids</code> passed when calling the model.`,name:"vocab_size"},{anchor:"transformers.InstructBlipVideoQFormerConfig.hidden_size",description:`<strong>hidden_size</strong> (<code>int</code>, <em>optional</em>, defaults to 768) —
	Dimensionality of the encoder layers and the pooler layer.`,name:"hidden_size"},{anchor:"transformers.InstructBlipVideoQFormerConfig.num_hidden_layers",description:`<strong>num_hidden_layers</strong> (<code>int</code>, <em>optional</em>, defaults to 12) —
	Number of hidden layers in the Transformer encoder.`,name:"num_hidden_layers"},{anchor:"transformers.InstructBlipVideoQFormerConfig.num_attention_heads",description:`<strong>num_attention_heads</strong> (<code>int</code>, <em>optional</em>, defaults to 12) —
	Number of attention heads for each attention layer in the Transformer encoder.`,name:"num_attention_heads"},{anchor:"transformers.InstructBlipVideoQFormerConfig.intermediate_size",description:`<strong>intermediate_size</strong> (<code>int</code>, <em>optional</em>, defaults to 3072) —
	Dimensionality of the “intermediate” (often named feed-forward) layer in the Transformer encoder.`,name:"intermediate_size"},{anchor:"transformers.InstructBlipVideoQFormerConfig.hidden_act",description:`<strong>hidden_act</strong> (<code>str</code> or <code>Callable</code>, <em>optional</em>, defaults to <code>"gelu"</code>) —
	The non-linear activation function (function or string) in the encoder and pooler. If string, <code>"gelu"</code>,
	<code>"relu"</code>, <code>"silu"</code> and <code>"gelu_new"</code> are supported.`,name:"hidden_act"},{anchor:"transformers.InstructBlipVideoQFormerConfig.hidden_dropout_prob",description:`<strong>hidden_dropout_prob</strong> (<code>float</code>, <em>optional</em>, defaults to 0.1) —
	The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.`,name:"hidden_dropout_prob"},{anchor:"transformers.InstructBlipVideoQFormerConfig.attention_probs_dropout_prob",description:`<strong>attention_probs_dropout_prob</strong> (<code>float</code>, <em>optional</em>, defaults to 0.1) —
	The dropout ratio for the attention probabilities.`,name:"attention_probs_dropout_prob"},{anchor:"transformers.InstructBlipVideoQFormerConfig.max_position_embeddings",description:`<strong>max_position_embeddings</strong> (<code>int</code>, <em>optional</em>, defaults to 512) —
	The maximum sequence length that this model might ever be used with. Typically set this to something large
	just in case (e.g., 512 or 1024 or 2048).`,name:"max_position_embeddings"},{anchor:"transformers.InstructBlipVideoQFormerConfig.initializer_range",description:`<strong>initializer_range</strong> (<code>float</code>, <em>optional</em>, defaults to 0.02) —
	The standard deviation of the truncated_normal_initializer for initializing all weight matrices.`,name:"initializer_range"},{anchor:"transformers.InstructBlipVideoQFormerConfig.layer_norm_eps",description:`<strong>layer_norm_eps</strong> (<code>float</code>, <em>optional</em>, defaults to 1e-12) —
	The epsilon used by the layer normalization layers.`,name:"layer_norm_eps"},{anchor:"transformers.InstructBlipVideoQFormerConfig.pad_token_id",description:`<strong>pad_token_id</strong> (<code>int</code>, <em>optional</em>, defaults to 0) —
	Token id used for padding sequences.`,name:"pad_token_id"},{anchor:"transformers.InstructBlipVideoQFormerConfig.position_embedding_type",description:`<strong>position_embedding_type</strong> (<code>str</code>, <em>optional</em>, defaults to <code>"absolute"</code>) —
	Type of position embedding. Choose one of <code>"absolute"</code>, <code>"relative_key"</code>, <code>"relative_key_query"</code>. For
	positional embeddings use <code>"absolute"</code>. For more information on <code>"relative_key"</code>, please refer to
	<a href="https://arxiv.org/abs/1803.02155" rel="nofollow">Self-Attention with Relative Position Representations (Shaw et al.)</a>.
	For more information on <code>"relative_key_query"</code>, please refer to <em>Method 4</em> in <a href="https://arxiv.org/abs/2009.13658" rel="nofollow">Improve Transformer Models
	with Better Relative Position Embeddings (Huang et al.)</a>.`,name:"position_embedding_type"},{anchor:"transformers.InstructBlipVideoQFormerConfig.cross_attention_frequency",description:`<strong>cross_attention_frequency</strong> (<code>int</code>, <em>optional</em>, defaults to 2) —
	The frequency of adding cross-attention to the Transformer layers.`,name:"cross_attention_frequency"},{anchor:"transformers.InstructBlipVideoQFormerConfig.encoder_hidden_size",description:`<strong>encoder_hidden_size</strong> (<code>int</code>, <em>optional</em>, defaults to 1408) —
	The hidden size of the hidden states for cross-attention.`,name:"encoder_hidden_size"}],source:"https://github.com/huggingface/transformers/blob/vr_31809/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py#L137"}}),X=new Lt({props:{anchor:"transformers.InstructBlipVideoQFormerConfig.example",$$slots:{default:[bn]},$$scope:{ctx:J}}}),be=new Z({props:{title:"InstructBlipVideoProcessor",local:"transformers.InstructBlipVideoProcessor",headingTag:"h2"}}),ve=new B({props:{name:"class transformers.InstructBlipVideoProcessor",anchor:"transformers.InstructBlipVideoProcessor",parameters:[{name:"image_processor",val:""},{name:"tokenizer",val:""},{name:"qformer_tokenizer",val:""}],parametersDescription:[{anchor:"transformers.InstructBlipVideoProcessor.image_processor",description:`<strong>image_processor</strong> (<code>InstructBlipVideoImageProcessor</code>) —
	An instance of <a href="/docs/transformers/pr_31809/en/model_doc/instructblipvideo#transformers.InstructBlipVideoImageProcessor">InstructBlipVideoImageProcessor</a>. The image processor is a required input.`,name:"image_processor"},{anchor:"transformers.InstructBlipVideoProcessor.tokenizer",description:"<strong>tokenizer</strong> (<code>AutoTokenizer</code>) —\nAn instance of [‘PreTrainedTokenizer`]. The tokenizer is a required input.",name:"tokenizer"},{anchor:"transformers.InstructBlipVideoProcessor.qformer_tokenizer",description:"<strong>qformer_tokenizer</strong> (<code>AutoTokenizer</code>) —\nAn instance of [‘PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.",name:"qformer_tokenizer"}],source:"https://github.com/huggingface/transformers/blob/vr_31809/src/transformers/models/instructblipvideo/processing_instructblipvideo.py#L30"}}),ye=new B({props:{name:"batch_decode",anchor:"transformers.InstructBlipVideoProcessor.batch_decode",parameters:[{name:"args",val:""},{name:"kwargs",val:""}],source:"https://github.com/huggingface/transformers/blob/vr_31809/src/transformers/models/instructblipvideo/processing_instructblipvideo.py#L133"}}),Ie=new B({props:{name:"decode",anchor:"transformers.InstructBlipVideoProcessor.decode",parameters:[{name:"args",val:""},{name:"kwargs",val:""}],source:"https://github.com/huggingface/transformers/blob/vr_31809/src/transformers/models/instructblipvideo/processing_instructblipvideo.py#L141"}}),Te=new Z({props:{title:"InstructBlipVideoImageProcessor",local:"transformers.InstructBlipVideoImageProcessor",headingTag:"h2"}}),we=new B({props:{name:"class transformers.InstructBlipVideoImageProcessor",anchor:"transformers.InstructBlipVideoImageProcessor",parameters:[{name:"do_resize",val:": bool = True"},{name:"size",val:": Dict = None"},{name:"resample",val:": Resampling = <Resampling.BICUBIC: 3>"},{name:"do_rescale",val:": bool = True"},{name:"rescale_factor",val:": Union = 0.00392156862745098"},{name:"do_normalize",val:": bool = True"},{name:"image_mean",val:": Union = None"},{name:"image_std",val:": Union = None"},{name:"do_convert_rgb",val:": bool = True"},{name:"kwargs",val:""}],parametersDescription:[{anchor:"transformers.InstructBlipVideoImageProcessor.do_resize",description:`<strong>do_resize</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) —
	Whether to resize the image’s (height, width) dimensions to the specified <code>size</code>. Can be overridden by the
	<code>do_resize</code> parameter in the <code>preprocess</code> method.`,name:"do_resize"},{anchor:"transformers.InstructBlipVideoImageProcessor.size",description:`<strong>size</strong> (<code>dict</code>, <em>optional</em>, defaults to <code>{"height" -- 384, "width": 384}</code>):
	Size of the output image after resizing. Can be overridden by the <code>size</code> parameter in the <code>preprocess</code>
	method.`,name:"size"},{anchor:"transformers.InstructBlipVideoImageProcessor.resample",description:`<strong>resample</strong> (<code>PILImageResampling</code>, <em>optional</em>, defaults to <code>Resampling.BICUBIC</code>) —
	Resampling filter to use if resizing the image. Only has an effect if <code>do_resize</code> is set to <code>True</code>. Can be
	overridden by the <code>resample</code> parameter in the <code>preprocess</code> method.`,name:"resample"},{anchor:"transformers.InstructBlipVideoImageProcessor.do_rescale",description:`<strong>do_rescale</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) —
	Whether to rescale the image by the specified scale <code>rescale_factor</code>. Can be overridden by the
	<code>do_rescale</code> parameter in the <code>preprocess</code> method.`,name:"do_rescale"},{anchor:"transformers.InstructBlipVideoImageProcessor.rescale_factor",description:`<strong>rescale_factor</strong> (<code>int</code> or <code>float</code>, <em>optional</em>, defaults to <code>1/255</code>) —
	Scale factor to use if rescaling the image. Only has an effect if <code>do_rescale</code> is set to <code>True</code>. Can be
	overridden by the <code>rescale_factor</code> parameter in the <code>preprocess</code> method.`,name:"rescale_factor"},{anchor:"transformers.InstructBlipVideoImageProcessor.do_normalize",description:`<strong>do_normalize</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) —
	Whether to normalize the image. Can be overridden by the <code>do_normalize</code> parameter in the <code>preprocess</code>
	method. Can be overridden by the <code>do_normalize</code> parameter in the <code>preprocess</code> method.`,name:"do_normalize"},{anchor:"transformers.InstructBlipVideoImageProcessor.image_mean",description:`<strong>image_mean</strong> (<code>float</code> or <code>List[float]</code>, <em>optional</em>, defaults to <code>IMAGENET_STANDARD_MEAN</code>) —
	Mean to use if normalizing the image. This is a float or list of floats the length of the number of
	channels in the image. Can be overridden by the <code>image_mean</code> parameter in the <code>preprocess</code> method. Can be
	overridden by the <code>image_mean</code> parameter in the <code>preprocess</code> method.`,name:"image_mean"},{anchor:"transformers.InstructBlipVideoImageProcessor.image_std",description:`<strong>image_std</strong> (<code>float</code> or <code>List[float]</code>, <em>optional</em>, defaults to <code>IMAGENET_STANDARD_STD</code>) —
	Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
	number of channels in the image. Can be overridden by the <code>image_std</code> parameter in the <code>preprocess</code> method.
	Can be overridden by the <code>image_std</code> parameter in the <code>preprocess</code> method.`,name:"image_std"},{anchor:"transformers.InstructBlipVideoImageProcessor.do_convert_rgb",description:`<strong>do_convert_rgb</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) —
	Whether to convert the image to RGB.`,name:"do_convert_rgb"}],source:"https://github.com/huggingface/transformers/blob/vr_31809/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py#L68"}}),Me=new B({props:{name:"preprocess",anchor:"transformers.InstructBlipVideoImageProcessor.preprocess",parameters:[{name:"images",val:": Union = None"},{name:"do_resize",val:": Optional = None"},{name:"size",val:": Optional = None"},{name:"resample",val:": Resampling = None"},{name:"do_rescale",val:": Optional = None"},{name:"rescale_factor",val:": Optional = None"},{name:"do_normalize",val:": Optional = None"},{name:"image_mean",val:": Union = None"},{name:"image_std",val:": Union = None"},{name:"return_tensors",val:": Union = None"},{name:"do_convert_rgb",val:": bool = None"},{name:"data_format",val:": ChannelDimension = <ChannelDimension.FIRST: 'channels_first'>"},{name:"input_data_format",val:": Union = None"},{name:"**kwargs",val:""}],parametersDescription:[{anchor:"transformers.InstructBlipVideoImageProcessor.preprocess.videos",description:`<strong>videos</strong> (<code>VideoInput</code>) —
	Video frames to preprocess. Expects a single or batch of videos as a list of frames with pixel values
	ranging from 0 to 255. If passing in video with pixel values between 0 and 1, set <code>do_rescale=False</code>.`,name:"videos"},{anchor:"transformers.InstructBlipVideoImageProcessor.preprocess.do_resize",description:`<strong>do_resize</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>self.do_resize</code>) —
	Whether to resize the video.`,name:"do_resize"},{anchor:"transformers.InstructBlipVideoImageProcessor.preprocess.size",description:`<strong>size</strong> (<code>Dict[str, int]</code>, <em>optional</em>, defaults to <code>self.size</code>) —
	Controls the size of the video after <code>resize</code>. The shortest edge of the image is resized to
	<code>size["shortest_edge"]</code> whilst preserving the aspect ratio. If the longest edge of this resized image
	is > <code>int(size["shortest_edge"] * (1333 / 800))</code>, then the image is resized again to make the longest
	edge equal to <code>int(size["shortest_edge"] * (1333 / 800))</code>.`,name:"size"},{anchor:"transformers.InstructBlipVideoImageProcessor.preprocess.resample",description:`<strong>resample</strong> (<code>PILImageResampling</code>, <em>optional</em>, defaults to <code>self.resample</code>) —
	Resampling filter to use if resizing the video. Only has an effect if <code>do_resize</code> is set to <code>True</code>.`,name:"resample"},{anchor:"transformers.InstructBlipVideoImageProcessor.preprocess.do_rescale",description:`<strong>do_rescale</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>self.do_rescale</code>) —
	Whether to rescale the video values between [0 - 1].`,name:"do_rescale"},{anchor:"transformers.InstructBlipVideoImageProcessor.preprocess.rescale_factor",description:`<strong>rescale_factor</strong> (<code>float</code>, <em>optional</em>, defaults to <code>self.rescale_factor</code>) —
	Rescale factor to rescale the video by if <code>do_rescale</code> is set to <code>True</code>.`,name:"rescale_factor"},{anchor:"transformers.InstructBlipVideoImageProcessor.preprocess.do_normalize",description:`<strong>do_normalize</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>self.do_normalize</code>) —
	Whether to normalize the video.`,name:"do_normalize"},{anchor:"transformers.InstructBlipVideoImageProcessor.preprocess.image_mean",description:`<strong>image_mean</strong> (<code>float</code> or <code>List[float]</code>, <em>optional</em>, defaults to <code>self.image_mean</code>) —
	Image mean to normalize the video by if <code>do_normalize</code> is set to <code>True</code>.`,name:"image_mean"},{anchor:"transformers.InstructBlipVideoImageProcessor.preprocess.image_std",description:`<strong>image_std</strong> (<code>float</code> or <code>List[float]</code>, <em>optional</em>, defaults to <code>self.image_std</code>) —
	Image standard deviation to normalize the video by if <code>do_normalize</code> is set to <code>True</code>.`,name:"image_std"},{anchor:"transformers.InstructBlipVideoImageProcessor.preprocess.do_convert_rgb",description:`<strong>do_convert_rgb</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>self.do_convert_rgb</code>) —
	Whether to convert the image to RGB.`,name:"do_convert_rgb"},{anchor:"transformers.InstructBlipVideoImageProcessor.preprocess.return_tensors",description:`<strong>return_tensors</strong> (<code>str</code> or <code>TensorType</code>, <em>optional</em>) —
	The type of tensors to return. Can be one of:<ul>
	<li>Unset: Return a list of <code>np.ndarray</code>.</li>
	<li><code>TensorType.TENSORFLOW</code> or <code>'tf'</code>: Return a batch of type <code>tf.Tensor</code>.</li>
	<li><code>TensorType.PYTORCH</code> or <code>'pt'</code>: Return a batch of type <code>torch.Tensor</code>.</li>
	<li><code>TensorType.NUMPY</code> or <code>'np'</code>: Return a batch of type <code>np.ndarray</code>.</li>
	<li><code>TensorType.JAX</code> or <code>'jax'</code>: Return a batch of type <code>jax.numpy.ndarray</code>.</li>
	</ul>`,name:"return_tensors"},{anchor:"transformers.InstructBlipVideoImageProcessor.preprocess.data_format",description:`<strong>data_format</strong> (<code>ChannelDimension</code> or <code>str</code>, <em>optional</em>, defaults to <code>ChannelDimension.FIRST</code>) —
	The channel dimension format for the output image. Can be one of:<ul>
	<li><code>"channels_first"</code> or <code>ChannelDimension.FIRST</code>: image in (num_channels, height, width) format.</li>
	<li><code>"channels_last"</code> or <code>ChannelDimension.LAST</code>: image in (height, width, num_channels) format.</li>
	<li>Unset: Use the channel dimension format of the input image.</li>
	</ul>`,name:"data_format"},{anchor:"transformers.InstructBlipVideoImageProcessor.preprocess.input_data_format",description:`<strong>input_data_format</strong> (<code>ChannelDimension</code> or <code>str</code>, <em>optional</em>) —
	The channel dimension format for the input image. If unset, the channel dimension format is inferred
	from the input image. Can be one of:<ul>
	<li><code>"channels_first"</code> or <code>ChannelDimension.FIRST</code>: image in (num_channels, height, width) format.</li>
	<li><code>"channels_last"</code> or <code>ChannelDimension.LAST</code>: image in (height, width, num_channels) format.</li>
	<li><code>"none"</code> or <code>ChannelDimension.NONE</code>: image in (height, width) format.</li>
	</ul>`,name:"input_data_format"}],source:"https://github.com/huggingface/transformers/blob/vr_31809/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py#L198"}}),Ve=new Z({props:{title:"InstructBlipVideoVisionModel",local:"transformers.InstructBlipVideoVisionModel",headingTag:"h2"}}),Be=new B({props:{name:"class transformers.InstructBlipVideoVisionModel",anchor:"transformers.InstructBlipVideoVisionModel",parameters:[{name:"config",val:": InstructBlipVideoVisionConfig"}],source:"https://github.com/huggingface/transformers/blob/vr_31809/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py#L534"}}),Ce=new B({props:{name:"forward",anchor:"transformers.InstructBlipVideoVisionModel.forward",parameters:[{name:"pixel_values",val:": Optional = None"},{name:"output_attentions",val:": Optional = None"},{name:"output_hidden_states",val:": Optional = None"},{name:"return_dict",val:": Optional = None"},{name:"interpolate_pos_encoding",val:": bool = False"}],parametersDescription:[{anchor:"transformers.InstructBlipVideoVisionModel.forward.pixel_values",description:`<strong>pixel_values</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, num_channels, height, width)</code>) —
	Pixel values. Pixel values can be obtained using <a href="/docs/transformers/pr_31809/en/model_doc/instructblipvideo#transformers.InstructBlipVideoProcessor">InstructBlipVideoProcessor</a>. See
	<code>InstructBlipVideoProcessor.__call__()</code> for details.`,name:"pixel_values"},{anchor:"transformers.InstructBlipVideoVisionModel.forward.output_attentions",description:`<strong>output_attentions</strong> (<code>bool</code>, <em>optional</em>) —
	Whether or not to return the attentions tensors of all attention layers. See <code>attentions</code> under returned
	tensors for more detail.`,name:"output_attentions"},{anchor:"transformers.InstructBlipVideoVisionModel.forward.output_hidden_states",description:`<strong>output_hidden_states</strong> (<code>bool</code>, <em>optional</em>) —
	Whether or not to return the hidden states of all layers. See <code>hidden_states</code> under returned tensors for
	more detail.`,name:"output_hidden_states"},{anchor:"transformers.InstructBlipVideoVisionModel.forward.return_dict",description:`<strong>return_dict</strong> (<code>bool</code>, <em>optional</em>) —
	Whether or not to return a <a href="/docs/transformers/pr_31809/en/main_classes/output#transformers.utils.ModelOutput">ModelOutput</a> instead of a plain tuple.`,name:"return_dict"},{anchor:"transformers.InstructBlipVideoVisionModel.forward.interpolate_pos_encoding",description:`<strong>interpolate_pos_encoding</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) —
	Whether to interpolate the pre-trained position encodings.`,name:"interpolate_pos_encoding"}],source:"https://github.com/huggingface/transformers/blob/vr_31809/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py#L549",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>


	<p>A <a
	href="/docs/transformers/pr_31809/en/main_classes/output#transformers.modeling_outputs.BaseModelOutputWithPooling"
	>transformers.modeling_outputs.BaseModelOutputWithPooling</a> or a tuple of
	<code>torch.FloatTensor</code> (if <code>return_dict=False</code> is passed or when <code>config.return_dict=False</code>) comprising various
	elements depending on the configuration (<code><class 'transformers.models.instructblipvideo.configuration_instructblipvideo.InstructBlipVideoVisionConfig'></code>) and inputs.</p>
	<ul>
	<li>
	<p><strong>last_hidden_state</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, sequence_length, hidden_size)</code>) — Sequence of hidden-states at the output of the last layer of the model.</p>
	</li>
	<li>
	<p><strong>pooler_output</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, hidden_size)</code>) — Last layer hidden-state of the first token of the sequence (classification token) after further processing
	through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
	the classification token after processing through a linear layer and a tanh activation function. The linear
	layer weights are trained from the next sentence prediction (classification) objective during pretraining.</p>
	</li>
	<li>
	<p><strong>hidden_states</strong> (<code>tuple(torch.FloatTensor)</code>, <em>optional</em>, returned when <code>output_hidden_states=True</code> is passed or when <code>config.output_hidden_states=True</code>) — Tuple of <code>torch.FloatTensor</code> (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape <code>(batch_size, sequence_length, hidden_size)</code>.</p>
	<p>Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.</p>
	</li>
	<li>
	<p><strong>attentions</strong> (<code>tuple(torch.FloatTensor)</code>, <em>optional</em>, returned when <code>output_attentions=True</code> is passed or when <code>config.output_attentions=True</code>) — Tuple of <code>torch.FloatTensor</code> (one for each layer) of shape <code>(batch_size, num_heads, sequence_length, sequence_length)</code>.</p>
	<p>Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.</p>
	</li>
	</ul>
	`,returnType:`<script context="module">export const metadata = 'undefined';<\/script>


	<p><a
	href="/docs/transformers/pr_31809/en/main_classes/output#transformers.modeling_outputs.BaseModelOutputWithPooling"
	>transformers.modeling_outputs.BaseModelOutputWithPooling</a> or <code>tuple(torch.FloatTensor)</code></p>
	`}}),O=new ln({props:{$$slots:{default:[vn]},$$scope:{ctx:J}}}),$e=new Z({props:{title:"InstructBlipVideoQFormerModel",local:"transformers.InstructBlipVideoQFormerModel",headingTag:"h2"}}),Je=new B({props:{name:"class transformers.InstructBlipVideoQFormerModel",anchor:"transformers.InstructBlipVideoQFormerModel",parameters:[{name:"config",val:": InstructBlipVideoQFormerConfig"}],source:"https://github.com/huggingface/transformers/blob/vr_31809/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py#L1080"}}),je=new B({props:{name:"forward",anchor:"transformers.InstructBlipVideoQFormerModel.forward",parameters:[{name:"input_ids",val:": LongTensor"},{name:"attention_mask",val:": Optional = None"},{name:"position_ids",val:": Optional = None"},{name:"query_embeds",val:": Optional = None"},{name:"head_mask",val:": Optional = None"},{name:"encoder_hidden_states",val:": Optional = None"},{name:"encoder_attention_mask",val:": Optional = None"},{name:"past_key_values",val:": Optional = None"},{name:"use_cache",val:": Optional = None"},{name:"output_attentions",val:": Optional = None"},{name:"output_hidden_states",val:": Optional = None"},{name:"return_dict",val:": Optional = None"}],source:"https://github.com/huggingface/transformers/blob/vr_31809/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py#L1153"}}),xe=new Z({props:{title:"InstructBlipVideoForConditionalGeneration",local:"transformers.InstructBlipVideoForConditionalGeneration",headingTag:"h2"}}),ke=new B({props:{name:"class transformers.InstructBlipVideoForConditionalGeneration",anchor:"transformers.InstructBlipVideoForConditionalGeneration",parameters:[{name:"config",val:": InstructBlipVideoConfig"}],parametersDescription:[{anchor:"transformers.InstructBlipVideoForConditionalGeneration.config",description:`<strong>config</strong> (<a href="/docs/transformers/pr_31809/en/model_doc/instructblipvideo#transformers.InstructBlipVideoConfig">InstructBlipVideoConfig</a>) — Model configuration class with all the parameters of the model.
	Initializing with a config file does not load the weights associated with the model, only the
	configuration. Check out the <a href="/docs/transformers/pr_31809/en/main_classes/model#transformers.PreTrainedModel.from_pretrained">from_pretrained()</a> method to load the model weights.`,name:"config"}],source:"https://github.com/huggingface/transformers/blob/vr_31809/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py#L1276"}}),ze=new B({props:{name:"forward",anchor:"transformers.InstructBlipVideoForConditionalGeneration.forward",parameters:[{name:"pixel_values",val:": FloatTensor"},{name:"qformer_input_ids",val:": FloatTensor"},{name:"qformer_attention_mask",val:": Optional = None"},{name:"input_ids",val:": Optional = None"},{name:"attention_mask",val:": Optional = None"},{name:"decoder_input_ids",val:": Optional = None"},{name:"decoder_attention_mask",val:": Optional = None"},{name:"output_attentions",val:": Optional = None"},{name:"output_hidden_states",val:": Optional = None"},{name:"labels",val:": Optional = None"},{name:"return_dict",val:": Optional = None"},{name:"interpolate_pos_encoding",val:": bool = False"}],parametersDescription:[{anchor:"transformers.InstructBlipVideoForConditionalGeneration.forward.pixel_values",description:`<strong>pixel_values</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, num_channels, height, width)</code>) —
	Pixel values. Pixel values can be obtained using <a href="/docs/transformers/pr_31809/en/model_doc/instructblipvideo#transformers.InstructBlipVideoProcessor">InstructBlipVideoProcessor</a>. See
	<code>InstructBlipVideoProcessor.__call__()</code> for details.`,name:"pixel_values"},{anchor:"transformers.InstructBlipVideoForConditionalGeneration.forward.qformer_input_ids",description:`<strong>qformer_input_ids</strong> (<code>torch.LongTensor</code> of shape <code>(batch_size, sequence_length)</code>, <em>optional</em>) —
	Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
	to serve as text prompt, which the Q-Former model will encode.</p>
	<p>Indices can be obtained using <a href="/docs/transformers/pr_31809/en/model_doc/instructblipvideo#transformers.InstructBlipVideoProcessor">InstructBlipVideoProcessor</a>. See <code>InstructBlipVideoProcessor.__call__()</code> for
	details.</p>
	<p><a href="../glossary#input-ids">What are input IDs?</a>`,name:"qformer_input_ids"},{anchor:"transformers.InstructBlipVideoForConditionalGeneration.forward.qformer_attention_mask",description:`<strong>qformer_attention_mask</strong> (<code>torch.Tensor</code> of shape <code>(batch_size, sequence_length)</code>, <em>optional</em>) —
	Mask to avoid performing attention on padding token indices. Mask values selected in <code>[0, 1]</code>:</p>
	<ul>
	<li>1 for tokens that are <strong>not masked</strong>,</li>
	<li>0 for tokens that are <strong>masked</strong>.</li>
	</ul>
	<p><a href="../glossary#attention-mask">What are attention masks?</a>`,name:"qformer_attention_mask"},{anchor:"transformers.InstructBlipVideoForConditionalGeneration.forward.input_ids",description:`<strong>input_ids</strong> (<code>torch.LongTensor</code> of shape <code>(batch_size, sequence_length)</code>, <em>optional</em>) —
	Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be
	provided to serve as text prompt, which the language model can continue.</p>
	<p>Indices can be obtained using <a href="/docs/transformers/pr_31809/en/model_doc/instructblipvideo#transformers.InstructBlipVideoProcessor">InstructBlipVideoProcessor</a>. See <code>InstructBlipVideoProcessor.__call__()</code> for
	details.</p>
	<p><a href="../glossary#input-ids">What are input IDs?</a>`,name:"input_ids"},{anchor:"transformers.InstructBlipVideoForConditionalGeneration.forward.attention_mask",description:`<strong>attention_mask</strong> (<code>torch.Tensor</code> of shape <code>(batch_size, sequence_length)</code>, <em>optional</em>) —
	Mask to avoid performing attention on padding token indices. Mask values selected in <code>[0, 1]</code>:</p>
	<ul>
	<li>1 for tokens that are <strong>not masked</strong>,</li>
	<li>0 for tokens that are <strong>masked</strong>.</li>
	</ul>
	<p><a href="../glossary#attention-mask">What are attention masks?</a>`,name:"attention_mask"},{anchor:"transformers.InstructBlipVideoForConditionalGeneration.forward.decoder_input_ids",description:`<strong>decoder_input_ids</strong> (<code>torch.LongTensor</code> of shape <code>(batch_size, target_sequence_length)</code>, <em>optional</em>) —
	Indices of decoder input sequence tokens in the vocabulary of the language model. Only relevant in case an
	encoder-decoder language model (like T5) is used.</p>
	<p>Indices can be obtained using <a href="/docs/transformers/pr_31809/en/model_doc/auto#transformers.AutoTokenizer">AutoTokenizer</a>. See <a href="/docs/transformers/pr_31809/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.encode">PreTrainedTokenizer.encode()</a> and
	<a href="/docs/transformers/pr_31809/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.__call__">PreTrainedTokenizer.<strong>call</strong>()</a> for details. <a href="../glossary#decoder-input-ids">What are decoder input IDs?</a>`,name:"decoder_input_ids"},{anchor:"transformers.InstructBlipVideoForConditionalGeneration.forward.decoder_attention_mask",description:`<strong>decoder_attention_mask</strong> (<code>torch.BoolTensor</code> of shape <code>(batch_size, target_sequence_length)</code>, <em>optional</em>) —
	Default behavior: generate a tensor that ignores pad tokens in <code>decoder_input_ids</code>. Causal mask will also
	be used by default.</p>
	<p>Only relevant in case an encoder-decoder language model (like T5) is used.`,name:"decoder_attention_mask"},{anchor:"transformers.InstructBlipVideoForConditionalGeneration.forward.output_attentions",description:`<strong>output_attentions</strong> (<code>bool</code>, <em>optional</em>) —
	Whether or not to return the attentions tensors of all attention layers. See <code>attentions</code> under returned
	tensors for more detail.`,name:"output_attentions"},{anchor:"transformers.InstructBlipVideoForConditionalGeneration.forward.output_hidden_states",description:`<strong>output_hidden_states</strong> (<code>bool</code>, <em>optional</em>) —
	Whether or not to return the hidden states of all layers. See <code>hidden_states</code> under returned tensors for
	more detail.`,name:"output_hidden_states"},{anchor:"transformers.InstructBlipVideoForConditionalGeneration.forward.return_dict",description:`<strong>return_dict</strong> (<code>bool</code>, <em>optional</em>) —
	Whether or not to return a <a href="/docs/transformers/pr_31809/en/main_classes/output#transformers.utils.ModelOutput">ModelOutput</a> instead of a plain tuple.`,name:"return_dict"},{anchor:"transformers.InstructBlipVideoForConditionalGeneration.forward.interpolate_pos_encoding",description:`<strong>interpolate_pos_encoding</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) —
	Whether to interpolate the pre-trained position encodings.`,name:"interpolate_pos_encoding"},{anchor:"transformers.InstructBlipVideoForConditionalGeneration.forward.labels",description:`<strong>labels</strong> (<code>torch.LongTensor</code> of shape <code>(batch_size,)</code>, <em>optional</em>) —
	Labels for computing the language modeling loss. Indices should be in <code>[-100, 0, ..., config.vocab_size - 1]</code>. All labels set to <code>-100</code> are ignored (masked), the loss is only computed for labels in <code>[0, ..., config.vocab_size]</code>`,name:"labels"}],source:"https://github.com/huggingface/transformers/blob/vr_31809/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py#L1363",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>


	<p>A <code>transformers.models.instructblipvideo.modeling_instructblipvideo.InstructBlipVideoForConditionalGenerationModelOutput</code> or a tuple of
	<code>torch.FloatTensor</code> (if <code>return_dict=False</code> is passed or when <code>config.return_dict=False</code>) comprising various
	elements depending on the configuration (<code><class 'transformers.models.instructblipvideo.configuration_instructblipvideo.InstructBlipVideoVisionConfig'></code>) and inputs.</p>
	<ul>
	<li><strong>loss</strong> (<code>torch.FloatTensor</code>, <em>optional</em>, returned when <code>labels</code> is provided, <code>torch.FloatTensor</code> of shape <code>(1,)</code>) — Language modeling loss from the language model.</li>
	<li><strong>logits</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, sequence_length, config.vocab_size)</code>) — Prediction scores of the language modeling head of the language model.</li>
	<li><strong>vision_outputs</strong> (<code>BaseModelOutputWithPooling</code>) — Outputs of the vision encoder.</li>
	<li><strong>qformer_outputs</strong> (<code>BaseModelOutputWithPoolingAndCrossAttentions</code>) — Outputs of the Q-Former (Querying Transformer).</li>
	<li><strong>language_model_outputs</strong> (<code>CausalLMOutputWithPast</code> or <code>Seq2SeqLMOutput</code>) — Outputs of the language model.</li>
	</ul>
	`,returnType:`<script context="module">export const metadata = 'undefined';<\/script>


	<p><code>transformers.models.instructblipvideo.modeling_instructblipvideo.InstructBlipVideoForConditionalGenerationModelOutput</code> or <code>tuple(torch.FloatTensor)</code></p>
	`}}),K=new ln({props:{$$slots:{default:[yn]},$$scope:{ctx:J}}}),ee=new Lt({props:{anchor:"transformers.InstructBlipVideoForConditionalGeneration.forward.example",$$slots:{default:[In]},$$scope:{ctx:J}}}),Ue=new B({props:{name:"generate",anchor:"transformers.InstructBlipVideoForConditionalGeneration.generate",parameters:[{name:"pixel_values",val:": FloatTensor"},{name:"qformer_input_ids",val:": Optional = None"},{name:"qformer_attention_mask",val:": Optional = None"},{name:"input_ids",val:": Optional = None"},{name:"attention_mask",val:": Optional = None"},{name:"interpolate_pos_encoding",val:": bool = False"},{name:"**generate_kwargs",val:""}],parametersDescription:[{anchor:"transformers.InstructBlipVideoForConditionalGeneration.generate.pixel_values",description:`<strong>pixel_values</strong> (<code>torch.FloatTensor</code> of shape (batch_size, num_channels, height, width) or —
	(batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
	qformer_input_ids (<code>torch.LongTensor</code> of shape (batch_size, sequence_length), <em>optional</em>):
	The sequence used as a prompt to be fed to the Q-Former module.
	qformer_attention_mask (<code>torch.LongTensor</code> of shape (batch_size, sequence_length), <em>optional</em>):
	Mask to avoid performing attention on padding token indices.
	input_ids (<code>torch.LongTensor</code> of shape (batch_size, sequence_length), <em>optional</em>):
	The sequence used as a prompt for the generation.
	attention_mask (<code>torch.LongTensor</code> of shape (batch_size, sequence_length), <em>optional</em>):
	Mask to avoid performing attention on padding token indices.
	interpolate_pos_encoding (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>):
	Whether to interpolate the positional encoding of the image embeddings.`,name:"pixel_values"}],source:"https://github.com/huggingface/transformers/blob/vr_31809/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py#L1550",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>


	<p>A list of strings of length batch_size * num_captions.</p>
	`,returnType:`<script context="module">export const metadata = 'undefined';<\/script>


	<p>captions (list)</p>
	`}}),We=new gn({props:{source:"https://github.com/huggingface/transformers/blob/main/docs/source/en/model_doc/instructblipvideo.md"}}),{c(){i=d("meta"),T=n(),b=d("p"),m=n(),p(y.$$.fragment),a=n(),p(I.$$.fragment),ut=n(),p(ne.$$.fragment),ft=n(),re=d("p"),re.innerHTML=xo,gt=n(),se=d("p"),se.textContent=ko,ht=n(),ae=d("p"),ae.innerHTML=zo,_t=n(),Q=d("img"),bt=n(),ie=d("small"),ie.innerHTML=Wo,vt=n(),le=d("p"),le.innerHTML=Zo,yt=n(),p(de.$$.fragment),It=n(),ce=d("ul"),ce.innerHTML=Fo,Tt=n(),p(me.$$.fragment),wt=n(),C=d("div"),p(pe.$$.fragment),Ht=n(),Ge=d("p"),Ge.innerHTML=Go,Xt=n(),Ne=d("p"),Ne.innerHTML=No,Yt=n(),p(L.$$.fragment),At=n(),S=d("div"),p(ue.$$.fragment),Dt=n(),Pe=d("p"),Pe.innerHTML=Po,Mt=n(),p(fe.$$.fragment),Vt=n(),x=d("div"),p(ge.$$.fragment),Ot=n(),Re=d("p"),Re.innerHTML=Ro,Kt=n(),qe=d("p"),qe.innerHTML=qo,eo=n(),p(H.$$.fragment),Bt=n(),p(he.$$.fragment),Ct=n(),k=d("div"),p(_e.$$.fragment),to=n(),Ee=d("p"),Ee.innerHTML=Eo,oo=n(),Qe=d("p"),Qe.innerHTML=Qo,no=n(),p(X.$$.fragment),$t=n(),p(be.$$.fragment),Jt=n(),$=d("div"),p(ve.$$.fragment),ro=n(),Le=d("p"),Le.textContent=Lo,so=n(),Se=d("p"),Se.innerHTML=So,ao=n(),Y=d("div"),p(ye.$$.fragment),io=n(),He=d("p"),He.innerHTML=Ho,lo=n(),A=d("div"),p(Ie.$$.fragment),co=n(),Xe=d("p"),Xe.innerHTML=Xo,jt=n(),p(Te.$$.fragment),xt=n(),F=d("div"),p(we.$$.fragment),mo=n(),Ye=d("p"),Ye.textContent=Yo,po=n(),D=d("div"),p(Me.$$.fragment),uo=n(),Ae=d("p"),Ae.textContent=Ao,kt=n(),p(Ve.$$.fragment),zt=n(),E=d("div"),p(Be.$$.fragment),fo=n(),R=d("div"),p(Ce.$$.fragment),go=n(),De=d("p"),De.innerHTML=Do,ho=n(),p(O.$$.fragment),Ut=n(),p($e.$$.fragment),Wt=n(),G=d("div"),p(Je.$$.fragment),_o=n(),Oe=d("p"),Oe.textContent=Oo,bo=n(),q=d("div"),p(je.$$.fragment),vo=n(),Ke=d("p"),Ke.innerHTML=Ko,yo=n(),et=d("ul"),et.innerHTML=en,Zt=n(),p(xe.$$.fragment),Ft=n(),M=d("div"),p(ke.$$.fragment),Io=n(),tt=d("p"),tt.textContent=tn,To=n(),ot=d("p"),ot.innerHTML=on,wo=n(),nt=d("p"),nt.innerHTML=nn,Mo=n(),rt=d("p"),rt.innerHTML=rn,Vo=n(),W=d("div"),p(ze.$$.fragment),Bo=n(),st=d("p"),st.innerHTML=sn,Co=n(),p(K.$$.fragment),$o=n(),p(ee.$$.fragment),Jo=n(),te=d("div"),p(Ue.$$.fragment),jo=n(),at=d("p"),at.innerHTML=an,Gt=n(),p(We.$$.fragment),Nt=n(),pt=d("p"),this.h()},l(e){const o=fn("svelte-u9bgzb",document.head);i=c(o,"META",{name:!0,content:!0}),o.forEach(t),T=r(e),b=c(e,"P",{}),V(b).forEach(t),m=r(e),u(y.$$.fragment,e),a=r(e),u(I.$$.fragment,e),ut=r(e),u(ne.$$.fragment,e),ft=r(e),re=c(e,"P",{"data-svelte-h":!0}),v(re)!=="svelte-1yas5ek"&&(re.innerHTML=xo),gt=r(e),se=c(e,"P",{"data-svelte-h":!0}),v(se)!=="svelte-vfdo9a"&&(se.textContent=ko),ht=r(e),ae=c(e,"P",{"data-svelte-h":!0}),v(ae)!=="svelte-g3w4hv"&&(ae.innerHTML=zo),_t=r(e),Q=c(e,"IMG",{src:!0,alt:!0,width:!0}),bt=r(e),ie=c(e,"SMALL",{"data-svelte-h":!0}),v(ie)!=="svelte-1odm781"&&(ie.innerHTML=Wo),vt=r(e),le=c(e,"P",{"data-svelte-h":!0}),v(le)!=="svelte-1plrd7p"&&(le.innerHTML=Zo),yt=r(e),u(de.$$.fragment,e),It=r(e),ce=c(e,"UL",{"data-svelte-h":!0}),v(ce)!=="svelte-l3xryg"&&(ce.innerHTML=Fo),Tt=r(e),u(me.$$.fragment,e),wt=r(e),C=c(e,"DIV",{class:!0});var z=V(C);u(pe.$$.fragment,z),Ht=r(z),Ge=c(z,"P",{"data-svelte-h":!0}),v(Ge)!=="svelte-1r18biq"&&(Ge.innerHTML=Go),Xt=r(z),Ne=c(z,"P",{"data-svelte-h":!0}),v(Ne)!=="svelte-klwkvv"&&(Ne.innerHTML=No),Yt=r(z),u(L.$$.fragment,z),At=r(z),S=c(z,"DIV",{class:!0});var Ze=V(S);u(ue.$$.fragment,Ze),Dt=r(Ze),Pe=c(Ze,"P",{"data-svelte-h":!0}),v(Pe)!=="svelte-rstzz1"&&(Pe.innerHTML=Po),Ze.forEach(t),z.forEach(t),Mt=r(e),u(fe.$$.fragment,e),Vt=r(e),x=c(e,"DIV",{class:!0});var N=V(x);u(ge.$$.fragment,N),Ot=r(N),Re=c(N,"P",{"data-svelte-h":!0}),v(Re)!=="svelte-fubd3o"&&(Re.innerHTML=Ro),Kt=r(N),qe=c(N,"P",{"data-svelte-h":!0}),v(qe)!=="svelte-klwkvv"&&(qe.innerHTML=qo),eo=r(N),u(H.$$.fragment,N),N.forEach(t),Bt=r(e),u(he.$$.fragment,e),Ct=r(e),k=c(e,"DIV",{class:!0});var P=V(k);u(_e.$$.fragment,P),to=r(P),Ee=c(P,"P",{"data-svelte-h":!0}),v(Ee)!=="svelte-1rho4ik"&&(Ee.innerHTML=Eo),oo=r(P),Qe=c(P,"P",{"data-svelte-h":!0}),v(Qe)!=="svelte-9vwsni"&&(Qe.innerHTML=Qo),no=r(P),u(X.$$.fragment,P),P.forEach(t),$t=r(e),u(be.$$.fragment,e),Jt=r(e),$=c(e,"DIV",{class:!0});var U=V($);u(ve.$$.fragment,U),ro=r(U),Le=c(U,"P",{"data-svelte-h":!0}),v(Le)!=="svelte-vkjqyr"&&(Le.textContent=Lo),so=r(U),Se=c(U,"P",{"data-svelte-h":!0}),v(Se)!=="svelte-1ed1ly5"&&(Se.innerHTML=So),ao=r(U),Y=c(U,"DIV",{class:!0});var Fe=V(Y);u(ye.$$.fragment,Fe),io=r(Fe),He=c(Fe,"P",{"data-svelte-h":!0}),v(He)!=="svelte-1fp5f0g"&&(He.innerHTML=Ho),Fe.forEach(t),lo=r(U),A=c(U,"DIV",{class:!0});var Rt=V(A);u(Ie.$$.fragment,Rt),co=r(Rt),Xe=c(Rt,"P",{"data-svelte-h":!0}),v(Xe)!=="svelte-l01ru"&&(Xe.innerHTML=Xo),Rt.forEach(t),U.forEach(t),jt=r(e),u(Te.$$.fragment,e),xt=r(e),F=c(e,"DIV",{class:!0});var it=V(F);u(we.$$.fragment,it),mo=r(it),Ye=c(it,"P",{"data-svelte-h":!0}),v(Ye)!=="svelte-7qjslq"&&(Ye.textContent=Yo),po=r(it),D=c(it,"DIV",{class:!0});var qt=V(D);u(Me.$$.fragment,qt),uo=r(qt),Ae=c(qt,"P",{"data-svelte-h":!0}),v(Ae)!=="svelte-llto5r"&&(Ae.textContent=Ao),qt.forEach(t),it.forEach(t),kt=r(e),u(Ve.$$.fragment,e),zt=r(e),E=c(e,"DIV",{class:!0});var Et=V(E);u(Be.$$.fragment,Et),fo=r(Et),R=c(Et,"DIV",{class:!0});var lt=V(R);u(Ce.$$.fragment,lt),go=r(lt),De=c(lt,"P",{"data-svelte-h":!0}),v(De)!=="svelte-yjpm2g"&&(De.innerHTML=Do),ho=r(lt),u(O.$$.fragment,lt),lt.forEach(t),Et.forEach(t),Ut=r(e),u($e.$$.fragment,e),Wt=r(e),G=c(e,"DIV",{class:!0});var dt=V(G);u(Je.$$.fragment,dt),_o=r(dt),Oe=c(dt,"P",{"data-svelte-h":!0}),v(Oe)!=="svelte-1d58c7r"&&(Oe.textContent=Oo),bo=r(dt),q=c(dt,"DIV",{class:!0});var ct=V(q);u(je.$$.fragment,ct),vo=r(ct),Ke=c(ct,"P",{"data-svelte-h":!0}),v(Ke)!=="svelte-1h74cdd"&&(Ke.innerHTML=Ko),yo=r(ct),et=c(ct,"UL",{"data-svelte-h":!0}),v(et)!=="svelte-1yyv1jy"&&(et.innerHTML=en),ct.forEach(t),dt.forEach(t),Zt=r(e),u(xe.$$.fragment,e),Ft=r(e),M=c(e,"DIV",{class:!0});var j=V(M);u(ke.$$.fragment,j),Io=r(j),tt=c(j,"P",{"data-svelte-h":!0}),v(tt)!=="svelte-hrl25d"&&(tt.textContent=tn),To=r(j),ot=c(j,"P",{"data-svelte-h":!0}),v(ot)!=="svelte-1ks26sg"&&(ot.innerHTML=on),wo=r(j),nt=c(j,"P",{"data-svelte-h":!0}),v(nt)!=="svelte-ewmvdz"&&(nt.innerHTML=nn),Mo=r(j),rt=c(j,"P",{"data-svelte-h":!0}),v(rt)!=="svelte-hswkmf"&&(rt.innerHTML=rn),Vo=r(j),W=c(j,"DIV",{class:!0});var oe=V(W);u(ze.$$.fragment,oe),Bo=r(oe),st=c(oe,"P",{"data-svelte-h":!0}),v(st)!=="svelte-11fx30m"&&(st.innerHTML=sn),Co=r(oe),u(K.$$.fragment,oe),$o=r(oe),u(ee.$$.fragment,oe),oe.forEach(t),Jo=r(j),te=c(j,"DIV",{class:!0});var Qt=V(te);u(Ue.$$.fragment,Qt),jo=r(Qt),at=c(Qt,"P",{"data-svelte-h":!0}),v(at)!=="svelte-eq620n"&&(at.innerHTML=an),Qt.forEach(t),j.forEach(t),Gt=r(e),u(We.$$.fragment,e),Nt=r(e),pt=c(e,"P",{}),V(pt).forEach(t),this.h()},h(){w(i,"name","hf:doc:metadata"),w(i,"content",wn),cn(Q.src,Uo="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/instructblip_architecture.jpg")\|\|w(Q,"src",Uo),w(Q,"alt","drawing"),w(Q,"width","600"),w(S,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),w(C,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),w(x,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),w(k,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),w(Y,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),w(A,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),w($,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),w(D,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),w(F,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),w(R,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),w(E,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),w(q,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),w(G,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),w(W,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),w(te,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),w(M,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,o){s(document.head,i),l(e,T,o),l(e,b,o),l(e,m,o),f(y,e,o),l(e,a,o),f(I,e,o),l(e,ut,o),f(ne,e,o),l(e,ft,o),l(e,re,o),l(e,gt,o),l(e,se,o),l(e,ht,o),l(e,ae,o),l(e,_t,o),l(e,Q,o),l(e,bt,o),l(e,ie,o),l(e,vt,o),l(e,le,o),l(e,yt,o),f(de,e,o),l(e,It,o),l(e,ce,o),l(e,Tt,o),f(me,e,o),l(e,wt,o),l(e,C,o),f(pe,C,null),s(C,Ht),s(C,Ge),s(C,Xt),s(C,Ne),s(C,Yt),f(L,C,null),s(C,At),s(C,S),f(ue,S,null),s(S,Dt),s(S,Pe),l(e,Mt,o),f(fe,e,o),l(e,Vt,o),l(e,x,o),f(ge,x,null),s(x,Ot),s(x,Re),s(x,Kt),s(x,qe),s(x,eo),f(H,x,null),l(e,Bt,o),f(he,e,o),l(e,Ct,o),l(e,k,o),f(_e,k,null),s(k,to),s(k,Ee),s(k,oo),s(k,Qe),s(k,no),f(X,k,null),l(e,$t,o),f(be,e,o),l(e,Jt,o),l(e,$,o),f(ve,$,null),s($,ro),s($,Le),s($,so),s($,Se),s($,ao),s($,Y),f(ye,Y,null),s(Y,io),s(Y,He),s($,lo),s($,A),f(Ie,A,null),s(A,co),s(A,Xe),l(e,jt,o),f(Te,e,o),l(e,xt,o),l(e,F,o),f(we,F,null),s(F,mo),s(F,Ye),s(F,po),s(F,D),f(Me,D,null),s(D,uo),s(D,Ae),l(e,kt,o),f(Ve,e,o),l(e,zt,o),l(e,E,o),f(Be,E,null),s(E,fo),s(E,R),f(Ce,R,null),s(R,go),s(R,De),s(R,ho),f(O,R,null),l(e,Ut,o),f($e,e,o),l(e,Wt,o),l(e,G,o),f(Je,G,null),s(G,_o),s(G,Oe),s(G,bo),s(G,q),f(je,q,null),s(q,vo),s(q,Ke),s(q,yo),s(q,et),l(e,Zt,o),f(xe,e,o),l(e,Ft,o),l(e,M,o),f(ke,M,null),s(M,Io),s(M,tt),s(M,To),s(M,ot),s(M,wo),s(M,nt),s(M,Mo),s(M,rt),s(M,Vo),s(M,W),f(ze,W,null),s(W,Bo),s(W,st),s(W,Co),f(K,W,null),s(W,$o),f(ee,W,null),s(M,Jo),s(M,te),f(Ue,te,null),s(te,jo),s(te,at),l(e,Gt,o),f(We,e,o),l(e,Nt,o),l(e,pt,o),Pt=!0},p(e,[o]){const z={};o&2&&(z.$$scope={dirty:o,ctx:e}),L.$set(z);const Ze={};o&2&&(Ze.$$scope={dirty:o,ctx:e}),H.$set(Ze);const N={};o&2&&(N.$$scope={dirty:o,ctx:e}),X.$set(N);const P={};o&2&&(P.$$scope={dirty:o,ctx:e}),O.$set(P);const U={};o&2&&(U.$$scope={dirty:o,ctx:e}),K.$set(U);const Fe={};o&2&&(Fe.$$scope={dirty:o,ctx:e}),ee.$set(Fe)},i(e){Pt\|\|(g(y.$$.fragment,e),g(I.$$.fragment,e),g(ne.$$.fragment,e),g(de.$$.fragment,e),g(me.$$.fragment,e),g(pe.$$.fragment,e),g(L.$$.fragment,e),g(ue.$$.fragment,e),g(fe.$$.fragment,e),g(ge.$$.fragment,e),g(H.$$.fragment,e),g(he.$$.fragment,e),g(_e.$$.fragment,e),g(X.$$.fragment,e),g(be.$$.fragment,e),g(ve.$$.fragment,e),g(ye.$$.fragment,e),g(Ie.$$.fragment,e),g(Te.$$.fragment,e),g(we.$$.fragment,e),g(Me.$$.fragment,e),g(Ve.$$.fragment,e),g(Be.$$.fragment,e),g(Ce.$$.fragment,e),g(O.$$.fragment,e),g($e.$$.fragment,e),g(Je.$$.fragment,e),g(je.$$.fragment,e),g(xe.$$.fragment,e),g(ke.$$.fragment,e),g(ze.$$.fragment,e),g(K.$$.fragment,e),g(ee.$$.fragment,e),g(Ue.$$.fragment,e),g(We.$$.fragment,e),Pt=!0)},o(e){h(y.$$.fragment,e),h(I.$$.fragment,e),h(ne.$$.fragment,e),h(de.$$.fragment,e),h(me.$$.fragment,e),h(pe.$$.fragment,e),h(L.$$.fragment,e),h(ue.$$.fragment,e),h(fe.$$.fragment,e),h(ge.$$.fragment,e),h(H.$$.fragment,e),h(he.$$.fragment,e),h(_e.$$.fragment,e),h(X.$$.fragment,e),h(be.$$.fragment,e),h(ve.$$.fragment,e),h(ye.$$.fragment,e),h(Ie.$$.fragment,e),h(Te.$$.fragment,e),h(we.$$.fragment,e),h(Me.$$.fragment,e),h(Ve.$$.fragment,e),h(Be.$$.fragment,e),h(Ce.$$.fragment,e),h(O.$$.fragment,e),h($e.$$.fragment,e),h(Je.$$.fragment,e),h(je.$$.fragment,e),h(xe.$$.fragment,e),h(ke.$$.fragment,e),h(ze.$$.fragment,e),h(K.$$.fragment,e),h(ee.$$.fragment,e),h(Ue.$$.fragment,e),h(We.$$.fragment,e),Pt=!1},d(e){e&&(t(T),t(b),t(m),t(a),t(ut),t(ft),t(re),t(gt),t(se),t(ht),t(ae),t(_t),t(Q),t(bt),t(ie),t(vt),t(le),t(yt),t(It),t(ce),t(Tt),t(wt),t(C),t(Mt),t(Vt),t(x),t(Bt),t(Ct),t(k),t($t),t(Jt),t($),t(jt),t(xt),t(F),t(kt),t(zt),t(E),t(Ut),t(Wt),t(G),t(Zt),t(Ft),t(M),t(Gt),t(Nt),t(pt)),t(i),_(y,e),_(I,e),_(ne,e),_(de,e),_(me,e),_(pe),_(L),_(ue),_(fe,e),_(ge),_(H),_(he,e),_(_e),_(X),_(be,e),_(ve),_(ye),_(Ie),_(Te,e),_(we),_(Me),_(Ve,e),_(Be),_(Ce),_(O),_($e,e),_(Je),_(je),_(xe,e),_(ke),_(ze),_(K),_(ee),_(Ue),_(We,e)}}}const wn='{"title":"InstructBlipVideo","local":"instructblipvideo","sections":[{"title":"Overview","local":"overview","sections":[],"depth":2},{"title":"Overview","local":"overview","sections":[],"depth":2},{"title":"Usage tips","local":"usage-tips","sections":[],"depth":2},{"title":"InstructBlipVideoConfig","local":"transformers.InstructBlipVideoConfig","sections":[],"depth":2},{"title":"InstructBlipVideoVisionConfig","local":"transformers.InstructBlipVideoVisionConfig","sections":[],"depth":2},{"title":"InstructBlipVideoQFormerConfig","local":"transformers.InstructBlipVideoQFormerConfig","sections":[],"depth":2},{"title":"InstructBlipVideoProcessor","local":"transformers.InstructBlipVideoProcessor","sections":[],"depth":2},{"title":"InstructBlipVideoImageProcessor","local":"transformers.InstructBlipVideoImageProcessor","sections":[],"depth":2},{"title":"InstructBlipVideoVisionModel","local":"transformers.InstructBlipVideoVisionModel","sections":[],"depth":2},{"title":"InstructBlipVideoQFormerModel","local":"transformers.InstructBlipVideoQFormerModel","sections":[],"depth":2},{"title":"InstructBlipVideoForConditionalGeneration","local":"transformers.InstructBlipVideoForConditionalGeneration","sections":[],"depth":2}],"depth":1}';function Mn(J){return mn(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class kn extends pn{constructor(i){super(),un(this,i,Mn,Tn,dn,{})}}export{kn as component};

Xet Storage Details

Size:: 91.5 kB
Xet hash:: 12d9241d562de8b046c9e431c89c1a5027be60f3eda72058acc99b2eaf7dbd35

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.