lematt1991 commited on
Commit
115ef33
·
verified ·
1 Parent(s): cd44074

Upload README.md with huggingface_hub

Browse files
Files changed (1) hide show
  1. README.md +40 -48
README.md CHANGED
@@ -125,24 +125,18 @@ def encode_video_plus_text(
125
  )
126
  ```
127
 
128
- ## `transformers` Usage 🤗
129
-
130
- Install `transformers` from source
131
-
132
- ```bash
133
- pip install git+https://github.com/huggingface/transformers
134
- ```
135
-
136
- For more information, check the documentation [here](https://huggingface.co/docs/transformers/main/en/model_doc/pe_audio_video).
137
 
138
  ```python
139
  from transformers import PeAudioVideoModel, PeAudioVideoProcessor
140
  import torch
141
 
142
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
143
- model = PeAudioVideoModel.from_pretrained("facebook/pe-av-large", device_map=device, dtype=torch.bfloat16)
144
  processor = PeAudioVideoProcessor.from_pretrained("facebook/pe-av-large")
145
 
 
 
146
  video_files = ["video1.mp4", "video2.mp4"]
147
  descriptions = ["description1", "description2"]
148
  audio_files = ["audio1.wav", "audio2.wav"]
@@ -153,66 +147,64 @@ inputs = processor(
153
  )
154
 
155
  with torch.inference_mode(), torch.autocast(device.type, dtype=torch.bfloat16):
156
- outputs = model(**inputs.to(device, dtype=model.dtype))
157
 
158
  audio_embeds = outputs.audio_embeds # Audio-only embeddings
159
  video_embeds = outputs.video_embeds # Video-only embeddings
160
  audio_video_embeds = outputs.audio_video_embeds # Joint audio-video embeddings
 
161
  text_audio_embeds = outputs.text_audio_embeds # Text embeddings aligned to audio
162
  text_video_embeds = outputs.text_video_embeds # Text embeddings aligned to video
163
- text_audio_video_embeds = outputs.text_audio_video_embeds # Text embeddings aligned to audio-video
164
  audio_plus_text_embeds = outputs.audio_plus_text_embeds # Joint audio and text embedding
165
  video_plus_text_embeds = outputs.video_plus_text_embeds # Joint video and text embedding
166
- ```
167
 
168
- Note that you can omit any of the modalities, and use the same `forward` method. The corresponding embeddings in `output` will be `None`.
 
169
 
170
- Moreover using `transformers`, one can load only the sub-model of interest to avoid loading the full model if for example one only wants to retrieve audio embeddgins:
171
- ```python
172
- from transformers import PeAudioModel, PeAudioProcessor
173
- import torch
174
 
175
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
176
- model = PeAudioVideoModel.from_pretrained("facebook/pe-av-large", device_map=device, dtype=torch.bfloat16)
177
- processor = PeAudioVideoProcessor.from_pretrained("facebook/pe-av-large")
178
 
179
- descriptions = ["description1", "description2"]
180
- audio_files = ["audio1.wav", "audio2.wav"]
181
 
182
- # Process inputs and get embeddings
183
- inputs = processor(
184
- text=descriptions, audio=audio_files, return_tensors="pt", padding=True
185
- )
186
 
187
- with torch.inference_mode(), torch.autocast(device.type, dtype=torch.bfloat16):
188
- outputs = model(**inputs.to(device, dtype=model.dtype))
189
 
190
- audio_embeds = outputs.audio_embeds # Audio-only embeddings
191
- text_audio_embeds = outputs.text_audio_embeds # Text embeddings aligned to audio
192
- ```
193
 
194
- likewise for video embeddings:
195
- ```python
196
- from transformers import PeVideoModel, PeVideoProcessor
197
- import torch
198
 
199
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
200
- model = PeVideoModel.from_pretrained("facebook/pe-av-large", device_map=device, dtype=torch.bfloat16)
201
- processor = PeVideoProcessor.from_pretrained("facebook/pe-av-large")
202
 
203
- descriptions = ["description1", "description2"]
204
- video_files = ["video1.mp4", "video2.mp4"]
 
 
 
 
 
 
 
205
 
206
- # Process inputs and get embeddings
207
- inputs = processor(
208
- text=descriptions, videos=video_files, return_tensors="pt", padding=True
 
 
 
209
  )
210
 
211
- with torch.inference_mode(), torch.autocast(device.type, dtype=torch.bfloat16):
212
- outputs = model(**inputs.to(device, dtype=model.dtype))
 
 
 
 
 
213
 
214
- video_embeds = outputs.video_embeds # Video-only embeddings
215
- text_video_embeds = outputs.text_video_embeds # Text embeddings aligned to video
216
  ```
217
 
218
  ## Citation
 
125
  )
126
  ```
127
 
128
+ ## `transformers` Usage
 
 
 
 
 
 
 
 
129
 
130
  ```python
131
  from transformers import PeAudioVideoModel, PeAudioVideoProcessor
132
  import torch
133
 
134
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
135
+ model = PeAudioVideoModel.from_pretrained("facebook/pe-av-large")
136
  processor = PeAudioVideoProcessor.from_pretrained("facebook/pe-av-large")
137
 
138
+ model = model.to(device)
139
+
140
  video_files = ["video1.mp4", "video2.mp4"]
141
  descriptions = ["description1", "description2"]
142
  audio_files = ["audio1.wav", "audio2.wav"]
 
147
  )
148
 
149
  with torch.inference_mode(), torch.autocast(device.type, dtype=torch.bfloat16):
150
+ outputs = model(**inputs.to(device), return_loss=True)
151
 
152
  audio_embeds = outputs.audio_embeds # Audio-only embeddings
153
  video_embeds = outputs.video_embeds # Video-only embeddings
154
  audio_video_embeds = outputs.audio_video_embeds # Joint audio-video embeddings
155
+ text_audio_video_embeds = outputs.audio_video_text_embeds # Text embeddings aligned to audio-video
156
  text_audio_embeds = outputs.text_audio_embeds # Text embeddings aligned to audio
157
  text_video_embeds = outputs.text_video_embeds # Text embeddings aligned to video
 
158
  audio_plus_text_embeds = outputs.audio_plus_text_embeds # Joint audio and text embedding
159
  video_plus_text_embeds = outputs.video_plus_text_embeds # Joint video and text embedding
 
160
 
161
+ # For classification, you can use the logits_* fields of the output
162
+ audio_text_preds = outputs.logits_audio_text.sigmoid()
163
 
164
+ # The overall loss is also available in the output (requires passing return_loss=True)
165
+ loss = outputs.loss
 
 
166
 
167
+ ```
 
 
168
 
169
+ We also provide methods for directly encoding an individual modality:
 
170
 
171
+ ```python
172
+ def get_text_audio_embeds(self, input_ids, attention_mask=None)
 
 
173
 
174
+ def get_text_video_embeds(self, input_ids, attention_mask=None)
 
175
 
176
+ def get_text_audio_video_embeds(self, input_ids, attention_mask=None)
 
 
177
 
178
+ def get_audio_embeds(self, input_values, padding_mask=None)
 
 
 
179
 
180
+ def get_video_embeds(self, pixel_values_videos, padding_mask_videos=None)
 
 
181
 
182
+ def get_audio_video_embeds(
183
+ self,
184
+ input_values: torch.Tensor,
185
+ pixel_values_videos: torch.Tensor,
186
+ padding_mask: Optional[torch.Tensor] = None,
187
+ padding_mask_videos: Optional[torch.Tensor] = None,
188
+ return_audio_embeds: bool = False,
189
+ return_video_embeds: bool = False,
190
+ )
191
 
192
+ def get_audio_plus_text_embeds(
193
+ self,
194
+ input_ids: torch.Tensor,
195
+ input_values: torch.Tensor,
196
+ attention_mask: Optional[torch.Tensor] = None,
197
+ padding_mask: Optional[torch.Tensor] = None,
198
  )
199
 
200
+ def get_video_plus_text_embeds(
201
+ self,
202
+ input_ids: torch.Tensor,
203
+ pixel_values_videos: torch.Tensor,
204
+ attention_mask: Optional[torch.Tensor] = None,
205
+ padding_mask_videos: Optional[torch.Tensor] = None,
206
+ )
207
 
 
 
208
  ```
209
 
210
  ## Citation