Upload README.md with huggingface_hub
Browse files
README.md
CHANGED
|
@@ -125,23 +125,17 @@ def encode_video_plus_text(
|
|
| 125 |
)
|
| 126 |
```
|
| 127 |
|
| 128 |
-
## `transformers` Usage
|
| 129 |
-
|
| 130 |
-
Install `transformers` from source
|
| 131 |
-
|
| 132 |
-
```bash
|
| 133 |
-
pip install git+https://github.com/huggingface/transformers
|
| 134 |
-
```
|
| 135 |
-
|
| 136 |
-
For more information, check the documentation [here](https://huggingface.co/docs/transformers/main/en/model_doc/pe_audio_video).
|
| 137 |
|
| 138 |
```python
|
| 139 |
from transformers import PeAudioVideoModel, PeAudioVideoProcessor
|
| 140 |
import torch
|
| 141 |
|
| 142 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 143 |
-
model = PeAudioVideoModel.from_pretrained("facebook/pe-av-
|
| 144 |
-
processor = PeAudioVideoProcessor.from_pretrained("facebook/pe-av-
|
|
|
|
|
|
|
| 145 |
|
| 146 |
video_files = ["video1.mp4", "video2.mp4"]
|
| 147 |
descriptions = ["description1", "description2"]
|
|
@@ -153,66 +147,64 @@ inputs = processor(
|
|
| 153 |
)
|
| 154 |
|
| 155 |
with torch.inference_mode(), torch.autocast(device.type, dtype=torch.bfloat16):
|
| 156 |
-
outputs = model(**inputs.to(device,
|
| 157 |
|
| 158 |
audio_embeds = outputs.audio_embeds # Audio-only embeddings
|
| 159 |
video_embeds = outputs.video_embeds # Video-only embeddings
|
| 160 |
audio_video_embeds = outputs.audio_video_embeds # Joint audio-video embeddings
|
|
|
|
| 161 |
text_audio_embeds = outputs.text_audio_embeds # Text embeddings aligned to audio
|
| 162 |
text_video_embeds = outputs.text_video_embeds # Text embeddings aligned to video
|
| 163 |
-
text_audio_video_embeds = outputs.text_audio_video_embeds # Text embeddings aligned to audio-video
|
| 164 |
audio_plus_text_embeds = outputs.audio_plus_text_embeds # Joint audio and text embedding
|
| 165 |
video_plus_text_embeds = outputs.video_plus_text_embeds # Joint video and text embedding
|
| 166 |
-
```
|
| 167 |
|
| 168 |
-
|
|
|
|
| 169 |
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
from transformers import PeAudioModel, PeAudioProcessor
|
| 173 |
-
import torch
|
| 174 |
|
| 175 |
-
|
| 176 |
-
model = PeAudioVideoModel.from_pretrained("facebook/pe-av-base", device_map=device, dtype=torch.bfloat16)
|
| 177 |
-
processor = PeAudioVideoProcessor.from_pretrained("facebook/pe-av-base")
|
| 178 |
|
| 179 |
-
|
| 180 |
-
audio_files = ["audio1.wav", "audio2.wav"]
|
| 181 |
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
text=descriptions, audio=audio_files, return_tensors="pt", padding=True
|
| 185 |
-
)
|
| 186 |
|
| 187 |
-
|
| 188 |
-
outputs = model(**inputs.to(device, dtype=model.dtype))
|
| 189 |
|
| 190 |
-
|
| 191 |
-
text_audio_embeds = outputs.text_audio_embeds # Text embeddings aligned to audio
|
| 192 |
-
```
|
| 193 |
|
| 194 |
-
|
| 195 |
-
```python
|
| 196 |
-
from transformers import PeVideoModel, PeVideoProcessor
|
| 197 |
-
import torch
|
| 198 |
|
| 199 |
-
|
| 200 |
-
model = PeVideoModel.from_pretrained("facebook/pe-av-base", device_map=device, dtype=torch.bfloat16)
|
| 201 |
-
processor = PeVideoProcessor.from_pretrained("facebook/pe-av-base")
|
| 202 |
|
| 203 |
-
|
| 204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
| 209 |
)
|
| 210 |
|
| 211 |
-
|
| 212 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
-
video_embeds = outputs.video_embeds # Video-only embeddings
|
| 215 |
-
text_video_embeds = outputs.text_video_embeds # Text embeddings aligned to video
|
| 216 |
```
|
| 217 |
|
| 218 |
## Citation
|
|
|
|
| 125 |
)
|
| 126 |
```
|
| 127 |
|
| 128 |
+
## `transformers` Usage
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
```python
|
| 131 |
from transformers import PeAudioVideoModel, PeAudioVideoProcessor
|
| 132 |
import torch
|
| 133 |
|
| 134 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 135 |
+
model = PeAudioVideoModel.from_pretrained("facebook/pe-av-large")
|
| 136 |
+
processor = PeAudioVideoProcessor.from_pretrained("facebook/pe-av-large")
|
| 137 |
+
|
| 138 |
+
model = model.to(device)
|
| 139 |
|
| 140 |
video_files = ["video1.mp4", "video2.mp4"]
|
| 141 |
descriptions = ["description1", "description2"]
|
|
|
|
| 147 |
)
|
| 148 |
|
| 149 |
with torch.inference_mode(), torch.autocast(device.type, dtype=torch.bfloat16):
|
| 150 |
+
outputs = model(**inputs.to(device), return_loss=True)
|
| 151 |
|
| 152 |
audio_embeds = outputs.audio_embeds # Audio-only embeddings
|
| 153 |
video_embeds = outputs.video_embeds # Video-only embeddings
|
| 154 |
audio_video_embeds = outputs.audio_video_embeds # Joint audio-video embeddings
|
| 155 |
+
text_audio_video_embeds = outputs.audio_video_text_embeds # Text embeddings aligned to audio-video
|
| 156 |
text_audio_embeds = outputs.text_audio_embeds # Text embeddings aligned to audio
|
| 157 |
text_video_embeds = outputs.text_video_embeds # Text embeddings aligned to video
|
|
|
|
| 158 |
audio_plus_text_embeds = outputs.audio_plus_text_embeds # Joint audio and text embedding
|
| 159 |
video_plus_text_embeds = outputs.video_plus_text_embeds # Joint video and text embedding
|
|
|
|
| 160 |
|
| 161 |
+
# For classification, you can use the logits_* fields of the output
|
| 162 |
+
audio_text_preds = outputs.logits_audio_text.sigmoid()
|
| 163 |
|
| 164 |
+
# The overall loss is also available in the output (requires passing return_loss=True)
|
| 165 |
+
loss = outputs.loss
|
|
|
|
|
|
|
| 166 |
|
| 167 |
+
```
|
|
|
|
|
|
|
| 168 |
|
| 169 |
+
We also provide methods for directly encoding an individual modality:
|
|
|
|
| 170 |
|
| 171 |
+
```python
|
| 172 |
+
def get_text_audio_embeds(self, input_ids, attention_mask=None)
|
|
|
|
|
|
|
| 173 |
|
| 174 |
+
def get_text_video_embeds(self, input_ids, attention_mask=None)
|
|
|
|
| 175 |
|
| 176 |
+
def get_text_audio_video_embeds(self, input_ids, attention_mask=None)
|
|
|
|
|
|
|
| 177 |
|
| 178 |
+
def get_audio_embeds(self, input_values, padding_mask=None)
|
|
|
|
|
|
|
|
|
|
| 179 |
|
| 180 |
+
def get_video_embeds(self, pixel_values_videos, padding_mask_videos=None)
|
|
|
|
|
|
|
| 181 |
|
| 182 |
+
def get_audio_video_embeds(
|
| 183 |
+
self,
|
| 184 |
+
input_values: torch.Tensor,
|
| 185 |
+
pixel_values_videos: torch.Tensor,
|
| 186 |
+
padding_mask: Optional[torch.Tensor] = None,
|
| 187 |
+
padding_mask_videos: Optional[torch.Tensor] = None,
|
| 188 |
+
return_audio_embeds: bool = False,
|
| 189 |
+
return_video_embeds: bool = False,
|
| 190 |
+
)
|
| 191 |
|
| 192 |
+
def get_audio_plus_text_embeds(
|
| 193 |
+
self,
|
| 194 |
+
input_ids: torch.Tensor,
|
| 195 |
+
input_values: torch.Tensor,
|
| 196 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 197 |
+
padding_mask: Optional[torch.Tensor] = None,
|
| 198 |
)
|
| 199 |
|
| 200 |
+
def get_video_plus_text_embeds(
|
| 201 |
+
self,
|
| 202 |
+
input_ids: torch.Tensor,
|
| 203 |
+
pixel_values_videos: torch.Tensor,
|
| 204 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 205 |
+
padding_mask_videos: Optional[torch.Tensor] = None,
|
| 206 |
+
)
|
| 207 |
|
|
|
|
|
|
|
| 208 |
```
|
| 209 |
|
| 210 |
## Citation
|