facebook
/

pe-av-base

Safetensors

pe_audio_video

Model card Files Files and versions

xet

Community

lematt1991 commited on Dec 16, 2025

Commit

e6f0155

verified ·

1 Parent(s): 7ac8788

Upload README.md with huggingface_hub

Browse files

Files changed (1) hide show

README.md +41 -49

README.md CHANGED Viewed

@@ -125,23 +125,17 @@ def encode_video_plus_text(
 )
 ```
-## `transformers` Usage 🤗
-Install `transformers` from source
-```bash
-pip install git+https://github.com/huggingface/transformers
-```
-For more information, check the documentation [here](https://huggingface.co/docs/transformers/main/en/model_doc/pe_audio_video).
 ```python
 from transformers import PeAudioVideoModel, PeAudioVideoProcessor
 import torch
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model = PeAudioVideoModel.from_pretrained("facebook/pe-av-base", device_map=device, dtype=torch.bfloat16)
-processor = PeAudioVideoProcessor.from_pretrained("facebook/pe-av-base")
 video_files = ["video1.mp4", "video2.mp4"]
 descriptions = ["description1", "description2"]
@@ -153,66 +147,64 @@ inputs = processor(
 )
 with torch.inference_mode(), torch.autocast(device.type, dtype=torch.bfloat16):
-    outputs = model(**inputs.to(device, dtype=model.dtype))
 audio_embeds = outputs.audio_embeds  # Audio-only embeddings
 video_embeds = outputs.video_embeds  # Video-only embeddings
 audio_video_embeds = outputs.audio_video_embeds  # Joint audio-video embeddings
 text_audio_embeds = outputs.text_audio_embeds  # Text embeddings aligned to audio
 text_video_embeds = outputs.text_video_embeds  # Text embeddings aligned to video
-text_audio_video_embeds = outputs.text_audio_video_embeds  # Text embeddings aligned to audio-video
 audio_plus_text_embeds = outputs.audio_plus_text_embeds  # Joint audio and text embedding
 video_plus_text_embeds = outputs.video_plus_text_embeds  # Joint video and text embedding
-```
-Note that you can omit any of the modalities, and use the same `forward` method.  The corresponding embeddings in `output` will be `None`.
-Moreover using `transformers`, one can load only the sub-model of interest to avoid loading the full model if for example one only wants to retrieve audio embeddgins:
-```python
-from transformers import PeAudioModel, PeAudioProcessor
-import torch
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model = PeAudioVideoModel.from_pretrained("facebook/pe-av-base", device_map=device, dtype=torch.bfloat16)
-processor = PeAudioVideoProcessor.from_pretrained("facebook/pe-av-base")
-descriptions = ["description1", "description2"]
-audio_files = ["audio1.wav", "audio2.wav"]
-# Process inputs and get embeddings
-inputs = processor(
-    text=descriptions, audio=audio_files, return_tensors="pt", padding=True
-)
-with torch.inference_mode(), torch.autocast(device.type, dtype=torch.bfloat16):
-    outputs = model(**inputs.to(device, dtype=model.dtype))
-audio_embeds = outputs.audio_embeds  # Audio-only embeddings
-text_audio_embeds = outputs.text_audio_embeds  # Text embeddings aligned to audio
-```
-likewise for video embeddings:
-```python
-from transformers import PeVideoModel, PeVideoProcessor
-import torch
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model = PeVideoModel.from_pretrained("facebook/pe-av-base", device_map=device, dtype=torch.bfloat16)
-processor = PeVideoProcessor.from_pretrained("facebook/pe-av-base")
-descriptions = ["description1", "description2"]
-video_files = ["video1.mp4", "video2.mp4"]
-# Process inputs and get embeddings
-inputs = processor(
-    text=descriptions, videos=video_files, return_tensors="pt", padding=True
 )
-with torch.inference_mode(), torch.autocast(device.type, dtype=torch.bfloat16):
-    outputs = model(**inputs.to(device, dtype=model.dtype))
-video_embeds = outputs.video_embeds  # Video-only embeddings
-text_video_embeds = outputs.text_video_embeds  # Text embeddings aligned to video
 ```
 ## Citation

 )
 ```
+## `transformers` Usage
 ```python
 from transformers import PeAudioVideoModel, PeAudioVideoProcessor
 import torch
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = PeAudioVideoModel.from_pretrained("facebook/pe-av-large")
+processor = PeAudioVideoProcessor.from_pretrained("facebook/pe-av-large")
+model = model.to(device)
 video_files = ["video1.mp4", "video2.mp4"]
 descriptions = ["description1", "description2"]
 )
 with torch.inference_mode(), torch.autocast(device.type, dtype=torch.bfloat16):
+    outputs = model(**inputs.to(device), return_loss=True)
 audio_embeds = outputs.audio_embeds  # Audio-only embeddings
 video_embeds = outputs.video_embeds  # Video-only embeddings
 audio_video_embeds = outputs.audio_video_embeds  # Joint audio-video embeddings
+text_audio_video_embeds = outputs.audio_video_text_embeds  # Text embeddings aligned to audio-video
 text_audio_embeds = outputs.text_audio_embeds  # Text embeddings aligned to audio
 text_video_embeds = outputs.text_video_embeds  # Text embeddings aligned to video
 audio_plus_text_embeds = outputs.audio_plus_text_embeds  # Joint audio and text embedding
 video_plus_text_embeds = outputs.video_plus_text_embeds  # Joint video and text embedding
+# For classification, you can use the logits_* fields of the output
+audio_text_preds = outputs.logits_audio_text.sigmoid()
+# The overall loss is also available in the output (requires passing return_loss=True)
+loss = outputs.loss
+```
+We also provide methods for directly encoding an individual modality:
+```python
+def get_text_audio_embeds(self, input_ids, attention_mask=None)
+def get_text_video_embeds(self, input_ids, attention_mask=None)
+def get_text_audio_video_embeds(self, input_ids, attention_mask=None)
+def get_audio_embeds(self, input_values, padding_mask=None)
+def get_video_embeds(self, pixel_values_videos, padding_mask_videos=None)
+def get_audio_video_embeds(
+    self,
+    input_values: torch.Tensor,
+    pixel_values_videos: torch.Tensor,
+    padding_mask: Optional[torch.Tensor] = None,
+    padding_mask_videos: Optional[torch.Tensor] = None,
+    return_audio_embeds: bool = False,
+    return_video_embeds: bool = False,
+)
+def get_audio_plus_text_embeds(
+    self,
+    input_ids: torch.Tensor,
+    input_values: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    padding_mask: Optional[torch.Tensor] = None,
 )
+def get_video_plus_text_embeds(
+    self,
+    input_ids: torch.Tensor,
+    pixel_values_videos: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    padding_mask_videos: Optional[torch.Tensor] = None,
+)
 ```
 ## Citation