SajayR
/

Triad2

Safetensors

Model card Files Files and versions

xet

Community

SajayR commited on Mar 1, 2025

Commit

50d0a45

verified ·

1 Parent(s): 29c8bf8

Update hf_model.py

Browse files

Files changed (1) hide show

hf_model.py +36 -9

hf_model.py CHANGED Viewed

@@ -205,18 +205,44 @@ class Triad(nn.Module):
         assert image is not None or audio is not None or text_list is not None, "At least one modality must be provided"
         if image is not None: assert image is not str, "Frames should be a path to an image"
         if audio is not None:
-            assert isinstance(audio, torch.Tensor) and audio.shape[0] == 1 and len(audio.shape) == 2, "Audio must be a PyTorch tensor of shape (1, T)"
         if text_list is not None:
             assert isinstance(text_list, list) and len(text_list) == 1, "Text list must be a list of strings of length 1"
         if image is not None:
-            image = Image.open(image).convert('RGB')
-            transform = transforms.Compose([
-            transforms.Resize((224, 224)),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=[0.485, 0.456, 0.406],
-                              std=[0.229, 0.224, 0.225])
-            ])
-            image = transform(image)
         embeddings = {}
         if image is not None:
             embeddings['visual_feats'] = self.visual_embedder(image)
@@ -233,3 +259,4 @@ class Triad(nn.Module):
             embeddings['text_audio_sim_matrix'] = self.compute_similarity_matrix(embeddings['text_feats'], embeddings['audio_feats'])
         return embeddings

         assert image is not None or audio is not None or text_list is not None, "At least one modality must be provided"
         if image is not None: assert image is not str, "Frames should be a path to an image"
         if audio is not None:
+            assert isinstance(audio, torch.Tensor) and len(audio.shape) == 2, "Audio must be a PyTorch tensor of shape (B, T)"
         if text_list is not None:
             assert isinstance(text_list, list) and len(text_list) == 1, "Text list must be a list of strings of length 1"
         if image is not None:
+            device = next(self.parameters()).device
+            # Handle batch of file paths
+            if isinstance(image, list):
+                # Process a list of image paths
+                processed_images = []
+                for img_path in image:
+                    img = Image.open(img_path).convert('RGB')
+                    transform = transforms.Compose([
+                        transforms.Resize((224, 224)),
+                        transforms.ToTensor(),
+                        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+                    ])
+                    processed_img = transform(img).to(device)
+                    processed_images.append(processed_img)
+                image = torch.stack(processed_images, dim=0)  # [B, 3, 224, 224]
+            # Handle single file path
+            elif isinstance(image, str):
+                img = Image.open(image).convert('RGB')
+                transform = transforms.Compose([
+                    transforms.Resize((224, 224)),
+                    transforms.ToTensor(),
+                    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+                ])
+                image = transform(img).to(device).unsqueeze(0)  # Add batch dimension [1, 3, 224, 224]
+            # Handle tensor input (assume it's already processed but may need device transfer)
+            elif isinstance(image, torch.Tensor):
+                # If single image without batch dimension
+                if image.dim() == 3:
+                    image = image.unsqueeze(0)  # Add batch dimension
+                image = image.to(device)
         embeddings = {}
         if image is not None:
             embeddings['visual_feats'] = self.visual_embedder(image)
             embeddings['text_audio_sim_matrix'] = self.compute_similarity_matrix(embeddings['text_feats'], embeddings['audio_feats'])
         return embeddings