fcastrovilli commited on
Commit
64e55d6
·
1 Parent(s): 5f1fdf7

feat: imagebind

Browse files
Files changed (6) hide show
  1. .gitignore +11 -0
  2. Dockerfile +67 -0
  3. README.md +50 -0
  4. main.py +193 -0
  5. requirements.txt +18 -0
  6. setup_imagebind.py +62 -0
.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ **__pycache__
2
+ .vscode
3
+ .idea/
4
+ .python-version
5
+ build/
6
+ imagebind.egg-info
7
+ .DS_Store
8
+ venv/
9
+ .checkpoints/
10
+ imagebind/
11
+ setup.py
Dockerfile ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Stage 1: Build stage
2
+ FROM python:3.10-slim as builder
3
+
4
+ # Install build dependencies
5
+ RUN apt-get update && apt-get install -y \
6
+ ffmpeg \
7
+ libsndfile1 \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Create a non-root user
11
+ RUN useradd -m -u 1000 user
12
+
13
+ # Switch to non-root user
14
+ USER user
15
+ ENV PATH="/home/user/.local/bin:$PATH"
16
+
17
+ # Set working directory
18
+ WORKDIR /app
19
+
20
+ # Copy requirements and setup scripts
21
+ COPY --chown=user requirements.txt setup_imagebind.py ./
22
+
23
+ # Install dependencies into a virtual environment
24
+ RUN python -m venv /app/venv
25
+ ENV PATH="/app/venv/bin:$PATH"
26
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
27
+
28
+ # Run setup script to download ImageBind
29
+ RUN python setup_imagebind.py
30
+
31
+ # Install ImageBind
32
+ RUN pip install --no-cache-dir .
33
+
34
+ # Stage 2: Runtime stage
35
+ FROM python:3.10-slim
36
+
37
+ # Install runtime dependencies only
38
+ RUN apt-get update && apt-get install -y \
39
+ ffmpeg \
40
+ libsndfile1 \
41
+ && rm -rf /var/lib/apt/lists/*
42
+
43
+ # Create a non-root user
44
+ RUN useradd -m -u 1000 user
45
+
46
+ # Switch to non-root user
47
+ USER user
48
+ ENV PATH="/home/user/.local/bin:$PATH"
49
+
50
+ # Set working directory
51
+ WORKDIR /app
52
+
53
+ # Copy virtual environment from builder
54
+ COPY --from=builder --chown=user /app/venv /app/venv
55
+ ENV PATH="/app/venv/bin:$PATH"
56
+
57
+ # Copy ImageBind from builder
58
+ COPY --from=builder --chown=user /app/imagebind /app/imagebind
59
+
60
+ # Copy application code
61
+ COPY --chown=user main.py .
62
+
63
+ # Expose the port
64
+ EXPOSE 8000
65
+
66
+ # Command to run the application
67
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
README.md CHANGED
@@ -9,4 +9,54 @@ license: mit
9
  short_description: Small imagebind api implementation
10
  ---
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
9
  short_description: Small imagebind api implementation
10
  ---
11
 
12
+ # ImageBind API Implementation
13
+
14
+ A FastAPI implementation of Facebook's ImageBind model for cross-modal embeddings.
15
+
16
+ ## Local Setup
17
+
18
+ 1. Install system dependencies:
19
+
20
+ ```bash
21
+ sudo apt-get update && sudo apt-get install -y ffmpeg libsndfile1
22
+ ```
23
+
24
+ 2. Create and activate a virtual environment:
25
+
26
+ ```bash
27
+ python -m venv venv
28
+ source venv/bin/activate # On Windows: venv\Scripts\activate
29
+ ```
30
+
31
+ 3. Install Python dependencies:
32
+
33
+ ```bash
34
+ pip install --no-cache-dir --upgrade -r requirements.txt
35
+ ```
36
+
37
+ 4. Download and setup ImageBind:
38
+
39
+ ```bash
40
+ python setup_imagebind.py
41
+ pip install --no-cache-dir .
42
+ ```
43
+
44
+ ## Docker Setup
45
+
46
+ Build and run the container:
47
+
48
+ ```bash
49
+ docker build -t imagebind-api .
50
+ docker run -p 8000:8000 imagebind-api
51
+ ```
52
+
53
+ ## API Endpoints
54
+
55
+ The API will be available at `http://localhost:8000` with the following endpoints:
56
+
57
+ - POST `/compute_embeddings`: Generate embeddings for images, audio files, and text
58
+ - POST `/compute_similarities`: Compute similarities between embeddings
59
+
60
+ For detailed API documentation, visit `http://localhost:8000/docs`
61
+
62
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
main.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from imagebind import data
4
+ from imagebind.models import imagebind_model
5
+ from imagebind.models.imagebind_model import ModalityType
6
+ from pydub import AudioSegment
7
+ from fastapi import FastAPI, UploadFile, File, Form
8
+ from typing import List, Dict
9
+ import tempfile
10
+ from pydantic import BaseModel
11
+ import uvicorn
12
+ import numpy as np
13
+
14
+ app = FastAPI()
15
+
16
+ def convert_audio_to_wav(audio_path: str) -> str:
17
+ """Convert MP3 to WAV if necessary."""
18
+ if audio_path.lower().endswith('.mp3'):
19
+ wav_path = audio_path.rsplit('.', 1)[0] + '.wav'
20
+ if not os.path.exists(wav_path):
21
+ audio = AudioSegment.from_mp3(audio_path)
22
+ audio.export(wav_path, format='wav')
23
+ return wav_path
24
+ return audio_path
25
+
26
+ class EmbeddingManager:
27
+ def __init__(self):
28
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
29
+ self.model = imagebind_model.imagebind_huge(pretrained=True)
30
+ self.model.eval()
31
+ self.model.to(self.device)
32
+
33
+ def compute_embeddings(self,
34
+ images: List[str] = None,
35
+ audio_files: List[str] = None,
36
+ texts: List[str] = None) -> dict:
37
+ """Compute embeddings for provided modalities only."""
38
+ with torch.no_grad():
39
+ inputs = {}
40
+
41
+ if texts:
42
+ inputs[ModalityType.TEXT] = data.load_and_transform_text(texts, self.device)
43
+ if images:
44
+ inputs[ModalityType.VISION] = data.load_and_transform_vision_data(images, self.device)
45
+ if audio_files:
46
+ inputs[ModalityType.AUDIO] = data.load_and_transform_audio_data(audio_files, self.device)
47
+
48
+ if not inputs:
49
+ return {}
50
+
51
+ embeddings = self.model(inputs)
52
+
53
+ result = {}
54
+ if ModalityType.VISION in inputs:
55
+ result['vision'] = embeddings[ModalityType.VISION].cpu().numpy().tolist()
56
+ if ModalityType.AUDIO in inputs:
57
+ result['audio'] = embeddings[ModalityType.AUDIO].cpu().numpy().tolist()
58
+ if ModalityType.TEXT in inputs:
59
+ result['text'] = embeddings[ModalityType.TEXT].cpu().numpy().tolist()
60
+
61
+ return result
62
+
63
+ @staticmethod
64
+ def compute_similarities(embeddings: Dict[str, List[List[float]]]) -> dict:
65
+ """Compute similarities between available embeddings."""
66
+ similarities = {}
67
+
68
+ # Convert available embeddings to tensors
69
+ tensors = {
70
+ k: torch.tensor(v) for k, v in embeddings.items()
71
+ if isinstance(v, (list, np.ndarray)) and len(v) > 0
72
+ }
73
+
74
+ # Compute cross-modal similarities
75
+ modality_pairs = [
76
+ ('vision', 'audio', 'vision_audio'),
77
+ ('vision', 'text', 'vision_text'),
78
+ ('audio', 'text', 'audio_text')
79
+ ]
80
+
81
+ for mod1, mod2, key in modality_pairs:
82
+ if mod1 in tensors and mod2 in tensors:
83
+ similarities[key] = torch.softmax(
84
+ tensors[mod1] @ tensors[mod2].T,
85
+ dim=-1
86
+ ).numpy().tolist()
87
+
88
+ # Compute same-modality similarities
89
+ for modality in ['vision', 'audio', 'text']:
90
+ if modality in tensors:
91
+ key = f'{modality}_{modality}'
92
+ similarities[key] = torch.softmax(
93
+ tensors[modality] @ tensors[modality].T,
94
+ dim=-1
95
+ ).numpy().tolist()
96
+
97
+ return similarities
98
+
99
+ # Initialize the embedding manager
100
+ embedding_manager = EmbeddingManager()
101
+
102
+ class EmbeddingResponse(BaseModel):
103
+ embeddings: dict
104
+ file_names: dict
105
+
106
+ class SimilarityResponse(BaseModel):
107
+ similarities: dict
108
+
109
+ @app.post("/compute_embeddings", response_model=EmbeddingResponse)
110
+ async def generate_embeddings(
111
+ texts: str | None = Form(None),
112
+ images: List[UploadFile] | None = File(default=None),
113
+ audio_files: List[UploadFile] | None = File(default=None)
114
+ ):
115
+ """Generate embeddings for any provided files and texts."""
116
+ temp_files = []
117
+
118
+ try:
119
+ image_paths = []
120
+ image_names = []
121
+ audio_paths = []
122
+ audio_names = []
123
+ text_list = []
124
+
125
+ # Process images if provided
126
+ if images:
127
+ for img in images:
128
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(img.filename)[1]) as tmp:
129
+ content = await img.read()
130
+ tmp.write(content)
131
+ image_paths.append(tmp.name)
132
+ image_names.append(img.filename)
133
+ temp_files.append(tmp.name)
134
+
135
+ # Process audio files if provided
136
+ if audio_files:
137
+ for audio in audio_files:
138
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(audio.filename)[1]) as tmp:
139
+ content = await audio.read()
140
+ tmp.write(content)
141
+ audio_path = convert_audio_to_wav(tmp.name)
142
+ audio_paths.append(audio_path)
143
+ audio_names.append(audio.filename)
144
+ temp_files.append(tmp.name)
145
+ if audio_path != tmp.name:
146
+ temp_files.append(audio_path)
147
+
148
+ # Process texts if provided
149
+ if texts:
150
+ text_list = [text.strip() for text in texts.split('\n') if text.strip()]
151
+
152
+ # Compute embeddings only if we have any input
153
+ if not any([image_paths, audio_paths, text_list]):
154
+ return EmbeddingResponse(
155
+ embeddings={},
156
+ file_names={}
157
+ )
158
+
159
+ embeddings = embedding_manager.compute_embeddings(
160
+ image_paths if image_paths else None,
161
+ audio_paths if audio_paths else None,
162
+ text_list if text_list else None
163
+ )
164
+
165
+ file_names = {}
166
+ if image_names:
167
+ file_names['images'] = image_names
168
+ if audio_names:
169
+ file_names['audio'] = audio_names
170
+ if text_list:
171
+ file_names['texts'] = text_list
172
+
173
+ return EmbeddingResponse(
174
+ embeddings=embeddings,
175
+ file_names=file_names
176
+ )
177
+
178
+ finally:
179
+ # Clean up temporary files
180
+ for temp_file in temp_files:
181
+ try:
182
+ os.unlink(temp_file)
183
+ except:
184
+ pass
185
+
186
+ @app.post("/compute_similarities", response_model=SimilarityResponse)
187
+ async def compute_similarities(embeddings: Dict[str, List[List[float]]]):
188
+ """Compute similarities from provided embeddings."""
189
+ similarities = embedding_manager.compute_similarities(embeddings)
190
+ return SimilarityResponse(similarities=similarities)
191
+
192
+ if __name__ == "__main__":
193
+ uvicorn.run(app, host="0.0.0.0", port=8000)
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch>=2.0.0,<2.1.0
2
+ torchvision>=0.15.0,<0.16.0
3
+ torchaudio>=2.0.0,<2.1.0
4
+ pytorchvideo @ git+https://github.com/facebookresearch/pytorchvideo.git@main
5
+ timm>=0.9.0,<0.10.0
6
+ ftfy
7
+ regex
8
+ einops
9
+ fvcore
10
+ eva-decord>=0.6.1
11
+ iopath
12
+ numpy>=1.24.0,<2.0.0
13
+ matplotlib
14
+ types-regex
15
+ pydub
16
+ fastapi
17
+ uvicorn
18
+ python-multipart
setup_imagebind.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shutil
2
+ import requests
3
+ from pathlib import Path
4
+ from zipfile import ZipFile
5
+ from io import BytesIO
6
+
7
+ def download_file(url: str, destination: Path) -> None:
8
+ """Download a file from URL to the specified destination."""
9
+ response = requests.get(url)
10
+ response.raise_for_status()
11
+ destination.write_bytes(response.content)
12
+
13
+ def download_github_folder(repo_owner: str, repo_name: str, folder_path: str, destination: Path) -> None:
14
+ """Download a specific folder from a GitHub repository using the ZIP download feature."""
15
+ # Download the whole repository as a ZIP file
16
+ zip_url = f"https://github.com/{repo_owner}/{repo_name}/archive/refs/heads/main.zip"
17
+ response = requests.get(zip_url)
18
+ response.raise_for_status()
19
+
20
+ # Extract only the needed folder from the ZIP
21
+ with ZipFile(BytesIO(response.content)) as zip_file:
22
+ folder_prefix = f"{repo_name}-main/{folder_path}"
23
+ # Extract only files from the specified folder
24
+ for file in zip_file.namelist():
25
+ if file.startswith(folder_prefix):
26
+ # Remove the repository name and branch prefix from the path
27
+ relative_path = file.replace(f"{repo_name}-main/", "", 1)
28
+ if relative_path.endswith('/'): # Skip directory entries
29
+ continue
30
+
31
+ # Read the file content from ZIP
32
+ content = zip_file.read(file)
33
+
34
+ # Create the file path
35
+ output_path = destination / relative_path
36
+ output_path.parent.mkdir(parents=True, exist_ok=True)
37
+
38
+ # Write the file
39
+ output_path.write_bytes(content)
40
+
41
+ def setup_imagebind():
42
+ """Setup ImageBind by downloading only required files."""
43
+ # Create clean imagebind directory if needed
44
+ imagebind_dir = Path("imagebind")
45
+ if imagebind_dir.exists():
46
+ shutil.rmtree(imagebind_dir)
47
+
48
+ # Download the imagebind folder
49
+ download_github_folder(
50
+ repo_owner="facebookresearch",
51
+ repo_name="ImageBind",
52
+ folder_path="imagebind",
53
+ destination=Path(".")
54
+ )
55
+
56
+ # Download setup.py file
57
+ setup_py_url = "https://raw.githubusercontent.com/facebookresearch/ImageBind/main/setup.py"
58
+ setup_py_path = Path("setup.py")
59
+ download_file(setup_py_url, setup_py_path)
60
+
61
+ if __name__ == "__main__":
62
+ setup_imagebind()