imagebind2

Runtime error

App Files Files Community

fcastrovilli commited on Nov 6, 2024

Commit

64e55d6

1 Parent(s): 5f1fdf7

feat: imagebind

Browse files

Files changed (6) hide show

.gitignore +11 -0
Dockerfile +67 -0
README.md +50 -0
main.py +193 -0
requirements.txt +18 -0
setup_imagebind.py +62 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,11 @@

+**__pycache__
+.vscode
+.idea/
+.python-version
+build/
+imagebind.egg-info
+.DS_Store
+venv/
+.checkpoints/
+imagebind/
+setup.py

Dockerfile ADDED Viewed

	@@ -0,0 +1,67 @@

+# Stage 1: Build stage
+FROM python:3.10-slim as builder
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    libsndfile1 \
+    && rm -rf /var/lib/apt/lists/*
+# Create a non-root user
+RUN useradd -m -u 1000 user
+# Switch to non-root user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+# Set working directory
+WORKDIR /app
+# Copy requirements and setup scripts
+COPY --chown=user requirements.txt setup_imagebind.py ./
+# Install dependencies into a virtual environment
+RUN python -m venv /app/venv
+ENV PATH="/app/venv/bin:$PATH"
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# Run setup script to download ImageBind
+RUN python setup_imagebind.py
+# Install ImageBind
+RUN pip install --no-cache-dir .
+# Stage 2: Runtime stage
+FROM python:3.10-slim
+# Install runtime dependencies only
+RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    libsndfile1 \
+    && rm -rf /var/lib/apt/lists/*
+# Create a non-root user
+RUN useradd -m -u 1000 user
+# Switch to non-root user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+# Set working directory
+WORKDIR /app
+# Copy virtual environment from builder
+COPY --from=builder --chown=user /app/venv /app/venv
+ENV PATH="/app/venv/bin:$PATH"
+# Copy ImageBind from builder
+COPY --from=builder --chown=user /app/imagebind /app/imagebind
+# Copy application code
+COPY --chown=user main.py .
+# Expose the port
+EXPOSE 8000
+# Command to run the application
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

README.md CHANGED Viewed

@@ -9,4 +9,54 @@ license: mit
 short_description: Small imagebind api implementation
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 short_description: Small imagebind api implementation
 ---
+# ImageBind API Implementation
+A FastAPI implementation of Facebook's ImageBind model for cross-modal embeddings.
+## Local Setup
+1. Install system dependencies:
+```bash
+sudo apt-get update && sudo apt-get install -y ffmpeg libsndfile1
+```
+2. Create and activate a virtual environment:
+```bash
+python -m venv venv
+source venv/bin/activate  # On Windows: venv\Scripts\activate
+```
+3. Install Python dependencies:
+```bash
+pip install --no-cache-dir --upgrade -r requirements.txt
+```
+4. Download and setup ImageBind:
+```bash
+python setup_imagebind.py
+pip install --no-cache-dir .
+```
+## Docker Setup
+Build and run the container:
+```bash
+docker build -t imagebind-api .
+docker run -p 8000:8000 imagebind-api
+```
+## API Endpoints
+The API will be available at `http://localhost:8000` with the following endpoints:
+- POST `/compute_embeddings`: Generate embeddings for images, audio files, and text
+- POST `/compute_similarities`: Compute similarities between embeddings
+For detailed API documentation, visit `http://localhost:8000/docs`
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

main.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import os
+import torch
+from imagebind import data
+from imagebind.models import imagebind_model
+from imagebind.models.imagebind_model import ModalityType
+from pydub import AudioSegment
+from fastapi import FastAPI, UploadFile, File, Form
+from typing import List, Dict
+import tempfile
+from pydantic import BaseModel
+import uvicorn
+import numpy as np
+app = FastAPI()
+def convert_audio_to_wav(audio_path: str) -> str:
+    """Convert MP3 to WAV if necessary."""
+    if audio_path.lower().endswith('.mp3'):
+        wav_path = audio_path.rsplit('.', 1)[0] + '.wav'
+        if not os.path.exists(wav_path):
+            audio = AudioSegment.from_mp3(audio_path)
+            audio.export(wav_path, format='wav')
+        return wav_path
+    return audio_path
+class EmbeddingManager:
+    def __init__(self):
+        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        self.model = imagebind_model.imagebind_huge(pretrained=True)
+        self.model.eval()
+        self.model.to(self.device)
+    def compute_embeddings(self,
+                          images: List[str] = None,
+                          audio_files: List[str] = None,
+                          texts: List[str] = None) -> dict:
+        """Compute embeddings for provided modalities only."""
+        with torch.no_grad():
+            inputs = {}
+            if texts:
+                inputs[ModalityType.TEXT] = data.load_and_transform_text(texts, self.device)
+            if images:
+                inputs[ModalityType.VISION] = data.load_and_transform_vision_data(images, self.device)
+            if audio_files:
+                inputs[ModalityType.AUDIO] = data.load_and_transform_audio_data(audio_files, self.device)
+            if not inputs:
+                return {}
+            embeddings = self.model(inputs)
+            result = {}
+            if ModalityType.VISION in inputs:
+                result['vision'] = embeddings[ModalityType.VISION].cpu().numpy().tolist()
+            if ModalityType.AUDIO in inputs:
+                result['audio'] = embeddings[ModalityType.AUDIO].cpu().numpy().tolist()
+            if ModalityType.TEXT in inputs:
+                result['text'] = embeddings[ModalityType.TEXT].cpu().numpy().tolist()
+            return result
+    @staticmethod
+    def compute_similarities(embeddings: Dict[str, List[List[float]]]) -> dict:
+        """Compute similarities between available embeddings."""
+        similarities = {}
+        # Convert available embeddings to tensors
+        tensors = {
+            k: torch.tensor(v) for k, v in embeddings.items()
+            if isinstance(v, (list, np.ndarray)) and len(v) > 0
+        }
+        # Compute cross-modal similarities
+        modality_pairs = [
+            ('vision', 'audio', 'vision_audio'),
+            ('vision', 'text', 'vision_text'),
+            ('audio', 'text', 'audio_text')
+        ]
+        for mod1, mod2, key in modality_pairs:
+            if mod1 in tensors and mod2 in tensors:
+                similarities[key] = torch.softmax(
+                    tensors[mod1] @ tensors[mod2].T,
+                    dim=-1
+                ).numpy().tolist()
+        # Compute same-modality similarities
+        for modality in ['vision', 'audio', 'text']:
+            if modality in tensors:
+                key = f'{modality}_{modality}'
+                similarities[key] = torch.softmax(
+                    tensors[modality] @ tensors[modality].T,
+                    dim=-1
+                ).numpy().tolist()
+        return similarities
+# Initialize the embedding manager
+embedding_manager = EmbeddingManager()
+class EmbeddingResponse(BaseModel):
+    embeddings: dict
+    file_names: dict
+class SimilarityResponse(BaseModel):
+    similarities: dict
+@app.post("/compute_embeddings", response_model=EmbeddingResponse)
+async def generate_embeddings(
+    texts: str | None = Form(None),
+    images: List[UploadFile] | None = File(default=None),
+    audio_files: List[UploadFile] | None = File(default=None)
+):
+    """Generate embeddings for any provided files and texts."""
+    temp_files = []
+    try:
+        image_paths = []
+        image_names = []
+        audio_paths = []
+        audio_names = []
+        text_list = []
+        # Process images if provided
+        if images:
+            for img in images:
+                with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(img.filename)[1]) as tmp:
+                    content = await img.read()
+                    tmp.write(content)
+                    image_paths.append(tmp.name)
+                    image_names.append(img.filename)
+                    temp_files.append(tmp.name)
+        # Process audio files if provided
+        if audio_files:
+            for audio in audio_files:
+                with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(audio.filename)[1]) as tmp:
+                    content = await audio.read()
+                    tmp.write(content)
+                    audio_path = convert_audio_to_wav(tmp.name)
+                    audio_paths.append(audio_path)
+                    audio_names.append(audio.filename)
+                    temp_files.append(tmp.name)
+                    if audio_path != tmp.name:
+                        temp_files.append(audio_path)
+        # Process texts if provided
+        if texts:
+            text_list = [text.strip() for text in texts.split('\n') if text.strip()]
+        # Compute embeddings only if we have any input
+        if not any([image_paths, audio_paths, text_list]):
+            return EmbeddingResponse(
+                embeddings={},
+                file_names={}
+            )
+        embeddings = embedding_manager.compute_embeddings(
+            image_paths if image_paths else None,
+            audio_paths if audio_paths else None,
+            text_list if text_list else None
+        )
+        file_names = {}
+        if image_names:
+            file_names['images'] = image_names
+        if audio_names:
+            file_names['audio'] = audio_names
+        if text_list:
+            file_names['texts'] = text_list
+        return EmbeddingResponse(
+            embeddings=embeddings,
+            file_names=file_names
+        )
+    finally:
+        # Clean up temporary files
+        for temp_file in temp_files:
+            try:
+                os.unlink(temp_file)
+            except:
+                pass
+@app.post("/compute_similarities", response_model=SimilarityResponse)
+async def compute_similarities(embeddings: Dict[str, List[List[float]]]):
+    """Compute similarities from provided embeddings."""
+    similarities = embedding_manager.compute_similarities(embeddings)
+    return SimilarityResponse(similarities=similarities)
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+torch>=2.0.0,<2.1.0
+torchvision>=0.15.0,<0.16.0
+torchaudio>=2.0.0,<2.1.0
+pytorchvideo @ git+https://github.com/facebookresearch/pytorchvideo.git@main
+timm>=0.9.0,<0.10.0
+ftfy
+regex
+einops
+fvcore
+eva-decord>=0.6.1
+iopath
+numpy>=1.24.0,<2.0.0
+matplotlib
+types-regex
+pydub
+fastapi
+uvicorn
+python-multipart

setup_imagebind.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import shutil
+import requests
+from pathlib import Path
+from zipfile import ZipFile
+from io import BytesIO
+def download_file(url: str, destination: Path) -> None:
+    """Download a file from URL to the specified destination."""
+    response = requests.get(url)
+    response.raise_for_status()
+    destination.write_bytes(response.content)
+def download_github_folder(repo_owner: str, repo_name: str, folder_path: str, destination: Path) -> None:
+    """Download a specific folder from a GitHub repository using the ZIP download feature."""
+    # Download the whole repository as a ZIP file
+    zip_url = f"https://github.com/{repo_owner}/{repo_name}/archive/refs/heads/main.zip"
+    response = requests.get(zip_url)
+    response.raise_for_status()
+    # Extract only the needed folder from the ZIP
+    with ZipFile(BytesIO(response.content)) as zip_file:
+        folder_prefix = f"{repo_name}-main/{folder_path}"
+        # Extract only files from the specified folder
+        for file in zip_file.namelist():
+            if file.startswith(folder_prefix):
+                # Remove the repository name and branch prefix from the path
+                relative_path = file.replace(f"{repo_name}-main/", "", 1)
+                if relative_path.endswith('/'):  # Skip directory entries
+                    continue
+                # Read the file content from ZIP
+                content = zip_file.read(file)
+                # Create the file path
+                output_path = destination / relative_path
+                output_path.parent.mkdir(parents=True, exist_ok=True)
+                # Write the file
+                output_path.write_bytes(content)
+def setup_imagebind():
+    """Setup ImageBind by downloading only required files."""
+    # Create clean imagebind directory if needed
+    imagebind_dir = Path("imagebind")
+    if imagebind_dir.exists():
+        shutil.rmtree(imagebind_dir)
+    # Download the imagebind folder
+    download_github_folder(
+        repo_owner="facebookresearch",
+        repo_name="ImageBind",
+        folder_path="imagebind",
+        destination=Path(".")
+    )
+    # Download setup.py file
+    setup_py_url = "https://raw.githubusercontent.com/facebookresearch/ImageBind/main/setup.py"
+    setup_py_path = Path("setup.py")
+    download_file(setup_py_url, setup_py_path)
+if __name__ == "__main__":
+    setup_imagebind()