imagebind2

Runtime error

App Files Files Community

opex792 commited on May 8, 2025

Commit

5bab2e7

verified ·

1 Parent(s): e6d5f01

Update main.py

Browse files

Files changed (1) hide show

main.py +298 -307

main.py CHANGED Viewed

@@ -2,26 +2,119 @@ import os
 import torch
 from imagebind import data
 from imagebind.models import imagebind_model
-from imagebind.models.imagebind_model import ModalityType
 from pydub import AudioSegment
-from fastapi import FastAPI, UploadFile, File, Form
-from typing import List, Dict
 import tempfile
-from pydantic import BaseModel
 import uvicorn
 import numpy as np
-from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
-from fastapi import Depends, HTTPException, status
-app = FastAPI()
-# Add these lines after the app initialization
 security = HTTPBearer()
-API_TOKEN = os.getenv("API_TOKEN", "your-default-token-here")  # Set a default token or use environment variable
-# Add this function for token verification
 async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
-    if credentials.credentials != API_TOKEN:
         raise HTTPException(
             status_code=status.HTTP_401_UNAUTHORIZED,
             detail="Invalid authentication token",
@@ -29,333 +122,231 @@ async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(secur
         )
     return credentials.credentials
-def convert_audio_to_wav(audio_path: str) -> str:
-    """Convert MP3 to WAV if necessary."""
-    if audio_path.lower().endswith('.mp3'):
         wav_path = audio_path.rsplit('.', 1)[0] + '.wav'
-        if not os.path.exists(wav_path):
-            audio = AudioSegment.from_mp3(audio_path)
             audio.export(wav_path, format='wav')
-        return wav_path
     return audio_path
-class EmbeddingManager:
-    def __init__(self):
-        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
-        self.model = imagebind_model.imagebind_huge(pretrained=True)
-        self.model.eval()
-        self.model.to(self.device)
-    def compute_embeddings(self,
-                          images: List[str] = None,
-                          audio_files: List[str] = None,
-                          texts: List[str] = None) -> dict:
-        """Compute embeddings for provided modalities only."""
-        with torch.no_grad():
-            inputs = {}
-            if texts:
-                inputs[ModalityType.TEXT] = data.load_and_transform_text(texts, self.device)
-            if images:
-                inputs[ModalityType.VISION] = data.load_and_transform_vision_data(images, self.device)
-            if audio_files:
-                inputs[ModalityType.AUDIO] = data.load_and_transform_audio_data(audio_files, self.device)
-            if not inputs:
-                return {}
-            embeddings = self.model(inputs)
-            result = {}
-            if ModalityType.VISION in inputs:
-                result['vision'] = embeddings[ModalityType.VISION].cpu().numpy().tolist()
-            if ModalityType.AUDIO in inputs:
-                result['audio'] = embeddings[ModalityType.AUDIO].cpu().numpy().tolist()
-            if ModalityType.TEXT in inputs:
-                result['text'] = embeddings[ModalityType.TEXT].cpu().numpy().tolist()
-            return result
-    @staticmethod
-    def compute_similarities(embeddings: Dict[str, List[List[float]]]) -> dict:
-        """Compute similarities between available embeddings."""
-        similarities = {}
-        # Convert available embeddings to tensors
-        tensors = {
-            k: torch.tensor(v) for k, v in embeddings.items()
-            if isinstance(v, (list, np.ndarray)) and len(v) > 0
-        }
-        # Compute cross-modal similarities
-        modality_pairs = [
-            ('vision', 'audio', 'vision_audio'),
-            ('vision', 'text', 'vision_text'),
-            ('audio', 'text', 'audio_text')
-        ]
-        for mod1, mod2, key in modality_pairs:
-            if mod1 in tensors and mod2 in tensors:
-                similarities[key] = torch.softmax(
-                    tensors[mod1] @ tensors[mod2].T,
-                    dim=-1
-                ).numpy().tolist()
-        # Compute same-modality similarities
-        for modality in ['vision', 'audio', 'text']:
-            if modality in tensors:
-                key = f'{modality}_{modality}'
-                similarities[key] = torch.softmax(
-                    tensors[modality] @ tensors[modality].T,
-                    dim=-1
-                ).numpy().tolist()
-        return similarities
-# Initialize the embedding manager
-embedding_manager = EmbeddingManager()
-class EmbeddingResponse(BaseModel):
-    embeddings: dict
-    file_names: dict
-class SimilarityRequest(BaseModel):
-    embeddings: Dict[str, List[List[float]]]
-    threshold: float = 0.5
-    top_k: int | None = None
-    include_self_similarity: bool = False
-    normalize_scores: bool = True
 class SimilarityMatch(BaseModel):
-    index_a: int
-    index_b: int
-    score: float
-    modality_a: str
-    modality_b: str
-    item_a: str  # Original item identifier (filename or text)
-    item_b: str  # Original item identifier (filename or text)
 class SimilarityResponse(BaseModel):
     matches: List[SimilarityMatch]
-    statistics: Dict[str, float]  # Contains avg_score, max_score, etc.
-    modality_pairs: List[str]  # Lists which modality comparisons were performed
-class ModalityPair:
-    def __init__(self, mod1: str, mod2: str):
-        self.mod1 = min(mod1, mod2)  # Ensure consistent ordering
-        self.mod2 = max(mod1, mod2)
-    def __str__(self):
-        return f"{self.mod1}_to_{self.mod2}"
-def compute_similarity_matrix(tensor1: torch.Tensor, tensor2: torch.Tensor, normalize: bool = True) -> torch.Tensor:
-    """Compute cosine similarity between two sets of embeddings."""
-    # Normalize embeddings if requested
-    if normalize:
-        tensor1 = torch.nn.functional.normalize(tensor1, dim=1)
-        tensor2 = torch.nn.functional.normalize(tensor2, dim=1)
-    # Compute similarity matrix
-    similarity = torch.matmul(tensor1, tensor2.T)
-    return similarity
-def get_top_k_matches(similarity_matrix: torch.Tensor, top_k: int | None = None) -> List[tuple]:
-    """Get top-k matches from a similarity matrix."""
-    if top_k is None:
-        top_k = similarity_matrix.numel()
-    # Flatten and get top-k indices
-    flat_sim = similarity_matrix.flatten()
-    top_k = min(top_k, flat_sim.numel())
-    values, indices = torch.topk(flat_sim, k=top_k)
-    # Convert flat indices to 2D indices
-    rows = indices // similarity_matrix.size(1)
-    cols = indices % similarity_matrix.size(1)
-    return [(r.item(), c.item(), v.item()) for r, c, v in zip(rows, cols, values)]
-@app.post("/compute_embeddings", response_model=EmbeddingResponse)
-async def generate_embeddings(
-    credentials: HTTPAuthorizationCredentials = Depends(verify_token),
-    texts: str | None = Form(None),
-    images: List[UploadFile] | None = File(default=None),
-    audio_files: List[UploadFile] | None = File(default=None)
 ):
-    """Generate embeddings for any provided files and texts."""
-    temp_files = []
     try:
-        image_paths = []
-        image_names = []
-        audio_paths = []
-        audio_names = []
-        text_list = []
-        # Process images if provided
         if images:
-            for img in images:
-                with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(img.filename)[1]) as tmp:
-                    content = await img.read()
-                    tmp.write(content)
-                    image_paths.append(tmp.name)
-                    image_names.append(img.filename)
-                    temp_files.append(tmp.name)
-        # Process audio files if provided
         if audio_files:
-            for audio in audio_files:
-                with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(audio.filename)[1]) as tmp:
-                    content = await audio.read()
-                    tmp.write(content)
-                    audio_path = convert_audio_to_wav(tmp.name)
-                    audio_paths.append(audio_path)
-                    audio_names.append(audio.filename)
-                    temp_files.append(tmp.name)
-                    if audio_path != tmp.name:
-                        temp_files.append(audio_path)
-        # Process texts if provided
-        if texts:
-            text_list = [text.strip() for text in texts.split('\n') if text.strip()]
-        # Compute embeddings only if we have any input
-        if not any([image_paths, audio_paths, text_list]):
-            return EmbeddingResponse(
-                embeddings={},
-                file_names={}
-            )
-        embeddings = embedding_manager.compute_embeddings(
-            image_paths if image_paths else None,
-            audio_paths if audio_paths else None,
-            text_list if text_list else None
         )
-        file_names = {}
-        if image_names:
-            file_names['images'] = image_names
-        if audio_names:
-            file_names['audio'] = audio_names
-        if text_list:
-            file_names['texts'] = text_list
-        return EmbeddingResponse(
-            embeddings=embeddings,
-            file_names=file_names
-        )
     finally:
-        # Clean up temporary files
-        for temp_file in temp_files:
             try:
-                os.unlink(temp_file)
-            except:
-                pass
-@app.post("/compute_similarities", response_model=SimilarityResponse)
-async def compute_similarities(
-    request: SimilarityRequest,
-    file_names: Dict[str, List[str]],  # Maps modality to list of file/text names
-    credentials: HTTPAuthorizationCredentials = Depends(verify_token)
-):
-    """
-    Compute cross-modal similarities with advanced filtering and matching options.
-    Parameters:
-    - embeddings: Dict mapping modality to embedding tensors
-    - threshold: Minimum similarity score to include in results
-    - top_k: Maximum number of matches to return (per modality pair)
-    - include_self_similarity: Whether to include same-item comparisons
-    - normalize_scores: Whether to normalize embeddings before comparison
-    - file_names: Dict mapping modality to list of original file/text names
-    """
-    matches = []
-    statistics = {
-        "avg_score": 0.0,
-        "max_score": 0.0,
-        "min_score": 1.0,
-        "total_comparisons": 0
-    }
-    # Convert embeddings to tensors
-    tensors = {
-        k: torch.tensor(v) for k, v in request.embeddings.items()
-        if isinstance(v, (list, np.ndarray)) and len(v) > 0
     }
-    modality_pairs = []
-    all_scores = []
-    # Get all possible modality pairs
-    modalities = list(tensors.keys())
-    for i, mod1 in enumerate(modalities):
-        for mod2 in modalities[i:]:  # Include self-comparisons if requested
-            if mod1 == mod2 and not request.include_self_similarity:
-                continue
-            pair = ModalityPair(mod1, mod2)
-            modality_pairs.append(str(pair))
-            # Compute similarity matrix
-            sim_matrix = compute_similarity_matrix(
-                tensors[mod1],
-                tensors[mod2],
-                normalize=request.normalize_scores
-            )
-            # Get top matches
-            top_matches = get_top_k_matches(sim_matrix, request.top_k)
-            # Filter by threshold and create match objects
-            for idx_a, idx_b, score in top_matches:
-                if score < request.threshold:
-                    continue
-                # Skip self-matches if not requested
-                if mod1 == mod2 and idx_a == idx_b and not request.include_self_similarity:
-                    continue
-                matches.append(SimilarityMatch(
-                    index_a=idx_a,
-                    index_b=idx_b,
-                    score=float(score),
-                    modality_a=mod1,
-                    modality_b=mod2,
-                    item_a=file_names[mod1][idx_a],
-                    item_b=file_names[mod2][idx_b]
-                ))
-                all_scores.append(score)
-    # Compute statistics
-    if all_scores:
-        statistics.update({
-            "avg_score": float(np.mean(all_scores)),
-            "max_score": float(np.max(all_scores)),
-            "min_score": float(np.min(all_scores)),
-            "total_comparisons": len(all_scores)
-        })
-    # Sort matches by score in descending order
-    matches.sort(key=lambda x: x.score, reverse=True)
     return SimilarityResponse(
-        matches=matches,
-        statistics=statistics,
-        modality_pairs=modality_pairs
     )
-@app.get("/health")
-async def health_check(
-    credentials: HTTPAuthorizationCredentials = Depends(verify_token)
-):
-    """Basic healthcheck endpoint that returns the status of the service."""
     return {
         "status": "healthy",
-        "model_device": embedding_manager.device
     }
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 import torch
 from imagebind import data
 from imagebind.models import imagebind_model
+from imagebind.models.imagebind_model import ModalityType as ImageBindModalityType
 from pydub import AudioSegment
+from fastapi import FastAPI, UploadFile, File, Form, Depends, HTTPException, status
+from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
+from fastapi.concurrency import run_in_threadpool
+from pydantic import BaseModel, Field, BaseSettings
+from typing import List, Dict, Optional, Tuple, Any
 import tempfile
 import uvicorn
 import numpy as np
+import logging
+from contextlib import asynccontextmanager
+class Settings(BaseSettings):
+    api_token: str = "your-default-token-here"
+    model_device: Optional[str] = None
+    log_level: str = "INFO"
+    class Config:
+        env_file = ".env"
+        env_file_encoding = 'utf-8'
+settings = Settings()
+logging.basicConfig(level=settings.log_level.upper())
+logger = logging.getLogger(__name__)
+class EmbeddingManager:
+    _instance = None
+    def __new__(cls, *args, **kwargs):
+        if not cls._instance:
+            cls._instance = super(EmbeddingManager, cls).__new__(cls, *args, **kwargs)
+        return cls._instance
+    def __init__(self):
+        if not hasattr(self, 'initialized'):
+            self.device = settings.model_device or ("cuda:0" if torch.cuda.is_available() else "cpu")
+            logger.info(f"Initializing EmbeddingManager on device: {self.device}")
+            try:
+                self.model = imagebind_model.imagebind_huge(pretrained=True)
+                self.model.eval()
+                self.model.to(self.device)
+                self.initialized = True
+                logger.info("ImageBind model loaded successfully.")
+            except Exception as e:
+                logger.error(f"Failed to load ImageBind model: {e}")
+                raise RuntimeError(f"Failed to load ImageBind model: {e}")
+    async def compute_embeddings(self,
+                                 image_inputs: Optional[List[Tuple[str, str]]] = None,
+                                 audio_inputs: Optional[List[Tuple[str, str]]] = None,
+                                 text_inputs: Optional[List[str]] = None,
+                                 depth_inputs: Optional[List[Tuple[str, str]]] = None,
+                                 thermal_inputs: Optional[List[Tuple[str, str]]] = None,
+                                 imu_inputs: Optional[List[Tuple[str, str]]] = None
+                                 ) -> Dict[str, List[Dict[str, Any]]]:
+        inputs = {}
+        input_ids = {}
+        if text_inputs:
+            inputs[ImageBindModalityType.TEXT] = data.load_and_transform_text(text_inputs, self.device)
+            input_ids[ImageBindModalityType.TEXT] = text_inputs
+        if image_inputs:
+            paths = [item[0] for item in image_inputs]
+            inputs[ImageBindModalityType.VISION] = data.load_and_transform_vision_data(paths, self.device)
+            input_ids[ImageBindModalityType.VISION] = [item[1] for item in image_inputs]
+        if audio_inputs:
+            paths = [item[0] for item in audio_inputs]
+            inputs[ImageBindModalityType.AUDIO] = data.load_and_transform_audio_data(paths, self.device)
+            input_ids[ImageBindModalityType.AUDIO] = [item[1] for item in audio_inputs]
+        if depth_inputs:
+            logger.warning("Depth modality processing is not yet fully implemented.")
+        if thermal_inputs:
+            logger.warning("Thermal modality processing is not yet fully implemented.")
+        if imu_inputs:
+            logger.warning("IMU modality processing is not yet fully implemented.")
+        if not inputs:
+            return {}
+        with torch.no_grad():
+            raw_embeddings = await run_in_threadpool(self.model, inputs)
+        result_embeddings = {}
+        for modality_type, embeddings_tensor in raw_embeddings.items():
+            modality_key = modality_type.name.lower()
+            result_embeddings[modality_key] = []
+            ids_for_modality = input_ids.get(modality_type, [])
+            for i, emb in enumerate(embeddings_tensor.cpu().numpy().tolist()):
+                item_id = ids_for_modality[i] if i < len(ids_for_modality) else f"item_{i}"
+                result_embeddings[modality_key].append({"id": item_id, "embedding": emb})
+        return result_embeddings
+embedding_manager: Optional[EmbeddingManager] = None
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    global embedding_manager
+    logger.info("Application startup...")
+    embedding_manager = EmbeddingManager()
+    settings.model_device = embedding_manager.device
+    yield
+    logger.info("Application shutdown...")
+app = FastAPI(lifespan=lifespan, title="ImageBind API", version="0.2.0")
 security = HTTPBearer()
 async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
+    if credentials.scheme != "Bearer" or credentials.credentials != settings.api_token:
+        logger.warning(f"Invalid authentication attempt. Scheme: {credentials.scheme}")
         raise HTTPException(
             status_code=status.HTTP_401_UNAUTHORIZED,
             detail="Invalid authentication token",
         )
     return credentials.credentials
+async def _save_upload_file_tmp(upload_file: UploadFile) -> Tuple[str, str]:
+    try:
+        suffix = os.path.splitext(upload_file.filename)[1]
+        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+            content = await upload_file.read()
+            tmp.write(content)
+            return tmp.name, upload_file.filename
+    except Exception as e:
+        logger.error(f"Error saving uploaded file {upload_file.filename}: {e}")
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Could not save file: {upload_file.filename}")
+def convert_audio_to_wav(audio_path: str, original_filename: str) -> str:
+    if audio_path.lower().endswith('.mp3') or not audio_path.lower().endswith('.wav'):
         wav_path = audio_path.rsplit('.', 1)[0] + '.wav'
+        try:
+            logger.info(f"Converting {original_filename} to WAV format.")
+            audio = AudioSegment.from_file(audio_path)
             audio.export(wav_path, format='wav')
+            if audio_path != wav_path and os.path.exists(audio_path):
+                 try:
+                    os.unlink(audio_path)
+                 except OSError:
+                    pass
+            return wav_path
+        except Exception as e:
+            logger.error(f"Error converting audio file {original_filename} to WAV: {e}")
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"Could not process audio file {original_filename}: {e}")
     return audio_path
+class ModalityType(str):
+    VISION = "vision"
+    AUDIO = "audio"
+    TEXT = "text"
+    DEPTH = "depth"
+    THERMAL = "thermal"
+    IMU = "imu"
+class EmbeddingItem(BaseModel):
+    id: str = Field(..., description="Identifier of the item (e.g., filename or text content)")
+    embedding: List[float] = Field(..., description="The computed embedding vector")
+class EmbeddingPayload(BaseModel):
+    vision: Optional[List[EmbeddingItem]] = Field(None, description="List of vision embeddings")
+    audio: Optional[List[EmbeddingItem]] = Field(None, description="List of audio embeddings")
+    text: Optional[List[EmbeddingItem]] = Field(None, description="List of text embeddings")
+    depth: Optional[List[EmbeddingItem]] = Field(None, description="List of depth embeddings (future support)")
+    thermal: Optional[List[EmbeddingItem]] = Field(None, description="List of thermal embeddings (future support)")
+    imu: Optional[List[EmbeddingItem]] = Field(None, description="List of IMU embeddings (future support)")
+class EmbeddingResponse(BaseModel):
+    embeddings: EmbeddingPayload
+    message: str = "Embeddings computed successfully"
 class SimilarityMatch(BaseModel):
+    item_a_id: str
+    item_b_id: str
+    modality_a: ModalityType
+    modality_b: ModalityType
+    score: float = Field(..., ge=0.0, le=1.0001)
+class SimilarityRequest(BaseModel):
+    embeddings_payload: EmbeddingPayload = Field(..., description="Payload containing embeddings from the /compute_embeddings endpoint")
+    threshold: float = Field(0.5, ge=0.0, le=1.0, description="Minimum similarity score to include in results")
+    top_k: Optional[int] = Field(None, gt=0, description="Maximum number of matches to return per modality pair comparison. If None, all matches above threshold are returned.")
+    normalize_scores: bool = Field(True, description="Whether to normalize embeddings before computing cosine similarity (recommended)")
+    compare_within_modalities: bool = Field(True, description="Compare items within the same modality (e.g., image1 vs image2)")
+    compare_across_modalities: bool = Field(True, description="Compare items across different modalities (e.g., image1 vs text1)")
 class SimilarityResponse(BaseModel):
     matches: List[SimilarityMatch]
+    statistics: Dict[str, float]
+    modality_pairs_compared: List[str]
+@app.post("/compute_embeddings", response_model=EmbeddingResponse, dependencies=[Depends(verify_token)])
+async def generate_embeddings_endpoint(
+    texts: Optional[List[str]] = Form(None, description="List of text strings to embed."),
+    images: Optional[List[UploadFile]] = File(default=None, description="List of image files."),
+    audio_files: Optional[List[UploadFile]] = File(default=None, description="List of audio files (MP3, WAV, etc.).")
 ):
+    if embedding_manager is None:
+        raise HTTPException(status_code=503, detail="Embedding manager not initialized.")
+    temp_files_to_clean = []
     try:
+        image_inputs: List[Tuple[str, str]] = []
+        audio_inputs: List[Tuple[str, str]] = []
         if images:
+            for img_file in images:
+                path, name = await _save_upload_file_tmp(img_file)
+                image_inputs.append((path, name))
+                temp_files_to_clean.append(path)
         if audio_files:
+            for audio_file_in in audio_files:
+                path, name = await _save_upload_file_tmp(audio_file_in)
+                temp_files_to_clean.append(path)
+                wav_path = convert_audio_to_wav(path, name)
+                audio_inputs.append((wav_path, name))
+                if wav_path != path:
+                    temp_files_to_clean.append(wav_path)
+        text_inputs_processed = [t.strip() for t in texts if t.strip()] if texts else None
+        if not any([image_inputs, audio_inputs, text_inputs_processed]):
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="No valid inputs provided for embedding.")
+        computed_data = await embedding_manager.compute_embeddings(
+            image_inputs=image_inputs if image_inputs else None,
+            audio_inputs=audio_inputs if audio_inputs else None,
+            text_inputs=text_inputs_processed if text_inputs_processed else None
         )
+        payload_data = {
+            ModalityType.VISION: computed_data.get(ModalityType.VISION, []),
+            ModalityType.AUDIO: computed_data.get(ModalityType.AUDIO, []),
+            ModalityType.TEXT: computed_data.get(ModalityType.TEXT, []),
+        }
+        embedding_payload = EmbeddingPayload(**payload_data)
+        return EmbeddingResponse(embeddings=embedding_payload)
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error in /compute_embeddings: {e}", exc_info=True)
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"An unexpected error occurred: {str(e)}")
     finally:
+        for temp_file in temp_files_to_clean:
             try:
+                if os.path.exists(temp_file):
+                    os.unlink(temp_file)
+            except Exception as e_clean:
+                logger.warning(f"Could not clean up temporary file {temp_file}: {e_clean}")
+def _compute_similarity_matrix(tensor1: torch.Tensor, tensor2: torch.Tensor, normalize: bool) -> torch.Tensor:
+    if normalize:
+        tensor1 = torch.nn.functional.normalize(tensor1, p=2, dim=1)
+        tensor2 = torch.nn.functional.normalize(tensor2, p=2, dim=1)
+    return torch.matmul(tensor1, tensor2.T)
+@app.post("/compute_similarities", response_model=SimilarityResponse, dependencies=[Depends(verify_token)])
+async def compute_similarities_endpoint(request: SimilarityRequest):
+    all_matches: List[SimilarityMatch] = []
+    all_scores: List[float] = []
+    modality_pairs_compared_set = set()
+    embeddings_by_modality: Dict[ModalityType, List[EmbeddingItem]] = {}
+    if request.embeddings_payload.vision:
+        embeddings_by_modality[ModalityType.VISION] = request.embeddings_payload.vision
+    if request.embeddings_payload.audio:
+        embeddings_by_modality[ModalityType.AUDIO] = request.embeddings_payload.audio
+    if request.embeddings_payload.text:
+        embeddings_by_modality[ModalityType.TEXT] = request.embeddings_payload.text
+    modalities_present = list(embeddings_by_modality.keys())
+    current_device = embedding_manager.device if embedding_manager else "cpu"
+    for i, mod1_type in enumerate(modalities_present):
+        items1 = embeddings_by_modality[mod1_type]
+        if not items1: continue
+        tensor1 = torch.tensor([item.embedding for item in items1], device=current_device)
+        if request.compare_within_modalities:
+            sim_matrix_intra = _compute_similarity_matrix(tensor1, tensor1, request.normalize_scores)
+            modality_pairs_compared_set.add(f"{mod1_type.value}_vs_{mod1_type.value}")
+            for r_idx in range(len(items1)):
+                for c_idx in range(r_idx + 1, len(items1)):
+                    score = float(sim_matrix_intra[r_idx, c_idx].item())
+                    if score >= request.threshold:
+                        all_matches.append(SimilarityMatch(
+                            item_a_id=items1[r_idx].id, item_b_id=items1[c_idx].id,
+                            modality_a=mod1_type, modality_b=mod1_type, score=score
+                        ))
+                        all_scores.append(score)
+        if request.compare_across_modalities:
+            for j in range(i + 1, len(modalities_present)):
+                mod2_type = modalities_present[j]
+                items2 = embeddings_by_modality[mod2_type]
+                if not items2: continue
+                tensor2 = torch.tensor([item.embedding for item in items2], device=current_device)
+                sim_matrix_inter = _compute_similarity_matrix(tensor1, tensor2, request.normalize_scores)
+                modality_pairs_compared_set.add(f"{mod1_type.value}_vs_{mod2_type.value}")
+                for r_idx in range(len(items1)):
+                    for c_idx in range(len(items2)):
+                        score = float(sim_matrix_inter[r_idx, c_idx].item())
+                        if score >= request.threshold:
+                             all_matches.append(SimilarityMatch(
+                                item_a_id=items1[r_idx].id, item_b_id=items2[c_idx].id,
+                                modality_a=mod1_type, modality_b=mod2_type, score=score
+                            ))
+                             all_scores.append(score)
+    all_matches.sort(key=lambda x: x.score, reverse=True)
+    if request.top_k and len(all_matches) > request.top_k:
+        all_matches = all_matches[:request.top_k]
+        all_scores = [match.score for match in all_matches]
+    stats = {
+        "total_matches_found_above_threshold": len(all_matches),
+        "avg_score": float(np.mean(all_scores)) if all_scores else 0.0,
+        "max_score": float(np.max(all_scores)) if all_scores else 0.0,
+        "min_score": float(np.min(all_scores)) if all_scores else 0.0,
     }
     return SimilarityResponse(
+        matches=all_matches,
+        statistics=stats,
+        modality_pairs_compared=sorted(list(modality_pairs_compared_set))
     )
+@app.get("/health", status_code=status.HTTP_200_OK, dependencies=[Depends(verify_token)])
+async def health_check():
     return {
         "status": "healthy",
+        "model_device": settings.model_device,
+        "torch_version": torch.__version__,
+        "cuda_available": torch.cuda.is_available()
     }
 if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860, log_level=settings.log_level.lower())