""" FusionLayer - Combines multimodal inputs into unified context NEW MODULE - Central integration point for all vision/text data """ class FusionLayer: def __init__(self): """ Fusion layer combines: - Caption (from BLIP) - OCR text (from OCRAgent) - Visual embedding (from MobileCLIP) - Object detections (from YOLO/SSD) """ print("[FusionLayer] Initialized") def fuse(self, caption=None, ocr_text=None, objects=None, embedding=None): """ Combine all available modalities into structured context Args: caption: Scene description from BLIP ocr_text: Extracted text from OCR objects: List of detected objects embedding: Image embedding vector (for FAISS) Returns: dict: Unified multimodal context """ context = { "caption": caption or "", "ocr_text": ocr_text or "", "objects": objects or [], "embedding": embedding, "has_text": bool(ocr_text and len(ocr_text) > 0), "has_objects": bool(objects and len(objects) > 0) } # Generate combined description context["full_description"] = self._build_description(context) return context def _build_description(self, context): """ Create natural language description from all modalities """ parts = [] # Add caption if context["caption"]: parts.append(context["caption"]) # Add objects if not in caption if context["objects"]: obj_str = ", ".join(set(context["objects"])) parts.append(f"Objects detected: {obj_str}") # Add OCR text if context["ocr_text"]: parts.append(f"Text visible: {context['ocr_text']}") return ". ".join(parts) if parts else "No description available" def extract_for_storage(self, fused_context): """ Extract fields needed for memory storage Returns: tuple: (description, embedding) """ description = fused_context["full_description"] embedding = fused_context.get("embedding") return description, embedding