"""
FusionLayer - Combines multimodal inputs into unified context
NEW MODULE - Central integration point for all vision/text data
"""


class FusionLayer:
    def __init__(self):
        """
        Fusion layer combines:
        - Caption (from BLIP)
        - OCR text (from OCRAgent)
        - Visual embedding (from MobileCLIP)
        - Object detections (from YOLO/SSD)
        """
        print("[FusionLayer] Initialized")

    def fuse(self, caption=None, ocr_text=None, objects=None, embedding=None):
        """
        Combine all available modalities into structured context
        
        Args:
            caption: Scene description from BLIP
            ocr_text: Extracted text from OCR
            objects: List of detected objects
            embedding: Image embedding vector (for FAISS)
            
        Returns:
            dict: Unified multimodal context
        """
        context = {
            "caption": caption or "",
            "ocr_text": ocr_text or "",
            "objects": objects or [],
            "embedding": embedding,
            "has_text": bool(ocr_text and len(ocr_text) > 0),
            "has_objects": bool(objects and len(objects) > 0)
        }
        
        # Generate combined description
        context["full_description"] = self._build_description(context)
        
        return context

    def _build_description(self, context):
        """
        Create natural language description from all modalities
        """
        parts = []
        
        # Add caption
        if context["caption"]:
            parts.append(context["caption"])
        
        # Add objects if not in caption
        if context["objects"]:
            obj_str = ", ".join(set(context["objects"]))
            parts.append(f"Objects detected: {obj_str}")
        
        # Add OCR text
        if context["ocr_text"]:
            parts.append(f"Text visible: {context['ocr_text']}")
        
        return ". ".join(parts) if parts else "No description available"

    def extract_for_storage(self, fused_context):
        """
        Extract fields needed for memory storage
        
        Returns:
            tuple: (description, embedding)
        """
        description = fused_context["full_description"]
        embedding = fused_context.get("embedding")
        
        return description, embedding