| """ |
| FusionLayer - Combines multimodal inputs into unified context |
| NEW MODULE - Central integration point for all vision/text data |
| """ |
|
|
|
|
| class FusionLayer: |
| def __init__(self): |
| """ |
| Fusion layer combines: |
| - Caption (from BLIP) |
| - OCR text (from OCRAgent) |
| - Visual embedding (from MobileCLIP) |
| - Object detections (from YOLO/SSD) |
| """ |
| print("[FusionLayer] Initialized") |
|
|
| def fuse(self, caption=None, ocr_text=None, objects=None, embedding=None): |
| """ |
| Combine all available modalities into structured context |
| |
| Args: |
| caption: Scene description from BLIP |
| ocr_text: Extracted text from OCR |
| objects: List of detected objects |
| embedding: Image embedding vector (for FAISS) |
| |
| Returns: |
| dict: Unified multimodal context |
| """ |
| context = { |
| "caption": caption or "", |
| "ocr_text": ocr_text or "", |
| "objects": objects or [], |
| "embedding": embedding, |
| "has_text": bool(ocr_text and len(ocr_text) > 0), |
| "has_objects": bool(objects and len(objects) > 0) |
| } |
| |
| |
| context["full_description"] = self._build_description(context) |
| |
| return context |
|
|
| def _build_description(self, context): |
| """ |
| Create natural language description from all modalities |
| """ |
| parts = [] |
| |
| |
| if context["caption"]: |
| parts.append(context["caption"]) |
| |
| |
| if context["objects"]: |
| obj_str = ", ".join(set(context["objects"])) |
| parts.append(f"Objects detected: {obj_str}") |
| |
| |
| if context["ocr_text"]: |
| parts.append(f"Text visible: {context['ocr_text']}") |
| |
| return ". ".join(parts) if parts else "No description available" |
|
|
| def extract_for_storage(self, fused_context): |
| """ |
| Extract fields needed for memory storage |
| |
| Returns: |
| tuple: (description, embedding) |
| """ |
| description = fused_context["full_description"] |
| embedding = fused_context.get("embedding") |
| |
| return description, embedding |
|
|