visionq / core /fusion_layer.py
NanG01's picture
architectural change: restructure project and update documentation
bc3cab1
"""
FusionLayer - Combines multimodal inputs into unified context
NEW MODULE - Central integration point for all vision/text data
"""
class FusionLayer:
def __init__(self):
"""
Fusion layer combines:
- Caption (from BLIP)
- OCR text (from OCRAgent)
- Visual embedding (from MobileCLIP)
- Object detections (from YOLO/SSD)
"""
print("[FusionLayer] Initialized")
def fuse(self, caption=None, ocr_text=None, objects=None, embedding=None):
"""
Combine all available modalities into structured context
Args:
caption: Scene description from BLIP
ocr_text: Extracted text from OCR
objects: List of detected objects
embedding: Image embedding vector (for FAISS)
Returns:
dict: Unified multimodal context
"""
context = {
"caption": caption or "",
"ocr_text": ocr_text or "",
"objects": objects or [],
"embedding": embedding,
"has_text": bool(ocr_text and len(ocr_text) > 0),
"has_objects": bool(objects and len(objects) > 0)
}
# Generate combined description
context["full_description"] = self._build_description(context)
return context
def _build_description(self, context):
"""
Create natural language description from all modalities
"""
parts = []
# Add caption
if context["caption"]:
parts.append(context["caption"])
# Add objects if not in caption
if context["objects"]:
obj_str = ", ".join(set(context["objects"]))
parts.append(f"Objects detected: {obj_str}")
# Add OCR text
if context["ocr_text"]:
parts.append(f"Text visible: {context['ocr_text']}")
return ". ".join(parts) if parts else "No description available"
def extract_for_storage(self, fused_context):
"""
Extract fields needed for memory storage
Returns:
tuple: (description, embedding)
"""
description = fused_context["full_description"]
embedding = fused_context.get("embedding")
return description, embedding