Spaces:

NanG01
/

visionq

Running

App Files Files Community

visionq / core /fusion_layer.py

NanG01

architectural change: restructure project and update documentation

bc3cab1 2 months ago

raw

history blame contribute delete

2.37 kB

	"""
	FusionLayer - Combines multimodal inputs into unified context
	NEW MODULE - Central integration point for all vision/text data
	"""


	class FusionLayer:
	def __init__(self):
	"""
	Fusion layer combines:
	- Caption (from BLIP)
	- OCR text (from OCRAgent)
	- Visual embedding (from MobileCLIP)
	- Object detections (from YOLO/SSD)
	"""
	print("[FusionLayer] Initialized")

	def fuse(self, caption=None, ocr_text=None, objects=None, embedding=None):
	"""
	Combine all available modalities into structured context

	Args:
	caption: Scene description from BLIP
	ocr_text: Extracted text from OCR
	objects: List of detected objects
	embedding: Image embedding vector (for FAISS)

	Returns:
	dict: Unified multimodal context
	"""
	context = {
	"caption": caption or "",
	"ocr_text": ocr_text or "",
	"objects": objects or [],
	"embedding": embedding,
	"has_text": bool(ocr_text and len(ocr_text) > 0),
	"has_objects": bool(objects and len(objects) > 0)
	}

	# Generate combined description
	context["full_description"] = self._build_description(context)

	return context

	def _build_description(self, context):
	"""
	Create natural language description from all modalities
	"""
	parts = []

	# Add caption
	if context["caption"]:
	parts.append(context["caption"])

	# Add objects if not in caption
	if context["objects"]:
	obj_str = ", ".join(set(context["objects"]))
	parts.append(f"Objects detected: {obj_str}")

	# Add OCR text
	if context["ocr_text"]:
	parts.append(f"Text visible: {context['ocr_text']}")

	return ". ".join(parts) if parts else "No description available"

	def extract_for_storage(self, fused_context):
	"""
	Extract fields needed for memory storage

	Returns:
	tuple: (description, embedding)
	"""
	description = fused_context["full_description"]
	embedding = fused_context.get("embedding")

	return description, embedding