Spaces:

DawnC
/

Pixcribe

Sleeping

App Files Files Community

Pixcribe / detection_fusion_manager.py

DawnC

Upload 22 files

6a3bd1f verified 6 months ago

raw

history blame contribute delete

9.13 kB

	from typing import List, Dict
	import numpy as np

	class DetectionFusionManager:
	"""Integrate and prioritize detection results with intelligent lighting fusion"""

	def __init__(self, clip_manager):
	self.clip_manager = clip_manager

	def fuse_lighting_analysis(self, cv_lighting: Dict, clip_scene: Dict) -> Dict:
	"""Intelligently fuse CV+Places365 lighting with CLIP scene understanding"""

	cv_lighting_type = cv_lighting.get('lighting_type', 'soft diffused light')
	cv_confidence = cv_lighting.get('confidence', 0.7)
	cv_features = cv_lighting.get('cv_features', {})

	# Get CLIP's lighting prediction
	clip_lighting_data = clip_scene.get('lighting', {})
	clip_lighting_type = clip_lighting_data.get('top', 'natural light')
	clip_confidence = clip_lighting_data.get('confidence', 0.5)

	# Intelligent fusion strategy:
	# 1. If CV has high confidence (>0.85), trust it
	# 2. If CV and CLIP semantically agree, boost confidence
	# 3. Otherwise, weighted average based on confidence

	if cv_confidence > 0.85:
	# High confidence from CV+Places365
	final_lighting = cv_lighting_type
	final_confidence = cv_confidence
	fusion_method = 'cv_dominant'

	elif self._lighting_semantically_similar(cv_lighting_type, clip_lighting_type):
	# Semantic agreement between CV and CLIP
	final_lighting = cv_lighting_type # Prefer CV's specific description
	# Boost confidence when both agree
	final_confidence = min(cv_confidence * 1.15, 0.95)
	fusion_method = 'consensus'

	else:
	# Weighted fusion based on confidence
	cv_weight = cv_confidence / (cv_confidence + clip_confidence)
	clip_weight = 1.0 - cv_weight

	# If CV weight is higher, use CV result
	if cv_weight > 0.6:
	final_lighting = cv_lighting_type
	final_confidence = cv_confidence * 0.9
	fusion_method = 'cv_weighted'
	else:
	# Use more generic description when uncertain
	final_lighting = self._generalize_lighting_description(
	cv_lighting_type, clip_lighting_type, cv_features
	)
	final_confidence = (cv_confidence * cv_weight + clip_confidence * clip_weight) * 0.85
	fusion_method = 'generalized'

	return {
	'lighting_type': final_lighting,
	'confidence': min(final_confidence, 0.95),
	'cv_analysis': cv_lighting_type,
	'clip_prediction': clip_lighting_type,
	'fusion_method': fusion_method,
	'cv_confidence': cv_confidence,
	'clip_confidence': clip_confidence
	}

	def _lighting_semantically_similar(self, cv_type: str, clip_type: str) -> bool:
	"""Check if two lighting descriptions are semantically similar"""
	# Define semantic similarity groups
	similarity_groups = [
	{'soft', 'diffused', 'overcast', 'cloudy'},
	{'bright', 'sunny', 'sunlight', 'clear'},
	{'warm', 'golden', 'amber', 'evening'},
	{'natural', 'daylight', 'outdoor'},
	{'cool', 'blue', 'twilight'},
	]

	cv_words = set(cv_type.lower().split())
	clip_words = set(clip_type.lower().split())

	# Check if both descriptions share words from same semantic group
	for group in similarity_groups:
	cv_match = cv_words & group
	clip_match = clip_words & group
	if cv_match and clip_match:
	return True

	# Direct word overlap
	common_words = cv_words & clip_words
	return len(common_words) >= 1

	def _generalize_lighting_description(self, cv_type: str, clip_type: str,
	cv_features: Dict) -> str:
	"""Generate a generalized lighting description when CV and CLIP disagree"""

	brightness = cv_features.get('brightness', 128)
	contrast = cv_features.get('contrast', 50)
	color_temp = cv_features.get('color_temp', 1.0)

	# Use feature-based generalization (not hard thresholds)
	brightness_norm = brightness / 255.0
	contrast_norm = min(contrast / 100.0, 1.0)

	# Decision tree based on physical features
	if contrast_norm < 0.5:
	# Low contrast
	if color_temp < 1.0:
	return 'soft diffused light'
	else:
	return 'warm ambient light'
	elif brightness_norm > 0.7:
	# High brightness
	return 'natural daylight'
	elif color_temp > 1.1:
	# Warm temperature
	return 'warm ambient light'
	else:
	# Default safe description
	return 'soft diffused light'

	def analyze_composition(self, image, detections: List[Dict]) -> Dict:
	"""Analyze image composition"""
	if not detections:
	return {'composition_type': 'empty', 'vertical_ratio': 0.0}

	# Calculate vertical element ratio
	vertical_objects = [
	d for d in detections
	if (d['bbox'][3] - d['bbox'][1]) > (d['bbox'][2] - d['bbox'][0])
	]
	vertical_ratio = len(vertical_objects) / max(len(detections), 1)

	# Determine composition type
	if vertical_ratio > 0.6:
	composition_type = 'urban canyon'
	elif vertical_ratio > 0.4:
	composition_type = 'vertical emphasis'
	else:
	composition_type = 'standard street view'

	return {
	'composition_type': composition_type,
	'vertical_ratio': vertical_ratio,
	'vertical_objects_count': len(vertical_objects),
	'total_objects': len(detections)
	}

	def fuse_detections(self, yolo_results: List[Dict], unknown_regions: List[Dict],
	scene_info: Dict, image=None, cv_lighting: Dict = None) -> Dict:
	"""Fuse all detection results with intelligent lighting fusion"""
	all_detections = []

	# Process YOLO detections with attention scores
	for det in yolo_results:
	attention_score = self._calculate_attention_score(det)
	det['attention_score'] = attention_score
	all_detections.append(det)

	# Classify unknown regions using OpenCLIP
	for region in unknown_regions:
	if 'image' not in region:
	continue

	classification = self.clip_manager.classify_hierarchical(region['image'])

	detection = {
	'class_name': classification['top_prediction'],
	'bbox': region['bbox'],
	'confidence': classification.get('confidence', 0.5),
	'attention_score': region.get('saliency_score', 0.5),
	'source': 'openclip'
	}
	all_detections.append(detection)

	# Sort by attention score
	ranked_detections = sorted(
	all_detections,
	key=lambda x: x['attention_score'],
	reverse=True
	)

	# Filter top 15
	filtered = []
	for det in ranked_detections:
	if len(filtered) >= 15:
	if det.get('brand') and det.get('brand_confidence', 0) > 0.45:
	filtered.append(det)
	else:
	break
	else:
	filtered.append(det)

	# Analyze composition
	composition = self.analyze_composition(image, filtered) if image else {}

	# Intelligent lighting fusion
	if cv_lighting:
	fused_lighting = self.fuse_lighting_analysis(cv_lighting, scene_info)
	# Update scene_info with fused lighting
	scene_info['lighting'] = {
	'top': fused_lighting['lighting_type'],
	'confidence': fused_lighting['confidence'],
	'fusion_details': fused_lighting
	}

	return {
	'detections': filtered,
	'scene_info': scene_info,
	'composition': composition,
	'total_objects': len(all_detections)
	}

	def _calculate_attention_score(self, detection: Dict) -> float:
	"""Calculate attention score based on position, size, and confidence"""
	bbox = detection['bbox']
	x1, y1, x2, y2 = bbox

	center_x = (x1 + x2) / 2
	center_y = (y1 + y2) / 2

	if x2 > 100:
	position_score = 0.5
	else:
	position_score = 1.0 - (abs(center_x - 0.5) + abs(center_y - 0.5))

	area = abs((x2 - x1) * (y2 - y1))
	if x2 > 100:
	area = area / (1000 * 1000)
	size_score = min(area, 0.5)

	conf_score = detection.get('confidence', 0.5)

	attention = (
	0.3 * position_score +
	0.3 * size_score +
	0.4 * conf_score
	)

	return attention

	print("✓ DetectionFusionManager (V2 with intelligent fusion) defined")