| from typing import List, Dict |
| import numpy as np |
|
|
| class DetectionFusionManager: |
| """Integrate and prioritize detection results with intelligent lighting fusion""" |
|
|
| def __init__(self, clip_manager): |
| self.clip_manager = clip_manager |
|
|
| def fuse_lighting_analysis(self, cv_lighting: Dict, clip_scene: Dict) -> Dict: |
| """Intelligently fuse CV+Places365 lighting with CLIP scene understanding""" |
|
|
| cv_lighting_type = cv_lighting.get('lighting_type', 'soft diffused light') |
| cv_confidence = cv_lighting.get('confidence', 0.7) |
| cv_features = cv_lighting.get('cv_features', {}) |
|
|
| |
| clip_lighting_data = clip_scene.get('lighting', {}) |
| clip_lighting_type = clip_lighting_data.get('top', 'natural light') |
| clip_confidence = clip_lighting_data.get('confidence', 0.5) |
|
|
| |
| |
| |
| |
|
|
| if cv_confidence > 0.85: |
| |
| final_lighting = cv_lighting_type |
| final_confidence = cv_confidence |
| fusion_method = 'cv_dominant' |
|
|
| elif self._lighting_semantically_similar(cv_lighting_type, clip_lighting_type): |
| |
| final_lighting = cv_lighting_type |
| |
| final_confidence = min(cv_confidence * 1.15, 0.95) |
| fusion_method = 'consensus' |
|
|
| else: |
| |
| cv_weight = cv_confidence / (cv_confidence + clip_confidence) |
| clip_weight = 1.0 - cv_weight |
|
|
| |
| if cv_weight > 0.6: |
| final_lighting = cv_lighting_type |
| final_confidence = cv_confidence * 0.9 |
| fusion_method = 'cv_weighted' |
| else: |
| |
| final_lighting = self._generalize_lighting_description( |
| cv_lighting_type, clip_lighting_type, cv_features |
| ) |
| final_confidence = (cv_confidence * cv_weight + clip_confidence * clip_weight) * 0.85 |
| fusion_method = 'generalized' |
|
|
| return { |
| 'lighting_type': final_lighting, |
| 'confidence': min(final_confidence, 0.95), |
| 'cv_analysis': cv_lighting_type, |
| 'clip_prediction': clip_lighting_type, |
| 'fusion_method': fusion_method, |
| 'cv_confidence': cv_confidence, |
| 'clip_confidence': clip_confidence |
| } |
|
|
| def _lighting_semantically_similar(self, cv_type: str, clip_type: str) -> bool: |
| """Check if two lighting descriptions are semantically similar""" |
| |
| similarity_groups = [ |
| {'soft', 'diffused', 'overcast', 'cloudy'}, |
| {'bright', 'sunny', 'sunlight', 'clear'}, |
| {'warm', 'golden', 'amber', 'evening'}, |
| {'natural', 'daylight', 'outdoor'}, |
| {'cool', 'blue', 'twilight'}, |
| ] |
|
|
| cv_words = set(cv_type.lower().split()) |
| clip_words = set(clip_type.lower().split()) |
|
|
| |
| for group in similarity_groups: |
| cv_match = cv_words & group |
| clip_match = clip_words & group |
| if cv_match and clip_match: |
| return True |
|
|
| |
| common_words = cv_words & clip_words |
| return len(common_words) >= 1 |
|
|
| def _generalize_lighting_description(self, cv_type: str, clip_type: str, |
| cv_features: Dict) -> str: |
| """Generate a generalized lighting description when CV and CLIP disagree""" |
|
|
| brightness = cv_features.get('brightness', 128) |
| contrast = cv_features.get('contrast', 50) |
| color_temp = cv_features.get('color_temp', 1.0) |
|
|
| |
| brightness_norm = brightness / 255.0 |
| contrast_norm = min(contrast / 100.0, 1.0) |
|
|
| |
| if contrast_norm < 0.5: |
| |
| if color_temp < 1.0: |
| return 'soft diffused light' |
| else: |
| return 'warm ambient light' |
| elif brightness_norm > 0.7: |
| |
| return 'natural daylight' |
| elif color_temp > 1.1: |
| |
| return 'warm ambient light' |
| else: |
| |
| return 'soft diffused light' |
|
|
| def analyze_composition(self, image, detections: List[Dict]) -> Dict: |
| """Analyze image composition""" |
| if not detections: |
| return {'composition_type': 'empty', 'vertical_ratio': 0.0} |
|
|
| |
| vertical_objects = [ |
| d for d in detections |
| if (d['bbox'][3] - d['bbox'][1]) > (d['bbox'][2] - d['bbox'][0]) |
| ] |
| vertical_ratio = len(vertical_objects) / max(len(detections), 1) |
|
|
| |
| if vertical_ratio > 0.6: |
| composition_type = 'urban canyon' |
| elif vertical_ratio > 0.4: |
| composition_type = 'vertical emphasis' |
| else: |
| composition_type = 'standard street view' |
|
|
| return { |
| 'composition_type': composition_type, |
| 'vertical_ratio': vertical_ratio, |
| 'vertical_objects_count': len(vertical_objects), |
| 'total_objects': len(detections) |
| } |
|
|
| def fuse_detections(self, yolo_results: List[Dict], unknown_regions: List[Dict], |
| scene_info: Dict, image=None, cv_lighting: Dict = None) -> Dict: |
| """Fuse all detection results with intelligent lighting fusion""" |
| all_detections = [] |
|
|
| |
| for det in yolo_results: |
| attention_score = self._calculate_attention_score(det) |
| det['attention_score'] = attention_score |
| all_detections.append(det) |
|
|
| |
| for region in unknown_regions: |
| if 'image' not in region: |
| continue |
|
|
| classification = self.clip_manager.classify_hierarchical(region['image']) |
|
|
| detection = { |
| 'class_name': classification['top_prediction'], |
| 'bbox': region['bbox'], |
| 'confidence': classification.get('confidence', 0.5), |
| 'attention_score': region.get('saliency_score', 0.5), |
| 'source': 'openclip' |
| } |
| all_detections.append(detection) |
|
|
| |
| ranked_detections = sorted( |
| all_detections, |
| key=lambda x: x['attention_score'], |
| reverse=True |
| ) |
|
|
| |
| filtered = [] |
| for det in ranked_detections: |
| if len(filtered) >= 15: |
| if det.get('brand') and det.get('brand_confidence', 0) > 0.45: |
| filtered.append(det) |
| else: |
| break |
| else: |
| filtered.append(det) |
|
|
| |
| composition = self.analyze_composition(image, filtered) if image else {} |
|
|
| |
| if cv_lighting: |
| fused_lighting = self.fuse_lighting_analysis(cv_lighting, scene_info) |
| |
| scene_info['lighting'] = { |
| 'top': fused_lighting['lighting_type'], |
| 'confidence': fused_lighting['confidence'], |
| 'fusion_details': fused_lighting |
| } |
|
|
| return { |
| 'detections': filtered, |
| 'scene_info': scene_info, |
| 'composition': composition, |
| 'total_objects': len(all_detections) |
| } |
|
|
| def _calculate_attention_score(self, detection: Dict) -> float: |
| """Calculate attention score based on position, size, and confidence""" |
| bbox = detection['bbox'] |
| x1, y1, x2, y2 = bbox |
|
|
| center_x = (x1 + x2) / 2 |
| center_y = (y1 + y2) / 2 |
|
|
| if x2 > 100: |
| position_score = 0.5 |
| else: |
| position_score = 1.0 - (abs(center_x - 0.5) + abs(center_y - 0.5)) |
|
|
| area = abs((x2 - x1) * (y2 - y1)) |
| if x2 > 100: |
| area = area / (1000 * 1000) |
| size_score = min(area, 0.5) |
|
|
| conf_score = detection.get('confidence', 0.5) |
|
|
| attention = ( |
| 0.3 * position_score + |
| 0.3 * size_score + |
| 0.4 * conf_score |
| ) |
|
|
| return attention |
|
|
| print("✓ DetectionFusionManager (V2 with intelligent fusion) defined") |
|
|