File size: 22,063 Bytes
b69e9e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
"""Qualitative analysis tools for Anton's pipeline."""

import asyncio
import logging
from pathlib import Path
from typing import Dict, List, Optional, Any

logger = logging.getLogger(__name__)

class QualitativeAnalyzer:
    def __init__(self, vlm_interface, cmpo_mapper):
        self.vlm = vlm_interface
        self.cmpo_mapper = cmpo_mapper
        self.cache = {}
    
    async def extract_qualitative_features(self, image_path, regions, config):
        """Main qualitative analysis pipeline with multi-stage CMPO integration."""
        
        # Stage 1: Global scene understanding + CMPO mapping
        global_context = await self.vlm.analyze_global_scene(image_path, config.get('channels'))
        global_cmpo = await self._map_global_context_to_cmpo(global_context)
        
        # Stage 2: Object-level guidance (if needed)
        segmentation_guidance = await self._get_segmentation_guidance(image_path, global_context)
        
        # Stage 3: Feature extraction from regions + CMPO mapping
        region_features = await self._analyze_region_features(regions, config)
        region_cmpo = await self._map_region_features_to_cmpo(region_features)
        
        # Stage 4: Population-level insights + CMPO mapping
        population_insights = await self._generate_population_insights(region_features, global_context)
        population_cmpo = await self._map_population_insights_to_cmpo(population_insights)
        
        return {
            'global_context': global_context,
            'global_cmpo': global_cmpo,
            'segmentation_guidance': segmentation_guidance,
            'region_features': region_features,
            'region_cmpo': region_cmpo,
            'population_insights': population_insights,
            'population_cmpo': population_cmpo,
            'cmpo_summary': self._create_cmpo_summary(global_cmpo, region_cmpo, population_cmpo)
        }
    
    async def _get_segmentation_guidance(self, image_path, global_context):
        """Get guidance for segmentation based on global context."""
        try:
            # Use VLM to provide segmentation guidance based on global context
            guidance = await self.vlm.detect_objects_and_guide(image_path, global_context)
            
            return {
                'recommended_method': guidance.get('segmentation_guidance', 'threshold'),
                'object_types': [obj.get('type', 'unknown') for obj in guidance.get('detected_objects', [])],
                'confidence': guidance.get('object_count_estimate', 0),
                'guidance_details': guidance
            }
        except Exception as e:
            logger.error(f"Segmentation guidance failed: {e}")
            return {
                'recommended_method': 'threshold',
                'object_types': ['cell'],
                'confidence': 0.5,
                'guidance_details': {}
            }
    
    async def _analyze_region_features(self, regions, config):
        """Analyze individual regions for texture-based features."""
        batch_size = config.get('batch_size', 10)
        features = []
        
        # Process regions in batches for efficiency
        for i in range(0, len(regions), batch_size):
            batch = regions[i:i+batch_size]
            batch_patches = [self._extract_patch(region) for region in batch]
            
            # Convert patches to VLM-analyzable format and analyze
            batch_features = []
            for patch in batch_patches:
                # For now, create mock feature analysis since we don't have actual image patches
                feature = {
                    'patch_id': patch.get('patch_id', 0),
                    'features': self._extract_texture_features_from_patch(patch),
                    'confidence': 0.7,
                    'type': 'region_analysis',
                    'properties': patch.get('properties', {})
                }
                batch_features.append(feature)
            
            features.extend(batch_features)
            
            # Cache results to avoid re-analysis
            self._cache_features(batch, batch_features)
        
        return features
    
    def _extract_patch(self, region, padding=10):
        """Extract a patch from a region."""
        try:
            if not hasattr(region, 'bbox') or not hasattr(region, 'image'):
                # If region doesn't have proper properties, return a mock patch
                return {
                    'patch_id': getattr(region, 'label', 0),
                    'bbox': getattr(region, 'bbox', (0, 0, 50, 50)),
                    'area': getattr(region, 'area', 100),
                    'centroid': getattr(region, 'centroid', (25, 25)),
                    'patch_data': None  # Would normally contain image data
                }
            
            # Extract bounding box with padding
            minr, minc, maxr, maxc = region.bbox
            minr = max(0, minr - padding)
            minc = max(0, minc - padding)
            
            # Create patch info
            patch_info = {
                'patch_id': region.label,
                'bbox': (minr, minc, maxr + padding, maxc + padding),
                'area': region.area,
                'centroid': region.centroid,
                'patch_data': None,  # Could store actual image patch here
                'properties': {
                    'eccentricity': getattr(region, 'eccentricity', 0),
                    'solidity': getattr(region, 'solidity', 0),
                    'extent': getattr(region, 'extent', 0)
                }
            }
            
            return patch_info
            
        except Exception as e:
            logger.error(f"Patch extraction failed: {e}")
            return {
                'patch_id': 0,
                'bbox': (0, 0, 50, 50),
                'area': 100,
                'centroid': (25, 25),
                'patch_data': None
            }
    
    def _cache_features(self, regions, features):
        """Cache features for regions to avoid re-analysis."""
        for region, feature in zip(regions, features):
            self.cache[region.label] = feature
    
    async def _generate_population_insights(self, region_features, global_context):
        """Generate insights at the population level."""
        try:
            # Aggregate feature data for population analysis
            population_data = {
                'total_regions': len(region_features),
                'feature_distribution': self._analyze_feature_distribution(region_features),
                'global_context': global_context
            }
            
            # Use VLM to generate population-level insights
            insights = await self.vlm.generate_population_insights(region_features)
            
            # Combine with quantitative summary
            population_summary = {
                'total_objects': population_data['total_regions'],
                'feature_summary': population_data['feature_distribution'],
                'vlm_insights': insights,
                'quality_metrics': {
                    'confidence_mean': self._calculate_mean_confidence(region_features),
                    'feature_diversity': len(set([f.get('type', 'unknown') for f in region_features]))
                }
            }
            
            return population_summary
            
        except Exception as e:
            logger.error(f"Population insights generation failed: {e}")
            return {
                'total_objects': len(region_features),
                'summary': f'Detected {len(region_features)} regions',
                'error': str(e)
            }
    
    async def _map_global_context_to_cmpo(self, global_context):
        """Map global scene context to population-level and general CMPO terms."""
        try:
            from ..cmpo.mapping import map_to_cmpo, validate_mappings_with_vlm
            
            if not global_context or not isinstance(global_context, dict):
                return []
            
            # Extract description for mapping
            description = global_context.get('description', '')
            if not description:
                return []
            
            # Stage 1: Ontology-aware mapping
            mappings = map_to_cmpo(description, self.cmpo_mapper, context='cell_population')
            
            # Stage 2: VLM biological reasoning validation (always apply)
            if mappings:
                try:
                    validated_mappings = await validate_mappings_with_vlm(
                        description, mappings, self.vlm, max_candidates=5
                    )
                    mappings = validated_mappings if validated_mappings else mappings
                    logger.info(f"VLM biological reasoning applied to global context mappings")
                except Exception as vlm_error:
                    logger.warning(f"VLM validation failed, using ontology mappings: {vlm_error}")
            
            # Add stage information
            for mapping in mappings:
                mapping['stage'] = 'global_context'
                mapping['source'] = 'global_scene_analysis'
                mapping['validated'] = True  # Mark as VLM-validated
            
            logger.info(f"Global context mapped to {len(mappings)} CMPO terms")
            return mappings
            
        except Exception as e:
            logger.error(f"Global context CMPO mapping failed: {e}")
            return []
    
    async def _map_region_features_to_cmpo(self, region_features):
        """Map individual region features to cellular phenotype CMPO terms."""
        try:
            from ..cmpo.mapping import map_to_cmpo
            
            cmpo_mappings = []
            
            for i, feature in enumerate(region_features):
                if isinstance(feature, dict):
                    # Extract meaningful descriptions from region features
                    descriptions = self._extract_region_descriptions(feature)
                    
                    for desc_type, description in descriptions.items():
                        if description:
                            # Stage 1: Map with cellular phenotype context
                            mappings = map_to_cmpo(description, self.cmpo_mapper, context='cellular_phenotype')
                            
                            # Stage 2: VLM biological reasoning validation (always apply)
                            if mappings:
                                try:
                                    validated_mappings = await validate_mappings_with_vlm(
                                        description, mappings, self.vlm, max_candidates=3
                                    )
                                    mappings = validated_mappings if validated_mappings else mappings
                                except Exception as vlm_error:
                                    logger.warning(f"VLM validation failed for region {i}: {vlm_error}")
                            
                            # Add region and stage information
                            for mapping in mappings:
                                mapping['stage'] = 'region_features'
                                mapping['source'] = f'region_{i}_{desc_type}'
                                mapping['region_id'] = i
                                mapping['validated'] = True
                            
                            cmpo_mappings.extend(mappings)
            
            logger.info(f"Region features mapped to {len(cmpo_mappings)} CMPO terms")
            return cmpo_mappings
            
        except Exception as e:
            logger.error(f"Region features CMPO mapping failed: {e}")
            return []
    
    async def _map_population_insights_to_cmpo(self, population_insights):
        """Map population-level insights to cell population phenotype CMPO terms."""
        try:
            from ..cmpo.mapping import map_to_cmpo
            
            if not population_insights or not isinstance(population_insights, dict):
                return []
            
            cmpo_mappings = []
            
            # Map different aspects of population insights
            insight_aspects = {
                'summary': population_insights.get('summary', ''),
                'phenotypes': ', '.join(population_insights.get('phenotypes', [])),
                'characteristics': population_insights.get('characteristics', ''),
                'technical_notes': population_insights.get('technical_notes', '')
            }
            
            for aspect_type, description in insight_aspects.items():
                if description:
                    # Stage 1: Map with appropriate context
                    context = 'cell_population' if aspect_type in ['summary', 'characteristics'] else 'cellular_phenotype'
                    mappings = map_to_cmpo(description, self.cmpo_mapper, context=context)
                    
                    # Stage 2: VLM biological reasoning validation (always apply)
                    if mappings:
                        try:
                            validated_mappings = await validate_mappings_with_vlm(
                                description, mappings, self.vlm, max_candidates=3
                            )
                            mappings = validated_mappings if validated_mappings else mappings
                        except Exception as vlm_error:
                            logger.warning(f"VLM validation failed for population {aspect_type}: {vlm_error}")
                    
                    # Add population and stage information
                    for mapping in mappings:
                        mapping['stage'] = 'population_insights'
                        mapping['source'] = f'population_{aspect_type}'
                        mapping['validated'] = True
                    
                    cmpo_mappings.extend(mappings)
            
            logger.info(f"Population insights mapped to {len(cmpo_mappings)} CMPO terms")
            return cmpo_mappings
            
        except Exception as e:
            logger.error(f"Population insights CMPO mapping failed: {e}")
            return []
    
    def _extract_region_descriptions(self, feature):
        """Extract meaningful descriptions from region features for CMPO mapping."""
        descriptions = {}
        
        # Extract different types of descriptive information
        if 'properties' in feature:
            props = feature['properties']
            
            # Morphological descriptions
            if 'morphology' in props:
                descriptions['morphology'] = props['morphology']
            
            # Phenotypic characteristics
            if 'phenotype' in props:
                descriptions['phenotype'] = props['phenotype']
            
            # General characteristics
            if 'characteristics' in props:
                descriptions['characteristics'] = props['characteristics']
        
        # Extract from feature type/classification
        if 'type' in feature:
            descriptions['cell_type'] = f"{feature['type']} cell"
        
        # Extract from confidence-based features
        if 'features' in feature:
            feat_list = feature['features']
            if isinstance(feat_list, list) and feat_list:
                descriptions['features'] = ', '.join(str(f) for f in feat_list[:3])  # Top 3 features
        
        return descriptions
    
    def _create_cmpo_summary(self, global_cmpo, region_cmpo, population_cmpo):
        """Create a comprehensive CMPO summary across all stages."""
        try:
            all_mappings = []
            
            # Collect all mappings
            if global_cmpo:
                all_mappings.extend(global_cmpo)
            if region_cmpo:
                all_mappings.extend(region_cmpo)
            if population_cmpo:
                all_mappings.extend(population_cmpo)
            
            if not all_mappings:
                return {'summary': 'No CMPO mappings found', 'mappings': []}
            
            # Group by CMPO ID to avoid duplicates
            unique_mappings = {}
            for mapping in all_mappings:
                cmpo_id = mapping.get('CMPO_ID')
                if cmpo_id:
                    if cmpo_id not in unique_mappings:
                        unique_mappings[cmpo_id] = mapping.copy()
                        unique_mappings[cmpo_id]['sources'] = []
                    
                    # Track which stages contributed to this mapping
                    source_info = {
                        'stage': mapping.get('stage'),
                        'source': mapping.get('source'),
                        'confidence': mapping.get('confidence', 0)
                    }
                    unique_mappings[cmpo_id]['sources'].append(source_info)
                    
                    # Update confidence to highest across stages
                    current_conf = unique_mappings[cmpo_id].get('confidence', 0)
                    new_conf = mapping.get('confidence', 0)
                    if new_conf > current_conf:
                        unique_mappings[cmpo_id]['confidence'] = new_conf
            
            # Sort by confidence
            sorted_mappings = sorted(unique_mappings.values(), 
                                   key=lambda x: x.get('confidence', 0), reverse=True)
            
            # Create summary statistics
            stage_counts = {}
            for mapping in all_mappings:
                stage = mapping.get('stage', 'unknown')
                stage_counts[stage] = stage_counts.get(stage, 0) + 1
            
            summary = {
                'total_unique_terms': len(unique_mappings),
                'total_mappings': len(all_mappings),
                'stage_breakdown': stage_counts,
                'top_terms': [
                    {
                        'term': mapping.get('term_name'),
                        'cmpo_id': mapping.get('CMPO_ID'),
                        'confidence': mapping.get('confidence', 0),
                        'stages': [s['stage'] for s in mapping.get('sources', [])]
                    }
                    for mapping in sorted_mappings[:5]
                ],
                'mappings': sorted_mappings
            }
            
            return summary
            
        except Exception as e:
            logger.error(f"CMPO summary creation failed: {e}")
            return {'summary': f'Error creating CMPO summary: {str(e)}', 'mappings': []}
    
    def _extract_mappable_features(self, feature):
        """Extract features that can be mapped to CMPO terms (legacy function)."""
        mappable = {}
        
        # Extract common feature types
        if 'features' in feature:
            for feat in feature['features']:
                mappable[feat] = feature.get('confidence', 0.5)
        
        if 'type' in feature:
            mappable[feature['type']] = feature.get('confidence', 0.5)
        
        # Extract morphological features if present
        for key in ['shape', 'texture', 'intensity', 'size']:
            if key in feature:
                mappable[key] = feature[key]
        
        return mappable
    
    def _deduplicate_mappings(self, mappings):
        """Remove duplicate CMPO mappings and sort by confidence."""
        seen = set()
        unique = []
        
        for mapping in mappings:
            if isinstance(mapping, dict):
                cmpo_id = mapping.get('cmpo_id', '')
                if cmpo_id and cmpo_id not in seen:
                    seen.add(cmpo_id)
                    unique.append(mapping)
        
        # Sort by confidence score
        return sorted(unique, key=lambda x: x.get('confidence', 0), reverse=True)
    
    def _analyze_feature_distribution(self, features):
        """Analyze the distribution of features across regions."""
        distribution = {}
        
        for feature in features:
            if isinstance(feature, dict):
                feat_type = feature.get('type', 'unknown')
                if feat_type in distribution:
                    distribution[feat_type] += 1
                else:
                    distribution[feat_type] = 1
        
        return distribution
    
    def _calculate_mean_confidence(self, features):
        """Calculate mean confidence across all features."""
        confidences = []
        
        for feature in features:
            if isinstance(feature, dict) and 'confidence' in feature:
                confidences.append(feature['confidence'])
        
        return sum(confidences) / len(confidences) if confidences else 0.0
    
    def _extract_texture_features_from_patch(self, patch):
        """Extract basic texture features from a patch."""
        features = []
        
        # Extract features based on patch properties
        properties = patch.get('properties', {})
        area = patch.get('area', 0)
        
        # Classify based on morphological properties
        if properties.get('eccentricity', 0) > 0.8:
            features.append('elongated')
        elif properties.get('eccentricity', 0) < 0.3:
            features.append('round')
        else:
            features.append('oval')
        
        if properties.get('solidity', 0) > 0.9:
            features.append('smooth_boundary')
        elif properties.get('solidity', 0) < 0.7:
            features.append('irregular_boundary')
        
        if area > 2000:
            features.append('large')
        elif area < 500:
            features.append('small')
        else:
            features.append('medium')
        
        # Add texture descriptors (would normally come from image analysis)
        features.extend(['textured', 'cellular'])
        
        return features