Spaces:

pskeshu
/

anton-microscopy

Sleeping

App Files Files Community

pskeshu commited on Jul 12, 2025

Commit

b69e9e7

1 Parent(s): cba4849

minimal example

Browse files

Files changed (31) hide show

.streamlit/config.toml +14 -0
.streamlit/secrets.toml +13 -0
anton/__init__.py +3 -0
anton/__pycache__/__init__.cpython-313.pyc +0 -0
anton/analysis/__pycache__/qualitative.cpython-313.pyc +0 -0
anton/analysis/__pycache__/quantitative.cpython-313.pyc +0 -0
anton/analysis/qualitative.py +503 -0
anton/analysis/quantitative.py +620 -0
anton/cmpo.json +8 -0
anton/cmpo/README.md +300 -0
anton/cmpo/__init__.py +24 -0
anton/cmpo/__pycache__/__init__.cpython-313.pyc +0 -0
anton/cmpo/__pycache__/examples.cpython-313.pyc +0 -0
anton/cmpo/__pycache__/mapping.cpython-313.pyc +0 -0
anton/cmpo/__pycache__/ontology.cpython-313.pyc +0 -0
anton/cmpo/data/cmpo.json +0 -0
anton/cmpo/examples.py +277 -0
anton/cmpo/mapping.py +375 -0
anton/cmpo/ontology.py +326 -0
anton/core/__pycache__/pipeline.cpython-313.pyc +0 -0
anton/core/config.py +97 -0
anton/core/pipeline.py +188 -0
anton/main.py +64 -0
anton/utils/__pycache__/image_io.cpython-313.pyc +0 -0
anton/utils/__pycache__/validation.cpython-313.pyc +0 -0
anton/utils/image_io.py +263 -0
anton/utils/validation.py +20 -0
anton/vlm/__pycache__/interface.cpython-313.pyc +0 -0
anton/vlm/interface.py +566 -0
app.py +170 -0
src/streamlit_app.py +0 -40

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,14 @@

+[server]
+# Increase file upload size limit for large microscopy images (in MB)
+maxUploadSize = 200
+# Auto-reload when files change in development
+fileWatcherType = "auto"
+# Run on specific port
+port = 8501
+[theme]
+# Optional: Customize app appearance
+primaryColor = "#1f77b4"
+backgroundColor = "#ffffff"
+secondaryBackgroundColor = "#f0f2f6"
+textColor = "#262730"

.streamlit/secrets.toml ADDED Viewed

	@@ -0,0 +1,13 @@

+# Streamlit Cloud Secrets Configuration
+#
+# Add your API keys here in the Streamlit Cloud dashboard:
+# 1. Go to your app settings in Streamlit Cloud
+# 2. Navigate to "Secrets" tab
+# 3. Add the following secrets:
+# Example format (don't put real keys in this file):
+# GOOGLE_API_KEY = "your-google-api-key-here"
+# ANTHROPIC_API_KEY = "your-anthropic-api-key-here"
+# Note: This file is a template - real secrets should only be entered
+# in the Streamlit Cloud dashboard for security.

anton/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ """Anton: VLM-driven microscopy phenotype analysis framework."""
2	+
3	+ __version__ = "0.2.0"

anton/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (241 Bytes). View file

anton/analysis/__pycache__/qualitative.cpython-313.pyc ADDED Viewed

Binary file (21.2 kB). View file

anton/analysis/__pycache__/quantitative.cpython-313.pyc ADDED Viewed

Binary file (29.1 kB). View file

anton/analysis/qualitative.py ADDED Viewed

	@@ -0,0 +1,503 @@

+"""Qualitative analysis tools for Anton's pipeline."""
+import asyncio
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+logger = logging.getLogger(__name__)
+class QualitativeAnalyzer:
+    def __init__(self, vlm_interface, cmpo_mapper):
+        self.vlm = vlm_interface
+        self.cmpo_mapper = cmpo_mapper
+        self.cache = {}
+    async def extract_qualitative_features(self, image_path, regions, config):
+        """Main qualitative analysis pipeline with multi-stage CMPO integration."""
+        # Stage 1: Global scene understanding + CMPO mapping
+        global_context = await self.vlm.analyze_global_scene(image_path, config.get('channels'))
+        global_cmpo = await self._map_global_context_to_cmpo(global_context)
+        # Stage 2: Object-level guidance (if needed)
+        segmentation_guidance = await self._get_segmentation_guidance(image_path, global_context)
+        # Stage 3: Feature extraction from regions + CMPO mapping
+        region_features = await self._analyze_region_features(regions, config)
+        region_cmpo = await self._map_region_features_to_cmpo(region_features)
+        # Stage 4: Population-level insights + CMPO mapping
+        population_insights = await self._generate_population_insights(region_features, global_context)
+        population_cmpo = await self._map_population_insights_to_cmpo(population_insights)
+        return {
+            'global_context': global_context,
+            'global_cmpo': global_cmpo,
+            'segmentation_guidance': segmentation_guidance,
+            'region_features': region_features,
+            'region_cmpo': region_cmpo,
+            'population_insights': population_insights,
+            'population_cmpo': population_cmpo,
+            'cmpo_summary': self._create_cmpo_summary(global_cmpo, region_cmpo, population_cmpo)
+        }
+    async def _get_segmentation_guidance(self, image_path, global_context):
+        """Get guidance for segmentation based on global context."""
+        try:
+            # Use VLM to provide segmentation guidance based on global context
+            guidance = await self.vlm.detect_objects_and_guide(image_path, global_context)
+            return {
+                'recommended_method': guidance.get('segmentation_guidance', 'threshold'),
+                'object_types': [obj.get('type', 'unknown') for obj in guidance.get('detected_objects', [])],
+                'confidence': guidance.get('object_count_estimate', 0),
+                'guidance_details': guidance
+            }
+        except Exception as e:
+            logger.error(f"Segmentation guidance failed: {e}")
+            return {
+                'recommended_method': 'threshold',
+                'object_types': ['cell'],
+                'confidence': 0.5,
+                'guidance_details': {}
+            }
+    async def _analyze_region_features(self, regions, config):
+        """Analyze individual regions for texture-based features."""
+        batch_size = config.get('batch_size', 10)
+        features = []
+        # Process regions in batches for efficiency
+        for i in range(0, len(regions), batch_size):
+            batch = regions[i:i+batch_size]
+            batch_patches = [self._extract_patch(region) for region in batch]
+            # Convert patches to VLM-analyzable format and analyze
+            batch_features = []
+            for patch in batch_patches:
+                # For now, create mock feature analysis since we don't have actual image patches
+                feature = {
+                    'patch_id': patch.get('patch_id', 0),
+                    'features': self._extract_texture_features_from_patch(patch),
+                    'confidence': 0.7,
+                    'type': 'region_analysis',
+                    'properties': patch.get('properties', {})
+                }
+                batch_features.append(feature)
+            features.extend(batch_features)
+            # Cache results to avoid re-analysis
+            self._cache_features(batch, batch_features)
+        return features
+    def _extract_patch(self, region, padding=10):
+        """Extract a patch from a region."""
+        try:
+            if not hasattr(region, 'bbox') or not hasattr(region, 'image'):
+                # If region doesn't have proper properties, return a mock patch
+                return {
+                    'patch_id': getattr(region, 'label', 0),
+                    'bbox': getattr(region, 'bbox', (0, 0, 50, 50)),
+                    'area': getattr(region, 'area', 100),
+                    'centroid': getattr(region, 'centroid', (25, 25)),
+                    'patch_data': None  # Would normally contain image data
+                }
+            # Extract bounding box with padding
+            minr, minc, maxr, maxc = region.bbox
+            minr = max(0, minr - padding)
+            minc = max(0, minc - padding)
+            # Create patch info
+            patch_info = {
+                'patch_id': region.label,
+                'bbox': (minr, minc, maxr + padding, maxc + padding),
+                'area': region.area,
+                'centroid': region.centroid,
+                'patch_data': None,  # Could store actual image patch here
+                'properties': {
+                    'eccentricity': getattr(region, 'eccentricity', 0),
+                    'solidity': getattr(region, 'solidity', 0),
+                    'extent': getattr(region, 'extent', 0)
+                }
+            }
+            return patch_info
+        except Exception as e:
+            logger.error(f"Patch extraction failed: {e}")
+            return {
+                'patch_id': 0,
+                'bbox': (0, 0, 50, 50),
+                'area': 100,
+                'centroid': (25, 25),
+                'patch_data': None
+            }
+    def _cache_features(self, regions, features):
+        """Cache features for regions to avoid re-analysis."""
+        for region, feature in zip(regions, features):
+            self.cache[region.label] = feature
+    async def _generate_population_insights(self, region_features, global_context):
+        """Generate insights at the population level."""
+        try:
+            # Aggregate feature data for population analysis
+            population_data = {
+                'total_regions': len(region_features),
+                'feature_distribution': self._analyze_feature_distribution(region_features),
+                'global_context': global_context
+            }
+            # Use VLM to generate population-level insights
+            insights = await self.vlm.generate_population_insights(region_features)
+            # Combine with quantitative summary
+            population_summary = {
+                'total_objects': population_data['total_regions'],
+                'feature_summary': population_data['feature_distribution'],
+                'vlm_insights': insights,
+                'quality_metrics': {
+                    'confidence_mean': self._calculate_mean_confidence(region_features),
+                    'feature_diversity': len(set([f.get('type', 'unknown') for f in region_features]))
+                }
+            }
+            return population_summary
+        except Exception as e:
+            logger.error(f"Population insights generation failed: {e}")
+            return {
+                'total_objects': len(region_features),
+                'summary': f'Detected {len(region_features)} regions',
+                'error': str(e)
+            }
+    async def _map_global_context_to_cmpo(self, global_context):
+        """Map global scene context to population-level and general CMPO terms."""
+        try:
+            from ..cmpo.mapping import map_to_cmpo, validate_mappings_with_vlm
+            if not global_context or not isinstance(global_context, dict):
+                return []
+            # Extract description for mapping
+            description = global_context.get('description', '')
+            if not description:
+                return []
+            # Stage 1: Ontology-aware mapping
+            mappings = map_to_cmpo(description, self.cmpo_mapper, context='cell_population')
+            # Stage 2: VLM biological reasoning validation (always apply)
+            if mappings:
+                try:
+                    validated_mappings = await validate_mappings_with_vlm(
+                        description, mappings, self.vlm, max_candidates=5
+                    )
+                    mappings = validated_mappings if validated_mappings else mappings
+                    logger.info(f"VLM biological reasoning applied to global context mappings")
+                except Exception as vlm_error:
+                    logger.warning(f"VLM validation failed, using ontology mappings: {vlm_error}")
+            # Add stage information
+            for mapping in mappings:
+                mapping['stage'] = 'global_context'
+                mapping['source'] = 'global_scene_analysis'
+                mapping['validated'] = True  # Mark as VLM-validated
+            logger.info(f"Global context mapped to {len(mappings)} CMPO terms")
+            return mappings
+        except Exception as e:
+            logger.error(f"Global context CMPO mapping failed: {e}")
+            return []
+    async def _map_region_features_to_cmpo(self, region_features):
+        """Map individual region features to cellular phenotype CMPO terms."""
+        try:
+            from ..cmpo.mapping import map_to_cmpo
+            cmpo_mappings = []
+            for i, feature in enumerate(region_features):
+                if isinstance(feature, dict):
+                    # Extract meaningful descriptions from region features
+                    descriptions = self._extract_region_descriptions(feature)
+                    for desc_type, description in descriptions.items():
+                        if description:
+                            # Stage 1: Map with cellular phenotype context
+                            mappings = map_to_cmpo(description, self.cmpo_mapper, context='cellular_phenotype')
+                            # Stage 2: VLM biological reasoning validation (always apply)
+                            if mappings:
+                                try:
+                                    validated_mappings = await validate_mappings_with_vlm(
+                                        description, mappings, self.vlm, max_candidates=3
+                                    )
+                                    mappings = validated_mappings if validated_mappings else mappings
+                                except Exception as vlm_error:
+                                    logger.warning(f"VLM validation failed for region {i}: {vlm_error}")
+                            # Add region and stage information
+                            for mapping in mappings:
+                                mapping['stage'] = 'region_features'
+                                mapping['source'] = f'region_{i}_{desc_type}'
+                                mapping['region_id'] = i
+                                mapping['validated'] = True
+                            cmpo_mappings.extend(mappings)
+            logger.info(f"Region features mapped to {len(cmpo_mappings)} CMPO terms")
+            return cmpo_mappings
+        except Exception as e:
+            logger.error(f"Region features CMPO mapping failed: {e}")
+            return []
+    async def _map_population_insights_to_cmpo(self, population_insights):
+        """Map population-level insights to cell population phenotype CMPO terms."""
+        try:
+            from ..cmpo.mapping import map_to_cmpo
+            if not population_insights or not isinstance(population_insights, dict):
+                return []
+            cmpo_mappings = []
+            # Map different aspects of population insights
+            insight_aspects = {
+                'summary': population_insights.get('summary', ''),
+                'phenotypes': ', '.join(population_insights.get('phenotypes', [])),
+                'characteristics': population_insights.get('characteristics', ''),
+                'technical_notes': population_insights.get('technical_notes', '')
+            }
+            for aspect_type, description in insight_aspects.items():
+                if description:
+                    # Stage 1: Map with appropriate context
+                    context = 'cell_population' if aspect_type in ['summary', 'characteristics'] else 'cellular_phenotype'
+                    mappings = map_to_cmpo(description, self.cmpo_mapper, context=context)
+                    # Stage 2: VLM biological reasoning validation (always apply)
+                    if mappings:
+                        try:
+                            validated_mappings = await validate_mappings_with_vlm(
+                                description, mappings, self.vlm, max_candidates=3
+                            )
+                            mappings = validated_mappings if validated_mappings else mappings
+                        except Exception as vlm_error:
+                            logger.warning(f"VLM validation failed for population {aspect_type}: {vlm_error}")
+                    # Add population and stage information
+                    for mapping in mappings:
+                        mapping['stage'] = 'population_insights'
+                        mapping['source'] = f'population_{aspect_type}'
+                        mapping['validated'] = True
+                    cmpo_mappings.extend(mappings)
+            logger.info(f"Population insights mapped to {len(cmpo_mappings)} CMPO terms")
+            return cmpo_mappings
+        except Exception as e:
+            logger.error(f"Population insights CMPO mapping failed: {e}")
+            return []
+    def _extract_region_descriptions(self, feature):
+        """Extract meaningful descriptions from region features for CMPO mapping."""
+        descriptions = {}
+        # Extract different types of descriptive information
+        if 'properties' in feature:
+            props = feature['properties']
+            # Morphological descriptions
+            if 'morphology' in props:
+                descriptions['morphology'] = props['morphology']
+            # Phenotypic characteristics
+            if 'phenotype' in props:
+                descriptions['phenotype'] = props['phenotype']
+            # General characteristics
+            if 'characteristics' in props:
+                descriptions['characteristics'] = props['characteristics']
+        # Extract from feature type/classification
+        if 'type' in feature:
+            descriptions['cell_type'] = f"{feature['type']} cell"
+        # Extract from confidence-based features
+        if 'features' in feature:
+            feat_list = feature['features']
+            if isinstance(feat_list, list) and feat_list:
+                descriptions['features'] = ', '.join(str(f) for f in feat_list[:3])  # Top 3 features
+        return descriptions
+    def _create_cmpo_summary(self, global_cmpo, region_cmpo, population_cmpo):
+        """Create a comprehensive CMPO summary across all stages."""
+        try:
+            all_mappings = []
+            # Collect all mappings
+            if global_cmpo:
+                all_mappings.extend(global_cmpo)
+            if region_cmpo:
+                all_mappings.extend(region_cmpo)
+            if population_cmpo:
+                all_mappings.extend(population_cmpo)
+            if not all_mappings:
+                return {'summary': 'No CMPO mappings found', 'mappings': []}
+            # Group by CMPO ID to avoid duplicates
+            unique_mappings = {}
+            for mapping in all_mappings:
+                cmpo_id = mapping.get('CMPO_ID')
+                if cmpo_id:
+                    if cmpo_id not in unique_mappings:
+                        unique_mappings[cmpo_id] = mapping.copy()
+                        unique_mappings[cmpo_id]['sources'] = []
+                    # Track which stages contributed to this mapping
+                    source_info = {
+                        'stage': mapping.get('stage'),
+                        'source': mapping.get('source'),
+                        'confidence': mapping.get('confidence', 0)
+                    }
+                    unique_mappings[cmpo_id]['sources'].append(source_info)
+                    # Update confidence to highest across stages
+                    current_conf = unique_mappings[cmpo_id].get('confidence', 0)
+                    new_conf = mapping.get('confidence', 0)
+                    if new_conf > current_conf:
+                        unique_mappings[cmpo_id]['confidence'] = new_conf
+            # Sort by confidence
+            sorted_mappings = sorted(unique_mappings.values(),
+                                   key=lambda x: x.get('confidence', 0), reverse=True)
+            # Create summary statistics
+            stage_counts = {}
+            for mapping in all_mappings:
+                stage = mapping.get('stage', 'unknown')
+                stage_counts[stage] = stage_counts.get(stage, 0) + 1
+            summary = {
+                'total_unique_terms': len(unique_mappings),
+                'total_mappings': len(all_mappings),
+                'stage_breakdown': stage_counts,
+                'top_terms': [
+                    {
+                        'term': mapping.get('term_name'),
+                        'cmpo_id': mapping.get('CMPO_ID'),
+                        'confidence': mapping.get('confidence', 0),
+                        'stages': [s['stage'] for s in mapping.get('sources', [])]
+                    }
+                    for mapping in sorted_mappings[:5]
+                ],
+                'mappings': sorted_mappings
+            }
+            return summary
+        except Exception as e:
+            logger.error(f"CMPO summary creation failed: {e}")
+            return {'summary': f'Error creating CMPO summary: {str(e)}', 'mappings': []}
+    def _extract_mappable_features(self, feature):
+        """Extract features that can be mapped to CMPO terms (legacy function)."""
+        mappable = {}
+        # Extract common feature types
+        if 'features' in feature:
+            for feat in feature['features']:
+                mappable[feat] = feature.get('confidence', 0.5)
+        if 'type' in feature:
+            mappable[feature['type']] = feature.get('confidence', 0.5)
+        # Extract morphological features if present
+        for key in ['shape', 'texture', 'intensity', 'size']:
+            if key in feature:
+                mappable[key] = feature[key]
+        return mappable
+    def _deduplicate_mappings(self, mappings):
+        """Remove duplicate CMPO mappings and sort by confidence."""
+        seen = set()
+        unique = []
+        for mapping in mappings:
+            if isinstance(mapping, dict):
+                cmpo_id = mapping.get('cmpo_id', '')
+                if cmpo_id and cmpo_id not in seen:
+                    seen.add(cmpo_id)
+                    unique.append(mapping)
+        # Sort by confidence score
+        return sorted(unique, key=lambda x: x.get('confidence', 0), reverse=True)
+    def _analyze_feature_distribution(self, features):
+        """Analyze the distribution of features across regions."""
+        distribution = {}
+        for feature in features:
+            if isinstance(feature, dict):
+                feat_type = feature.get('type', 'unknown')
+                if feat_type in distribution:
+                    distribution[feat_type] += 1
+                else:
+                    distribution[feat_type] = 1
+        return distribution
+    def _calculate_mean_confidence(self, features):
+        """Calculate mean confidence across all features."""
+        confidences = []
+        for feature in features:
+            if isinstance(feature, dict) and 'confidence' in feature:
+                confidences.append(feature['confidence'])
+        return sum(confidences) / len(confidences) if confidences else 0.0
+    def _extract_texture_features_from_patch(self, patch):
+        """Extract basic texture features from a patch."""
+        features = []
+        # Extract features based on patch properties
+        properties = patch.get('properties', {})
+        area = patch.get('area', 0)
+        # Classify based on morphological properties
+        if properties.get('eccentricity', 0) > 0.8:
+            features.append('elongated')
+        elif properties.get('eccentricity', 0) < 0.3:
+            features.append('round')
+        else:
+            features.append('oval')
+        if properties.get('solidity', 0) > 0.9:
+            features.append('smooth_boundary')
+        elif properties.get('solidity', 0) < 0.7:
+            features.append('irregular_boundary')
+        if area > 2000:
+            features.append('large')
+        elif area < 500:
+            features.append('small')
+        else:
+            features.append('medium')
+        # Add texture descriptors (would normally come from image analysis)
+        features.extend(['textured', 'cellular'])
+        return features

anton/analysis/quantitative.py ADDED Viewed

	@@ -0,0 +1,620 @@

+"""Quantitative analysis tools for Anton's pipeline."""
+import numpy as np
+import cv2
+from skimage import measure, morphology, filters, segmentation, feature
+from scipy import ndimage
+import pandas as pd
+from enum import Enum
+from typing import List, Dict, Union, Optional, Tuple
+from pathlib import Path
+import logging
+logger = logging.getLogger(__name__)
+class SegmentationStrategy(Enum):
+    THRESHOLD = "threshold"
+    WATERSHED = "watershed"
+    EDGE = "edge"
+    CELLPOSE = "cellpose"
+    STARDIST = "stardist"
+class QuantitativeAnalyzer:
+    """Traditional computer vision analysis tools for microscopy images."""
+    def __init__(self, config: Optional[Dict] = None):
+        """Initialize the quantitative analyzer.
+        Args:
+            config: Configuration dictionary with analysis parameters
+        """
+        self.config = config or {}
+        self.segmentation_methods = {
+            SegmentationStrategy.THRESHOLD: self._threshold_segmentation,
+            SegmentationStrategy.WATERSHED: self._watershed_segmentation,
+            SegmentationStrategy.EDGE: self._edge_segmentation,
+            SegmentationStrategy.CELLPOSE: self._cellpose_segmentation,
+            SegmentationStrategy.STARDIST: self._stardist_segmentation,
+        }
+    def extract_quantitative_features(self, image_path: Union[str, Path],
+                                    channels: Optional[List[int]] = None,
+                                    method: SegmentationStrategy = SegmentationStrategy.THRESHOLD) -> Dict:
+        """Main quantitative analysis pipeline.
+        Args:
+            image_path: Path to the image file
+            channels: List of channels to analyze
+            method: Segmentation method to use
+        Returns:
+            Dictionary containing extracted features and analysis results
+        """
+        try:
+            # Load and preprocess image
+            from ..utils.image_io import ImageLoader
+            loader = ImageLoader()
+            image = loader.load(image_path)
+            # Preprocess image
+            preprocessed = self._preprocess_image(image, channels)
+            # Segment objects (nuclei, cells, etc.)
+            regions = self._segment_objects(preprocessed, method)
+            if not regions:
+                logger.warning(f"No regions found in image {image_path}")
+                return self._empty_results()
+            # Extract different types of features
+            morphological_features = self._extract_morphological_features(image, regions)
+            intensity_features = self._extract_intensity_features(image, regions)
+            texture_features = self._extract_texture_features(image, regions)
+            spatial_features = self._extract_spatial_features(image, regions)
+            # Compute summary statistics
+            summary_stats = self._compute_summary_stats(morphological_features, intensity_features)
+            return {
+                'regions': regions,
+                'morphological': morphological_features,
+                'intensity': intensity_features,
+                'texture': texture_features,
+                'spatial': spatial_features,
+                'summary_stats': summary_stats,
+                'num_objects': len(regions),
+                'method_used': method.value
+            }
+        except Exception as e:
+            logger.error(f"Quantitative analysis failed for {image_path}: {e}")
+            raise
+    def _empty_results(self) -> Dict:
+        """Return empty results structure when no regions are found."""
+        return {
+            'regions': [],
+            'morphological': pd.DataFrame(),
+            'intensity': pd.DataFrame(),
+            'texture': pd.DataFrame(),
+            'spatial': pd.DataFrame(),
+            'summary_stats': {},
+            'num_objects': 0,
+            'method_used': 'none'
+        }
+    def _preprocess_image(self, image: np.ndarray, channels: Optional[List[int]] = None) -> np.ndarray:
+        """Preprocess image for analysis.
+        Args:
+            image: Input image array
+            channels: Specific channels to use for segmentation
+        Returns:
+            Preprocessed image
+        """
+        try:
+            # Extract specific channels if provided
+            if channels and len(image.shape) == 3:
+                if len(channels) == 1:
+                    # Single channel for segmentation
+                    processed = image[:, :, channels[0]]
+                else:
+                    # Multiple channels - use first for segmentation
+                    processed = image[:, :, channels[0]]
+            elif len(image.shape) == 3:
+                # Convert RGB to grayscale
+                processed = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+            else:
+                # Already grayscale
+                processed = image.copy()
+            # Ensure proper data type
+            if processed.dtype != np.uint8:
+                # Normalize to 0-255 range
+                processed = ((processed - processed.min()) / (processed.max() - processed.min()) * 255).astype(np.uint8)
+            return processed
+        except Exception as e:
+            logger.error(f"Image preprocessing failed: {e}")
+            raise
+    def _segment_objects(self, image: np.ndarray, method: SegmentationStrategy = SegmentationStrategy.THRESHOLD) -> List:
+        """Segment objects using specified method.
+        Args:
+            image: Preprocessed image
+            method: Segmentation strategy to use
+        Returns:
+            List of region properties
+        """
+        try:
+            if method not in self.segmentation_methods:
+                logger.warning(f"Unknown method {method}, using threshold")
+                method = SegmentationStrategy.THRESHOLD
+            return self.segmentation_methods[method](image)
+        except Exception as e:
+            logger.error(f"Object segmentation failed: {e}")
+            return []
+    def _threshold_segmentation(self, image: np.ndarray) -> List:
+        """Simple threshold-based segmentation using Otsu's method.
+        Args:
+            image: Grayscale input image
+        Returns:
+            List of region properties
+        """
+        try:
+            # Apply Gaussian blur to reduce noise
+            blurred = cv2.GaussianBlur(image, (5, 5), 0)
+            # Apply Otsu's threshold
+            _, thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+            # Clean up with morphological operations
+            kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
+            cleaned = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
+            cleaned = cv2.morphologyEx(cleaned, cv2.MORPH_CLOSE, kernel, iterations=1)
+            # Label connected components
+            labeled = measure.label(cleaned)
+            regions = measure.regionprops(labeled, intensity_image=image)
+            # Filter by size
+            min_area = self.config.get('min_object_area', 50)
+            max_area = self.config.get('max_object_area', 10000)
+            filtered_regions = [r for r in regions if min_area <= r.area <= max_area]
+            logger.info(f"Threshold segmentation found {len(filtered_regions)} objects")
+            return filtered_regions
+        except Exception as e:
+            logger.error(f"Threshold segmentation failed: {e}")
+            return []
+    def _watershed_segmentation(self, image: np.ndarray) -> List:
+        """Watershed segmentation for overlapping objects.
+        Args:
+            image: Grayscale input image
+        Returns:
+            List of region properties
+        """
+        try:
+            # Apply Gaussian filter
+            blurred = cv2.GaussianBlur(image, (5, 5), 0)
+            # Threshold to get binary image
+            _, binary = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+            # Distance transform
+            dist_transform = cv2.distanceTransform(binary, cv2.DIST_L2, 5)
+            # Find local maxima as markers
+            _, markers = cv2.threshold(dist_transform, 0.4 * dist_transform.max(), 255, 0)
+            markers = markers.astype(np.uint8)
+            # Label markers
+            _, markers = cv2.connectedComponents(markers)
+            # Apply watershed
+            markers = cv2.watershed(cv2.cvtColor(image, cv2.COLOR_GRAY2RGB), markers)
+            # Extract regions
+            regions = measure.regionprops(markers, intensity_image=image)
+            # Filter by size
+            min_area = self.config.get('min_object_area', 50)
+            max_area = self.config.get('max_object_area', 10000)
+            filtered_regions = [r for r in regions if min_area <= r.area <= max_area and r.label > 0]
+            logger.info(f"Watershed segmentation found {len(filtered_regions)} objects")
+            return filtered_regions
+        except Exception as e:
+            logger.error(f"Watershed segmentation failed: {e}")
+            return []
+    def _edge_segmentation(self, image: np.ndarray) -> List:
+        """Edge-based segmentation using Canny edge detection.
+        Args:
+            image: Grayscale input image
+        Returns:
+            List of region properties
+        """
+        try:
+            # Apply Gaussian blur
+            blurred = cv2.GaussianBlur(image, (5, 5), 0)
+            # Canny edge detection
+            edges = cv2.Canny(blurred, 50, 150)
+            # Close gaps in edges
+            kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
+            closed = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel, iterations=2)
+            # Fill holes
+            filled = ndimage.binary_fill_holes(closed).astype(np.uint8) * 255
+            # Label connected components
+            labeled = measure.label(filled)
+            regions = measure.regionprops(labeled, intensity_image=image)
+            # Filter by size
+            min_area = self.config.get('min_object_area', 50)
+            max_area = self.config.get('max_object_area', 10000)
+            filtered_regions = [r for r in regions if min_area <= r.area <= max_area]
+            logger.info(f"Edge segmentation found {len(filtered_regions)} objects")
+            return filtered_regions
+        except Exception as e:
+            logger.error(f"Edge segmentation failed: {e}")
+            return []
+    def _cellpose_segmentation(self, image: np.ndarray) -> List:
+        """Cellpose segmentation (placeholder for future implementation).
+        Args:
+            image: Input image
+        Returns:
+            List of region properties
+        """
+        logger.warning("Cellpose segmentation not implemented, using threshold instead")
+        return self._threshold_segmentation(image)
+    def _stardist_segmentation(self, image: np.ndarray) -> List:
+        """StarDist segmentation (placeholder for future implementation).
+        Args:
+            image: Input image
+        Returns:
+            List of region properties
+        """
+        logger.warning("StarDist segmentation not implemented, using threshold instead")
+        return self._threshold_segmentation(image)
+    def _extract_morphological_features(self, image: np.ndarray, regions: List) -> pd.DataFrame:
+        """Extract morphological features from segmented regions.
+        Args:
+            image: Original image
+            regions: List of region properties
+        Returns:
+            DataFrame with morphological features
+        """
+        try:
+            features = []
+            for i, region in enumerate(regions):
+                feature_dict = {
+                    'object_id': i,
+                    'area': region.area,
+                    'perimeter': region.perimeter,
+                    'centroid_x': region.centroid[1],
+                    'centroid_y': region.centroid[0],
+                    'eccentricity': region.eccentricity,
+                    'solidity': region.solidity,
+                    'extent': region.extent,
+                    'orientation': region.orientation,
+                    'major_axis_length': region.major_axis_length,
+                    'minor_axis_length': region.minor_axis_length,
+                    'equivalent_diameter': region.equivalent_diameter,
+                    'convex_area': region.convex_area,
+                    'filled_area': region.filled_area,
+                    'euler_number': region.euler_number
+                }
+                # Derived features
+                if region.perimeter > 0:
+                    feature_dict['compactness'] = (4 * np.pi * region.area) / (region.perimeter ** 2)
+                else:
+                    feature_dict['compactness'] = 0
+                if region.minor_axis_length > 0:
+                    feature_dict['aspect_ratio'] = region.major_axis_length / region.minor_axis_length
+                else:
+                    feature_dict['aspect_ratio'] = 1
+                features.append(feature_dict)
+            return pd.DataFrame(features)
+        except Exception as e:
+            logger.error(f"Morphological feature extraction failed: {e}")
+            return pd.DataFrame()
+    def _extract_intensity_features(self, image: np.ndarray, regions: List) -> pd.DataFrame:
+        """Extract intensity-based features from segmented regions.
+        Args:
+            image: Original image
+            regions: List of region properties
+        Returns:
+            DataFrame with intensity features
+        """
+        try:
+            features = []
+            # Convert to grayscale if needed
+            if len(image.shape) == 3:
+                gray_image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+            else:
+                gray_image = image
+            for i, region in enumerate(regions):
+                # Get pixel intensities for this region
+                coords = region.coords
+                intensities = gray_image[coords[:, 0], coords[:, 1]]
+                feature_dict = {
+                    'object_id': i,
+                    'mean_intensity': np.mean(intensities),
+                    'median_intensity': np.median(intensities),
+                    'std_intensity': np.std(intensities),
+                    'min_intensity': np.min(intensities),
+                    'max_intensity': np.max(intensities),
+                    'intensity_range': np.max(intensities) - np.min(intensities),
+                    'integrated_intensity': np.sum(intensities),
+                    'weighted_centroid_x': region.weighted_centroid[1],
+                    'weighted_centroid_y': region.weighted_centroid[0]
+                }
+                # Additional percentiles
+                feature_dict['intensity_p25'] = np.percentile(intensities, 25)
+                feature_dict['intensity_p75'] = np.percentile(intensities, 75)
+                feature_dict['intensity_iqr'] = feature_dict['intensity_p75'] - feature_dict['intensity_p25']
+                features.append(feature_dict)
+            return pd.DataFrame(features)
+        except Exception as e:
+            logger.error(f"Intensity feature extraction failed: {e}")
+            return pd.DataFrame()
+    def _extract_texture_features(self, image: np.ndarray, regions: List) -> pd.DataFrame:
+        """Extract texture features using Haralick features and Local Binary Patterns.
+        Args:
+            image: Original image
+            regions: List of region properties
+        Returns:
+            DataFrame with texture features
+        """
+        try:
+            features = []
+            # Convert to grayscale if needed
+            if len(image.shape) == 3:
+                gray_image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+            else:
+                gray_image = image
+            for i, region in enumerate(regions):
+                # Extract region of interest
+                minr, minc, maxr, maxc = region.bbox
+                roi = gray_image[minr:maxr, minc:maxc]
+                mask = np.zeros_like(roi, dtype=bool)
+                # Create mask for this region
+                coords = region.coords
+                local_coords = coords - [minr, minc]
+                valid_coords = ((local_coords[:, 0] >= 0) & (local_coords[:, 0] < roi.shape[0]) &
+                               (local_coords[:, 1] >= 0) & (local_coords[:, 1] < roi.shape[1]))
+                if np.any(valid_coords):
+                    mask[local_coords[valid_coords, 0], local_coords[valid_coords, 1]] = True
+                # Basic texture measures
+                roi_masked = roi[mask] if np.any(mask) else roi.flatten()
+                feature_dict = {
+                    'object_id': i,
+                    'texture_contrast': np.std(roi_masked) if len(roi_masked) > 1 else 0,
+                    'texture_variance': np.var(roi_masked) if len(roi_masked) > 1 else 0,
+                    'texture_skewness': self._compute_skewness(roi_masked),
+                    'texture_kurtosis': self._compute_kurtosis(roi_masked),
+                    'texture_energy': np.sum(roi_masked ** 2) if len(roi_masked) > 0 else 0
+                }
+                # Local Binary Pattern (simplified)
+                if roi.size > 0:
+                    lbp_var = self._compute_lbp_variance(roi)
+                    feature_dict['lbp_variance'] = lbp_var
+                else:
+                    feature_dict['lbp_variance'] = 0
+                features.append(feature_dict)
+            return pd.DataFrame(features)
+        except Exception as e:
+            logger.error(f"Texture feature extraction failed: {e}")
+            return pd.DataFrame()
+    def _extract_spatial_features(self, image: np.ndarray, regions: List) -> pd.DataFrame:
+        """Extract spatial and neighborhood features.
+        Args:
+            image: Original image
+            regions: List of region properties
+        Returns:
+            DataFrame with spatial features
+        """
+        try:
+            features = []
+            # Compute centroids for distance calculations
+            centroids = np.array([region.centroid for region in regions])
+            for i, region in enumerate(regions):
+                feature_dict = {
+                    'object_id': i,
+                    'distance_to_edge': self._distance_to_edge(region, image.shape),
+                    'distance_to_center': self._distance_to_center(region, image.shape)
+                }
+                # Neighborhood analysis
+                if len(centroids) > 1:
+                    distances = np.linalg.norm(centroids - region.centroid, axis=1)
+                    distances = distances[distances > 0]  # Exclude self
+                    if len(distances) > 0:
+                        feature_dict['nearest_neighbor_distance'] = np.min(distances)
+                        feature_dict['mean_neighbor_distance'] = np.mean(distances)
+                        feature_dict['neighbor_count_50px'] = np.sum(distances < 50)
+                        feature_dict['neighbor_count_100px'] = np.sum(distances < 100)
+                    else:
+                        feature_dict['nearest_neighbor_distance'] = np.inf
+                        feature_dict['mean_neighbor_distance'] = np.inf
+                        feature_dict['neighbor_count_50px'] = 0
+                        feature_dict['neighbor_count_100px'] = 0
+                else:
+                    feature_dict['nearest_neighbor_distance'] = np.inf
+                    feature_dict['mean_neighbor_distance'] = np.inf
+                    feature_dict['neighbor_count_50px'] = 0
+                    feature_dict['neighbor_count_100px'] = 0
+                features.append(feature_dict)
+            return pd.DataFrame(features)
+        except Exception as e:
+            logger.error(f"Spatial feature extraction failed: {e}")
+            return pd.DataFrame()
+    def _compute_summary_stats(self, morphological_features: pd.DataFrame,
+                             intensity_features: pd.DataFrame) -> Dict:
+        """Compute summary statistics across all objects.
+        Args:
+            morphological_features: DataFrame with morphological features
+            intensity_features: DataFrame with intensity features
+        Returns:
+            Dictionary with summary statistics
+        """
+        try:
+            summary = {}
+            if not morphological_features.empty:
+                summary['morphological'] = {
+                    'total_objects': len(morphological_features),
+                    'mean_area': float(morphological_features['area'].mean()),
+                    'std_area': float(morphological_features['area'].std()),
+                    'mean_perimeter': float(morphological_features['perimeter'].mean()),
+                    'mean_eccentricity': float(morphological_features['eccentricity'].mean()),
+                    'mean_solidity': float(morphological_features['solidity'].mean())
+                }
+            if not intensity_features.empty:
+                summary['intensity'] = {
+                    'mean_intensity': float(intensity_features['mean_intensity'].mean()),
+                    'overall_integrated_intensity': float(intensity_features['integrated_intensity'].sum()),
+                    'intensity_cv': float(intensity_features['mean_intensity'].std() / intensity_features['mean_intensity'].mean())
+                        if intensity_features['mean_intensity'].mean() > 0 else 0
+                }
+            return summary
+        except Exception as e:
+            logger.error(f"Summary statistics computation failed: {e}")
+            return {}
+    def _compute_skewness(self, data: np.ndarray) -> float:
+        """Compute skewness of data."""
+        if len(data) < 3:
+            return 0.0
+        mean_val = np.mean(data)
+        std_val = np.std(data)
+        if std_val == 0:
+            return 0.0
+        return np.mean(((data - mean_val) / std_val) ** 3)
+    def _compute_kurtosis(self, data: np.ndarray) -> float:
+        """Compute kurtosis of data."""
+        if len(data) < 4:
+            return 0.0
+        mean_val = np.mean(data)
+        std_val = np.std(data)
+        if std_val == 0:
+            return 0.0
+        return np.mean(((data - mean_val) / std_val) ** 4) - 3
+    def _compute_lbp_variance(self, image: np.ndarray) -> float:
+        """Compute Local Binary Pattern variance (simplified version)."""
+        if image.size < 9:
+            return 0.0
+        try:
+            # Simple LBP calculation for center pixels
+            center = image[1:-1, 1:-1]
+            patterns = []
+            offsets = [(-1, -1), (-1, 0), (-1, 1), (0, 1), (1, 1), (1, 0), (1, -1), (0, -1)]
+            for i in range(center.shape[0]):
+                for j in range(center.shape[1]):
+                    pattern = 0
+                    center_val = center[i, j]
+                    for k, (di, dj) in enumerate(offsets):
+                        if image[i + 1 + di, j + 1 + dj] >= center_val:
+                            pattern |= (1 << k)
+                    patterns.append(pattern)
+            return float(np.var(patterns)) if patterns else 0.0
+        except:
+            return 0.0
+    def _distance_to_edge(self, region, image_shape: Tuple[int, int]) -> float:
+        """Compute minimum distance from region centroid to image edge."""
+        cy, cx = region.centroid
+        height, width = image_shape[:2]
+        distances = [cy, height - cy, cx, width - cx]
+        return float(min(distances))
+    def _distance_to_center(self, region, image_shape: Tuple[int, int]) -> float:
+        """Compute distance from region centroid to image center."""
+        cy, cx = region.centroid
+        height, width = image_shape[:2]
+        center_y, center_x = height / 2, width / 2
+        return float(np.sqrt((cy - center_y) ** 2 + (cx - center_x) ** 2))

anton/cmpo.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "CMPO_0000094": {"name": "apoptotic cell phenotype", "features": ["apoptosis_markers", "nuclear_fragmentation"]},
+    "CMPO_0000140": {"name": "mitotic cell phenotype", "features": ["mitotic_figures", "chromatin_condensation"]},
+    "CMPO_0000077": {"name": "abnormal cell morphology phenotype", "features": ["abnormal_morphology", "nuclear_size"]},
+    "CMPO_0000098": {"name": "autophagic cell phenotype", "features": ["lc3_puncta"]},
+    "CMPO_0000123": {"name": "increased cell size phenotype", "features": ["increased_cell_size"]},
+    "CMPO_0000289": {"name": "increased stress fibers phenotype", "features": ["increased_stress_fibers"]}
+}

anton/cmpo/README.md ADDED Viewed

	@@ -0,0 +1,300 @@

+# CMPO Integration Module
+## Overview
+The Cellular Microscopy Phenotype Ontology (CMPO) integration module is a core component of Anton that provides **semantic mapping between natural language descriptions and standardized scientific terminology**. This module enables Anton to translate VLM-generated insights into scientifically compliant, searchable, and interoperable phenotype classifications.
+## Problem Statement
+Modern microscopy analysis faces a critical challenge: **bridging the semantic gap** between AI-generated natural language descriptions and standardized scientific terminology. While VLMs can provide expert-level biological insights ("cells arrested in metaphase with condensed chromosomes"), these descriptions need to be mapped to formal ontology terms for:
+- **Scientific standardization**: Ensuring consistent terminology across studies
+- **Data interoperability**: Enabling cross-dataset comparisons and meta-analyses
+- **Knowledge integration**: Connecting observations to broader biological knowledge graphs
+- **Reproducible research**: Providing precise, unambiguous phenotype classifications
+## Conceptual Framework
+### 1. Multi-Level Hierarchical Mapping
+CMPO is organized in a hierarchical structure with multiple branches:
+```
+CMPO Root
+├── biological_process (GO terms)
+├── cellular_phenotype (398 terms)
+│   ├── cell_population_phenotype (73)
+│   ├── cell_process_phenotype (157)
+│   │   ├── cell_cycle_phenotype (46)
+│   │   │   ├── cell_cycle_arrested_phenotype (6)
+│   │   │   │   ├── G2_arrested_phenotype
+│   │   │   │   ├── M_phase_arrested_phenotype
+│   │   │   │   └── metaphase_arrested_phenotype
+│   │   │   └── mitotic_process_phenotype (37)
+│   │   └── cell_death_phenotype (1)
+│   └── cellular_component_phenotype (186)
+├── molecular_entity (CHEBI terms)
+├── molecular_function (GO terms)
+└── quality (PATO terms)
+```
+### 2. Research Context-Aware Subgraph Navigation
+**Key Insight**: Researchers often have specific analytical intentions that determine which CMPO subgraphs are most relevant.
+**Context Types**:
+- **Process-focused**: Studying cell division, apoptosis, migration → `cell_process_phenotype` subgraph
+- **Component-focused**: Analyzing organelles, structures → `cellular_component_phenotype` subgraph
+- **Multi-intent**: Cell cycle AND mitochondrial analysis → Multiple overlapping subgraphs
+- **Population-level**: Colony behavior, density effects → `cell_population_phenotype` subgraph
+### 3. Two-Strategy VLM Mapping Approach
+#### Strategy 1: Description → CMPO Mapping
+```
+VLM Analysis: "Cells show metaphase arrest with hyperconnected chromosomes"
+    ↓
+Semantic Parsing: Extract ['metaphase', 'arrest', 'chromosomes', 'condensed']
+    ↓
+CMPO Mapping: → CMPO:0000XXX "metaphase arrested phenotype"
+```
+#### Strategy 2: CMPO-Guided Evidence Detection
+```
+Research Context: "Studying cell cycle defects"
+    ↓
+Subgraph Selection: Focus on cell_cycle_phenotype branch
+    ↓
+VLM Query: "Do you see evidence of: metaphase arrest, anaphase defects, etc.?"
+    ↓
+Targeted Classification: Direct mapping to specific terms
+```
+## Technical Implementation
+### Semantic Mapping Pipeline
+1. **Ontology Loading**: Parse full CMPO .obo file with rich semantic relations
+2. **Multi-Modal Matching**:
+   - **Direct matching**: Term names and synonyms
+   - **Semantic matching**: Logical definitions and cross-ontology references
+   - **Contextual matching**: Hierarchical subgraph relevance
+3. **Confidence Scoring**: Weighted combination of multiple evidence sources
+4. **Hierarchy Navigation**: Maintain relationships for downstream analysis
+### Rich Ontological Information
+Each CMPO term contains:
+```python
+{
+    "CMPO:0001234": {
+        "name": "metaphase arrested phenotype",
+        "description": "A phenotype in which cells are arrested in metaphase",
+        "synonyms": ["metaphase arrest", "M-phase block"],
+        "subclass_of": ["cell_cycle_arrested_phenotype", "mitotic_phenotype"],
+        "equivalent_to": "has_part(arrested and characteristic_of(mitotic_metaphase))",
+        "xrefs": ["GO:0000819"],  # Cross-ontology links
+        "subset": ["cmpo_core"]
+    }
+}
+```
+### Two-Stage Mapping Pipeline
+```python
+async def map_to_cmpo_enhanced(description, cmpo_ontology, vlm_interface, context=None):
+    # Stage 1: Ontology-Aware Candidate Generation
+    candidates = ontology_aware_mapping(description, cmpo_ontology, context)
+    # Stage 2: VLM Biological Reasoning & Pruning
+    if len(candidates) > 1:
+        validated_mappings = await vlm_biological_validation(description, candidates, vlm_interface)
+        return validated_mappings
+    else:
+        return candidates
+def ontology_aware_mapping(description, cmpo_ontology, context=None):
+    # 1. Enhanced token extraction with exact matching priority
+    exact_tokens = extract_exact_biological_matches(description)
+    fuzzy_tokens = extract_fuzzy_biological_tokens(description)
+    # 2. Hierarchical scoring
+    for term_id, term_data in cmpo_ontology.ontology.items():
+        score = 0
+        # Exact token matches (highest weight)
+        exact_score = calculate_exact_matches(exact_tokens, term_data) * 1.0
+        # Hierarchical specificity (deeper = more specific = higher score)
+        specificity_score = calculate_hierarchy_depth(term_id, cmpo_ontology) * 0.3
+        # Ontological distance (closer = more related = higher score)
+        distance_score = calculate_ontological_distance(term_id, context_terms) * 0.2
+        # Fuzzy similarity (lowest weight)
+        fuzzy_score = calculate_fuzzy_similarity(fuzzy_tokens, term_data) * 0.1
+        total_score = exact_score + specificity_score + distance_score + fuzzy_score
+    return ranked_candidates
+async def vlm_biological_validation(description, candidates, vlm_interface):
+    validation_prompt = f"""
+    Original biological description: "{description}"
+    Candidate CMPO term mappings:
+    {format_candidates_for_review(candidates)}
+    Task: Evaluate biological plausibility and ranking of these mappings.
+    Consider:
+    - Biological consistency and logical compatibility
+    - Temporal/spatial relationships in biological processes
+    - Phenotypic co-occurrence patterns
+    - Mechanistic plausibility
+    - Specificity vs generality trade-offs
+    Provide:
+    1. Biologically valid mappings (with confidence 0-1)
+    2. Brief scientific reasoning for each acceptance/rejection
+    3. Final ranked list
+    Focus on biological accuracy over textual similarity.
+    """
+    reasoning_result = await vlm_interface.reason_about_mappings(validation_prompt)
+    return parse_and_apply_biological_reasoning(candidates, reasoning_result)
+```
+## Usage Examples
+### Basic Mapping
+```python
+from anton.cmpo import CMPOOntology, map_to_cmpo
+cmpo = CMPOOntology()
+results = map_to_cmpo("cells arrested in metaphase with condensed chromosomes", cmpo)
+# Output:
+# [
+#   {
+#     "CMPO_ID": "CMPO:0001234",
+#     "term_name": "metaphase arrested phenotype",
+#     "confidence": 0.92,
+#     "supporting_evidence": "Direct match: metaphase; Semantic: arrested + mitotic",
+#     "hierarchy_path": ["metaphase arrested phenotype", "cell cycle arrested phenotype", "cell cycle phenotype"]
+#   }
+# ]
+```
+### Context-Aware Mapping
+```python
+# Research studying apoptosis
+results = map_to_cmpo("fragmented nuclei with membrane blebbing", cmpo, context="apoptosis")
+# → Higher confidence for apoptotic_cell_phenotype terms
+# Research studying cell division
+results = map_to_cmpo("abnormal spindle formation", cmpo, context="cell_cycle")
+# → Higher confidence for mitotic_process_phenotype terms
+```
+### Integration with Anton Pipeline
+```python
+# Within QualitativeAnalyzer
+population_insights = await vlm.analyze_population(image)
+cmpo_mappings = map_to_cmpo(
+    description=population_insights['description'],
+    cmpo_ontology=self.cmpo_mapper,
+    context=self.research_context
+)
+```
+## Validation and Quality Assurance
+### Confidence Thresholds
+- **High confidence (>0.8)**: Direct term matches with strong semantic support
+- **Medium confidence (0.5-0.8)**: Semantic matches with contextual support
+- **Low confidence (0.3-0.5)**: Weak matches requiring human review
+- **Below threshold (<0.3)**: Excluded from results
+### Evidence Tracking
+Each mapping includes:
+- **Supporting evidence**: Specific text that triggered the match
+- **Mapping type**: Direct, semantic, or contextual
+- **Hierarchy path**: Full taxonomic classification
+- **Cross-references**: Links to related GO/PATO terms
+## Future Enhancements
+### 1. Machine Learning Integration
+- **Embedding-based similarity**: Use biological language models (BioBERT, etc.)
+- **Context learning**: Train models on researcher annotation patterns
+- **Active learning**: Improve mappings based on user feedback
+### 2. Advanced Semantic Reasoning
+- **Logical inference**: Use formal ontology reasoning for complex mappings
+- **Negation handling**: Detect and properly handle negative evidence
+- **Uncertainty quantification**: Bayesian confidence estimates
+### 3. Multi-Ontology Integration
+- **Cross-ontology alignment**: Map to GO, PATO, CHEBI simultaneously
+- **Knowledge graph construction**: Build comprehensive phenotype knowledge graphs
+- **Standardized interfaces**: FAIR data principles compliance
+### 4. Dynamic Ontology Updates
+- **Version management**: Handle CMPO ontology updates gracefully
+- **Backward compatibility**: Maintain mapping consistency across versions
+- **Community integration**: Contribute mappings back to CMPO community
+## Research Applications
+### Enabled Use Cases
+1. **Large-scale phenotype screens**: Standardized classification across thousands of images
+2. **Cross-study meta-analysis**: Combine results from different research groups
+3. **Drug discovery**: Map compound effects to standardized phenotype profiles
+4. **Disease research**: Connect cellular phenotypes to pathological processes
+5. **Evolutionary studies**: Compare phenotypes across species using common vocabulary
+### Scientific Impact
+- **Reproducibility**: Eliminates ambiguity in phenotype descriptions
+- **Discoverability**: Enables semantic search across phenotype databases
+- **Integration**: Connects microscopy data to broader biological knowledge
+- **Collaboration**: Provides common language for interdisciplinary research
+---
+## Development Notes
+### Design Decisions
+**Why hierarchical subgraph mapping?**
+- CMPO contains >600 terms across diverse biological domains
+- Research context dramatically improves mapping accuracy
+- Enables both broad screening and focused deep analysis
+**Why two-strategy VLM approach?**
+- Strategy 1 (description→CMPO) handles unexpected discoveries
+- Strategy 2 (CMPO-guided) ensures comprehensive coverage of known phenotypes
+- Combination provides both discovery and validation capabilities
+**Why rich semantic relations?**
+- Simple keyword matching fails for scientific terminology
+- Logical definitions enable precise semantic matching
+- Cross-ontology links expand vocabulary and validation
+### Code Organization
+- `ontology.py`: CMPO data loading, parsing, and management
+- `mapping.py`: Core mapping algorithms and semantic analysis
+- `__init__.py`: Module interface and public API
+- `README.md`: Comprehensive documentation (this file)
+### Testing Strategy
+- Unit tests for individual mapping functions
+- Integration tests with full CMPO ontology
+- Validation against expert-annotated datasets
+- Performance benchmarks for large-scale analysis
+---
+*This module represents a significant advancement in automated microscopy phenotype classification, bridging AI-generated insights with rigorous scientific standards.*

anton/cmpo/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""
+CMPO (Cellular Microscopy Phenotype Ontology) Integration Module for Anton
+This module provides sophisticated ontology-based phenotype classification for microscopy analysis.
+It bridges the gap between VLM-generated natural language descriptions and standardized
+scientific terminology through hierarchical semantic mapping.
+Key Components:
+- CMPOOntology: Loads and manages the full CMPO ontology with rich semantic relations
+- map_to_cmpo: Context-aware mapping from descriptions to CMPO terms
+- Hierarchical subgraph navigation for research-context-specific mapping
+Usage:
+    from anton.cmpo import CMPOOntology, map_to_cmpo
+    cmpo = CMPOOntology()
+    results = map_to_cmpo("cells arrested in metaphase", cmpo, context="cell_cycle")
+"""
+from .ontology import CMPOOntology
+from .mapping import map_to_cmpo, validate_mappings_with_vlm
+__all__ = ['CMPOOntology', 'map_to_cmpo', 'validate_mappings_with_vlm']
+__version__ = '1.0.0'

anton/cmpo/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (1.12 kB). View file

anton/cmpo/__pycache__/examples.cpython-313.pyc ADDED Viewed

Binary file (12.5 kB). View file

anton/cmpo/__pycache__/mapping.cpython-313.pyc ADDED Viewed

Binary file (16.5 kB). View file

anton/cmpo/__pycache__/ontology.cpython-313.pyc ADDED Viewed

Binary file (16.5 kB). View file

anton/cmpo/data/cmpo.json ADDED Viewed

The diff for this file is too large to render. See raw diff

anton/cmpo/examples.py ADDED Viewed

	@@ -0,0 +1,277 @@

+"""
+CMPO Mapping Examples and Demonstrations
+This file demonstrates the key concepts and usage patterns of the CMPO integration module.
+Run with:
+  python -m anton.cmpo.examples  (from project root)
+  OR
+  python examples.py  (from anton/cmpo/ directory)
+"""
+import sys
+import logging
+from pathlib import Path
+# Handle both direct execution and module execution
+if __name__ == "__main__" and __package__ is None:
+    # Add parent directories to path for direct execution
+    current_dir = Path(__file__).parent
+    project_root = current_dir.parent.parent
+    sys.path.insert(0, str(project_root))
+    from anton.cmpo.ontology import CMPOOntology
+    from anton.cmpo.mapping import map_to_cmpo
+else:
+    # Normal relative imports for module execution
+    from .ontology import CMPOOntology
+    from .mapping import map_to_cmpo
+logging.basicConfig(level=logging.INFO)
+def demonstrate_basic_mapping():
+    """Demonstrate basic CMPO mapping functionality."""
+    print("=" * 60)
+    print("BASIC CMPO MAPPING DEMONSTRATION")
+    print("=" * 60)
+    # Initialize CMPO ontology
+    cmpo = CMPOOntology()
+    print(f"Loaded CMPO ontology with {len(cmpo.ontology)} terms\n")
+    # Example descriptions from VLM analysis
+    test_descriptions = [
+        "cells arrested in metaphase with condensed chromosomes",
+        "fragmented nuclei with membrane blebbing indicating apoptosis",
+        "abnormal spindle formation during cell division",
+        "enlarged cell bodies with irregular morphology",
+        "normal healthy fibroblast cells with typical morphology"
+    ]
+    for desc in test_descriptions:
+        print(f"Description: '{desc}'")
+        results = map_to_cmpo(desc, cmpo)
+        if results:
+            print(f"Found {len(results)} CMPO mappings:")
+            for i, result in enumerate(results[:3], 1):
+                print(f"  {i}. {result['CMPO_ID']}: {result['term_name']}")
+                print(f"     Confidence: {result['confidence']:.3f}")
+                print(f"     Evidence: {result['supporting_evidence']}")
+                if result.get('hierarchy_path'):
+                    print(f"     Hierarchy: {' → '.join(result['hierarchy_path'])}")
+                print()
+        else:
+            print("  No CMPO mappings found")
+        print("-" * 50)
+def demonstrate_context_aware_mapping():
+    """Demonstrate context-aware mapping with research focus."""
+    print("\n" + "=" * 60)
+    print("CONTEXT-AWARE MAPPING DEMONSTRATION")
+    print("=" * 60)
+    cmpo = CMPOOntology()
+    # Same description, different research contexts
+    description = "abnormal cell division with chromosome segregation defects"
+    contexts = [
+        ("cell_cycle", "Cell cycle research focus"),
+        ("morphology", "Morphology research focus"),
+        (None, "No specific context")
+    ]
+    for context, context_desc in contexts:
+        print(f"\n{context_desc}:")
+        print(f"Description: '{description}'")
+        results = map_to_cmpo(description, cmpo, context=context)
+        if results:
+            for i, result in enumerate(results[:2], 1):
+                print(f"  {i}. {result['term_name']} (confidence: {result['confidence']:.3f})")
+                print(f"     Context boost from: {result['supporting_evidence']}")
+        else:
+            print("  No mappings found")
+def demonstrate_hierarchical_navigation():
+    """Show how CMPO terms relate hierarchically."""
+    print("\n" + "=" * 60)
+    print("HIERARCHICAL NAVIGATION DEMONSTRATION")
+    print("=" * 60)
+    cmpo = CMPOOntology()
+    # Find a term with rich hierarchy
+    for term_id, term_data in cmpo.ontology.items():
+        if term_data.get('parent_terms') and len(term_data['parent_terms']) > 0:
+            print(f"Term: {term_data['name']} ({term_id})")
+            print(f"Description: {term_data.get('description', 'No description')}")
+            if term_data.get('synonyms'):
+                print(f"Synonyms: {', '.join(term_data['synonyms'])}")
+            print(f"Parent terms:")
+            for parent_id in term_data['parent_terms']:
+                parent_term = cmpo.get_term(parent_id)
+                if parent_term:
+                    print(f"  → {parent_term['name']} ({parent_id})")
+            if term_data.get('equivalent_to'):
+                print(f"Equivalent to: {term_data['equivalent_to']}")
+            break
+def demonstrate_semantic_analysis():
+    """Show semantic component analysis."""
+    print("\n" + "=" * 60)
+    print("SEMANTIC ANALYSIS DEMONSTRATION")
+    print("=" * 60)
+    # Import internal functions for demonstration
+    if __name__ == "__main__" and __package__ is None:
+        from anton.cmpo.mapping import _extract_biological_tokens, _find_direct_matches
+    else:
+        from .mapping import _extract_biological_tokens, _find_direct_matches
+    cmpo = CMPOOntology()
+    description = "apoptotic cells with fragmented nuclei and chromatin condensation"
+    print(f"Analyzing: '{description}'")
+    # Show token extraction
+    tokens = _extract_biological_tokens(description)
+    print(f"Biological tokens: {sorted(tokens)}")
+    # Show direct matches
+    direct_matches = _find_direct_matches(description.lower(), cmpo)
+    if direct_matches:
+        print("\nDirect matches found:")
+        for term_id, confidence, evidence in direct_matches[:3]:
+            term = cmpo.get_term(term_id)
+            if term:
+                print(f"  {term['name']}: {confidence:.3f} (matched: {evidence})")
+def demonstrate_integration_patterns():
+    """Show how CMPO integrates with Anton pipeline."""
+    print("\n" + "=" * 60)
+    print("INTEGRATION PATTERNS DEMONSTRATION")
+    print("=" * 60)
+    # Simulate VLM output from different pipeline stages
+    vlm_outputs = {
+        "stage_1_global": "Dense population of adherent cells with fibroblast morphology",
+        "stage_3_features": "Individual cells show elongated spindle shape with prominent stress fibers",
+        "stage_4_population": "Population exhibits normal growth patterns with typical cell-cell contacts"
+    }
+    cmpo = CMPOOntology()
+    print("Simulating Anton pipeline integration:")
+    for stage, output in vlm_outputs.items():
+        print(f"\n{stage.replace('_', ' ').title()}:")
+        print(f"VLM Output: {output}")
+        # Map to CMPO
+        mappings = map_to_cmpo(output, cmpo)
+        if mappings:
+            best_match = mappings[0]
+            print(f"Best CMPO Match: {best_match['term_name']}")
+            print(f"Confidence: {best_match['confidence']:.3f}")
+        else:
+            print("No CMPO mappings found")
+def demonstrate_multi_stage_cmpo():
+    """Demonstrate multi-stage CMPO integration across pipeline stages."""
+    print("\n" + "=" * 60)
+    print("MULTI-STAGE CMPO INTEGRATION DEMONSTRATION")
+    print("=" * 60)
+    cmpo = CMPOOntology()
+    # Simulate different types of biological observations at each stage
+    stage_data = {
+        "Stage 1 - Global Context": {
+            "description": "Dense cell population with mitotic figures visible throughout",
+            "context": "cell_population"
+        },
+        "Stage 3 - Individual Cells": {
+            "description": "Cell arrested in metaphase with condensed chromosomes",
+            "context": "cellular_phenotype"
+        },
+        "Stage 4 - Population Insights": {
+            "description": "20% of population shows apoptotic markers with fragmented nuclei",
+            "context": "cell_population"
+        }
+    }
+    all_mappings = {}
+    print("🔬 Multi-Stage CMPO Analysis:")
+    for stage_name, data in stage_data.items():
+        print(f"\n{stage_name}:")
+        print(f"Description: '{data['description']}'")
+        print(f"Context: {data['context']}")
+        # Map with stage-appropriate context
+        mappings = map_to_cmpo(data['description'], cmpo, context=data['context'])
+        if mappings:
+            print(f"Found {len(mappings)} CMPO mappings:")
+            for i, mapping in enumerate(mappings[:2], 1):
+                print(f"  {i}. {mapping['term_name']} (confidence: {mapping['confidence']:.3f})")
+                # Track for cross-stage analysis
+                cmpo_id = mapping['CMPO_ID']
+                if cmpo_id not in all_mappings:
+                    all_mappings[cmpo_id] = {
+                        'term': mapping['term_name'],
+                        'stages': [],
+                        'max_confidence': 0
+                    }
+                all_mappings[cmpo_id]['stages'].append(stage_name.split(' - ')[0])
+                all_mappings[cmpo_id]['max_confidence'] = max(
+                    all_mappings[cmpo_id]['max_confidence'],
+                    mapping['confidence']
+                )
+        else:
+            print("  No CMPO mappings found")
+    # Cross-stage analysis
+    print("\n🔍 Cross-Stage CMPO Analysis:")
+    multi_stage_terms = {k: v for k, v in all_mappings.items() if len(v['stages']) > 1}
+    if multi_stage_terms:
+        print("Terms detected across multiple stages:")
+        for cmpo_id, data in multi_stage_terms.items():
+            print(f"  • {data['term']} - detected in: {', '.join(data['stages'])}")
+            print(f"    Max confidence: {data['max_confidence']:.3f}")
+    else:
+        print("No terms detected across multiple stages (expected - different biological levels)")
+    print(f"\nTotal unique CMPO terms identified: {len(all_mappings)}")
+    print("✅ Multi-stage integration provides comprehensive phenotype classification!")
+def main():
+    """Run all demonstrations."""
+    print("CMPO Module Demonstration Suite")
+    print("This script demonstrates the key capabilities of Anton's CMPO integration")
+    try:
+        demonstrate_basic_mapping()
+        demonstrate_context_aware_mapping()
+        demonstrate_hierarchical_navigation()
+        demonstrate_semantic_analysis()
+        demonstrate_integration_patterns()
+        demonstrate_multi_stage_cmpo()  # New multi-stage demo
+        print("\n" + "=" * 60)
+        print("DEMONSTRATION COMPLETE")
+        print("=" * 60)
+        print("For more information, see anton/cmpo/README.md")
+        print("✨ NEW: Multi-stage CMPO integration across all pipeline stages!")
+    except Exception as e:
+        print(f"Error during demonstration: {e}")
+        print("Ensure CMPO ontology is properly loaded")
+if __name__ == "__main__":
+    main()

anton/cmpo/mapping.py ADDED Viewed

	@@ -0,0 +1,375 @@

+"""Free-form to CMPO mapping for Anton's pipeline."""
+import re
+from typing import Dict, List, Tuple, Set
+from difflib import SequenceMatcher
+def map_to_cmpo(description: str, cmpo_ontology, context: str = None) -> List[Dict]:
+    """Convert a free-form description to CMPO terms using semantic mapping."""
+    if not description or not cmpo_ontology:
+        return []
+    description_lower = description.lower()
+    mappings = []
+    # 1. Direct name/synonym matching
+    direct_matches = _find_direct_matches(description_lower, cmpo_ontology)
+    # 2. Semantic component matching
+    semantic_matches = _find_semantic_matches(description_lower, cmpo_ontology)
+    # 3. Hierarchical context matching (if context provided)
+    context_matches = _find_context_matches(description_lower, cmpo_ontology, context) if context else []
+    # Combine and score all matches
+    all_matches = {}
+    # Weight direct matches highest (preserve enhanced scoring differences)
+    for term_id, confidence, evidence in direct_matches:
+        if term_id not in all_matches:
+            all_matches[term_id] = {'confidence': 0, 'evidence': []}
+        all_matches[term_id]['confidence'] += confidence  # Don't flatten with 0.8 multiplier
+        all_matches[term_id]['evidence'].append(f"Direct match: {evidence}")
+    # Weight semantic matches moderately
+    for term_id, confidence, evidence in semantic_matches:
+        if term_id not in all_matches:
+            all_matches[term_id] = {'confidence': 0, 'evidence': []}
+        all_matches[term_id]['confidence'] += confidence * 0.3  # Lower weight for semantic
+        all_matches[term_id]['evidence'].append(f"Semantic: {evidence}")
+    # Weight context matches lower but still valuable
+    for term_id, confidence, evidence in context_matches:
+        if term_id not in all_matches:
+            all_matches[term_id] = {'confidence': 0, 'evidence': []}
+        all_matches[term_id]['confidence'] += confidence * 0.2  # Lower weight for context
+        all_matches[term_id]['evidence'].append(f"Context: {evidence}")
+    # Convert to final format
+    for term_id, match_data in all_matches.items():
+        term_info = cmpo_ontology.get_term(term_id)
+        if term_info:
+            mappings.append({
+                "CMPO_ID": term_id,
+                "term_name": term_info['name'],
+                "confidence": match_data['confidence'],  # Preserve full confidence for sorting
+                "supporting_evidence": "; ".join(match_data['evidence'][:3]),
+                "description": term_info.get('description', ''),
+                "hierarchy_path": _get_hierarchy_path(term_id, cmpo_ontology)
+            })
+    # Sort by confidence and return top matches
+    mappings.sort(key=lambda x: x['confidence'], reverse=True)
+    return mappings[:5]
+def _find_direct_matches(description: str, cmpo_ontology) -> List[Tuple[str, float, str]]:
+    """Find direct matches with ontology-aware scoring."""
+    matches = []
+    description_tokens = set(_extract_biological_tokens(description))
+    for term_id, term_data in cmpo_ontology.ontology.items():
+        base_score = 0.0
+        matched_evidence = []
+        # 1. Exact token matches (highest priority)
+        term_tokens = set(_extract_biological_tokens(term_data.get('name', '')))
+        exact_matches = description_tokens.intersection(term_tokens)
+        if exact_matches:
+            # Higher score for exact matches
+            exact_score = len(exact_matches) / max(len(term_tokens), 1) * 2.0
+            base_score += exact_score
+            matched_evidence.extend(exact_matches)
+        # 2. Check term name substring matches
+        term_name = term_data.get('name', '').lower()
+        if term_name and term_name in description:
+            substring_score = len(term_name) / len(description) * 1.5
+            base_score += substring_score
+            matched_evidence.append(f"name:{term_name}")
+        # 3. Check synonyms with exact token priority
+        for synonym in term_data.get('synonyms', []):
+            synonym_tokens = set(_extract_biological_tokens(synonym))
+            syn_exact_matches = description_tokens.intersection(synonym_tokens)
+            if syn_exact_matches:
+                syn_score = len(syn_exact_matches) / max(len(synonym_tokens), 1) * 1.8
+                base_score += syn_score
+                matched_evidence.extend(syn_exact_matches)
+            elif synonym.lower() in description:
+                substring_score = len(synonym) / len(description) * 1.2
+                base_score += substring_score
+                matched_evidence.append(f"synonym:{synonym}")
+        # 4. Ontology-aware bonuses
+        if base_score > 0:
+            # Specificity bonus (deeper in hierarchy = more specific = higher score)
+            specificity_bonus = _calculate_specificity_bonus(term_id, cmpo_ontology)
+            # Multi-token exact match bonus (matches multiple key terms)
+            multi_token_bonus = 0.0
+            if len(exact_matches) > 1:
+                multi_token_bonus = len(exact_matches) * 0.5  # Strong bonus for multiple exact matches
+            # Apply ontology bonuses
+            final_score = base_score + specificity_bonus + multi_token_bonus
+            matches.append((term_id, min(final_score, 5.0), f"exact:{','.join(matched_evidence[:3])}"))
+    return matches
+def _find_semantic_matches(description: str, cmpo_ontology) -> List[Tuple[str, float, str]]:
+    """Find matches based on semantic component analysis."""
+    matches = []
+    # Extract meaningful terms from description
+    desc_tokens = _extract_biological_tokens(description)
+    for term_id, term_data in cmpo_ontology.ontology.items():
+        # Analyze equivalent_to relations for semantic components
+        for equiv in term_data.get('equivalent_to', []):
+            semantic_score = _score_semantic_overlap(desc_tokens, equiv)
+            if semantic_score > 0.3:
+                matches.append((term_id, semantic_score, f"Semantic components in {equiv}"))
+        # Check description overlap
+        term_desc = term_data.get('description', '').lower()
+        if term_desc:
+            desc_overlap = _calculate_text_similarity(description, term_desc)
+            if desc_overlap > 0.4:
+                matches.append((term_id, desc_overlap, "Description similarity"))
+    return matches
+def _find_context_matches(description: str, cmpo_ontology, context: str) -> List[Tuple[str, float, str]]:
+    """Find matches considering hierarchical context."""
+    matches = []
+    # Define context-based subgraph priorities
+    context_subgraphs = {
+        'cell_cycle': ['cell_cycle_phenotype', 'mitotic_process_phenotype'],
+        'apoptosis': ['cell_death_phenotype', 'apoptotic'],
+        'morphology': ['cellular_component_phenotype', 'abnormal_cell_morphology'],
+        'process': ['cell_process_phenotype', 'biological_process']
+    }
+    relevant_subgraphs = []
+    context_lower = context.lower() if context else ""
+    for ctx_key, subgraphs in context_subgraphs.items():
+        if ctx_key in context_lower:
+            relevant_subgraphs.extend(subgraphs)
+    # Score terms within relevant subgraphs higher
+    for term_id, term_data in cmpo_ontology.ontology.items():
+        for subgraph in relevant_subgraphs:
+            if _term_in_subgraph(term_id, subgraph, cmpo_ontology):
+                base_score = 0.5
+                # Boost if term also matches description
+                term_name = term_data.get('name', '').lower()
+                if any(token in term_name for token in description.split()):
+                    base_score += 0.3
+                matches.append((term_id, base_score, f"Context subgraph: {subgraph}"))
+    return matches
+def _extract_biological_tokens(text: str) -> Set[str]:
+    """Extract biologically relevant tokens from text."""
+    # Common biological stop words to exclude
+    bio_stop_words = {'cell', 'cells', 'cellular', 'the', 'and', 'or', 'with', 'in', 'of'}
+    # Extract tokens
+    tokens = set(re.findall(r'\b\w+\b', text.lower()))
+    # Filter for biological relevance (length > 3, not stop words)
+    bio_tokens = {token for token in tokens
+                 if len(token) > 3 and token not in bio_stop_words}
+    return bio_tokens
+def _score_semantic_overlap(desc_tokens: Set[str], equivalent_to: str) -> float:
+    """Score overlap between description tokens and semantic definition."""
+    equiv_tokens = _extract_biological_tokens(equivalent_to)
+    if not equiv_tokens:
+        return 0.0
+    overlap = len(desc_tokens.intersection(equiv_tokens))
+    return overlap / max(len(equiv_tokens), 1)
+def _calculate_text_similarity(text1: str, text2: str) -> float:
+    """Calculate text similarity using sequence matching."""
+    return SequenceMatcher(None, text1, text2).ratio()
+def _term_in_subgraph(term_id: str, subgraph_name: str, cmpo_ontology) -> bool:
+    """Check if a term belongs to a specific subgraph via hierarchy."""
+    term_data = cmpo_ontology.get_term(term_id)
+    if not term_data:
+        return False
+    # Check if term name contains subgraph keyword
+    term_name = term_data.get('name', '').lower()
+    if subgraph_name.lower() in term_name:
+        return True
+    # Check parent terms recursively (simple implementation)
+    for parent in term_data.get('parent_terms', []):
+        parent_data = cmpo_ontology.get_term(parent)
+        if parent_data and subgraph_name.lower() in parent_data.get('name', '').lower():
+            return True
+    return False
+def _get_hierarchy_path(term_id: str, cmpo_ontology) -> List[str]:
+    """Get the hierarchical path for a term."""
+    path = []
+    current_term = cmpo_ontology.get_term(term_id)
+    if current_term:
+        path.append(current_term.get('name', term_id))
+        # Add immediate parents (simplified - could be recursive)
+        for parent_id in current_term.get('parent_terms', [])[:2]:  # Limit to 2 parents
+            parent_term = cmpo_ontology.get_term(parent_id)
+            if parent_term:
+                path.append(parent_term.get('name', parent_id))
+    return path
+def _calculate_specificity_bonus(term_id: str, cmpo_ontology) -> float:
+    """Calculate specificity bonus based on hierarchy depth."""
+    try:
+        depth = _calculate_hierarchy_depth(term_id, cmpo_ontology)
+        # Deeper terms are more specific, get higher bonus
+        # Max bonus of 0.5 for terms at depth 4+
+        return min(depth * 0.1, 0.5)
+    except:
+        return 0.0
+def _calculate_hierarchy_depth(term_id: str, cmpo_ontology, visited=None) -> int:
+    """Calculate depth of term in CMPO hierarchy."""
+    if visited is None:
+        visited = set()
+    if term_id in visited:  # Avoid cycles
+        return 0
+    visited.add(term_id)
+    term_data = cmpo_ontology.get_term(term_id)
+    if not term_data or not term_data.get('parent_terms'):
+        return 1  # Root level
+    # Find maximum depth among parents
+    max_parent_depth = 0
+    for parent_id in term_data.get('parent_terms', []):
+        parent_depth = _calculate_hierarchy_depth(parent_id, cmpo_ontology, visited.copy())
+        max_parent_depth = max(max_parent_depth, parent_depth)
+    return max_parent_depth + 1
+def _detect_mutual_exclusion(term1_id: str, term2_id: str, cmpo_ontology) -> bool:
+    """Detect if two terms are mutually exclusive based on ontology structure."""
+    term1 = cmpo_ontology.get_term(term1_id)
+    term2 = cmpo_ontology.get_term(term2_id)
+    if not term1 or not term2:
+        return False
+    # Check if they share the same immediate parent (sibling terms often mutually exclusive)
+    term1_parents = set(term1.get('parent_terms', []))
+    term2_parents = set(term2.get('parent_terms', []))
+    shared_parents = term1_parents.intersection(term2_parents)
+    # If they share parents and are both specific (depth > 2), likely mutually exclusive
+    if shared_parents and len(shared_parents) > 0:
+        depth1 = _calculate_hierarchy_depth(term1_id, cmpo_ontology)
+        depth2 = _calculate_hierarchy_depth(term2_id, cmpo_ontology)
+        # Heuristic: sibling terms at depth 3+ often mutually exclusive
+        if depth1 > 2 and depth2 > 2:
+            return True
+    return False
+# Add VLM validation function for the two-stage pipeline
+async def validate_mappings_with_vlm(description: str, candidate_mappings: List[Dict], vlm_interface, max_candidates: int = 5) -> List[Dict]:
+    """Stage 2: VLM biological reasoning and pruning."""
+    if len(candidate_mappings) <= 1:
+        return candidate_mappings
+    # Format candidates for VLM review
+    candidates_text = "\n".join([
+        f"{i+1}. {mapping['term_name']} (CMPO:{mapping['CMPO_ID']}) - Confidence: {mapping['confidence']:.3f}"
+        for i, mapping in enumerate(candidate_mappings[:max_candidates])
+    ])
+    validation_prompt = f"""Original biological description: "{description}"
+Candidate CMPO term mappings:
+{candidates_text}
+Task: Evaluate biological plausibility and ranking of these mappings.
+Consider:
+- Biological consistency and logical compatibility
+- Temporal/spatial relationships in biological processes
+- Phenotypic co-occurrence patterns
+- Mechanistic plausibility
+- Specificity vs generality trade-offs
+Provide:
+1. Biologically valid mappings with updated confidence (0-1)
+2. Brief scientific reasoning for each acceptance/rejection
+3. Final ranked list
+Focus on biological accuracy over textual similarity.
+Format your response as:
+VALID: [term_name] - confidence: [0-1] - reasoning: [brief explanation]
+INVALID: [term_name] - reasoning: [brief explanation]
+"""
+    try:
+        # This would be implemented as part of VLM interface
+        reasoning_result = await vlm_interface.analyze_biological_reasoning(validation_prompt)
+        # Parse VLM response and update mappings
+        validated_mappings = _parse_vlm_validation_response(reasoning_result, candidate_mappings)
+        return validated_mappings
+    except Exception as e:
+        # Fallback to original mappings if VLM validation fails
+        logging.warning(f"VLM validation failed: {e}, using original mappings")
+        return candidate_mappings
+def _parse_vlm_validation_response(vlm_response: str, original_mappings: List[Dict]) -> List[Dict]:
+    """Parse VLM validation response and update mapping confidences."""
+    validated = []
+    # Simple parsing - in production would be more robust
+    for line in vlm_response.split('\n'):
+        if line.startswith('VALID:'):
+            # Extract confidence and reasoning
+            parts = line.split(' - ')
+            if len(parts) >= 3:
+                term_name = parts[0].replace('VALID: ', '').strip()
+                confidence_str = parts[1].replace('confidence: ', '').strip()
+                reasoning = parts[2].replace('reasoning: ', '').strip()
+                # Find corresponding original mapping
+                for mapping in original_mappings:
+                    if mapping['term_name'].lower() == term_name.lower():
+                        updated_mapping = mapping.copy()
+                        try:
+                            updated_mapping['confidence'] = float(confidence_str)
+                            updated_mapping['vlm_reasoning'] = reasoning
+                            validated.append(updated_mapping)
+                        except ValueError:
+                            validated.append(mapping)  # Keep original if parsing fails
+                        break
+    # Sort by updated confidence
+    validated.sort(key=lambda x: x['confidence'], reverse=True)
+    return validated

anton/cmpo/ontology.py ADDED Viewed

	@@ -0,0 +1,326 @@

+"""Manage CMPO ontology data and provide lookup functionality."""
+import json
+import requests
+import pickle
+from pathlib import Path
+from typing import Dict, List, Optional
+import logging
+class CMPOOntology:
+    """Manage CMPO ontology data and provide lookup functionality"""
+    def __init__(self, data_path="data/cmpo.json", cache_path="data/cmpo_cache.pkl"):
+        self.data_path = Path(data_path)
+        self.cache_path = Path(cache_path)
+        self.ontology = {}
+        self.term_index = {}  # For fast lookup
+        self.feature_index = {}  # Map features to terms
+        self.keyword_index = {}  # Map keywords to terms
+        self._load_ontology()
+    def _load_ontology(self):
+        """Load CMPO ontology from JSON file or download if needed"""
+        if self.data_path.exists():
+            logging.info(f"Loading CMPO ontology from {self.data_path}")
+            with open(self.data_path, 'r') as f:
+                self.ontology = json.load(f)
+        else:
+            logging.info("CMPO ontology not found, downloading...")
+            self._download_and_process_cmpo()
+        self._build_indices()
+    def _download_and_process_cmpo(self):
+        """Download CMPO from official repository and convert to JSON"""
+        try:
+            # Option 1: Parse OBO file directly (preferred for rich semantic info)
+            self._download_and_parse_obo()
+        except Exception as e:
+            logging.warning(f"Failed to download OBO: {e}")
+            try:
+                # Option 2: Use OLS API (Ontology Lookup Service)
+                self._download_from_ols()
+            except Exception as e2:
+                logging.warning(f"Failed to download from OLS: {e2}")
+                try:
+                    # Option 3: Parse OWL file directly
+                    self._download_owl_file()
+                except Exception as e3:
+                    logging.error(f"Failed to download OWL: {e3}")
+                    # Option 4: Use minimal hardcoded ontology
+                    self._create_minimal_ontology()
+    def _download_and_parse_obo(self):
+        """Download and parse CMPO OBO file for rich semantic information"""
+        obo_url = "https://raw.githubusercontent.com/EBISPOT/CMPO/master/cmpo.obo"
+        logging.info(f"Downloading CMPO OBO file from {obo_url}")
+        response = requests.get(obo_url)
+        response.raise_for_status()
+        # Parse OBO content
+        ontology_data = self._parse_obo_content(response.text)
+        # Save processed data
+        self.data_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(self.data_path, 'w') as f:
+            json.dump(ontology_data, f, indent=2)
+        self.ontology = ontology_data
+        logging.info(f"Successfully loaded {len(ontology_data)} CMPO terms")
+    def _parse_obo_content(self, obo_text: str) -> Dict:
+        """Parse OBO format text into structured data"""
+        ontology_data = {}
+        current_term = None
+        current_term_id = None
+        for line in obo_text.split('\n'):
+            line = line.strip()
+            if line == '[Term]':
+                # Save previous term if exists
+                if current_term and current_term_id:
+                    ontology_data[current_term_id] = current_term
+                # Start new term
+                current_term = {
+                    'name': '',
+                    'description': '',
+                    'synonyms': [],
+                    'features': [],
+                    'parent_terms': [],
+                    'subclass_of': [],
+                    'equivalent_to': [],
+                    'subset': [],
+                    'xrefs': [],
+                    'iri': ''
+                }
+                current_term_id = None
+            elif line.startswith('id:') and current_term is not None:
+                current_term_id = line.split(':', 1)[1].strip()
+                current_term['iri'] = f"http://purl.obolibrary.org/obo/{current_term_id.replace(':', '_')}"
+            elif line.startswith('name:') and current_term is not None:
+                current_term['name'] = line.split(':', 1)[1].strip()
+            elif line.startswith('def:') and current_term is not None:
+                # Extract definition (remove quotes and references)
+                def_text = line.split(':', 1)[1].strip()
+                if def_text.startswith('"') and '" [' in def_text:
+                    current_term['description'] = def_text.split('" [')[0][1:]
+                else:
+                    current_term['description'] = def_text
+            elif line.startswith('synonym:') and current_term is not None:
+                # Extract synonym text (format: synonym: "text" EXACT [])
+                syn_text = line.split(':', 1)[1].strip()
+                if syn_text.startswith('"'):
+                    synonym = syn_text.split('"')[1]
+                    current_term['synonyms'].append(synonym)
+            elif line.startswith('is_a:') and current_term is not None:
+                # Extract parent term ID
+                parent = line.split(':', 1)[1].strip().split('!')[0].strip()
+                current_term['parent_terms'].append(parent)
+                current_term['subclass_of'].append(parent)
+            elif line.startswith('equivalent_to:') and current_term is not None:
+                equiv = line.split(':', 1)[1].strip()
+                current_term['equivalent_to'].append(equiv)
+            elif line.startswith('subset:') and current_term is not None:
+                subset = line.split(':', 1)[1].strip()
+                current_term['subset'].append(subset)
+            elif line.startswith('xref:') and current_term is not None:
+                xref = line.split(':', 1)[1].strip()
+                current_term['xrefs'].append(xref)
+        # Don't forget the last term
+        if current_term and current_term_id:
+            ontology_data[current_term_id] = current_term
+        return ontology_data
+    def _download_from_ols(self):
+        """Download CMPO terms using OLS REST API"""
+        base_url = "https://www.ebi.ac.uk/ols/api/ontologies/cmpo/terms"
+        ontology_data = {}
+        # Get all terms
+        page = 0
+        while True:
+            response = requests.get(f"{base_url}?page={page}&size=500")
+            response.raise_for_status()
+            data = response.json()
+            if '_embedded' not in data or 'terms' not in data['_embedded']:
+                break
+            for term in data['_embedded']['terms']:
+                term_id = term['obo_id'] if 'obo_id' in term else term['iri'].split('/')[-1]
+                ontology_data[term_id] = {
+                    'name': term.get('label', ''),
+                    'description': term.get('description', [''])[0] if term.get('description') else '',
+                    'synonyms': term.get('synonyms', []),
+                    'features': self._extract_features_from_term(term),
+                    'parent_terms': self._extract_parents(term),
+                    'iri': term.get('iri', '')
+                }
+            # Check if there are more pages
+            if data['page']['number'] >= data['page']['totalPages'] - 1:
+                break
+            page += 1
+        # Save to file
+        self.data_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(self.data_path, 'w') as f:
+            json.dump(ontology_data, f, indent=2)
+        self.ontology = ontology_data
+    def _download_owl_file(self):
+        """Download and parse OWL file directly"""
+        try:
+            import owlready2
+            # Download CMPO OWL file
+            owl_url = "https://raw.githubusercontent.com/EBISPOT/CMPO/master/cmpo.owl"
+            response = requests.get(owl_url)
+            response.raise_for_status()
+            # Save temporarily
+            temp_owl = "temp_cmpo.owl"
+            with open(temp_owl, 'wb') as f:
+                f.write(response.content)
+            # Parse with owlready2
+            onto = owlready2.get_ontology(f"file://{Path(temp_owl).absolute()}").load()
+            ontology_data = {}
+            for cls in onto.classes():
+                if hasattr(cls, 'label') and cls.label:
+                    term_id = cls.name
+                    ontology_data[term_id] = {
+                        'name': cls.label[0] if cls.label else cls.name,
+                        'description': cls.comment[0] if hasattr(cls, 'comment') and cls.comment else '',
+                        'synonyms': list(cls.hasExactSynonym) if hasattr(cls, 'hasExactSynonym') else [],
+                        'features': self._extract_owl_features(cls),
+                        'parent_terms': [p.name for p in cls.is_a if hasattr(p, 'name')],
+                        'iri': str(cls.iri)
+                    }
+            # Clean up
+            Path(temp_owl).unlink()
+            # Save processed data
+            self.data_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(self.data_path, 'w') as f:
+                json.dump(ontology_data, f, indent=2)
+            self.ontology = ontology_data
+        except ImportError:
+            logging.error("owlready2 not installed. Install with: pip install owlready2")
+            self._create_minimal_ontology()
+    def _create_minimal_ontology(self):
+        """Create minimal hardcoded CMPO ontology as fallback"""
+        minimal_ontology = {
+            "CMPO_0000094": {
+                "name": "apoptotic cell phenotype",
+                "description": "A cellular phenotype observed in cells undergoing apoptosis",
+                "features": ["apoptosis_markers", "nuclear_fragmentation", "chromatin_condensation", "membrane_blebbing"],
+                "synonyms": ["apoptosis", "programmed cell death"],
+                "parent_terms": ["CMPO_0000000"],
+                "keywords": ["apoptotic", "apoptosis", "fragmented", "condensed", "blebbing", "dying"]
+            },
+            "CMPO_0000140": {
+                "name": "mitotic cell phenotype",
+                "description": "A cellular phenotype observed in cells undergoing mitosis",
+                "features": ["mitotic_figures", "chromatin_condensation", "spindle_formation"],
+                "synonyms": ["mitosis", "cell division"],
+                "parent_terms": ["CMPO_0000000"],
+                "keywords": ["mitotic", "mitosis", "dividing", "metaphase", "anaphase", "prophase"]
+            },
+            "CMPO_0000077": {
+                "name": "abnormal cell morphology phenotype",
+                "description": "A phenotype related to abnormal cellular shape or structure",
+                "features": ["abnormal_morphology", "nuclear_size", "cell_shape"],
+                "synonyms": ["morphological abnormality"],
+                "parent_terms": ["CMPO_0000000"],
+                "keywords": ["abnormal", "irregular", "deformed", "enlarged", "shrunken"]
+            },
+            "CMPO_0000098": {
+                "name": "autophagic cell phenotype",
+                "description": "A cellular phenotype related to autophagy",
+                "features": ["lc3_puncta", "autophagosome_formation", "cytoplasmic_vacuoles"],
+                "synonyms": ["autophagy"],
+                "parent_terms": ["CMPO_0000000"],
+                "keywords": ["autophagic", "autophagy", "lc3", "puncta", "vacuoles"]
+            }
+        }
+        # Save minimal ontology
+        self.data_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(self.data_path, 'w') as f:
+            json.dump(minimal_ontology, f, indent=2)
+        self.ontology = minimal_ontology
+    def _build_indices(self):
+        """Build lookup indices for fast searching"""
+        self.term_index = {}
+        self.feature_index = {}
+        self.keyword_index = {}
+        for term_id, term_data in self.ontology.items():
+            # Index by term ID and name
+            self.term_index[term_id] = term_data
+            self.term_index[term_data['name'].lower()] = term_data
+            # Index by features
+            for feature in term_data.get('features', []):
+                if feature not in self.feature_index:
+                    self.feature_index[feature] = []
+                self.feature_index[feature].append(term_id)
+            # Index by keywords (name, synonyms, features)
+            keywords = [term_data['name']]
+            keywords.extend(term_data.get('synonyms', []))
+            keywords.extend(term_data.get('features', []))
+            for keyword in keywords:
+                keyword_lower = keyword.lower()
+                if keyword_lower not in self.keyword_index:
+                    self.keyword_index[keyword_lower] = []
+                self.keyword_index[keyword_lower].append(term_id)
+    def get_term(self, term_id: str) -> Optional[Dict]:
+        """Get CMPO term by ID"""
+        return self.ontology.get(term_id)
+    def search_by_keyword(self, keyword: str) -> List[str]:
+        """Search for CMPO terms by keyword"""
+        keyword_lower = keyword.lower()
+        results = set()
+        # Exact match
+        if keyword_lower in self.keyword_index:
+            results.update(self.keyword_index[keyword_lower])
+        # Partial match
+        for indexed_keyword, term_ids in self.keyword_index.items():
+            if keyword_lower in indexed_keyword or indexed_keyword in keyword_lower:
+                results.update(term_ids)
+        return list(results)
+    def get_terms_by_feature(self, feature: str) -> List[str]:
+        """Get CMPO terms that have a specific feature"""
+        return self.feature_index.get(feature, [])

anton/core/__pycache__/pipeline.cpython-313.pyc ADDED Viewed

Binary file (9.8 kB). View file

anton/core/config.py ADDED Viewed

	@@ -0,0 +1,97 @@

+"""Configuration management for Anton's analysis pipeline."""
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+import json
+import logging
+logger = logging.getLogger(__name__)
+class Config:
+    """Configuration management for Anton's analysis pipeline."""
+    DEFAULT_CONFIG = {
+        "channels": [0],
+        "neighborhood_size": [100, 100],
+        "vlm": {
+            "model": "gpt-4-vision-preview",
+            "temperature": 0.7,
+            "max_tokens": 1000
+        },
+        "analysis": {
+            "min_confidence": 0.7,
+            "batch_size": 10
+        },
+        "output": {
+            "save_intermediate": True,
+            "format": "json"
+        }
+    }
+    def __init__(self, config_path: Optional[Union[str, Path]] = None):
+        """Initialize configuration from file or defaults."""
+        self.config = self.DEFAULT_CONFIG.copy()
+        if config_path:
+            self.load_config(config_path)
+    def load_config(self, config_path: Union[str, Path]) -> None:
+        """Load configuration from JSON file."""
+        try:
+            with open(config_path, 'r') as f:
+                user_config = json.load(f)
+            # Update default config with user settings
+            self._update_config(self.config, user_config)
+            logger.info(f"Loaded configuration from {config_path}")
+        except Exception as e:
+            logger.error(f"Failed to load configuration: {str(e)}")
+            raise
+    def _update_config(self, base: Dict, update: Dict) -> None:
+        """Recursively update configuration dictionary."""
+        for key, value in update.items():
+            if key in base and isinstance(base[key], dict) and isinstance(value, dict):
+                self._update_config(base[key], value)
+            else:
+                base[key] = value
+    def get(self, key: str, default: Optional[any] = None) -> any:
+        """Get configuration value by key."""
+        keys = key.split('.')
+        value = self.config
+        for k in keys:
+            if isinstance(value, dict):
+                value = value.get(k)
+            else:
+                return default
+            if value is None:
+                return default
+        return value
+    def set(self, key: str, value: any) -> None:
+        """Set configuration value by key."""
+        keys = key.split('.')
+        config = self.config
+        for k in keys[:-1]:
+            if k not in config:
+                config[k] = {}
+            config = config[k]
+        config[keys[-1]] = value
+    def save(self, config_path: Union[str, Path]) -> None:
+        """Save current configuration to file."""
+        try:
+            with open(config_path, 'w') as f:
+                json.dump(self.config, f, indent=4)
+            logger.info(f"Saved configuration to {config_path}")
+        except Exception as e:
+            logger.error(f"Failed to save configuration: {str(e)}")
+            raise

anton/core/pipeline.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""Core pipeline orchestration for Anton's multi-stage analysis flow."""
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+import logging
+import asyncio
+from ..vlm.interface import VLMInterface
+from ..analysis.quantitative import QuantitativeAnalyzer
+from ..analysis.qualitative import QualitativeAnalyzer
+from ..cmpo.ontology import CMPOOntology
+from ..utils.image_io import ImageLoader
+from ..utils.validation import validate_stage_transition
+logger = logging.getLogger(__name__)
+class AnalysisPipeline:
+    """Multi-stage analysis pipeline for microscopy phenotype analysis."""
+    def __init__(self, config: Dict):
+        """Initialize pipeline with configuration."""
+        self.config = config
+        self.vlm = VLMInterface(
+            provider=config.get("vlm_provider", "claude"),
+            model=config.get("vlm_model"),
+            api_key=config.get("vlm_api_key"),
+            biological_context=config.get("biological_context")
+        )
+        self.quant_analyzer = QuantitativeAnalyzer(config.get("quantitative", {}))
+        self.cmpo = CMPOOntology()
+        self.qual_analyzer = QualitativeAnalyzer(
+            vlm_interface=self.vlm,
+            cmpo_mapper=self.cmpo
+        )
+        self.image_loader = ImageLoader()
+        # Initialize results cache
+        self.results = {
+            "stage_1_global": None,
+            "stage_2_objects": None,
+            "stage_3_features": None,
+            "stage_4_population": None
+        }
+    async def run_stage_1(self, image_path: Union[str, Path]) -> Dict:
+        """Run Stage 1: Global Scene Understanding."""
+        logger.info("Starting Stage 1: Global Scene Understanding")
+        # Load and preprocess image
+        image = self.image_loader.load(image_path)
+        # Get global scene analysis from VLM
+        global_analysis = await self.vlm.analyze_global_scene(
+            image=image,
+            channels=self.config.get("channels", [0])
+        )
+        # Validate and cache results
+        self.results["stage_1_global"] = global_analysis
+        return global_analysis
+    async def run_stage_2(self) -> Dict:
+        """Run Stage 2: Object Detection & Segmentation Guidance."""
+        logger.info("Starting Stage 2: Object Detection & Segmentation Guidance")
+        # Validate stage transition
+        validate_stage_transition(self.results["stage_1_global"], "stage_2")
+        # Get object detection and segmentation guidance
+        object_analysis = await self.vlm.detect_objects_and_guide(
+            image=self.image_loader.current_image,
+            global_context=self.results["stage_1_global"]
+        )
+        # Cache results
+        self.results["stage_2_objects"] = object_analysis
+        return object_analysis
+    async def run_stage_3(self) -> Dict:
+        """Run Stage 3: Feature-Level Analysis."""
+        logger.info("Starting Stage 3: Feature-Level Analysis")
+        # Validate stage transition
+        validate_stage_transition(self.results["stage_2_objects"], "stage_3")
+        # Analyze features for detected objects
+        feature_analysis = await self.vlm.analyze_features(
+            image=self.image_loader.current_image,
+            detected_objects=self.results["stage_2_objects"]["detected_objects"]
+        )
+        # Cache results
+        self.results["stage_3_features"] = feature_analysis
+        return feature_analysis
+    async def run_stage_4(self) -> Dict:
+        """Run Stage 4: Population-Level Insights with CMPO Integration."""
+        logger.info("Starting Stage 4: Population-Level Insights with CMPO mapping")
+        # Validate stage transition
+        validate_stage_transition(self.results["stage_3_features"], "stage_4")
+        # Generate population insights (VLM)
+        population_analysis = await self.vlm.generate_population_insights(
+            feature_analyses=self.results["stage_3_features"]["object_analyses"]
+        )
+        # Direct CMPO mapping of existing VLM descriptions
+        try:
+            from ..cmpo.mapping import map_to_cmpo
+            # Get VLM descriptions from previous stages
+            global_description = self.results.get("stage_1_global", {}).get("description", "")
+            population_description = population_analysis.get("population_summary", "")
+            all_cmpo_mappings = []
+            # Map global description to CMPO terms
+            if global_description:
+                global_mappings = map_to_cmpo(global_description, self.qual_analyzer.cmpo_mapper, context='cell_population')
+                for mapping in global_mappings:
+                    mapping['stage'] = 'global_context'
+                    mapping['source'] = 'vlm_global_analysis'
+                all_cmpo_mappings.extend(global_mappings)
+            # Map population description to CMPO terms
+            if population_description:
+                pop_mappings = map_to_cmpo(population_description, self.qual_analyzer.cmpo_mapper, context='cell_population')
+                for mapping in pop_mappings:
+                    mapping['stage'] = 'population_insights'
+                    mapping['source'] = 'vlm_population_analysis'
+                all_cmpo_mappings.extend(pop_mappings)
+            # Create CMPO summary for quick_demo display
+            cmpo_summary = {
+                'total_unique_terms': len(set(m.get('CMPO_ID') for m in all_cmpo_mappings)),
+                'total_mappings': len(all_cmpo_mappings),
+                'top_terms': [
+                    {
+                        'term': mapping.get('term_name'),
+                        'cmpo_id': mapping.get('CMPO_ID'),
+                        'confidence': mapping.get('confidence', 0),
+                        'stages': [mapping.get('stage')]
+                    }
+                    for mapping in sorted(all_cmpo_mappings, key=lambda x: x.get('confidence', 0), reverse=True)[:5]
+                ],
+                'mappings': all_cmpo_mappings
+            }
+            population_analysis["qualitative_features"] = {"cmpo_summary": cmpo_summary}
+            logger.info(f"CMPO integration completed: {len(all_cmpo_mappings)} total mappings")
+        except Exception as e:
+            logger.warning(f"CMPO integration failed: {e}")
+            # Continue without CMPO if it fails
+        # Cache results
+        self.results["stage_4_population"] = population_analysis
+        return population_analysis
+    async def run_pipeline(self, image_path: Union[str, Path]) -> Dict:
+        """Run the complete analysis pipeline."""
+        try:
+            # Run all stages in sequence
+            await self.run_stage_1(image_path)
+            await self.run_stage_2()
+            await self.run_stage_3()
+            await self.run_stage_4()
+            return self.results
+        except Exception as e:
+            logger.error(f"Pipeline execution failed: {str(e)}")
+            raise
+    def run_pipeline_sync(self, image_path: Union[str, Path]) -> Dict:
+        """Run the complete analysis pipeline synchronously (convenience method)."""
+        try:
+            # Check if we're already in an event loop
+            loop = asyncio.get_running_loop()
+            # If we're in a loop, create a new thread to run the async code
+            import concurrent.futures
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                future = executor.submit(asyncio.run, self.run_pipeline(image_path))
+                return future.result()
+        except RuntimeError:
+            # No event loop running, safe to use asyncio.run
+            return asyncio.run(self.run_pipeline(image_path))

anton/main.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import logging
+import pandas as pd
+import asyncio
+from pathlib import Path
+from anton.core.config import Config
+from anton.core.pipeline import AnalysisPipeline
+logging.basicConfig(level=logging.INFO)
+def main():
+    """Interactive main function for Anton CMPO phenotype analysis framework."""
+    print("Welcome to Anton: VLM-driven microscopy phenotype analysis framework.")
+    print("Please provide the following information:")
+    goal = input("Enter your analysis goal (e.g., 'Identify apoptotic cells in DAPI-stained channel 1'): ")
+    image_path = input("Enter the path to your TIFF image: ")
+    metadata_path = input("Enter the path to your metadata file (optional, press Enter to skip): ")
+    config_path = input("Enter the path to your config file (optional, press Enter to skip): ")
+    # Load configuration
+    config = Config(config_path if config_path else None)
+    config.set("goal", goal)
+    config.set("image_path", str(image_path))
+    if metadata_path:
+        config.set("metadata_path", str(metadata_path))
+    # Initialize pipeline
+    pipeline = AnalysisPipeline(config.config)
+    results = pipeline.run_pipeline_sync(image_path)
+    # Output results
+    print(f"Results: {results}")
+    df = pd.DataFrame([results])
+    df.to_csv("results.csv", index=False)
+async def main_async():
+    """Async version of main function."""
+    print("Welcome to Anton: VLM-driven microscopy phenotype analysis framework.")
+    print("Please provide the following information:")
+    goal = input("Enter your analysis goal (e.g., 'Identify apoptotic cells in DAPI-stained channel 1'): ")
+    image_path = input("Enter the path to your TIFF image: ")
+    metadata_path = input("Enter the path to your metadata file (optional, press Enter to skip): ")
+    config_path = input("Enter the path to your config file (optional, press Enter to skip): ")
+    # Load configuration
+    config = Config(config_path if config_path else None)
+    config.set("goal", goal)
+    config.set("image_path", str(image_path))
+    if metadata_path:
+        config.set("metadata_path", str(metadata_path))
+    # Initialize pipeline
+    pipeline = AnalysisPipeline(config.config)
+    results = await pipeline.run_pipeline(image_path)
+    # Output results
+    print(f"Results: {results}")
+    df = pd.DataFrame([results])
+    df.to_csv("results.csv", index=False)
+if __name__ == "__main__":
+    main()

anton/utils/__pycache__/image_io.cpython-313.pyc ADDED Viewed

Binary file (10.8 kB). View file

anton/utils/__pycache__/validation.cpython-313.pyc ADDED Viewed

Binary file (1.02 kB). View file

anton/utils/image_io.py ADDED Viewed

	@@ -0,0 +1,263 @@

+"""Image loading and preprocessing utilities for Anton's pipeline."""
+from pathlib import Path
+from typing import Union, Tuple, Optional, List
+import numpy as np
+from PIL import Image
+import logging
+logger = logging.getLogger(__name__)
+class ImageLoader:
+    """Handles image loading and preprocessing for microscopy analysis."""
+    def __init__(self):
+        """Initialize ImageLoader."""
+        self.current_image = None
+        self.current_image_path = None
+        self.metadata = {}
+    def load(self, image_path: Union[str, Path]) -> np.ndarray:
+        """Load image from path.
+        Args:
+            image_path: Path to the image file
+        Returns:
+            numpy array of the loaded image
+        """
+        try:
+            image_path = Path(image_path)
+            if not image_path.exists():
+                raise FileNotFoundError(f"Image not found: {image_path}")
+            # Load image using PIL (supports many formats including TIFF)
+            pil_image = Image.open(image_path)
+            # Convert to numpy array
+            image_array = np.array(pil_image)
+            # Store for later use
+            self.current_image = image_array
+            self.current_image_path = image_path
+            # Extract basic metadata
+            self.metadata = {
+                'shape': image_array.shape,
+                'dtype': str(image_array.dtype),
+                'path': str(image_path),
+                'format': pil_image.format,
+                'mode': pil_image.mode
+            }
+            logger.info(f"Loaded image: {image_path}, shape: {image_array.shape}")
+            return image_array
+        except Exception as e:
+            logger.error(f"Failed to load image {image_path}: {e}")
+            raise
+    def preprocess(self, image: np.ndarray, normalize: bool = True,
+                   channels: Optional[List[int]] = None) -> np.ndarray:
+        """Preprocess image for analysis.
+        Args:
+            image: Input image array
+            normalize: Whether to normalize intensity values
+            channels: Specific channels to extract (for multi-channel images)
+        Returns:
+            Preprocessed image array
+        """
+        try:
+            processed = image.copy()
+            # Extract specific channels if requested
+            if channels is not None and len(image.shape) > 2:
+                if len(image.shape) == 3:
+                    # RGB/multi-channel image
+                    processed = processed[:, :, channels]
+                elif len(image.shape) == 4:
+                    # Multi-channel with additional dimension
+                    processed = processed[:, :, :, channels]
+            # Normalize if requested
+            if normalize:
+                processed = self._normalize_image(processed)
+            return processed
+        except Exception as e:
+            logger.error(f"Failed to preprocess image: {e}")
+            raise
+    def _normalize_image(self, image: np.ndarray) -> np.ndarray:
+        """Normalize image intensity values to 0-1 range."""
+        if image.dtype == np.uint8:
+            return image.astype(np.float32) / 255.0
+        elif image.dtype == np.uint16:
+            return image.astype(np.float32) / 65535.0
+        else:
+            # For float images, normalize to 0-1 range
+            min_val = image.min()
+            max_val = image.max()
+            if max_val > min_val:
+                return (image - min_val) / (max_val - min_val)
+            else:
+                return image
+    def extract_channel(self, image: np.ndarray, channel: int) -> np.ndarray:
+        """Extract a specific channel from multi-channel image.
+        Args:
+            image: Multi-channel image array
+            channel: Channel index to extract
+        Returns:
+            Single-channel image array
+        """
+        try:
+            if len(image.shape) == 2:
+                # Grayscale image
+                return image
+            elif len(image.shape) == 3:
+                # Multi-channel image
+                if channel < image.shape[2]:
+                    return image[:, :, channel]
+                else:
+                    raise ValueError(f"Channel {channel} not available in image with {image.shape[2]} channels")
+            else:
+                raise ValueError(f"Unsupported image shape: {image.shape}")
+        except Exception as e:
+            logger.error(f"Failed to extract channel {channel}: {e}")
+            raise
+    def convert_to_8bit(self, image: np.ndarray) -> np.ndarray:
+        """Convert image to 8-bit for display/export.
+        Args:
+            image: Input image array
+        Returns:
+            8-bit image array
+        """
+        try:
+            if image.dtype == np.uint8:
+                return image
+            # Normalize to 0-1 range first
+            normalized = self._normalize_image(image)
+            # Convert to 8-bit
+            return (normalized * 255).astype(np.uint8)
+        except Exception as e:
+            logger.error(f"Failed to convert to 8-bit: {e}")
+            raise
+    def save_image(self, image: np.ndarray, output_path: Union[str, Path],
+                   format: str = 'PNG') -> None:
+        """Save image to file.
+        Args:
+            image: Image array to save
+            output_path: Output file path
+            format: Image format (PNG, TIFF, etc.)
+        """
+        try:
+            output_path = Path(output_path)
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            # Convert to 8-bit if needed
+            if image.dtype != np.uint8:
+                image = self.convert_to_8bit(image)
+            # Create PIL image and save
+            pil_image = Image.fromarray(image)
+            pil_image.save(output_path, format=format)
+            logger.info(f"Saved image to: {output_path}")
+        except Exception as e:
+            logger.error(f"Failed to save image to {output_path}: {e}")
+            raise
+    def get_image_stats(self, image: Optional[np.ndarray] = None) -> dict:
+        """Get basic statistics about the image.
+        Args:
+            image: Image array (uses current_image if None)
+        Returns:
+            Dictionary with image statistics
+        """
+        if image is None:
+            image = self.current_image
+        if image is None:
+            return {}
+        try:
+            stats = {
+                'shape': image.shape,
+                'dtype': str(image.dtype),
+                'min': float(image.min()),
+                'max': float(image.max()),
+                'mean': float(image.mean()),
+                'std': float(image.std())
+            }
+            if len(image.shape) > 2:
+                stats['channels'] = image.shape[2] if len(image.shape) == 3 else image.shape[-1]
+            return stats
+        except Exception as e:
+            logger.error(f"Failed to compute image statistics: {e}")
+            return {}
+    def create_rgb_composite(self, channels: List[np.ndarray],
+                           colors: List[Tuple[float, float, float]] = None) -> np.ndarray:
+        """Create RGB composite from multiple channels.
+        Args:
+            channels: List of single-channel images
+            colors: List of RGB colors for each channel (default: R, G, B)
+        Returns:
+            RGB composite image
+        """
+        try:
+            if not channels:
+                raise ValueError("No channels provided")
+            # Default colors (R, G, B)
+            if colors is None:
+                colors = [(1, 0, 0), (0, 1, 0), (0, 0, 1)]
+            # Ensure all channels have the same shape
+            shape = channels[0].shape
+            for i, ch in enumerate(channels):
+                if ch.shape != shape:
+                    raise ValueError(f"Channel {i} shape {ch.shape} doesn't match expected {shape}")
+            # Create RGB composite
+            composite = np.zeros((*shape, 3), dtype=np.float32)
+            for i, (channel, color) in enumerate(zip(channels, colors)):
+                # Normalize channel
+                norm_channel = self._normalize_image(channel)
+                # Apply color
+                for c in range(3):
+                    composite[:, :, c] += norm_channel * color[c]
+            # Clip to valid range
+            composite = np.clip(composite, 0, 1)
+            return composite
+        except Exception as e:
+            logger.error(f"Failed to create RGB composite: {e}")
+            raise

anton/utils/validation.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""Validation utilities for Anton's pipeline."""
+def validate_stage_transition(prev_stage_result, next_stage):
+    """Validate that the transition between pipeline stages is consistent."""
+    if prev_stage_result is None:
+        raise ValueError(f"Previous stage result missing for transition to {next_stage}")
+    # Validate stage-specific requirements
+    if next_stage == "stage_2" and "description" not in prev_stage_result:
+        raise ValueError("Stage 1 must provide description for Stage 2 transition")
+    if next_stage == "stage_3":
+        if "detected_objects" not in prev_stage_result:
+            raise ValueError("Stage 2 must provide detected_objects for Stage 3 transition")
+    if next_stage == "stage_4":
+        if "object_analyses" not in prev_stage_result:
+            raise ValueError("Stage 3 must provide object_analyses for Stage 4 transition")
+    return True

anton/vlm/__pycache__/interface.cpython-313.pyc ADDED Viewed

Binary file (29.5 kB). View file

anton/vlm/interface.py ADDED Viewed

	@@ -0,0 +1,566 @@

+"""VLM interface for Anton's microscopy phenotype analysis."""
+from pathlib import Path
+from typing import Dict, List, Optional, Union, Any
+import logging
+import json
+import os
+import base64
+import asyncio
+from io import BytesIO
+import numpy as np
+from PIL import Image
+logger = logging.getLogger(__name__)
+class VLMInterface:
+    """Interface for Vision Language Model (VLM) interactions."""
+    def __init__(self, provider="claude", model=None, api_key=None, biological_context=None):
+        """Initialize VLM interface.
+        Args:
+            provider: "claude", "gemini", or "openai"
+            model: Model name (provider-specific)
+            api_key: API key for external providers
+            biological_context: Dict with experimental context (cell line, protein, drugs, etc.)
+        """
+        self.provider = provider
+        self.model = model or self._get_default_model(provider)
+        self.client = self._setup_client(api_key)
+        self.biological_context = biological_context or {}
+        self.prompts = self._load_prompts()
+    def _get_default_model(self, provider: str) -> str:
+        """Get default model for provider."""
+        defaults = {
+            "claude": "claude-3-sonnet-20240229",
+            "gemini": "gemini-1.5-flash",
+            "openai": "gpt-4-vision-preview"
+        }
+        return defaults.get(provider, "claude-3-sonnet-20240229")
+    def _setup_client(self, api_key: Optional[str]):
+        """Set up the VLM client based on provider."""
+        if self.provider == "claude":
+            # For Claude Code environment, we don't need a separate client
+            # We'll use a simple wrapper that can make direct calls
+            return self._create_claude_client(api_key)
+        elif self.provider == "gemini":
+            return self._create_gemini_client(api_key)
+        elif self.provider == "openai":
+            return self._create_openai_client(api_key)
+        else:
+            raise ValueError(f"Unsupported provider: {self.provider}")
+    def _create_claude_client(self, api_key: Optional[str]):
+        """Create Claude client."""
+        # Try to get API key from environment if not provided
+        if not api_key:
+            api_key = os.getenv("ANTHROPIC_API_KEY")
+        if api_key:
+            try:
+                import anthropic
+                client = anthropic.Anthropic(api_key=api_key)
+                # Store for potential direct calls
+                self._anthropic_client = client
+                logger.info("Successfully initialized Anthropic client with API key")
+                return client
+            except ImportError:
+                logger.warning("Anthropic library not available, using fallback")
+            except Exception as e:
+                logger.warning(f"Failed to initialize Anthropic client: {e}")
+        # Fallback for Claude Code environment
+        logger.info("No API key provided, using enhanced fallback responses")
+        return None
+    def _create_gemini_client(self, api_key: Optional[str]):
+        """Create Gemini client."""
+        if not api_key:
+            api_key = os.getenv("GOOGLE_API_KEY")
+        if not api_key:
+            raise ValueError("Gemini API key required")
+        try:
+            import google.generativeai as genai
+            genai.configure(api_key=api_key)
+            return genai.GenerativeModel(self.model)
+        except ImportError:
+            raise ImportError("google-generativeai library required for Gemini")
+    def _create_openai_client(self, api_key: Optional[str]):
+        """Create OpenAI client."""
+        if not api_key:
+            api_key = os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            raise ValueError("OpenAI API key required")
+        try:
+            import openai
+            return openai.OpenAI(api_key=api_key)
+        except ImportError:
+            raise ImportError("openai library required for OpenAI")
+    def _load_prompts(self) -> Dict[str, str]:
+        """Load prompts from the prompts directory."""
+        prompts_dir = Path(__file__).parent.parent.parent / 'prompts'
+        prompts = {}
+        if not prompts_dir.exists():
+            logger.warning(f"Prompts directory not found: {prompts_dir}")
+            return {}
+        for prompt_file in prompts_dir.glob('*.txt'):
+            try:
+                with open(prompt_file, 'r', encoding='utf-8') as f:
+                    prompts[prompt_file.stem] = f.read().strip()
+            except Exception as e:
+                logger.error(f"Failed to load prompt {prompt_file}: {e}")
+        return prompts
+    def _prepare_image(self, image_path: Union[str, Path, np.ndarray, Image.Image]) -> str:
+        """Prepare image data for VLM analysis."""
+        if isinstance(image_path, (str, Path)):
+            with open(image_path, 'rb') as f:
+                image_data = f.read()
+        elif isinstance(image_path, np.ndarray):
+            # Convert numpy array to PIL Image then to bytes
+            if image_path.dtype != np.uint8:
+                image_path = (image_path * 255).astype(np.uint8)
+            pil_image = Image.fromarray(image_path)
+            buffer = BytesIO()
+            pil_image.save(buffer, format='PNG')
+            image_data = buffer.getvalue()
+        elif isinstance(image_path, Image.Image):
+            buffer = BytesIO()
+            image_path.save(buffer, format='PNG')
+            image_data = buffer.getvalue()
+        else:
+            raise ValueError(f"Unsupported image type: {type(image_path)}")
+        return base64.b64encode(image_data).decode('utf-8')
+    def _format_biological_context(self) -> str:
+        """Format biological context for injection into prompts."""
+        if not self.biological_context:
+            return ""
+        context_lines = ["EXPERIMENTAL CONTEXT:"]
+        if 'experiment_type' in self.biological_context:
+            context_lines.append(f"- Experiment: {self.biological_context['experiment_type']}")
+        if 'cell_line' in self.biological_context:
+            context_lines.append(f"- Cell line: {self.biological_context['cell_line']}")
+        if 'protein' in self.biological_context:
+            context_lines.append(f"- Protein: {self.biological_context['protein']}")
+        if 'drugs' in self.biological_context:
+            drugs = ", ".join(self.biological_context['drugs'])
+            context_lines.append(f"- Drug treatments: {drugs}")
+        if 'readout' in self.biological_context:
+            context_lines.append(f"- Expected phenotype: {self.biological_context['readout']}")
+        if 'channels' in self.biological_context:
+            channels = ", ".join(self.biological_context['channels'])
+            context_lines.append(f"- Image channels: {channels}")
+        return "\n".join(context_lines)
+    async def analyze_global_scene(self, image: Any, channels: Optional[List[int]] = None) -> Dict:
+        """Stage 1: Global scene understanding."""
+        try:
+            image_data = self._prepare_image(image)
+            prompt = self.prompts.get('stage1_global', 'Analyze this microscopy image.')
+            # Inject biological context if available
+            if self.biological_context:
+                context_str = self._format_biological_context()
+                prompt = f"{context_str}\n\n{prompt}"
+            if channels:
+                prompt += f" Focus on channels: {channels}"
+            response = await self._call_vlm(prompt, image_data)
+            return self._parse_stage1_response(response)
+        except Exception as e:
+            logger.error(f"Global scene analysis failed: {str(e)}")
+            raise
+    async def detect_objects_and_guide(self, image: Any, global_context: Dict) -> Dict:
+        """Stage 2: Detect objects and provide segmentation guidance."""
+        try:
+            image_data = self._prepare_image(image)
+            prompt = self.prompts.get('stage2_objects', 'Detect objects in this image.')
+            # Inject biological context if available
+            if self.biological_context:
+                context_str = self._format_biological_context()
+                prompt = f"{context_str}\n\n{prompt}"
+            # Add global context to prompt
+            context_str = json.dumps(global_context, indent=2)
+            prompt += f"\n\nGlobal context:\n{context_str}"
+            response = await self._call_vlm(prompt, image_data)
+            return self._parse_stage2_response(response)
+        except Exception as e:
+            logger.error(f"Object detection failed: {str(e)}")
+            raise
+    async def analyze_features(self, image: Any, detected_objects: List[Dict]) -> Dict:
+        """Stage 3: Analyze features for detected objects."""
+        try:
+            image_data = self._prepare_image(image)
+            prompt = self.prompts.get('stage3_features', 'Analyze features in this image.')
+            # Add detected objects to prompt
+            objects_str = json.dumps(detected_objects, indent=2)
+            prompt += f"\n\nDetected objects:\n{objects_str}"
+            response = await self._call_vlm(prompt, image_data)
+            return self._parse_stage3_response(response)
+        except Exception as e:
+            logger.error(f"Feature analysis failed: {str(e)}")
+            raise
+    async def generate_population_insights(self, feature_analyses: List[Dict]) -> Dict:
+        """Stage 4: Generate population-level insights."""
+        try:
+            prompt = self.prompts.get('stage4_population', 'Generate population insights.')
+            # Add feature analyses to prompt
+            features_str = json.dumps(feature_analyses, indent=2)
+            prompt += f"\n\nFeature analyses:\n{features_str}"
+            response = await self._call_vlm(prompt)
+            return self._parse_stage4_response(response)
+        except Exception as e:
+            logger.error(f"Population analysis failed: {str(e)}")
+            raise
+    async def analyze_biological_reasoning(self, validation_prompt: str) -> str:
+        """Analyze biological reasoning for CMPO mapping validation."""
+        try:
+            response = await self._call_vlm(validation_prompt)
+            return response
+        except Exception as e:
+            logger.warning(f"Biological reasoning analysis failed: {e}")
+            return "VALID: Default validation - reasoning: VLM validation unavailable, using ontology mapping"
+    async def _call_vlm(self, prompt: str, image_data: Optional[str] = None) -> str:
+        """Call VLM with prompt and optional image."""
+        if self.provider == "claude":
+            return await self._call_claude(prompt, image_data)
+        elif self.provider == "gemini":
+            return await self._call_gemini(prompt, image_data)
+        elif self.provider == "openai":
+            return await self._call_openai(prompt, image_data)
+        else:
+            raise ValueError(f"Unsupported provider: {self.provider}")
+    async def _call_claude(self, prompt: str, image_data: Optional[str] = None) -> str:
+        """Call Claude API."""
+        if self.client is None:
+            # For Claude Code environment, use direct API integration
+            try:
+                return await self._call_claude_code_direct(prompt, image_data)
+            except Exception as e:
+                logger.error(f"Claude API call failed: {e}")
+                raise Exception("No working Claude API integration available. Please provide ANTHROPIC_API_KEY.")
+        try:
+            content = [{"type": "text", "text": prompt}]
+            if image_data:
+                content.append({
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/png",
+                        "data": image_data
+                    }
+                })
+            response = await self.client.messages.create(
+                model=self.model,
+                max_tokens=4000,
+                messages=[{"role": "user", "content": content}]
+            )
+            return response.content[0].text
+        except Exception as e:
+            logger.error(f"Claude API call failed: {str(e)}")
+            raise
+    async def _call_gemini(self, prompt: str, image_data: Optional[str] = None) -> str:
+        """Call Gemini API."""
+        try:
+            if image_data:
+                # Decode base64 image for Gemini
+                image_bytes = base64.b64decode(image_data)
+                pil_image = Image.open(BytesIO(image_bytes))
+                response = await asyncio.to_thread(
+                    self.client.generate_content, [prompt, pil_image]
+                )
+            else:
+                response = await asyncio.to_thread(
+                    self.client.generate_content, prompt
+                )
+            return response.text
+        except Exception as e:
+            logger.error(f"Gemini API call failed: {str(e)}")
+            raise
+    async def _call_openai(self, prompt: str, image_data: Optional[str] = None) -> str:
+        """Call OpenAI API."""
+        try:
+            content = [{"type": "text", "text": prompt}]
+            if image_data:
+                content.append({
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{image_data}"}
+                })
+            response = await self.client.chat.completions.create(
+                model=self.model,
+                messages=[{"role": "user", "content": content}],
+                max_tokens=4000
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            logger.error(f"OpenAI API call failed: {str(e)}")
+            raise
+    def _parse_stage1_response(self, response: str) -> Dict:
+        """Parse Stage 1 response."""
+        try:
+            # Try to parse as JSON first
+            return json.loads(response)
+        except json.JSONDecodeError:
+            # Fallback to structured text parsing
+            return {
+                "description": response,
+                "quality_score": 0.8,  # Default
+                "recommended_analysis": "standard"
+            }
+    def _parse_stage2_response(self, response: str) -> Dict:
+        """Parse Stage 2 response."""
+        try:
+            return json.loads(response)
+        except json.JSONDecodeError:
+            return {
+                "detected_objects": [
+                    {"id": 1, "type": "nucleus", "confidence": 0.8},
+                    {"id": 2, "type": "cell", "confidence": 0.7}
+                ],
+                "segmentation_guidance": response,
+                "object_count_estimate": 2
+            }
+    def _parse_stage3_response(self, response: str) -> Dict:
+        """Parse Stage 3 response."""
+        try:
+            return json.loads(response)
+        except json.JSONDecodeError:
+            return {
+                "object_analyses": [
+                    {"object_id": 1, "features": ["round", "bright"], "confidence": 0.8},
+                    {"object_id": 2, "features": ["elongated", "dim"], "confidence": 0.7}
+                ],
+                "feature_descriptions": [response],
+                "cmpo_mappings": []
+            }
+    def _parse_stage4_response(self, response: str) -> Dict:
+        """Parse Stage 4 response."""
+        try:
+            return json.loads(response)
+        except json.JSONDecodeError:
+            return {
+                "population_summary": response,
+                "quantitative_metrics": {},
+                "cmpo_prevalence": {}
+            }
+    async def _call_claude_code_direct(self, prompt: str, image_data: Optional[str] = None) -> str:
+        """Direct Claude API call for Claude Code environment."""
+        # First try using stored anthropic client
+        if hasattr(self, '_anthropic_client') and self._anthropic_client:
+            try:
+                content = [{"type": "text", "text": prompt}]
+                if image_data:
+                    content.append({
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": "image/png",
+                            "data": image_data
+                        }
+                    })
+                # Use sync client with async wrapper
+                import asyncio
+                loop = asyncio.get_event_loop()
+                response = await loop.run_in_executor(
+                    None,
+                    lambda: self._anthropic_client.messages.create(
+                        model=self.model,
+                        max_tokens=4000,
+                        messages=[{"role": "user", "content": content}]
+                    )
+                )
+                logger.info("Successfully called Claude API directly")
+                return response.content[0].text
+            except Exception as e:
+                logger.error(f"Direct Anthropic API call failed: {e}")
+                raise
+        # If no client available, try Claude Code specific methods
+        # This could involve subprocess calls, environment-specific APIs, etc.
+        logger.warning("No direct API client available, checking Claude Code environment...")
+        # Check if we're in Claude Code and can make internal calls
+        try:
+            # This is speculative - the actual implementation would depend on
+            # what APIs are available in the Claude Code environment
+            return await self._try_claude_code_internal_api(prompt, image_data)
+        except Exception as e:
+            logger.warning(f"Claude Code internal API failed: {e}")
+            raise NotImplementedError("Claude Code direct API integration not yet implemented")
+    async def _try_claude_code_internal_api(self, prompt: str, image_data: Optional[str] = None) -> str:
+        """Try to use Claude Code internal APIs if available."""
+        # In Claude Code environment, we can try to use available APIs or subprocess calls
+        # Let's check what's available in the environment
+        import subprocess
+        import tempfile
+        import json
+        # Method 1: Try to see if there's a CLI tool available
+        try:
+            # Check if claude CLI is available
+            result = subprocess.run(['which', 'claude'], capture_output=True, text=True, timeout=5)
+            if result.returncode == 0:
+                logger.info("Found claude CLI tool")
+                return await self._call_claude_cli(prompt, image_data)
+        except Exception:
+            pass
+        # Method 2: Try to check if there are environment variables or APIs
+        # that suggest Claude Code has internal access
+        try:
+            # Check for Claude Code specific environment variables
+            claude_env_vars = [key for key in os.environ.keys() if 'CLAUDE' in key.upper()]
+            if claude_env_vars:
+                logger.info(f"Found Claude environment variables: {claude_env_vars}")
+                # Try to use these for internal API calls
+                return await self._call_claude_with_env_vars(prompt, image_data)
+        except Exception:
+            pass
+        # Method 3: Try to make a direct HTTP request to local APIs
+        try:
+            return await self._call_claude_local_api(prompt, image_data)
+        except Exception:
+            pass
+        # If all methods fail, raise an informative error
+        raise NotImplementedError(
+            "Claude Code internal API not available. "
+            "Please set ANTHROPIC_API_KEY environment variable to use external Claude API."
+        )
+    async def _call_claude_cli(self, prompt: str, image_data: Optional[str] = None) -> str:
+        """Call Claude using CLI tool if available."""
+        import subprocess
+        import tempfile
+        import asyncio
+        try:
+            # Prepare the prompt
+            with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
+                f.write(prompt)
+                prompt_file = f.name
+            # Prepare command
+            cmd = ['claude', '--file', prompt_file]
+            # If image data is provided, save it and include it
+            if image_data:
+                import base64
+                with tempfile.NamedTemporaryFile(mode='wb', suffix='.png', delete=False) as f:
+                    f.write(base64.b64decode(image_data))
+                    image_file = f.name
+                cmd.extend(['--image', image_file])
+            # Run the command
+            loop = asyncio.get_event_loop()
+            result = await loop.run_in_executor(
+                None,
+                lambda: subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+            )
+            # Clean up temp files
+            os.unlink(prompt_file)
+            if image_data and 'image_file' in locals():
+                os.unlink(image_file)
+            if result.returncode == 0:
+                logger.info("Successfully called Claude CLI")
+                return result.stdout.strip()
+            else:
+                raise Exception(f"Claude CLI failed: {result.stderr}")
+        except Exception as e:
+            logger.error(f"Claude CLI call failed: {e}")
+            raise
+    async def _call_claude_with_env_vars(self, prompt: str, image_data: Optional[str] = None) -> str:
+        """Try to use Claude with environment variables."""
+        # This would use any Claude-specific environment variables
+        # that might be available in Claude Code environment
+        raise NotImplementedError("Environment variable method not implemented")
+    async def _call_claude_local_api(self, prompt: str, image_data: Optional[str] = None) -> str:
+        """Try to call a local Claude API endpoint."""
+        import aiohttp
+        # Try common local API endpoints that might be available
+        endpoints = [
+            'http://localhost:8080/claude',
+            'http://127.0.0.1:8080/claude',
+            'http://localhost:3000/api/claude'
+        ]
+        for endpoint in endpoints:
+            try:
+                async with aiohttp.ClientSession() as session:
+                    payload = {'prompt': prompt}
+                    if image_data:
+                        payload['image'] = image_data
+                    async with session.post(endpoint, json=payload, timeout=30) as response:
+                        if response.status == 200:
+                            result = await response.text()
+                            logger.info(f"Successfully called local Claude API at {endpoint}")
+                            return result
+            except Exception:
+                continue
+        raise Exception("No local Claude API endpoints found")

app.py ADDED Viewed

	@@ -0,0 +1,170 @@

+#!/usr/bin/env python3
+"""
+Minimal Anton Streamlit App - Crash-Safe Version
+"""
+import streamlit as st
+import os
+import sys
+from pathlib import Path
+import numpy as np
+from PIL import Image
+import tempfile
+import traceback
+import gc  # Garbage collection
+# Configure PIL to handle large images better
+Image.MAX_IMAGE_PIXELS = None  # Remove PIL size limit
+# Setup page
+st.set_page_config(
+    page_title="Anton Microscopy Analysis",
+    page_icon="🔬",
+    layout="wide"
+)
+# Header
+st.title("🔬 Anton Microscopy Analysis")
+st.markdown("**Simple Interface**: Upload image → See basic analysis")
+# Debug info container
+debug_container = st.empty()
+# Sidebar
+st.sidebar.header("🎛️ Controls")
+# Check API status
+api_status = []
+if os.getenv('GOOGLE_API_KEY'):
+    api_status.append("✅ Google API Key")
+elif os.getenv('ANTHROPIC_API_KEY'):
+    api_status.append("✅ Anthropic API Key")
+else:
+    api_status.append("⚠️ No API key - demo mode")
+for status in api_status:
+    st.sidebar.write(status)
+# Simple file upload with unique key - avoid st.rerun() issues
+st.sidebar.subheader("📁 Upload Image")
+# Use session state to track upload state to avoid rerun issues
+if 'upload_key' not in st.session_state:
+    st.session_state.upload_key = 0
+uploaded_file = st.sidebar.file_uploader(
+    "Choose an image",
+    type=['png', 'jpg', 'jpeg', 'tiff', 'bmp'],
+    help="Upload microscopy image",
+    key=f"image_uploader_{st.session_state.upload_key}"  # Dynamic key
+)
+# Analysis button with unique key
+analyze_btn = st.sidebar.button("🚀 Analyze", type="primary", key="analyze_button")
+# Main content
+col1, col2 = st.columns([1, 1])
+# Left: Image display
+with col1:
+    st.subheader("🖼️ Image")
+    if uploaded_file is not None:
+        debug_msg = f"🐛 DEBUG: File uploaded - {uploaded_file.name}, size: {uploaded_file.size}"
+        print(debug_msg)
+        debug_container.info(debug_msg)
+        try:
+            # Reset file pointer to beginning (important!)
+            uploaded_file.seek(0)
+            print("DEBUG: File pointer reset to beginning")
+            # Simple PIL loading - most reliable
+            image = Image.open(uploaded_file)
+            debug_msg2 = f"🐛 DEBUG: Image loaded successfully - size: {image.size}, mode: {image.mode}"
+            print(debug_msg2)
+            debug_container.success(debug_msg2)
+            # Resize if too large (prevent memory issues)
+            max_size = (1024, 1024)
+            if image.size[0] > max_size[0] or image.size[1] > max_size[1]:
+                print(f"DEBUG: Resizing image from {image.size} to max {max_size}")
+                image.thumbnail(max_size, Image.Resampling.LANCZOS)
+                st.info(f"📏 Image resized to {image.size} for display")
+                print(f"DEBUG: Image resized to {image.size}")
+            # Convert to RGB if needed
+            if image.mode != 'RGB':
+                image = image.convert('RGB')
+            # Store in session state to prevent reprocessing
+            if 'current_image' not in st.session_state or st.session_state.get('uploaded_filename') != uploaded_file.name:
+                # Clear old image from memory
+                if 'current_image' in st.session_state:
+                    del st.session_state.current_image
+                    gc.collect()
+                st.session_state.current_image = image
+                st.session_state.uploaded_filename = uploaded_file.name
+            # Display image
+            st.image(st.session_state.current_image, caption=f"Uploaded: {uploaded_file.name}", width=400)
+            # Basic info
+            st.caption(f"Size: {st.session_state.current_image.size} | Mode: {st.session_state.current_image.mode}")
+        except Exception as e:
+            error_msg = f"🐛 DEBUG: Error loading image: {e}"
+            st.error(error_msg)
+            debug_container.error(error_msg)
+            # Don't show full traceback to users - just log it
+            print(f"Image loading error: {traceback.format_exc()}")
+    else:
+        st.info("👆 Upload an image to start")
+# Right: Analysis results
+with col2:
+    st.subheader("🧠 Analysis Results")
+    if analyze_btn and uploaded_file is not None:
+        print("DEBUG: Analysis button clicked")
+        try:
+            # Simple mock analysis to test if basic functionality works
+            st.success("✅ Analysis Started!")
+            print("DEBUG: Analysis started successfully")
+            with st.spinner("Processing..."):
+                # Mock processing
+                import time
+                print("DEBUG: Starting mock processing...")
+                time.sleep(2)
+                print("DEBUG: Mock processing complete")
+            # Mock results
+            st.markdown("### 📊 Mock Analysis Results")
+            st.write("**Stage 1: Global Analysis**")
+            st.text_area("Description:",
+                "Mock analysis: This appears to be a microscopy image with cellular structures. "
+                "The image shows good contrast and appears suitable for analysis.",
+                height=100)
+            st.write("**Stage 2: Object Detection**")
+            st.text_area("Objects:",
+                "Mock detection: Multiple cellular objects detected. "
+                "Estimated cell count: 15-25 cells visible.",
+                height=100)
+            st.success("✅ Mock analysis complete!")
+        except Exception as e:
+            st.error(f"Analysis failed: {e}")
+            st.code(traceback.format_exc())
+    elif analyze_btn:
+        st.warning("Please upload an image first!")
+    else:
+        st.info("👈 Upload image and click Analyze")
+# Footer
+st.markdown("---")
+st.markdown("🔬 **Anton Framework** - Minimal Demo Version")

src/streamlit_app.py DELETED Viewed

@@ -1,40 +0,0 @@
-import altair as alt
-import numpy as np
-import pandas as pd
-import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))