Implement metrics evaluation system - CVAT extraction, SAM3 inference, metrics calculation, visualization, and main pipeline

Browse files

Files changed (7) hide show

metrics_evaluation/extraction/cvat_extractor.py +450 -0
metrics_evaluation/inference/sam3_inference.py +265 -0
metrics_evaluation/metrics/metrics_calculator.py +419 -0
metrics_evaluation/run_evaluation.py +271 -0
metrics_evaluation/utils/__init__.py +0 -0
metrics_evaluation/utils/logging_config.py +44 -0
metrics_evaluation/visualization/visual_comparison.py +168 -0

metrics_evaluation/extraction/cvat_extractor.py ADDED Viewed

	@@ -0,0 +1,450 @@

+"""CVAT data extraction for SAM3 evaluation."""
+import json
+import logging
+import random
+from pathlib import Path
+from typing import Any
+from ..config.config_models import EvaluationConfig
+from ..cvat_api.client import CVATClient
+from ..schema.core.annotation.mask import Mask
+logger = logging.getLogger(__name__)
+class CVATExtractor:
+    """Extract annotated images and masks from CVAT."""
+    def __init__(self, config: EvaluationConfig):
+        """Initialize extractor with configuration.
+        Args:
+            config: Evaluation configuration
+        Raises:
+            ValueError: If configuration is invalid
+        """
+        self.config = config
+        self.client: CVATClient | None = None
+        self.project_id: int | None = None
+    def connect(self) -> None:
+        """Connect to CVAT API.
+        Raises:
+            ConnectionError: If connection fails
+            ValueError: If credentials are invalid
+        """
+        import os
+        from dotenv import load_dotenv
+        load_dotenv()
+        username = os.getenv("CVAT_USERNAME")
+        password = os.getenv("CVAT_PASSWORD")
+        if not username or not password:
+            raise ValueError(
+                "CVAT credentials not found in .env file. "
+                "Required: CVAT_USERNAME, CVAT_PASSWORD"
+            )
+        try:
+            self.client = CVATClient(
+                host=self.config.cvat.url,
+                credentials=(username, password),
+                organization=self.config.cvat.organization,
+            )
+            logger.info(f"Connected to CVAT at {self.config.cvat.url}")
+        except Exception as e:
+            raise ConnectionError(f"Failed to connect to CVAT: {e}") from e
+    def find_training_project(self) -> int:
+        """Find AI training project in CVAT.
+        Returns:
+            Project ID
+        Raises:
+            ValueError: If no suitable project found
+        """
+        if not self.client:
+            raise ValueError("Not connected to CVAT. Call connect() first.")
+        projects = self.client.projects.list()
+        filter_str = self.config.cvat.project_name_filter.lower()
+        matching_projects = [
+            p for p in projects if filter_str in p.name.lower()
+        ]
+        if not matching_projects:
+            available = [p.name for p in projects]
+            raise ValueError(
+                f"No project found with '{filter_str}' in name.\n"
+                f"Available projects: {available}"
+            )
+        if len(matching_projects) > 1:
+            names = [p.name for p in matching_projects]
+            logger.warning(
+                f"Multiple matching projects found: {names}. Using first one."
+            )
+        project = matching_projects[0]
+        self.project_id = project.id
+        logger.info(f"Using project: {project.name} (ID: {project.id})")
+        return project.id
+    def discover_images(self) -> dict[str, list[dict[str, Any]]]:
+        """Discover images with target labels.
+        Returns:
+            Dict mapping class names to lists of image metadata
+        Raises:
+            ValueError: If no images found with target labels
+        """
+        if not self.client or not self.project_id:
+            raise ValueError("Must connect and find project first")
+        tasks = self.client.tasks.list(project_id=self.project_id)
+        if not tasks:
+            raise ValueError(f"No tasks found in project {self.project_id}")
+        logger.info(f"Found {len(tasks)} tasks in project")
+        # Collect all images with annotations
+        class_images: dict[str, list[dict[str, Any]]] = {
+            class_name: [] for class_name in self.config.classes.keys()
+        }
+        for task in tasks:
+            try:
+                jobs = self.client.jobs.list(task_id=task.id)
+                for job in jobs:
+                    annotations = self.client.annotations.get_job_annotations(job.id)
+                    # Get frames with annotations
+                    if not annotations or not hasattr(annotations, 'shapes'):
+                        continue
+                    # Group annotations by frame
+                    frame_annotations: dict[int, list] = {}
+                    for shape in annotations.shapes:
+                        frame_id = shape.frame
+                        if frame_id not in frame_annotations:
+                            frame_annotations[frame_id] = []
+                        frame_annotations[frame_id].append(shape)
+                    # Check which classes are present in each frame
+                    for frame_id, shapes in frame_annotations.items():
+                        labels_in_frame = {shape.label_name for shape in shapes if hasattr(shape, 'type') and shape.type == 'mask'}
+                        for class_name in self.config.classes.keys():
+                            if class_name in labels_in_frame:
+                                class_images[class_name].append({
+                                    "task_id": task.id,
+                                    "job_id": job.id,
+                                    "frame_id": frame_id,
+                                    "task_name": task.name,
+                                    "labels": list(labels_in_frame),
+                                })
+            except Exception as e:
+                logger.warning(f"Error processing task {task.id}: {e}")
+                continue
+        # Log discovered counts
+        for class_name, images in class_images.items():
+            logger.info(f"Found {len(images)} images with label '{class_name}'")
+        # Check if we have enough images
+        for class_name, images in class_images.items():
+            requested = self.config.classes[class_name]
+            if len(images) < requested:
+                logger.warning(
+                    f"Class '{class_name}': Requested {requested} images but only found {len(images)}"
+                )
+        return class_images
+    def sample_images(
+        self, class_images: dict[str, list[dict[str, Any]]]
+    ) -> dict[str, list[dict[str, Any]]]:
+        """Randomly sample images for each class.
+        Args:
+            class_images: Dict mapping class names to image lists
+        Returns:
+            Dict with sampled images
+        """
+        sampled = {}
+        for class_name, images in class_images.items():
+            requested = self.config.classes[class_name]
+            available = len(images)
+            if available == 0:
+                logger.error(f"No images available for class '{class_name}'")
+                sampled[class_name] = []
+                continue
+            sample_size = min(requested, available)
+            sampled[class_name] = random.sample(images, sample_size)
+            logger.info(
+                f"Sampled {sample_size}/{requested} images for class '{class_name}'"
+            )
+        return sampled
+    def download_images_and_masks(
+        self, sampled_images: dict[str, list[dict[str, Any]]]
+    ) -> dict[str, list[Path]]:
+        """Download images and extract ground truth masks.
+        Args:
+            sampled_images: Dict of sampled image metadata
+        Returns:
+            Dict mapping class names to lists of image paths
+        Raises:
+            ValueError: If download or extraction fails critically
+        """
+        if not self.client:
+            raise ValueError("Not connected to CVAT")
+        cache_dir = self.config.get_cache_path()
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        downloaded_paths: dict[str, list[Path]] = {
+            class_name: [] for class_name in self.config.classes.keys()
+        }
+        total_images = sum(len(images) for images in sampled_images.values())
+        processed = 0
+        for class_name, images in sampled_images.items():
+            for img_meta in images:
+                processed += 1
+                logger.info(
+                    f"Processing {processed}/{total_images}: "
+                    f"{class_name} - Task {img_meta['task_id']} Frame {img_meta['frame_id']}"
+                )
+                try:
+                    image_path = self._download_image(class_name, img_meta, cache_dir)
+                    self._extract_masks(class_name, img_meta, image_path)
+                    downloaded_paths[class_name].append(image_path)
+                except Exception as e:
+                    logger.error(
+                        f"Failed to process {class_name} image "
+                        f"(task={img_meta['task_id']}, frame={img_meta['frame_id']}): {e}"
+                    )
+                    continue
+        # Log final counts
+        for class_name, paths in downloaded_paths.items():
+            logger.info(f"Successfully processed {len(paths)} images for '{class_name}'")
+        return downloaded_paths
+    def _download_image(
+        self, class_name: str, img_meta: dict[str, Any], cache_dir: Path
+    ) -> Path:
+        """Download single image.
+        Args:
+            class_name: Class label
+            img_meta: Image metadata
+            cache_dir: Cache directory
+        Returns:
+            Path to downloaded image
+        Raises:
+            ValueError: If download fails
+        """
+        # Create output directory
+        image_name = f"{img_meta['task_name']}_frame_{img_meta['frame_id']:06d}"
+        output_dir = cache_dir / class_name / image_name
+        output_dir.mkdir(parents=True, exist_ok=True)
+        image_path = output_dir / "image.jpg"
+        # Check cache
+        if image_path.exists():
+            logger.debug(f"Image already cached: {image_path}")
+            return image_path
+        # Download from CVAT
+        if not self.client:
+            raise ValueError("Client not initialized")
+        try:
+            image_data = self.client.tasks.get_frame(
+                img_meta["task_id"], img_meta["frame_id"]
+            )
+            with open(image_path, "wb") as f:
+                f.write(image_data)
+            logger.debug(f"Downloaded image to {image_path}")
+            return image_path
+        except Exception as e:
+            raise ValueError(
+                f"Failed to download image from task {img_meta['task_id']} "
+                f"frame {img_meta['frame_id']}: {e}"
+            ) from e
+    def _extract_masks(
+        self, class_name: str, img_meta: dict[str, Any], image_path: Path
+    ) -> None:
+        """Extract ground truth masks for image.
+        Args:
+            class_name: Class label
+            img_meta: Image metadata
+            image_path: Path to image file
+        Raises:
+            ValueError: If mask extraction fails
+        """
+        output_dir = image_path.parent / "ground_truth"
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Check if already extracted
+        metadata_path = output_dir / "metadata.json"
+        if metadata_path.exists():
+            logger.debug(f"Masks already extracted: {output_dir}")
+            return
+        if not self.client:
+            raise ValueError("Client not initialized")
+        # Get annotations for this job
+        try:
+            annotations = self.client.annotations.get_job_annotations(
+                img_meta["job_id"]
+            )
+            if not annotations or not hasattr(annotations, 'shapes'):
+                raise ValueError("No annotations found")
+            # Filter masks for this frame
+            frame_masks = [
+                shape
+                for shape in annotations.shapes
+                if shape.frame == img_meta["frame_id"]
+                and hasattr(shape, 'type')
+                and shape.type == 'mask'
+            ]
+            if not frame_masks:
+                logger.warning(
+                    f"No mask annotations found for frame {img_meta['frame_id']}"
+                )
+                # Create empty metadata
+                with open(metadata_path, "w") as f:
+                    json.dump({"masks": []}, f, indent=2)
+                return
+            # Get image dimensions
+            from PIL import Image
+            with Image.open(image_path) as img:
+                width, height = img.size
+            # Extract each mask
+            mask_metadata = []
+            label_counts: dict[str, int] = {}
+            for shape in frame_masks:
+                label = shape.label_name
+                if label not in label_counts:
+                    label_counts[label] = 0
+                instance_idx = label_counts[label]
+                label_counts[label] += 1
+                # Convert CVAT RLE to mask
+                if not hasattr(shape, 'points') or not shape.points:
+                    logger.warning(f"Shape missing points data, skipping")
+                    continue
+                mask_filename = f"mask_{label}_{instance_idx}.png"
+                mask_path = output_dir / mask_filename
+                try:
+                    mask = Mask.from_cvat_api_rle(
+                        cvat_rle=shape.points,
+                        width=width,
+                        height=height,
+                        file_path=str(mask_path),
+                    )
+                    mask_metadata.append({
+                        "filename": mask_filename,
+                        "label": label,
+                        "instance_idx": instance_idx,
+                    })
+                    logger.debug(f"Extracted mask: {mask_filename}")
+                except Exception as e:
+                    logger.error(f"Failed to convert mask for label {label}: {e}")
+                    continue
+            # Save metadata
+            with open(metadata_path, "w") as f:
+                json.dump(
+                    {
+                        "image": str(image_path.name),
+                        "width": width,
+                        "height": height,
+                        "masks": mask_metadata,
+                    },
+                    f,
+                    indent=2,
+                )
+            logger.info(f"Extracted {len(mask_metadata)} masks to {output_dir}")
+        except Exception as e:
+            raise ValueError(f"Failed to extract masks: {e}") from e
+    def run_extraction(self) -> dict[str, list[Path]]:
+        """Run complete extraction pipeline.
+        Returns:
+            Dict mapping class names to lists of image paths
+        Raises:
+            Exception: If any critical step fails
+        """
+        logger.info("Starting CVAT extraction pipeline")
+        # Connect
+        self.connect()
+        # Find project
+        self.find_training_project()
+        # Discover images
+        class_images = self.discover_images()
+        # Sample
+        sampled = self.sample_images(class_images)
+        # Download and extract
+        paths = self.download_images_and_masks(sampled)
+        logger.info("CVAT extraction complete")
+        return paths

metrics_evaluation/inference/sam3_inference.py ADDED Viewed

	@@ -0,0 +1,265 @@

+"""SAM3 inference for evaluation."""
+import base64
+import json
+import logging
+import time
+from pathlib import Path
+import requests
+from PIL import Image
+from ..config.config_models import EvaluationConfig
+logger = logging.getLogger(__name__)
+class SAM3Inferencer:
+    """Run SAM3 inference on images."""
+    def __init__(self, config: EvaluationConfig):
+        """Initialize inferencer with configuration.
+        Args:
+            config: Evaluation configuration
+        """
+        self.config = config
+        self.endpoint = config.sam3.endpoint
+        self.timeout = config.sam3.timeout
+        self.retry_attempts = config.sam3.retry_attempts
+    def infer_single_image(self, image_path: Path, classes: list[str]) -> list[dict]:
+        """Run SAM3 inference on single image.
+        Args:
+            image_path: Path to image file
+            classes: List of class names to detect
+        Returns:
+            List of detection results with masks
+        Raises:
+            ValueError: If inference fails after retries
+        """
+        # Load and encode image
+        with open(image_path, "rb") as f:
+            image_data = f.read()
+        image_b64 = base64.b64encode(image_data).decode()
+        # Prepare request
+        payload = {
+            "inputs": image_b64,
+            "parameters": {"classes": classes}
+        }
+        # Retry logic
+        last_error = None
+        for attempt in range(self.retry_attempts):
+            try:
+                start_time = time.time()
+                response = requests.post(
+                    self.endpoint,
+                    json=payload,
+                    timeout=self.timeout
+                )
+                elapsed = time.time() - start_time
+                if response.status_code == 200:
+                    results = response.json()
+                    logger.debug(
+                        f"Inference successful for {image_path.name} "
+                        f"({elapsed:.2f}s, {len(results)} detections)"
+                    )
+                    return results
+                else:
+                    last_error = f"HTTP {response.status_code}: {response.text}"
+                    logger.warning(f"Inference failed (attempt {attempt + 1}): {last_error}")
+            except requests.Timeout:
+                last_error = f"Request timeout after {self.timeout}s"
+                logger.warning(f"Inference timeout (attempt {attempt + 1})")
+            except Exception as e:
+                last_error = str(e)
+                logger.warning(f"Inference error (attempt {attempt + 1}): {e}")
+            # Exponential backoff
+            if attempt < self.retry_attempts - 1:
+                sleep_time = 2 ** attempt
+                logger.debug(f"Retrying in {sleep_time}s...")
+                time.sleep(sleep_time)
+        raise ValueError(
+            f"Inference failed after {self.retry_attempts} attempts for {image_path}: {last_error}"
+        )
+    def save_inference_results(
+        self,
+        results: list[dict],
+        output_dir: Path,
+        image_width: int,
+        image_height: int
+    ) -> None:
+        """Save inference results as masks.
+        Args:
+            results: SAM3 detection results
+            output_dir: Directory to save masks
+            image_width: Image width
+            image_height: Image height
+        Raises:
+            ValueError: If mask conversion fails
+        """
+        output_dir.mkdir(parents=True, exist_ok=True)
+        mask_metadata = []
+        label_counts: dict[str, int] = {}
+        for result in results:
+            label = result.get("label", "unknown")
+            score = result.get("score", 0.0)
+            mask_b64 = result.get("mask")
+            if not mask_b64:
+                logger.warning(f"Result missing mask data for label {label}")
+                continue
+            # Count instances per label
+            if label not in label_counts:
+                label_counts[label] = 0
+            instance_idx = label_counts[label]
+            label_counts[label] += 1
+            # Decode mask
+            try:
+                mask_data = base64.b64decode(mask_b64)
+                mask_img = Image.open(io.BytesIO(mask_data))
+                # Convert to L mode (grayscale) if needed
+                if mask_img.mode != 'L':
+                    mask_img = mask_img.convert('L')
+                # Validate dimensions
+                if mask_img.size != (image_width, image_height):
+                    logger.warning(
+                        f"Mask dimension mismatch: expected {image_width}x{image_height}, "
+                        f"got {mask_img.size}. Resizing."
+                    )
+                    mask_img = mask_img.resize((image_width, image_height), Image.NEAREST)
+                # Save as PNG
+                mask_filename = f"mask_{label}_{instance_idx}.png"
+                mask_path = output_dir / mask_filename
+                mask_img.save(mask_path)
+                mask_metadata.append({
+                    "filename": mask_filename,
+                    "label": label,
+                    "instance_idx": instance_idx,
+                    "score": score,
+                })
+                logger.debug(f"Saved inference mask: {mask_filename}")
+            except Exception as e:
+                logger.error(f"Failed to save mask for label {label}: {e}")
+                continue
+        # Save metadata
+        metadata_path = output_dir / "metadata.json"
+        with open(metadata_path, "w") as f:
+            json.dump(
+                {
+                    "width": image_width,
+                    "height": image_height,
+                    "masks": mask_metadata,
+                },
+                f,
+                indent=2,
+            )
+        logger.info(f"Saved {len(mask_metadata)} inference masks to {output_dir}")
+    def run_inference_batch(
+        self, image_paths: dict[str, list[Path]], force: bool = False
+    ) -> dict[str, int]:
+        """Run inference on batch of images.
+        Args:
+            image_paths: Dict mapping class names to image paths
+            force: Force re-inference even if results exist
+        Returns:
+            Dict with inference statistics
+        Raises:
+            ValueError: If no images provided
+        """
+        total_images = sum(len(paths) for paths in image_paths.values())
+        if total_images == 0:
+            raise ValueError("No images provided for inference")
+        logger.info(f"Starting inference on {total_images} images")
+        stats = {
+            "total": total_images,
+            "successful": 0,
+            "failed": 0,
+            "skipped": 0,
+        }
+        import io
+        processed = 0
+        for class_name, paths in image_paths.items():
+            for image_path in paths:
+                processed += 1
+                logger.info(
+                    f"Inference {processed}/{total_images}: {image_path.parent.name}"
+                )
+                inference_dir = image_path.parent / "inference"
+                metadata_path = inference_dir / "metadata.json"
+                # Check cache
+                if not force and metadata_path.exists():
+                    logger.debug(f"Inference results already exist: {inference_dir}")
+                    stats["skipped"] += 1
+                    continue
+                try:
+                    # Get image dimensions
+                    with Image.open(image_path) as img:
+                        width, height = img.size
+                    # Run inference
+                    results = self.infer_single_image(
+                        image_path,
+                        list(self.config.classes.keys())
+                    )
+                    # Save results
+                    self.save_inference_results(
+                        results, inference_dir, width, height
+                    )
+                    stats["successful"] += 1
+                except Exception as e:
+                    logger.error(f"Inference failed for {image_path}: {e}")
+                    stats["failed"] += 1
+                    continue
+        logger.info(
+            f"Inference complete: {stats['successful']} successful, "
+            f"{stats['failed']} failed, {stats['skipped']} skipped"
+        )
+        return stats

metrics_evaluation/metrics/metrics_calculator.py ADDED Viewed

	@@ -0,0 +1,419 @@

+"""Metrics calculation for SAM3 evaluation."""
+import json
+import logging
+from pathlib import Path
+from typing import Any
+import numpy as np
+from PIL import Image
+from scipy.optimize import linear_sum_assignment
+from ..config.config_models import EvaluationConfig
+logger = logging.getLogger(__name__)
+class MetricsCalculator:
+    """Calculate segmentation metrics."""
+    def __init__(self, config: EvaluationConfig):
+        """Initialize calculator with configuration.
+        Args:
+            config: Evaluation configuration
+        """
+        self.config = config
+        self.iou_thresholds = config.metrics.iou_thresholds
+    def calculate_iou(self, mask1: np.ndarray, mask2: np.ndarray) -> float:
+        """Calculate IoU between two binary masks.
+        Args:
+            mask1: First binary mask
+            mask2: Second binary mask
+        Returns:
+            IoU score between 0 and 1
+        """
+        intersection = np.logical_and(mask1, mask2).sum()
+        union = np.logical_or(mask1, mask2).sum()
+        if union == 0:
+            return 0.0
+        return float(intersection / union)
+    def match_instances(
+        self,
+        gt_masks: list[np.ndarray],
+        pred_masks: list[np.ndarray],
+        iou_threshold: float
+    ) -> dict[str, Any]:
+        """Match predicted instances to ground truth.
+        Args:
+            gt_masks: List of ground truth masks
+            pred_masks: List of predicted masks
+            iou_threshold: Minimum IoU for match
+        Returns:
+            Dict with matching results
+        """
+        if len(gt_masks) == 0 and len(pred_masks) == 0:
+            return {
+                "matches": [],
+                "unmatched_gt": [],
+                "unmatched_pred": [],
+                "true_positives": 0,
+                "false_positives": 0,
+                "false_negatives": 0,
+            }
+        if len(gt_masks) == 0:
+            return {
+                "matches": [],
+                "unmatched_gt": [],
+                "unmatched_pred": list(range(len(pred_masks))),
+                "true_positives": 0,
+                "false_positives": len(pred_masks),
+                "false_negatives": 0,
+            }
+        if len(pred_masks) == 0:
+            return {
+                "matches": [],
+                "unmatched_gt": list(range(len(gt_masks))),
+                "unmatched_pred": [],
+                "true_positives": 0,
+                "false_positives": 0,
+                "false_negatives": len(gt_masks),
+            }
+        # Compute IoU matrix
+        iou_matrix = np.zeros((len(gt_masks), len(pred_masks)))
+        for i, gt_mask in enumerate(gt_masks):
+            for j, pred_mask in enumerate(pred_masks):
+                iou_matrix[i, j] = self.calculate_iou(gt_mask, pred_mask)
+        # Hungarian algorithm for optimal matching
+        gt_indices, pred_indices = linear_sum_assignment(-iou_matrix)
+        # Filter matches by threshold
+        matches = []
+        for gt_idx, pred_idx in zip(gt_indices, pred_indices):
+            iou = iou_matrix[gt_idx, pred_idx]
+            if iou >= iou_threshold:
+                matches.append({
+                    "gt_idx": int(gt_idx),
+                    "pred_idx": int(pred_idx),
+                    "iou": float(iou),
+                })
+        matched_gt = {m["gt_idx"] for m in matches}
+        matched_pred = {m["pred_idx"] for m in matches}
+        unmatched_gt = [i for i in range(len(gt_masks)) if i not in matched_gt]
+        unmatched_pred = [i for i in range(len(pred_masks)) if i not in matched_pred]
+        return {
+            "matches": matches,
+            "unmatched_gt": unmatched_gt,
+            "unmatched_pred": unmatched_pred,
+            "true_positives": len(matches),
+            "false_positives": len(unmatched_pred),
+            "false_negatives": len(unmatched_gt),
+        }
+    def calculate_image_metrics(
+        self, image_dir: Path
+    ) -> dict[str, Any]:
+        """Calculate metrics for single image.
+        Args:
+            image_dir: Directory containing ground_truth and inference subdirs
+        Returns:
+            Dict with metrics for this image
+        Raises:
+            ValueError: If required files are missing
+        """
+        gt_dir = image_dir / "ground_truth"
+        inf_dir = image_dir / "inference"
+        # Load metadata
+        gt_meta_path = gt_dir / "metadata.json"
+        inf_meta_path = inf_dir / "metadata.json"
+        if not gt_meta_path.exists():
+            raise ValueError(f"Ground truth metadata not found: {gt_meta_path}")
+        if not inf_meta_path.exists():
+            logger.warning(f"Inference metadata not found: {inf_meta_path}")
+            return None
+        with open(gt_meta_path) as f:
+            gt_meta = json.load(f)
+        with open(inf_meta_path) as f:
+            inf_meta = json.load(f)
+        # Group masks by label
+        gt_by_label: dict[str, list[np.ndarray]] = {}
+        inf_by_label: dict[str, list[np.ndarray]] = {}
+        # Load ground truth masks
+        for mask_info in gt_meta.get("masks", []):
+            label = mask_info["label"]
+            mask_path = gt_dir / mask_info["filename"]
+            if not mask_path.exists():
+                logger.warning(f"Ground truth mask not found: {mask_path}")
+                continue
+            mask_img = Image.open(mask_path)
+            mask_array = np.array(mask_img) > 0  # Binarize
+            if label not in gt_by_label:
+                gt_by_label[label] = []
+            gt_by_label[label].append(mask_array)
+        # Load inference masks
+        for mask_info in inf_meta.get("masks", []):
+            label = mask_info["label"]
+            mask_path = inf_dir / mask_info["filename"]
+            if not mask_path.exists():
+                logger.warning(f"Inference mask not found: {mask_path}")
+                continue
+            mask_img = Image.open(mask_path)
+            mask_array = np.array(mask_img) > 0  # Binarize
+            if label not in inf_by_label:
+                inf_by_label[label] = []
+            inf_by_label[label].append(mask_array)
+        # Calculate metrics at each IoU threshold
+        results = {
+            "image_name": image_dir.name,
+            "class": image_dir.parent.name,
+            "by_threshold": {},
+        }
+        all_labels = set(gt_by_label.keys()) | set(inf_by_label.keys())
+        for threshold in self.iou_thresholds:
+            threshold_results = {
+                "iou_threshold": threshold,
+                "by_label": {},
+                "total": {
+                    "true_positives": 0,
+                    "false_positives": 0,
+                    "false_negatives": 0,
+                },
+            }
+            for label in all_labels:
+                gt_masks = gt_by_label.get(label, [])
+                pred_masks = inf_by_label.get(label, [])
+                matching = self.match_instances(gt_masks, pred_masks, threshold)
+                threshold_results["by_label"][label] = {
+                    "gt_count": len(gt_masks),
+                    "pred_count": len(pred_masks),
+                    "true_positives": matching["true_positives"],
+                    "false_positives": matching["false_positives"],
+                    "false_negatives": matching["false_negatives"],
+                    "matches": matching["matches"],
+                }
+                # Add to totals
+                threshold_results["total"]["true_positives"] += matching["true_positives"]
+                threshold_results["total"]["false_positives"] += matching["false_positives"]
+                threshold_results["total"]["false_negatives"] += matching["false_negatives"]
+            results["by_threshold"][str(threshold)] = threshold_results
+        return results
+    def calculate_aggregate_metrics(
+        self, image_results: list[dict[str, Any]]
+    ) -> dict[str, Any]:
+        """Calculate aggregate metrics across all images.
+        Args:
+            image_results: List of per-image metrics
+        Returns:
+            Dict with aggregate metrics
+        """
+        aggregate = {
+            "total_images": len(image_results),
+            "by_threshold": {},
+        }
+        for threshold in self.iou_thresholds:
+            threshold_str = str(threshold)
+            # Aggregate counts
+            label_stats: dict[str, dict] = {}
+            total_tp = 0
+            total_fp = 0
+            total_fn = 0
+            for img_result in image_results:
+                threshold_data = img_result["by_threshold"][threshold_str]
+                for label, label_data in threshold_data["by_label"].items():
+                    if label not in label_stats:
+                        label_stats[label] = {
+                            "tp": 0,
+                            "fp": 0,
+                            "fn": 0,
+                            "gt_total": 0,
+                            "pred_total": 0,
+                        }
+                    label_stats[label]["tp"] += label_data["true_positives"]
+                    label_stats[label]["fp"] += label_data["false_positives"]
+                    label_stats[label]["fn"] += label_data["false_negatives"]
+                    label_stats[label]["gt_total"] += label_data["gt_count"]
+                    label_stats[label]["pred_total"] += label_data["pred_count"]
+                total_tp += threshold_data["total"]["true_positives"]
+                total_fp += threshold_data["total"]["false_positives"]
+                total_fn += threshold_data["total"]["false_negatives"]
+            # Calculate precision, recall, F1 per label
+            for label, stats in label_stats.items():
+                precision = stats["tp"] / (stats["tp"] + stats["fp"]) if (stats["tp"] + stats["fp"]) > 0 else 0.0
+                recall = stats["tp"] / (stats["tp"] + stats["fn"]) if (stats["tp"] + stats["fn"]) > 0 else 0.0
+                f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
+                stats["precision"] = precision
+                stats["recall"] = recall
+                stats["f1"] = f1
+            # Calculate overall metrics
+            overall_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0
+            overall_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0
+            overall_f1 = 2 * overall_precision * overall_recall / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0.0
+            # Build confusion matrix
+            confusion_matrix = self._build_confusion_matrix(image_results, threshold_str)
+            aggregate["by_threshold"][threshold_str] = {
+                "iou_threshold": threshold,
+                "by_label": label_stats,
+                "overall": {
+                    "true_positives": total_tp,
+                    "false_positives": total_fp,
+                    "false_negatives": total_fn,
+                    "precision": overall_precision,
+                    "recall": overall_recall,
+                    "f1": overall_f1,
+                    "map": overall_precision,  # Simplified mAP
+                    "mar": overall_recall,     # Simplified mAR
+                },
+                "confusion_matrix": confusion_matrix,
+            }
+        return aggregate
+    def _build_confusion_matrix(
+        self, image_results: list[dict[str, Any]], threshold_str: str
+    ) -> dict[str, Any]:
+        """Build confusion matrix for given threshold.
+        Args:
+            image_results: List of per-image metrics
+            threshold_str: IoU threshold as string
+        Returns:
+            Confusion matrix data
+        """
+        # Get all labels
+        all_labels = set()
+        for img_result in image_results:
+            threshold_data = img_result["by_threshold"][threshold_str]
+            all_labels.update(threshold_data["by_label"].keys())
+        labels = sorted(all_labels)
+        n_labels = len(labels)
+        # Initialize matrix
+        matrix = np.zeros((n_labels, n_labels), dtype=int)
+        # Fill matrix (simplified: just count matches)
+        for img_result in image_results:
+            threshold_data = img_result["by_threshold"][threshold_str]
+            for i, gt_label in enumerate(labels):
+                if gt_label not in threshold_data["by_label"]:
+                    continue
+                label_data = threshold_data["by_label"][gt_label]
+                # True positives go on diagonal
+                matrix[i, i] += label_data["true_positives"]
+                # False negatives (missed) - simplified representation
+                # In a full implementation, we'd track which class they were predicted as
+        return {
+            "labels": labels,
+            "matrix": matrix.tolist(),
+        }
+    def run_evaluation(self, cache_dir: Path) -> dict[str, Any]:
+        """Run complete metrics evaluation.
+        Args:
+            cache_dir: Cache directory with ground truth and inference results
+        Returns:
+            Complete metrics results
+        Raises:
+            ValueError: If cache directory is invalid
+        """
+        if not cache_dir.exists():
+            raise ValueError(f"Cache directory not found: {cache_dir}")
+        logger.info(f"Calculating metrics from {cache_dir}")
+        # Find all image directories
+        image_results = []
+        for class_dir in cache_dir.iterdir():
+            if not class_dir.is_dir():
+                continue
+            for image_dir in class_dir.iterdir():
+                if not image_dir.is_dir():
+                    continue
+                try:
+                    metrics = self.calculate_image_metrics(image_dir)
+                    if metrics:
+                        image_results.append(metrics)
+                except Exception as e:
+                    logger.error(f"Failed to calculate metrics for {image_dir}: {e}")
+                    continue
+        if not image_results:
+            raise ValueError("No valid image results found for metrics calculation")
+        logger.info(f"Calculated metrics for {len(image_results)} images")
+        # Calculate aggregate
+        aggregate = self.calculate_aggregate_metrics(image_results)
+        return {
+            "per_image": image_results,
+            "aggregate": aggregate,
+        }

metrics_evaluation/run_evaluation.py ADDED Viewed

	@@ -0,0 +1,271 @@

+#!/usr/bin/env python3
+"""Main execution script for SAM3 metrics evaluation."""
+import argparse
+import json
+import logging
+import sys
+from pathlib import Path
+from config.config_loader import load_config
+from extraction.cvat_extractor import CVATExtractor
+from inference.sam3_inference import SAM3Inferencer
+from metrics.metrics_calculator import MetricsCalculator
+from utils.logging_config import setup_logging
+from visualization.visual_comparison import VisualComparator
+logger = logging.getLogger(__name__)
+def write_metrics_summary(metrics: dict, output_path: Path) -> None:
+    """Write human-readable metrics summary.
+    Args:
+        metrics: Metrics dictionary
+        output_path: Path to output file
+    """
+    with open(output_path, "w") as f:
+        f.write("=" * 80 + "\n")
+        f.write("SAM3 EVALUATION METRICS SUMMARY\n")
+        f.write("=" * 80 + "\n\n")
+        aggregate = metrics["aggregate"]
+        f.write(f"Total Images Evaluated: {aggregate['total_images']}\n\n")
+        for threshold_str, threshold_data in aggregate["by_threshold"].items():
+            iou = threshold_data["iou_threshold"]
+            f.write(f"\n{'='*80}\n")
+            f.write(f"IoU Threshold: {iou:.0%}\n")
+            f.write(f"{'='*80}\n\n")
+            overall = threshold_data["overall"]
+            f.write("Overall Metrics:\n")
+            f.write(f"  True Positives:  {overall['true_positives']}\n")
+            f.write(f"  False Positives: {overall['false_positives']}\n")
+            f.write(f"  False Negatives: {overall['false_negatives']}\n")
+            f.write(f"  Precision:       {overall['precision']:.2%}\n")
+            f.write(f"  Recall:          {overall['recall']:.2%}\n")
+            f.write(f"  F1-Score:        {overall['f1']:.2%}\n")
+            f.write(f"  mAP:             {overall['map']:.2%}\n")
+            f.write(f"  mAR:             {overall['mar']:.2%}\n\n")
+            f.write("Per-Class Metrics:\n")
+            f.write("-" * 80 + "\n")
+            f.write(f"{'Class':<20} {'GT':>6} {'Pred':>6} {'TP':>6} {'FP':>6} {'FN':>6} {'Prec':>8} {'Rec':>8} {'F1':>8}\n")
+            f.write("-" * 80 + "\n")
+            for label, stats in sorted(threshold_data["by_label"].items()):
+                f.write(
+                    f"{label:<20} "
+                    f"{stats['gt_total']:>6} "
+                    f"{stats['pred_total']:>6} "
+                    f"{stats['tp']:>6} "
+                    f"{stats['fp']:>6} "
+                    f"{stats['fn']:>6} "
+                    f"{stats['precision']:>8.2%} "
+                    f"{stats['recall']:>8.2%} "
+                    f"{stats['f1']:>8.2%}\n"
+                )
+            f.write("\n")
+            # Confusion Matrix
+            cm = threshold_data["confusion_matrix"]
+            labels = cm["labels"]
+            matrix = cm["matrix"]
+            if labels:
+                f.write("Confusion Matrix:\n")
+                f.write("-" * 80 + "\n")
+                # Header
+                header = "Actual \\ Pred |"
+                for label in labels:
+                    header += f" {label[:10]:>10} |"
+                f.write(header + "\n")
+                f.write("-" * len(header) + "\n")
+                # Rows
+                for i, actual_label in enumerate(labels):
+                    row = f"{actual_label[:13]:>13} |"
+                    for j in range(len(labels)):
+                        row += f" {matrix[i][j]:>10} |"
+                    f.write(row + "\n")
+                f.write("\n")
+        f.write("=" * 80 + "\n")
+        f.write("END OF REPORT\n")
+        f.write("=" * 80 + "\n")
+    logger.info(f"Wrote metrics summary to {output_path}")
+def main() -> int:
+    """Main execution function.
+    Returns:
+        Exit code (0 for success, non-zero for failure)
+    """
+    parser = argparse.ArgumentParser(
+        description="Run SAM3 metrics evaluation against CVAT ground truth"
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="config/config.json",
+        help="Path to configuration file"
+    )
+    parser.add_argument(
+        "--force-download",
+        action="store_true",
+        help="Force re-download images from CVAT"
+    )
+    parser.add_argument(
+        "--force-inference",
+        action="store_true",
+        help="Force re-run SAM3 inference"
+    )
+    parser.add_argument(
+        "--skip-inference",
+        action="store_true",
+        help="Skip inference, use cached results"
+    )
+    parser.add_argument(
+        "--visualize",
+        action="store_true",
+        help="Generate visual comparisons"
+    )
+    parser.add_argument(
+        "--log-level",
+        type=str,
+        default="INFO",
+        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
+        help="Logging level"
+    )
+    args = parser.parse_args()
+    # Load configuration
+    try:
+        config = load_config(args.config)
+    except Exception as e:
+        print(f"ERROR: Failed to load configuration: {e}", file=sys.stderr)
+        return 1
+    # Setup logging
+    cache_dir = config.get_cache_path()
+    log_file = cache_dir / "evaluation_log.txt"
+    setup_logging(log_file, getattr(logging, args.log_level))
+    logger.info("=" * 80)
+    logger.info("SAM3 METRICS EVALUATION")
+    logger.info("=" * 80)
+    try:
+        # Phase 1: Extract from CVAT
+        logger.info("\n" + "=" * 80)
+        logger.info("PHASE 1: CVAT Data Extraction")
+        logger.info("=" * 80)
+        extractor = CVATExtractor(config)
+        if args.force_download:
+            logger.info("Force download enabled - will re-download all images")
+        image_paths = extractor.run_extraction()
+        total_extracted = sum(len(paths) for paths in image_paths.values())
+        logger.info(f"Extraction complete: {total_extracted} images extracted")
+        if total_extracted == 0:
+            logger.error("No images extracted. Aborting.")
+            return 1
+        # Phase 2: Run SAM3 Inference
+        if not args.skip_inference:
+            logger.info("\n" + "=" * 80)
+            logger.info("PHASE 2: SAM3 Inference")
+            logger.info("=" * 80)
+            inferencer = SAM3Inferencer(config)
+            stats = inferencer.run_inference_batch(image_paths, args.force_inference)
+            logger.info(
+                f"Inference complete: {stats['successful']} successful, "
+                f"{stats['failed']} failed, {stats['skipped']} skipped"
+            )
+            if stats['successful'] == 0 and stats['skipped'] == 0:
+                logger.error("No successful inferences. Aborting.")
+                return 1
+        else:
+            logger.info("Skipping inference (--skip-inference)")
+        # Phase 3: Calculate Metrics
+        logger.info("\n" + "=" * 80)
+        logger.info("PHASE 3: Metrics Calculation")
+        logger.info("=" * 80)
+        calculator = MetricsCalculator(config)
+        metrics = calculator.run_evaluation(cache_dir)
+        # Save detailed metrics
+        metrics_json_path = cache_dir / "metrics_detailed.json"
+        with open(metrics_json_path, "w") as f:
+            json.dump(metrics, f, indent=2)
+        logger.info(f"Saved detailed metrics to {metrics_json_path}")
+        # Write summary
+        metrics_summary_path = cache_dir / "metrics_summary.txt"
+        write_metrics_summary(metrics, metrics_summary_path)
+        # Phase 4: Visualization (optional)
+        if args.visualize or config.output.generate_visualizations:
+            logger.info("\n" + "=" * 80)
+            logger.info("PHASE 4: Visual Comparisons")
+            logger.info("=" * 80)
+            comparator = VisualComparator()
+            comparison_paths = comparator.generate_all_comparisons(cache_dir)
+            logger.info(f"Generated {len(comparison_paths)} visual comparisons")
+        # Summary
+        logger.info("\n" + "=" * 80)
+        logger.info("EVALUATION COMPLETE")
+        logger.info("=" * 80)
+        aggregate = metrics["aggregate"]
+        logger.info(f"Images evaluated: {aggregate['total_images']}")
+        # Show metrics at 50% IoU
+        threshold_50 = aggregate["by_threshold"]["0.5"]
+        overall = threshold_50["overall"]
+        logger.info(f"\nMetrics at 50% IoU:")
+        logger.info(f"  Precision: {overall['precision']:.2%}")
+        logger.info(f"  Recall:    {overall['recall']:.2%}")
+        logger.info(f"  F1-Score:  {overall['f1']:.2%}")
+        logger.info(f"  mAP:       {overall['map']:.2%}")
+        logger.info(f"  mAR:       {overall['mar']:.2%}")
+        logger.info(f"\nResults saved to:")
+        logger.info(f"  Metrics Summary: {metrics_summary_path}")
+        logger.info(f"  Detailed JSON:   {metrics_json_path}")
+        logger.info(f"  Execution Log:   {log_file}")
+        return 0
+    except KeyboardInterrupt:
+        logger.warning("\nEvaluation interrupted by user")
+        return 130
+    except Exception as e:
+        logger.error(f"\nEvaluation failed with error: {e}", exc_info=True)
+        return 1
+if __name__ == "__main__":
+    sys.exit(main())

metrics_evaluation/utils/__init__.py ADDED Viewed

File without changes

metrics_evaluation/utils/logging_config.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""Logging configuration for evaluation."""
+import logging
+import sys
+from pathlib import Path
+def setup_logging(log_file: Path | None = None, level: int = logging.INFO) -> None:
+    """Set up logging configuration.
+    Args:
+        log_file: Optional path to log file
+        level: Logging level
+    """
+    # Create formatters
+    detailed_formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S"
+    )
+    simple_formatter = logging.Formatter(
+        "%(levelname)s: %(message)s"
+    )
+    # Get root logger
+    logger = logging.getLogger()
+    logger.setLevel(level)
+    # Remove existing handlers
+    logger.handlers.clear()
+    # Console handler
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setLevel(level)
+    console_handler.setFormatter(simple_formatter)
+    logger.addHandler(console_handler)
+    # File handler
+    if log_file:
+        log_file.parent.mkdir(parents=True, exist_ok=True)
+        file_handler = logging.FileHandler(log_file, mode="w")
+        file_handler.setLevel(logging.DEBUG)
+        file_handler.setFormatter(detailed_formatter)
+        logger.addHandler(file_handler)

metrics_evaluation/visualization/visual_comparison.py ADDED Viewed

	@@ -0,0 +1,168 @@

+"""Visual comparison generation for evaluation."""
+import json
+import logging
+from pathlib import Path
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+logger = logging.getLogger(__name__)
+class VisualComparator:
+    """Generate visual comparisons between ground truth and predictions."""
+    def __init__(self):
+        """Initialize comparator."""
+        self.colors = {
+            "ground_truth": (0, 255, 0, 128),  # Green
+            "prediction": (255, 0, 0, 128),     # Red
+            "true_positive": (255, 255, 0, 128),  # Yellow
+            "false_positive": (255, 0, 0, 128),   # Red
+            "false_negative": (0, 0, 255, 128),   # Blue
+        }
+    def create_comparison(
+        self, image_dir: Path, output_path: Path | None = None
+    ) -> Path:
+        """Create visual comparison for image.
+        Args:
+            image_dir: Directory containing image and masks
+            output_path: Optional output path (default: image_dir/comparison.png)
+        Returns:
+            Path to generated comparison image
+        Raises:
+            ValueError: If required files are missing
+        """
+        # Load original image
+        image_path = image_dir / "image.jpg"
+        if not image_path.exists():
+            raise ValueError(f"Image not found: {image_path}")
+        original = Image.open(image_path).convert("RGBA")
+        width, height = original.size
+        # Create overlays
+        gt_overlay = Image.new("RGBA", (width, height), (0, 0, 0, 0))
+        pred_overlay = Image.new("RGBA", (width, height), (0, 0, 0, 0))
+        # Load ground truth masks
+        gt_dir = image_dir / "ground_truth"
+        if gt_dir.exists():
+            gt_meta_path = gt_dir / "metadata.json"
+            if gt_meta_path.exists():
+                with open(gt_meta_path) as f:
+                    gt_meta = json.load(f)
+                for mask_info in gt_meta.get("masks", []):
+                    mask_path = gt_dir / mask_info["filename"]
+                    if not mask_path.exists():
+                        continue
+                    mask = Image.open(mask_path).convert("L")
+                    colored_mask = Image.new("RGBA", (width, height), self.colors["ground_truth"])
+                    colored_mask.putalpha(mask)
+                    gt_overlay = Image.alpha_composite(gt_overlay, colored_mask)
+        # Load prediction masks
+        pred_dir = image_dir / "inference"
+        if pred_dir.exists():
+            pred_meta_path = pred_dir / "metadata.json"
+            if pred_meta_path.exists():
+                with open(pred_meta_path) as f:
+                    pred_meta = json.load(f)
+                for mask_info in pred_meta.get("masks", []):
+                    mask_path = pred_dir / mask_info["filename"]
+                    if not mask_path.exists():
+                        continue
+                    mask = Image.open(mask_path).convert("L")
+                    colored_mask = Image.new("RGBA", (width, height), self.colors["prediction"])
+                    colored_mask.putalpha(mask)
+                    pred_overlay = Image.alpha_composite(pred_overlay, colored_mask)
+        # Composite images
+        result = Image.alpha_composite(original, gt_overlay)
+        result = Image.alpha_composite(result, pred_overlay)
+        # Add legend
+        result = self._add_legend(result)
+        # Save
+        if output_path is None:
+            output_path = image_dir / "comparison.png"
+        result.convert("RGB").save(output_path)
+        logger.debug(f"Saved comparison to {output_path}")
+        return output_path
+    def _add_legend(self, image: Image.Image) -> Image.Image:
+        """Add color legend to image.
+        Args:
+            image: Input image
+        Returns:
+            Image with legend
+        """
+        # Create legend area
+        legend_height = 60
+        legend_img = Image.new("RGB", (image.width, image.height + legend_height), (255, 255, 255))
+        legend_img.paste(image, (0, 0))
+        draw = ImageDraw.Draw(legend_img)
+        # Draw legend items
+        x_offset = 10
+        y_offset = image.height + 10
+        items = [
+            ("Ground Truth", self.colors["ground_truth"][:3]),
+            ("Prediction", self.colors["prediction"][:3]),
+        ]
+        for label, color in items:
+            # Draw color box
+            draw.rectangle([x_offset, y_offset, x_offset + 30, y_offset + 30], fill=color)
+            # Draw label
+            draw.text((x_offset + 40, y_offset + 5), label, fill=(0, 0, 0))
+            x_offset += 200
+        return legend_img
+    def generate_all_comparisons(self, cache_dir: Path) -> list[Path]:
+        """Generate comparisons for all images in cache.
+        Args:
+            cache_dir: Cache directory
+        Returns:
+            List of paths to generated comparisons
+        """
+        comparison_paths = []
+        for class_dir in cache_dir.iterdir():
+            if not class_dir.is_dir():
+                continue
+            for image_dir in class_dir.iterdir():
+                if not image_dir.is_dir():
+                    continue
+                try:
+                    comparison_path = self.create_comparison(image_dir)
+                    comparison_paths.append(comparison_path)
+                except Exception as e:
+                    logger.error(f"Failed to create comparison for {image_dir}: {e}")
+                    continue
+        logger.info(f"Generated {len(comparison_paths)} comparison images")
+        return comparison_paths