""" CV Export Utilities Shared helper functions for computer vision export formats (COCO, YOLO, VOC). """ from typing import Dict, List, Tuple, Any, Optional import logging logger = logging.getLogger(__name__) def build_category_mapping(annotations: List[dict], schemas: List[dict]) -> Dict[str, int]: """ Build a mapping from label names to integer category IDs. Extracts labels from image_annotation schemas first (preserving config order), then discovers any additional labels from annotations. Args: annotations: List of annotation records schemas: List of annotation_scheme config dicts Returns: Dict mapping label name -> integer ID (starting from 1 for COCO, 0-indexed for YOLO) """ labels = [] seen = set() # First, collect labels from schema configs (preserves defined order) for schema in schemas: if schema.get("annotation_type") == "image_annotation": for label_def in schema.get("labels", []): name = label_def if isinstance(label_def, str) else label_def.get("name", "") if name and name not in seen: labels.append(name) seen.add(name) # Then discover any labels in annotation data not already in config for ann in annotations: for schema_name, img_annotations in ann.get("image_annotations", {}).items(): if not isinstance(img_annotations, list): continue for obj in img_annotations: label = obj.get("label", "") if label and label not in seen: labels.append(label) seen.add(label) return {name: idx for idx, name in enumerate(labels)} def polygon_to_bbox(points: List[List[float]]) -> Tuple[float, float, float, float]: """ Compute axis-aligned bounding box from a polygon. Args: points: List of [x, y] coordinate pairs Returns: Tuple of (x_min, y_min, width, height) """ if not points: return (0, 0, 0, 0) xs = [p[0] for p in points] ys = [p[1] for p in points] x_min = min(xs) y_min = min(ys) return (x_min, y_min, max(xs) - x_min, max(ys) - y_min) def polygon_area(points: List[List[float]]) -> float: """ Compute the area of a polygon using the shoelace formula. Args: points: List of [x, y] coordinate pairs Returns: Absolute area of the polygon """ n = len(points) if n < 3: return 0.0 area = 0.0 for i in range(n): j = (i + 1) % n area += points[i][0] * points[j][1] area -= points[j][0] * points[i][1] return abs(area) / 2.0 def normalize_bbox(x: float, y: float, w: float, h: float, img_w: float, img_h: float) -> Tuple[float, float, float, float]: """ Normalize bounding box coordinates to [0, 1] range. Args: x, y: Top-left corner coordinates w, h: Width and height img_w, img_h: Image dimensions Returns: Tuple of (center_x, center_y, width, height) normalized to [0, 1] """ if img_w <= 0 or img_h <= 0: return (0, 0, 0, 0) cx = max(0.0, min(1.0, (x + w / 2) / img_w)) cy = max(0.0, min(1.0, (y + h / 2) / img_h)) nw = max(0.0, min(1.0, w / img_w)) nh = max(0.0, min(1.0, h / img_h)) return (cx, cy, nw, nh) def flatten_polygon(points: List[List[float]]) -> List[float]: """ Flatten a list of [x, y] points into a flat coordinate list [x1, y1, x2, y2, ...]. This is the format used by COCO segmentation. Args: points: List of [x, y] coordinate pairs Returns: Flat list of coordinates """ result = [] for p in points: result.extend(p[:2]) return result def extract_image_annotations(annotation: dict) -> List[Tuple[str, List[dict]]]: """ Extract image annotation objects from an annotation record. Args: annotation: Single annotation record with image_annotations field Returns: List of (schema_name, annotation_objects) tuples """ results = [] for schema_name, objects in annotation.get("image_annotations", {}).items(): if isinstance(objects, list) and objects: results.append((schema_name, objects)) return results def get_image_dimensions(item: dict, default_width: int = 0, default_height: int = 0) -> Tuple[int, int]: """ Extract image dimensions from item metadata. Checks common field names for image width/height. Args: item: Item data dict default_width: Fallback width default_height: Fallback height Returns: Tuple of (width, height) """ # Check common field patterns width = default_width for w_key in ("image_width", "width", "img_width", "w"): if w_key in item: try: width = int(item[w_key]) except (ValueError, TypeError): pass break height = default_height for h_key in ("image_height", "height", "img_height", "h"): if h_key in item: try: height = int(item[h_key]) except (ValueError, TypeError): pass break return (width, height) def get_image_filename(item: dict) -> Optional[str]: """ Extract image filename from item data. Args: item: Item data dict Returns: Image filename/path string or None """ for key in ("image", "image_path", "image_url", "file_name", "filename", "img"): if key in item and item[key]: return str(item[key]) return None # --------------------------------------------------------------------------- # RLE mask utilities (Potato RLE <-> COCO RLE conversion) # --------------------------------------------------------------------------- def decode_rle(rle: dict, width: int, height: int) -> List[int]: """ Decode Potato RLE-encoded mask to a flat binary array (row-major order). Potato RLE stores counts alternating between 0-pixels and 1-pixels, starting with 0s, in row-major (left-to-right, top-to-bottom) order. Args: rle: Dict with 'counts' (list of ints) and 'size' [height, width] width: Image width height: Image height Returns: Flat list of 0/1 values in row-major order """ counts = rle.get("counts", []) total = width * height mask = [0] * total pos = 0 val = 0 for count in counts: for _ in range(count): if pos < total: mask[pos] = val pos += 1 val = 1 - val return mask def rle_bbox(mask: List[int], width: int, height: int) -> List[float]: """ Compute axis-aligned bounding box [x, y, w, h] from a flat binary mask. Args: mask: Flat list of 0/1 values (row-major) width: Image width height: Image height Returns: [x_min, y_min, bbox_width, bbox_height] or [0, 0, 0, 0] if empty """ x_min, y_min = width, height x_max, y_max = -1, -1 for i, val in enumerate(mask): if val: y = i // width x = i % width if x < x_min: x_min = x if x > x_max: x_max = x if y < y_min: y_min = y if y > y_max: y_max = y if x_max < 0: return [0, 0, 0, 0] return [float(x_min), float(y_min), float(x_max - x_min + 1), float(y_max - y_min + 1)] def rle_area(mask: List[int]) -> int: """ Compute mask area as the count of foreground pixels. Args: mask: Flat list of 0/1 values Returns: Number of 1-pixels """ return sum(mask) def _column_major_rle_counts(mask_2d: List[List[int]], height: int, width: int) -> List[int]: """ Read a 2D mask in column-major order and compute RLE counts. Counts alternate between 0-pixels and 1-pixels, starting with 0s. Args: mask_2d: 2D list [height][width] of 0/1 values height: Image height width: Image width Returns: List of integer run counts in column-major order """ counts: List[int] = [] current_val = 0 current_run = 0 for x in range(width): for y in range(height): pixel = mask_2d[y][x] if pixel == current_val: current_run += 1 else: counts.append(current_run) current_val = 1 - current_val current_run = 1 counts.append(current_run) return counts def _encode_coco_rle_string(counts: List[int]) -> str: """ Encode RLE integer counts as a COCO compressed ASCII string. Implements the exact algorithm from pycocotools maskApi.c rleToString(): - Delta encoding for i > 2: x = counts[i] - counts[i-2] - Each value encoded as 6-bit groups (5 data bits + 1 continuation bit) - Each group offset by 48 to produce printable ASCII - Signed values supported via arithmetic right shift Args: counts: List of integer run counts Returns: Encoded ASCII string """ chars = [] for i, cnt in enumerate(counts): # Delta encoding: for i > 2, encode difference from counts[i-2] x = cnt - counts[i - 2] if i > 2 else cnt while True: c = x & 0x1F x >>= 5 # If bit 4 set, sign bit is 1 → more groups unless x is all-ones (-1) # If bit 4 clear, sign bit is 0 → more groups unless x is all-zeros (0) if c & 0x10: more = (x != -1) else: more = (x != 0) if more: c |= 0x20 chars.append(chr(c + 48)) if not more: break return "".join(chars) def rle_to_coco_rle(rle: dict, width: int, height: int) -> Dict[str, Any]: """ Convert Potato RLE to COCO RLE format. Potato RLE is row-major; COCO RLE is column-major with compressed ASCII string encoding. Args: rle: Potato RLE dict with 'counts' and 'size' width: Image width height: Image height Returns: COCO RLE dict {"counts": "encoded_string", "size": [height, width]} """ # Decode to flat row-major mask flat = decode_rle(rle, width, height) # Reshape to 2D mask_2d = [] for y in range(height): row = flat[y * width:(y + 1) * width] mask_2d.append(row) # Compute column-major RLE counts col_counts = _column_major_rle_counts(mask_2d, height, width) # Encode as COCO compressed string encoded = _encode_coco_rle_string(col_counts) return {"counts": encoded, "size": [height, width]}