Spaces:

Blablablab
/

codebook

Paused

App Files Files Community

codebook / potato /export /cv_utils.py

davidjurgens

Deploy: Potato — Codebook Annotation

aceb1b2 verified 7 days ago

Raw

History Blame Contribute Delete

10.9 kB

	"""
	CV Export Utilities

	Shared helper functions for computer vision export formats (COCO, YOLO, VOC).
	"""

	from typing import Dict, List, Tuple, Any, Optional
	import logging

	logger = logging.getLogger(__name__)


	def build_category_mapping(annotations: List[dict], schemas: List[dict]) -> Dict[str, int]:
	"""
	Build a mapping from label names to integer category IDs.

	Extracts labels from image_annotation schemas first (preserving config order),
	then discovers any additional labels from annotations.

	Args:
	annotations: List of annotation records
	schemas: List of annotation_scheme config dicts

	Returns:
	Dict mapping label name -> integer ID (starting from 1 for COCO, 0-indexed for YOLO)
	"""
	labels = []
	seen = set()

	# First, collect labels from schema configs (preserves defined order)
	for schema in schemas:
	if schema.get("annotation_type") == "image_annotation":
	for label_def in schema.get("labels", []):
	name = label_def if isinstance(label_def, str) else label_def.get("name", "")
	if name and name not in seen:
	labels.append(name)
	seen.add(name)

	# Then discover any labels in annotation data not already in config
	for ann in annotations:
	for schema_name, img_annotations in ann.get("image_annotations", {}).items():
	if not isinstance(img_annotations, list):
	continue
	for obj in img_annotations:
	label = obj.get("label", "")
	if label and label not in seen:
	labels.append(label)
	seen.add(label)

	return {name: idx for idx, name in enumerate(labels)}


	def polygon_to_bbox(points: List[List[float]]) -> Tuple[float, float, float, float]:
	"""
	Compute axis-aligned bounding box from a polygon.

	Args:
	points: List of [x, y] coordinate pairs

	Returns:
	Tuple of (x_min, y_min, width, height)
	"""
	if not points:
	return (0, 0, 0, 0)

	xs = [p[0] for p in points]
	ys = [p[1] for p in points]
	x_min = min(xs)
	y_min = min(ys)
	return (x_min, y_min, max(xs) - x_min, max(ys) - y_min)


	def polygon_area(points: List[List[float]]) -> float:
	"""
	Compute the area of a polygon using the shoelace formula.

	Args:
	points: List of [x, y] coordinate pairs

	Returns:
	Absolute area of the polygon
	"""
	n = len(points)
	if n < 3:
	return 0.0
	area = 0.0
	for i in range(n):
	j = (i + 1) % n
	area += points[i][0] * points[j][1]
	area -= points[j][0] * points[i][1]
	return abs(area) / 2.0


	def normalize_bbox(x: float, y: float, w: float, h: float,
	img_w: float, img_h: float) -> Tuple[float, float, float, float]:
	"""
	Normalize bounding box coordinates to [0, 1] range.

	Args:
	x, y: Top-left corner coordinates
	w, h: Width and height
	img_w, img_h: Image dimensions

	Returns:
	Tuple of (center_x, center_y, width, height) normalized to [0, 1]
	"""
	if img_w <= 0 or img_h <= 0:
	return (0, 0, 0, 0)
	cx = max(0.0, min(1.0, (x + w / 2) / img_w))
	cy = max(0.0, min(1.0, (y + h / 2) / img_h))
	nw = max(0.0, min(1.0, w / img_w))
	nh = max(0.0, min(1.0, h / img_h))
	return (cx, cy, nw, nh)


	def flatten_polygon(points: List[List[float]]) -> List[float]:
	"""
	Flatten a list of [x, y] points into a flat coordinate list [x1, y1, x2, y2, ...].

	This is the format used by COCO segmentation.

	Args:
	points: List of [x, y] coordinate pairs

	Returns:
	Flat list of coordinates
	"""
	result = []
	for p in points:
	result.extend(p[:2])
	return result


	def extract_image_annotations(annotation: dict) -> List[Tuple[str, List[dict]]]:
	"""
	Extract image annotation objects from an annotation record.

	Args:
	annotation: Single annotation record with image_annotations field

	Returns:
	List of (schema_name, annotation_objects) tuples
	"""
	results = []
	for schema_name, objects in annotation.get("image_annotations", {}).items():
	if isinstance(objects, list) and objects:
	results.append((schema_name, objects))
	return results


	def get_image_dimensions(item: dict, default_width: int = 0,
	default_height: int = 0) -> Tuple[int, int]:
	"""
	Extract image dimensions from item metadata.

	Checks common field names for image width/height.

	Args:
	item: Item data dict
	default_width: Fallback width
	default_height: Fallback height

	Returns:
	Tuple of (width, height)
	"""
	# Check common field patterns
	width = default_width
	for w_key in ("image_width", "width", "img_width", "w"):
	if w_key in item:
	try:
	width = int(item[w_key])
	except (ValueError, TypeError):
	pass
	break

	height = default_height
	for h_key in ("image_height", "height", "img_height", "h"):
	if h_key in item:
	try:
	height = int(item[h_key])
	except (ValueError, TypeError):
	pass
	break

	return (width, height)


	def get_image_filename(item: dict) -> Optional[str]:
	"""
	Extract image filename from item data.

	Args:
	item: Item data dict

	Returns:
	Image filename/path string or None
	"""
	for key in ("image", "image_path", "image_url", "file_name", "filename", "img"):
	if key in item and item[key]:
	return str(item[key])
	return None


	# ---------------------------------------------------------------------------
	# RLE mask utilities (Potato RLE <-> COCO RLE conversion)
	# ---------------------------------------------------------------------------


	def decode_rle(rle: dict, width: int, height: int) -> List[int]:
	"""
	Decode Potato RLE-encoded mask to a flat binary array (row-major order).

	Potato RLE stores counts alternating between 0-pixels and 1-pixels,
	starting with 0s, in row-major (left-to-right, top-to-bottom) order.

	Args:
	rle: Dict with 'counts' (list of ints) and 'size' [height, width]
	width: Image width
	height: Image height

	Returns:
	Flat list of 0/1 values in row-major order
	"""
	counts = rle.get("counts", [])
	total = width * height
	mask = [0] * total
	pos = 0
	val = 0
	for count in counts:
	for _ in range(count):
	if pos < total:
	mask[pos] = val
	pos += 1
	val = 1 - val
	return mask


	def rle_bbox(mask: List[int], width: int, height: int) -> List[float]:
	"""
	Compute axis-aligned bounding box [x, y, w, h] from a flat binary mask.

	Args:
	mask: Flat list of 0/1 values (row-major)
	width: Image width
	height: Image height

	Returns:
	[x_min, y_min, bbox_width, bbox_height] or [0, 0, 0, 0] if empty
	"""
	x_min, y_min = width, height
	x_max, y_max = -1, -1
	for i, val in enumerate(mask):
	if val:
	y = i // width
	x = i % width
	if x < x_min:
	x_min = x
	if x > x_max:
	x_max = x
	if y < y_min:
	y_min = y
	if y > y_max:
	y_max = y
	if x_max < 0:
	return [0, 0, 0, 0]
	return [float(x_min), float(y_min),
	float(x_max - x_min + 1), float(y_max - y_min + 1)]


	def rle_area(mask: List[int]) -> int:
	"""
	Compute mask area as the count of foreground pixels.

	Args:
	mask: Flat list of 0/1 values

	Returns:
	Number of 1-pixels
	"""
	return sum(mask)


	def _column_major_rle_counts(mask_2d: List[List[int]], height: int,
	width: int) -> List[int]:
	"""
	Read a 2D mask in column-major order and compute RLE counts.

	Counts alternate between 0-pixels and 1-pixels, starting with 0s.

	Args:
	mask_2d: 2D list [height][width] of 0/1 values
	height: Image height
	width: Image width

	Returns:
	List of integer run counts in column-major order
	"""
	counts: List[int] = []
	current_val = 0
	current_run = 0

	for x in range(width):
	for y in range(height):
	pixel = mask_2d[y][x]
	if pixel == current_val:
	current_run += 1
	else:
	counts.append(current_run)
	current_val = 1 - current_val
	current_run = 1
	counts.append(current_run)
	return counts


	def _encode_coco_rle_string(counts: List[int]) -> str:
	"""
	Encode RLE integer counts as a COCO compressed ASCII string.

	Implements the exact algorithm from pycocotools maskApi.c rleToString():
	- Delta encoding for i > 2: x = counts[i] - counts[i-2]
	- Each value encoded as 6-bit groups (5 data bits + 1 continuation bit)
	- Each group offset by 48 to produce printable ASCII
	- Signed values supported via arithmetic right shift

	Args:
	counts: List of integer run counts

	Returns:
	Encoded ASCII string
	"""
	chars = []
	for i, cnt in enumerate(counts):
	# Delta encoding: for i > 2, encode difference from counts[i-2]
	x = cnt - counts[i - 2] if i > 2 else cnt
	while True:
	c = x & 0x1F
	x >>= 5
	# If bit 4 set, sign bit is 1 → more groups unless x is all-ones (-1)
	# If bit 4 clear, sign bit is 0 → more groups unless x is all-zeros (0)
	if c & 0x10:
	more = (x != -1)
	else:
	more = (x != 0)
	if more:
	c \|= 0x20
	chars.append(chr(c + 48))
	if not more:
	break
	return "".join(chars)


	def rle_to_coco_rle(rle: dict, width: int, height: int) -> Dict[str, Any]:
	"""
	Convert Potato RLE to COCO RLE format.

	Potato RLE is row-major; COCO RLE is column-major with compressed
	ASCII string encoding.

	Args:
	rle: Potato RLE dict with 'counts' and 'size'
	width: Image width
	height: Image height

	Returns:
	COCO RLE dict {"counts": "encoded_string", "size": [height, width]}
	"""
	# Decode to flat row-major mask
	flat = decode_rle(rle, width, height)

	# Reshape to 2D
	mask_2d = []
	for y in range(height):
	row = flat[y * width:(y + 1) * width]
	mask_2d.append(row)

	# Compute column-major RLE counts
	col_counts = _column_major_rle_counts(mask_2d, height, width)

	# Encode as COCO compressed string
	encoded = _encode_coco_rle_string(col_counts)

	return {"counts": encoded, "size": [height, width]}