""" Shared helpers for collection/curation pipeline. """ from __future__ import annotations import os import re from pathlib import Path from typing import List, Optional ALLOWED_LABEL_PRIORITY = ("strong", "weak", "user") def parse_label_priority(value: str) -> List[str]: """ Parse and validate comma-separated label priority list. Returns de-duplicated values while preserving order. """ raw_items = [item.strip() for item in str(value).split(",") if item.strip()] if not raw_items: raise ValueError("label priority cannot be empty") invalid = [item for item in raw_items if item not in ALLOWED_LABEL_PRIORITY] if invalid: raise ValueError(f"Invalid label priority values: {invalid}") deduped = [] seen = set() for item in raw_items: if item in seen: continue deduped.append(item) seen.add(item) return deduped def safe_resolve_in_dir(base_dir: Path, filename: str) -> Optional[Path]: """ Resolve a filename safely under base_dir. Reject nested paths and path traversal patterns. """ raw_name = str(filename).strip() if not raw_name: return None safe_name = Path(raw_name).name if safe_name != raw_name: return None root = base_dir.resolve() candidate = (base_dir / safe_name).resolve() if os.path.commonpath([str(root), str(candidate)]) != str(root): return None return candidate def sanitize_identifier(value: str, fallback: str, max_len: int = 64) -> str: """ Sanitize identifier for filesystem-safe filenames. """ clean = re.sub(r"[^A-Za-z0-9_-]", "_", str(value).strip()) clean = clean[:max_len] return clean if clean else fallback