SoilTextureClassification / collection_common.py
Iridium-193's picture
Upload folder using huggingface_hub
49dd243 verified
"""
Shared helpers for collection/curation pipeline.
"""
from __future__ import annotations
import os
import re
from pathlib import Path
from typing import List, Optional
ALLOWED_LABEL_PRIORITY = ("strong", "weak", "user")
def parse_label_priority(value: str) -> List[str]:
"""
Parse and validate comma-separated label priority list.
Returns de-duplicated values while preserving order.
"""
raw_items = [item.strip() for item in str(value).split(",") if item.strip()]
if not raw_items:
raise ValueError("label priority cannot be empty")
invalid = [item for item in raw_items if item not in ALLOWED_LABEL_PRIORITY]
if invalid:
raise ValueError(f"Invalid label priority values: {invalid}")
deduped = []
seen = set()
for item in raw_items:
if item in seen:
continue
deduped.append(item)
seen.add(item)
return deduped
def safe_resolve_in_dir(base_dir: Path, filename: str) -> Optional[Path]:
"""
Resolve a filename safely under base_dir.
Reject nested paths and path traversal patterns.
"""
raw_name = str(filename).strip()
if not raw_name:
return None
safe_name = Path(raw_name).name
if safe_name != raw_name:
return None
root = base_dir.resolve()
candidate = (base_dir / safe_name).resolve()
if os.path.commonpath([str(root), str(candidate)]) != str(root):
return None
return candidate
def sanitize_identifier(value: str, fallback: str, max_len: int = 64) -> str:
"""
Sanitize identifier for filesystem-safe filenames.
"""
clean = re.sub(r"[^A-Za-z0-9_-]", "_", str(value).strip())
clean = clean[:max_len]
return clean if clean else fallback