File size: 1,749 Bytes
49dd243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""
Shared helpers for collection/curation pipeline.
"""

from __future__ import annotations

import os
import re
from pathlib import Path
from typing import List, Optional


ALLOWED_LABEL_PRIORITY = ("strong", "weak", "user")


def parse_label_priority(value: str) -> List[str]:
    """
    Parse and validate comma-separated label priority list.
    Returns de-duplicated values while preserving order.
    """
    raw_items = [item.strip() for item in str(value).split(",") if item.strip()]
    if not raw_items:
        raise ValueError("label priority cannot be empty")

    invalid = [item for item in raw_items if item not in ALLOWED_LABEL_PRIORITY]
    if invalid:
        raise ValueError(f"Invalid label priority values: {invalid}")

    deduped = []
    seen = set()
    for item in raw_items:
        if item in seen:
            continue
        deduped.append(item)
        seen.add(item)
    return deduped


def safe_resolve_in_dir(base_dir: Path, filename: str) -> Optional[Path]:
    """
    Resolve a filename safely under base_dir.
    Reject nested paths and path traversal patterns.
    """
    raw_name = str(filename).strip()
    if not raw_name:
        return None
    safe_name = Path(raw_name).name
    if safe_name != raw_name:
        return None

    root = base_dir.resolve()
    candidate = (base_dir / safe_name).resolve()
    if os.path.commonpath([str(root), str(candidate)]) != str(root):
        return None
    return candidate


def sanitize_identifier(value: str, fallback: str, max_len: int = 64) -> str:
    """
    Sanitize identifier for filesystem-safe filenames.
    """
    clean = re.sub(r"[^A-Za-z0-9_-]", "_", str(value).strip())
    clean = clean[:max_len]
    return clean if clean else fallback