File size: 2,290 Bytes
549c270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# src/data/registry.py
from __future__ import annotations

from pathlib import Path
from typing import Dict

# Canonical path helpers live in utils.paths
from utils.paths import (
    RAW_DIR,
    PROCESSED_DIR,
    get_dataset_paths as _get_dataset_paths,  # returns dict[str, Path]
    get_raw_path,
    get_processed_path,
)


def get_paths(dataset: str) -> Dict[str, Path]:
    """
    Return raw and processed directories for a dataset name (as Path objects).
    Creates them if they do not exist.

    Example:
        d = get_paths("beauty")
        d["raw_dir"] -> Path(.../data/raw/beauty)
        d["processed_dir"] -> Path(.../data/processed/beauty)
    """
    name = (dataset or "").lower()
    raw_dir = RAW_DIR / name
    processed_dir = PROCESSED_DIR / name
    raw_dir.mkdir(parents=True, exist_ok=True)
    processed_dir.mkdir(parents=True, exist_ok=True)
    return {"raw_dir": raw_dir, "processed_dir": processed_dir}


def raw_file(dataset: str, filename: str) -> Path:
    """Convenience: Path to a file inside data/raw/<dataset>/"""
    return get_paths(dataset)["raw_dir"] / filename


def processed_file(dataset: str, filename: str) -> Path:
    """Convenience: Path to a file inside data/processed/<dataset>/"""
    return get_paths(dataset)["processed_dir"] / filename


# ---------------------------------------------------------------------
# Compatibility shim used by older code/tests:
# This now returns Path objects instead of strings.
# ---------------------------------------------------------------------
def get_dataset_paths(dataset: str) -> Dict[str, Path]:
    """
    Returns absolute paths (as Path objects) for the given dataset:
    {
      "raw": Path(.../data/raw/<dataset>),
      "processed": Path(.../data/processed/<dataset>),
      "cache": Path(.../data/cache/<dataset>),
      "logs": Path(.../logs),
      "meta_features_path": Path(.../meta_features.npy),
      "text_features_path": Path(.../text_features.npy),
      "image_features_path": Path(.../image_features.npy),
      "labels_path": Path(.../labels.json)
    }
    """
    return _get_dataset_paths(dataset)


__all__ = [
    "get_paths",
    "raw_file",
    "processed_file",
    "get_dataset_paths",  # keep public for tests
    "get_raw_path",
    "get_processed_path",
]