File size: 2,072 Bytes
8d8bf0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from pathlib import Path
from typing import Union, Dict

# --- project roots ---
PROJECT_ROOT = Path(__file__).resolve().parents[2]
DATA_DIR      = PROJECT_ROOT / "data"
RAW_DIR       = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
CACHE_DIR     = DATA_DIR / "cache"
LOGS_DIR      = PROJECT_ROOT / "logs"
MODELS_DIR    = PROJECT_ROOT / "src" / "models"


def ensure_dir(path: Union[str, Path]) -> Path:
    """
    Ensure a directory exists. Accepts either a str or a pathlib.Path.
    Returns a pathlib.Path.
    """
    p = Path(path) if not isinstance(path, Path) else path
    p.mkdir(parents=True, exist_ok=True)
    return p


def get_raw_path(dataset: str) -> Path:
    """.../data/raw/<dataset>"""
    return ensure_dir(RAW_DIR / dataset)


def get_processed_path(dataset: str) -> Path:
    """.../data/processed/<dataset>"""
    return ensure_dir(PROCESSED_DIR / dataset)


def get_logs_path() -> Path:
    """.../logs"""
    return ensure_dir(LOGS_DIR)


def get_dataset_paths(dataset: str) -> Dict[str, Path]:
    """
    Convenience bundle of dataset-related paths.
    NOTE: returns Path objects (not strings) for consistency.
    """
    dataset = dataset.lower()
    processed_dir = get_processed_path(dataset)

    return {
        "raw": get_raw_path(dataset),
        "processed": processed_dir,
        "cache": ensure_dir(CACHE_DIR / dataset),
        "logs": get_logs_path(),

        # Parquet input files
        "item_meta_emb_path": processed_dir / "item_meta_emb.parquet",
        "item_image_emb_path": processed_dir / "item_image_emb.parquet",
        "item_text_emb_path": processed_dir / "item_text_emb.parquet",

        # FAISS-related npy features
        "meta_features_path": processed_dir / "meta_features.npy",
        "text_features_path": processed_dir / "text_features.npy",
        "image_features_path": processed_dir / "image_features.npy",
        "labels_path": processed_dir / "labels.json",

        # ✅ Add missing FAISS fusion output path
        "faiss_fusion_path": processed_dir / "faiss_fusion.index",
    }