Spaces:
Sleeping
Sleeping
| from pathlib import Path | |
| from typing import Union, Dict | |
| # --- project roots --- | |
| PROJECT_ROOT = Path(__file__).resolve().parents[2] | |
| DATA_DIR = PROJECT_ROOT / "data" | |
| RAW_DIR = DATA_DIR / "raw" | |
| PROCESSED_DIR = DATA_DIR / "processed" | |
| CACHE_DIR = DATA_DIR / "cache" | |
| LOGS_DIR = PROJECT_ROOT / "logs" | |
| MODELS_DIR = PROJECT_ROOT / "src" / "models" | |
| def ensure_dir(path: Union[str, Path]) -> Path: | |
| """ | |
| Ensure a directory exists. Accepts either a str or a pathlib.Path. | |
| Returns a pathlib.Path. | |
| """ | |
| p = Path(path) if not isinstance(path, Path) else path | |
| p.mkdir(parents=True, exist_ok=True) | |
| return p | |
| def get_raw_path(dataset: str) -> Path: | |
| """.../data/raw/<dataset>""" | |
| return ensure_dir(RAW_DIR / dataset) | |
| def get_processed_path(dataset: str) -> Path: | |
| """.../data/processed/<dataset>""" | |
| return ensure_dir(PROCESSED_DIR / dataset) | |
| def get_logs_path() -> Path: | |
| """.../logs""" | |
| return ensure_dir(LOGS_DIR) | |
| def get_dataset_paths(dataset: str) -> Dict[str, Path]: | |
| """ | |
| Convenience bundle of dataset-related paths. | |
| NOTE: returns Path objects (not strings) for consistency. | |
| """ | |
| dataset = dataset.lower() | |
| processed_dir = get_processed_path(dataset) | |
| return { | |
| "raw": get_raw_path(dataset), | |
| "processed": processed_dir, | |
| "cache": ensure_dir(CACHE_DIR / dataset), | |
| "logs": get_logs_path(), | |
| # Parquet input files | |
| "item_meta_emb_path": processed_dir / "item_meta_emb.parquet", | |
| "item_image_emb_path": processed_dir / "item_image_emb.parquet", | |
| "item_text_emb_path": processed_dir / "item_text_emb.parquet", | |
| # FAISS-related npy features | |
| "meta_features_path": processed_dir / "meta_features.npy", | |
| "text_features_path": processed_dir / "text_features.npy", | |
| "image_features_path": processed_dir / "image_features.npy", | |
| "labels_path": processed_dir / "labels.json", | |
| # ✅ Add missing FAISS fusion output path | |
| "faiss_fusion_path": processed_dir / "faiss_fusion.index", | |
| } |