Spaces:
Sleeping
Sleeping
File size: 3,104 Bytes
549c270 65bcc59 24a5fa2 65bcc59 549c270 24a5fa2 549c270 24a5fa2 549c270 24a5fa2 65bcc59 24a5fa2 65bcc59 549c270 24a5fa2 549c270 65bcc59 549c270 24a5fa2 65bcc59 549c270 24a5fa2 549c270 24a5fa2 549c270 24a5fa2 65bcc59 24a5fa2 65bcc59 549c270 24a5fa2 549c270 24a5fa2 549c270 24a5fa2 549c270 24a5fa2 549c270 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
from pathlib import Path
from typing import Union, Dict
import os
from huggingface_hub import hf_hub_download
# --- Constants ---
HF_REPO = "mickey1976/mayankc-amazon_beauty_subset"
CACHE: Dict[str, Path] = {}
# --- project roots ---
PROJECT_ROOT = Path(__file__).resolve().parents[2]
DATA_DIR = PROJECT_ROOT / "data"
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
CACHE_DIR = DATA_DIR / "cache"
LOGS_DIR = PROJECT_ROOT / "logs"
MODELS_DIR = PROJECT_ROOT / "src" / "models"
def ensure_dir(path: Union[str, Path]) -> Path:
p = Path(path) if not isinstance(path, Path) else path
p.mkdir(parents=True, exist_ok=True)
return p
def get_raw_path(dataset: str) -> Path:
return ensure_dir(RAW_DIR / dataset)
def _hf_download(filename: str) -> Path:
if filename in CACHE:
return CACHE[filename]
path = hf_hub_download(repo_id=HF_REPO, filename=filename, repo_type="dataset")
CACHE[filename] = Path(path)
return Path(path)
def get_processed_path(dataset: str) -> Path:
local_path = PROCESSED_DIR / dataset
if local_path.exists():
return local_path
# fallback: download any known file to get a valid parent path
fallback_file = f"parquet/user_text_emb.parquet"
fallback_path = _hf_download(fallback_file)
return fallback_path.parent
def get_logs_path() -> Path:
return ensure_dir(LOGS_DIR)
def get_dataset_paths(dataset: str) -> Dict[str, Path]:
dataset = dataset.lower()
def resolve_or_download(subfolder: str, name: str) -> Path:
local = PROCESSED_DIR / dataset / name
if local.exists():
return local
return _hf_download(f"{subfolder}/{name}")
return {
"raw": get_raw_path(dataset),
"processed": get_processed_path(dataset),
"cache": ensure_dir(CACHE_DIR / dataset),
"logs": get_logs_path(),
# JSON and config files
"defaults": resolve_or_download("json", "defaults.json"),
"item_ids": resolve_or_download("json", "item_ids.json"),
"user_seq": resolve_or_download("json", "user_seq.json"),
# Parquet files
"item_meta_emb": resolve_or_download("parquet", "item_meta_emb.parquet"),
"item_image_emb": resolve_or_download("parquet", "item_image_emb.parquet"),
"item_text_emb": resolve_or_download("parquet", "item_text_emb.parquet"),
"user_text_emb": resolve_or_download("parquet", "user_text_emb.parquet"),
# NPY files
"text": resolve_or_download("npy", "text.npy"),
"image": resolve_or_download("npy", "image.npy"),
"meta": resolve_or_download("npy", "meta.npy"),
"cove": resolve_or_download("npy", "cove.npy"),
# FAISS files
"faiss_concat": resolve_or_download("faiss", "items_beauty_concat.faiss"),
"faiss_weighted": resolve_or_download("faiss", "items_beauty_weighted.faiss"),
# Model
"adapter_model": resolve_or_download("model", "adapter_model.safetensors"),
"full_model": resolve_or_download("model", "model.safetensors"),
} |