mickey1976 commited on
Commit
65bcc59
·
1 Parent(s): 8844635

Updated: paths.py to load data from HF Hub

Browse files
Files changed (1) hide show
  1. src/utils/paths.py +43 -17
src/utils/paths.py CHANGED
@@ -1,5 +1,12 @@
1
  from pathlib import Path
2
  from typing import Union, Dict
 
 
 
 
 
 
 
3
 
4
  # --- project roots ---
5
  PROJECT_ROOT = Path(__file__).resolve().parents[2]
@@ -10,7 +17,6 @@ CACHE_DIR = DATA_DIR / "cache"
10
  LOGS_DIR = PROJECT_ROOT / "logs"
11
  MODELS_DIR = PROJECT_ROOT / "src" / "models"
12
 
13
-
14
  def ensure_dir(path: Union[str, Path]) -> Path:
15
  """
16
  Ensure a directory exists. Accepts either a str or a pathlib.Path.
@@ -20,30 +26,50 @@ def ensure_dir(path: Union[str, Path]) -> Path:
20
  p.mkdir(parents=True, exist_ok=True)
21
  return p
22
 
23
-
24
  def get_raw_path(dataset: str) -> Path:
25
  """.../data/raw/<dataset>"""
26
  return ensure_dir(RAW_DIR / dataset)
27
 
 
 
 
 
 
 
28
 
29
  def get_processed_path(dataset: str) -> Path:
30
- """.../data/processed/<dataset>"""
31
- return ensure_dir(PROCESSED_DIR / dataset)
 
 
 
 
 
 
 
32
 
 
 
 
 
33
 
34
  def get_logs_path() -> Path:
35
- """.../logs"""
36
  return ensure_dir(LOGS_DIR)
37
 
38
-
39
  def get_dataset_paths(dataset: str) -> Dict[str, Path]:
40
  """
41
- Convenience bundle of dataset-related paths.
42
- NOTE: returns Path objects (not strings) for consistency.
43
  """
44
  dataset = dataset.lower()
45
  processed_dir = get_processed_path(dataset)
46
 
 
 
 
 
 
 
47
  return {
48
  "raw": get_raw_path(dataset),
49
  "processed": processed_dir,
@@ -51,16 +77,16 @@ def get_dataset_paths(dataset: str) -> Dict[str, Path]:
51
  "logs": get_logs_path(),
52
 
53
  # Parquet input files
54
- "item_meta_emb_path": processed_dir / "item_meta_emb.parquet",
55
- "item_image_emb_path": processed_dir / "item_image_emb.parquet",
56
- "item_text_emb_path": processed_dir / "item_text_emb.parquet",
57
 
58
  # FAISS-related npy features
59
- "meta_features_path": processed_dir / "meta_features.npy",
60
- "text_features_path": processed_dir / "text_features.npy",
61
- "image_features_path": processed_dir / "image_features.npy",
62
- "labels_path": processed_dir / "labels.json",
63
 
64
- # ✅ Add missing FAISS fusion output path
65
- "faiss_fusion_path": processed_dir / "faiss_fusion.index",
66
  }
 
1
  from pathlib import Path
2
  from typing import Union, Dict
3
+ import os
4
+
5
+ from huggingface_hub import hf_hub_download
6
+
7
+ # --- Constants ---
8
+ HF_REPO = "mayankc/mayankc-amazon_beauty_subset"
9
+ CACHE: Dict[str, Path] = {}
10
 
11
  # --- project roots ---
12
  PROJECT_ROOT = Path(__file__).resolve().parents[2]
 
17
  LOGS_DIR = PROJECT_ROOT / "logs"
18
  MODELS_DIR = PROJECT_ROOT / "src" / "models"
19
 
 
20
  def ensure_dir(path: Union[str, Path]) -> Path:
21
  """
22
  Ensure a directory exists. Accepts either a str or a pathlib.Path.
 
26
  p.mkdir(parents=True, exist_ok=True)
27
  return p
28
 
 
29
  def get_raw_path(dataset: str) -> Path:
30
  """.../data/raw/<dataset>"""
31
  return ensure_dir(RAW_DIR / dataset)
32
 
33
+ def _hf_download(filename: str) -> Path:
34
+ if filename in CACHE:
35
+ return CACHE[filename]
36
+ path = hf_hub_download(repo_id=HF_REPO, filename=filename)
37
+ CACHE[filename] = Path(path)
38
+ return Path(path)
39
 
40
  def get_processed_path(dataset: str) -> Path:
41
+ """
42
+ For Hugging Face deployment:
43
+ - If running locally, use local /data/processed/<dataset>
44
+ - If on Spaces or missing local files, fall back to hf_hub_download
45
+ Returns the base processed folder (parent of downloaded file).
46
+ """
47
+ local_path = PROCESSED_DIR / dataset
48
+ if local_path.exists():
49
+ return local_path
50
 
51
+ # fallback to Hugging Face Dataset Hub
52
+ fallback_file = f"{dataset}/user_text_emb.parquet"
53
+ fallback_path = _hf_download(fallback_file)
54
+ return fallback_path.parent
55
 
56
  def get_logs_path() -> Path:
 
57
  return ensure_dir(LOGS_DIR)
58
 
 
59
  def get_dataset_paths(dataset: str) -> Dict[str, Path]:
60
  """
61
+ Returns dictionary of paths for known dataset assets.
62
+ If local file not found, pulls from Hugging Face Hub.
63
  """
64
  dataset = dataset.lower()
65
  processed_dir = get_processed_path(dataset)
66
 
67
+ def resolve_or_download(name: str) -> Path:
68
+ local = processed_dir / name
69
+ if local.exists():
70
+ return local
71
+ return _hf_download(f"{dataset}/{name}")
72
+
73
  return {
74
  "raw": get_raw_path(dataset),
75
  "processed": processed_dir,
 
77
  "logs": get_logs_path(),
78
 
79
  # Parquet input files
80
+ "item_meta_emb_path": resolve_or_download("item_meta_emb.parquet"),
81
+ "item_image_emb_path": resolve_or_download("item_image_emb.parquet"),
82
+ "item_text_emb_path": resolve_or_download("item_text_emb.parquet"),
83
 
84
  # FAISS-related npy features
85
+ "meta_features_path": resolve_or_download("meta_features.npy"),
86
+ "text_features_path": resolve_or_download("text_features.npy"),
87
+ "image_features_path": resolve_or_download("image_features.npy"),
88
+ "labels_path": resolve_or_download("labels.json"),
89
 
90
+ # ✅ FAISS fusion output path
91
+ "faiss_fusion_path": resolve_or_download("faiss_fusion.index"),
92
  }