07Codex07 commited on
Commit
2d31ac9
·
1 Parent(s): 19e0262

Fix: add repo_type=dataset and force_download=True for public dataset

Browse files
Files changed (1) hide show
  1. chatbot_retriever.py +9 -3
chatbot_retriever.py CHANGED
@@ -100,13 +100,19 @@ def ensure_data_dir():
100
  for f in files:
101
  local_path = os.path.join(data_dir, f.replace("/", "_"))
102
  if not os.path.exists(local_path):
103
- print(f"📥 Downloading {f} from Hugging Face...")
104
- downloaded = hf_hub_download(repo_id=DATASET_REPO, filename=f)
 
 
 
 
 
105
  os.rename(downloaded, local_path)
106
  local_paths.append(local_path)
107
  return local_paths
108
 
109
 
 
110
  def detect_subject(fname: str) -> Optional[str]:
111
  # light heuristic to guess subject code from filename
112
  t = (fname or "").lower()
@@ -191,7 +197,7 @@ def load_all_docs(base_dir: str = DATA_DIR) -> List:
191
  def build_or_load_indexes(force_reindex: bool = False):
192
  if os.getenv("FORCE_REINDEX", "0").lower() in ("1", "true", "yes"):
193
  force_reindex = True
194
-
195
  ensure_data_dir()
196
  docs = load_all_docs(DATA_DIR)
197
  if not docs:
 
100
  for f in files:
101
  local_path = os.path.join(data_dir, f.replace("/", "_"))
102
  if not os.path.exists(local_path):
103
+ print(f"📥 Downloading {f} from Hugging Face (public dataset)...")
104
+ downloaded = hf_hub_download(
105
+ repo_id=DATASET_REPO,
106
+ filename=f,
107
+ repo_type="dataset", # ✅ tells HF it's a dataset
108
+ force_download=True # ✅ bypass any bad cached 401s
109
+ )
110
  os.rename(downloaded, local_path)
111
  local_paths.append(local_path)
112
  return local_paths
113
 
114
 
115
+
116
  def detect_subject(fname: str) -> Optional[str]:
117
  # light heuristic to guess subject code from filename
118
  t = (fname or "").lower()
 
197
  def build_or_load_indexes(force_reindex: bool = False):
198
  if os.getenv("FORCE_REINDEX", "0").lower() in ("1", "true", "yes"):
199
  force_reindex = True
200
+
201
  ensure_data_dir()
202
  docs = load_all_docs(DATA_DIR)
203
  if not docs: