07Codex07 commited on
Commit
708635f
·
1 Parent(s): 2d31ac9

Fix: add repo_type=dataset and force_download=True for public dataset

Browse files
Files changed (1) hide show
  1. chatbot_retriever.py +12 -8
chatbot_retriever.py CHANGED
@@ -72,7 +72,7 @@ import os
72
  DATASET_REPO = "07Codex07/PrepGraph-Data"
73
 
74
  def ensure_data_dir():
75
- """Make sure data/ folder exists and contains downloaded PDFs from HF dataset."""
76
  data_dir = os.getenv("DATA_DIR", "data")
77
  os.makedirs(data_dir, exist_ok=True)
78
 
@@ -98,19 +98,23 @@ def ensure_data_dir():
98
 
99
  local_paths = []
100
  for f in files:
101
- local_path = os.path.join(data_dir, f.replace("/", "_"))
102
- if not os.path.exists(local_path):
 
 
 
103
  print(f"📥 Downloading {f} from Hugging Face (public dataset)...")
104
  downloaded = hf_hub_download(
105
  repo_id=DATASET_REPO,
106
  filename=f,
107
- repo_type="dataset", # ✅ tells HF it's a dataset
108
- force_download=True # ✅ bypass any bad cached 401s
109
  )
110
- os.rename(downloaded, local_path)
111
- local_paths.append(local_path)
112
- return local_paths
113
 
 
 
 
114
 
115
 
116
  def detect_subject(fname: str) -> Optional[str]:
 
72
  DATASET_REPO = "07Codex07/PrepGraph-Data"
73
 
74
  def ensure_data_dir():
75
+ """Ensure data/ folder exists and download PDFs from the Hugging Face dataset with correct subfolders."""
76
  data_dir = os.getenv("DATA_DIR", "data")
77
  os.makedirs(data_dir, exist_ok=True)
78
 
 
98
 
99
  local_paths = []
100
  for f in files:
101
+ # Keep the original folder structure (e.g., data/pyqs/...)
102
+ download_path = os.path.join(data_dir, f)
103
+ os.makedirs(os.path.dirname(download_path), exist_ok=True)
104
+
105
+ if not os.path.exists(download_path):
106
  print(f"📥 Downloading {f} from Hugging Face (public dataset)...")
107
  downloaded = hf_hub_download(
108
  repo_id=DATASET_REPO,
109
  filename=f,
110
+ repo_type="dataset", # ✅ specify dataset type
111
+ force_download=True # ✅ force refresh cache if needed
112
  )
113
+ os.replace(downloaded, download_path)
 
 
114
 
115
+ local_paths.append(download_path)
116
+
117
+ return local_paths
118
 
119
 
120
  def detect_subject(fname: str) -> Optional[str]: