Spaces:
Sleeping
Sleeping
Fix: add repo_type=dataset and force_download=True for public dataset
Browse files- chatbot_retriever.py +12 -8
chatbot_retriever.py
CHANGED
|
@@ -72,7 +72,7 @@ import os
|
|
| 72 |
DATASET_REPO = "07Codex07/PrepGraph-Data"
|
| 73 |
|
| 74 |
def ensure_data_dir():
|
| 75 |
-
"""
|
| 76 |
data_dir = os.getenv("DATA_DIR", "data")
|
| 77 |
os.makedirs(data_dir, exist_ok=True)
|
| 78 |
|
|
@@ -98,19 +98,23 @@ def ensure_data_dir():
|
|
| 98 |
|
| 99 |
local_paths = []
|
| 100 |
for f in files:
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
| 103 |
print(f"📥 Downloading {f} from Hugging Face (public dataset)...")
|
| 104 |
downloaded = hf_hub_download(
|
| 105 |
repo_id=DATASET_REPO,
|
| 106 |
filename=f,
|
| 107 |
-
repo_type="dataset",
|
| 108 |
-
force_download=True
|
| 109 |
)
|
| 110 |
-
os.
|
| 111 |
-
local_paths.append(local_path)
|
| 112 |
-
return local_paths
|
| 113 |
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
|
| 116 |
def detect_subject(fname: str) -> Optional[str]:
|
|
|
|
| 72 |
DATASET_REPO = "07Codex07/PrepGraph-Data"
|
| 73 |
|
| 74 |
def ensure_data_dir():
|
| 75 |
+
"""Ensure data/ folder exists and download PDFs from the Hugging Face dataset with correct subfolders."""
|
| 76 |
data_dir = os.getenv("DATA_DIR", "data")
|
| 77 |
os.makedirs(data_dir, exist_ok=True)
|
| 78 |
|
|
|
|
| 98 |
|
| 99 |
local_paths = []
|
| 100 |
for f in files:
|
| 101 |
+
# ✅ Keep the original folder structure (e.g., data/pyqs/...)
|
| 102 |
+
download_path = os.path.join(data_dir, f)
|
| 103 |
+
os.makedirs(os.path.dirname(download_path), exist_ok=True)
|
| 104 |
+
|
| 105 |
+
if not os.path.exists(download_path):
|
| 106 |
print(f"📥 Downloading {f} from Hugging Face (public dataset)...")
|
| 107 |
downloaded = hf_hub_download(
|
| 108 |
repo_id=DATASET_REPO,
|
| 109 |
filename=f,
|
| 110 |
+
repo_type="dataset", # ✅ specify dataset type
|
| 111 |
+
force_download=True # ✅ force refresh cache if needed
|
| 112 |
)
|
| 113 |
+
os.replace(downloaded, download_path)
|
|
|
|
|
|
|
| 114 |
|
| 115 |
+
local_paths.append(download_path)
|
| 116 |
+
|
| 117 |
+
return local_paths
|
| 118 |
|
| 119 |
|
| 120 |
def detect_subject(fname: str) -> Optional[str]:
|