Try to fix loading markdown dataset
Browse files- backend/runner/config.py +52 -17
backend/runner/config.py
CHANGED
|
@@ -198,28 +198,63 @@ def load_markdown_dataset() -> Optional[Path]:
|
|
| 198 |
if DATASETS_AVAILABLE:
|
| 199 |
from datasets import load_dataset
|
| 200 |
print("�� Downloading markdown dataset...")
|
| 201 |
-
|
|
|
|
|
|
|
| 202 |
|
| 203 |
-
#
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
-
|
| 215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
images_dir = work_dir / "images"
|
| 217 |
images_dir.mkdir(exist_ok=True)
|
| 218 |
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
print(f"✅ Successfully downloaded markdown dataset to {works_dir}")
|
| 225 |
return works_dir
|
|
|
|
| 198 |
if DATASETS_AVAILABLE:
|
| 199 |
from datasets import load_dataset
|
| 200 |
print("�� Downloading markdown dataset...")
|
| 201 |
+
# Use huggingface_hub to download files directly instead of datasets library
|
| 202 |
+
from huggingface_hub import list_repo_files
|
| 203 |
+
files = list_repo_files(repo_id=ARTEFACT_MARKDOWN_DATASET, repo_type="dataset")
|
| 204 |
|
| 205 |
+
# Filter for work directories and files
|
| 206 |
+
work_dirs = set()
|
| 207 |
+
for file_path in files:
|
| 208 |
+
if file_path.startswith("works/") and "/" in file_path[7:]:
|
| 209 |
+
work_id = file_path.split("/")[1]
|
| 210 |
+
work_dirs.add(work_id)
|
| 211 |
+
|
| 212 |
+
print(f" Found {len(work_dirs)} work directories to download")
|
| 213 |
+
|
| 214 |
+
# Download each work directory
|
| 215 |
+
for i, work_id in enumerate(work_dirs):
|
| 216 |
+
if i % 100 == 0:
|
| 217 |
+
print(f" Downloaded {i}/{len(work_dirs)} work directories...")
|
| 218 |
|
| 219 |
+
work_dir = works_dir / work_id
|
| 220 |
+
work_dir.mkdir(parents=True, exist_ok=True)
|
| 221 |
+
|
| 222 |
+
# Download markdown file
|
| 223 |
+
try:
|
| 224 |
+
md_file = hf_hub_download(
|
| 225 |
+
repo_id=ARTEFACT_MARKDOWN_DATASET,
|
| 226 |
+
filename=f"works/{work_id}/{work_id}.md",
|
| 227 |
+
repo_type="dataset"
|
| 228 |
+
)
|
| 229 |
+
# Copy to our cache
|
| 230 |
+
import shutil
|
| 231 |
+
shutil.copy2(md_file, work_dir / f"{work_id}.md")
|
| 232 |
+
except Exception as e:
|
| 233 |
+
print(f"⚠️ Could not download markdown for {work_id}: {e}")
|
| 234 |
+
|
| 235 |
+
# Download images
|
| 236 |
+
try:
|
| 237 |
images_dir = work_dir / "images"
|
| 238 |
images_dir.mkdir(exist_ok=True)
|
| 239 |
|
| 240 |
+
# Get list of image files for this work
|
| 241 |
+
work_files = [f for f in files if f.startswith(f"works/{work_id}/images/")]
|
| 242 |
+
|
| 243 |
+
for img_file in work_files:
|
| 244 |
+
try:
|
| 245 |
+
downloaded_file = hf_hub_download(
|
| 246 |
+
repo_id=ARTEFACT_MARKDOWN_DATASET,
|
| 247 |
+
filename=img_file,
|
| 248 |
+
repo_type="dataset"
|
| 249 |
+
)
|
| 250 |
+
# Copy to our cache
|
| 251 |
+
img_name = img_file.split("/")[-1]
|
| 252 |
+
shutil.copy2(downloaded_file, images_dir / img_name)
|
| 253 |
+
except Exception as e:
|
| 254 |
+
print(f"⚠️ Could not download image {img_file}: {e}")
|
| 255 |
+
|
| 256 |
+
except Exception as e:
|
| 257 |
+
print(f"⚠️ Could not download images for {work_id}: {e}")
|
| 258 |
|
| 259 |
print(f"✅ Successfully downloaded markdown dataset to {works_dir}")
|
| 260 |
return works_dir
|