Try to fix markdown loading
Browse files- backend/runner/config.py +25 -3
backend/runner/config.py
CHANGED
|
@@ -202,19 +202,36 @@ def load_markdown_dataset() -> Optional[Path]:
|
|
| 202 |
from huggingface_hub import list_repo_files
|
| 203 |
files = list_repo_files(repo_id=ARTEFACT_MARKDOWN_DATASET, repo_type="dataset")
|
| 204 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
# Filter for work directories and files
|
| 206 |
work_dirs = set()
|
| 207 |
for file_path in files:
|
| 208 |
-
if file_path.startswith("works/")
|
| 209 |
-
|
| 210 |
-
|
|
|
|
|
|
|
|
|
|
| 211 |
|
| 212 |
print(f" Found {len(work_dirs)} work directories to download")
|
| 213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
# Download each work directory
|
| 215 |
for i, work_id in enumerate(work_dirs):
|
| 216 |
if i % 100 == 0:
|
| 217 |
print(f" Downloaded {i}/{len(work_dirs)} work directories...")
|
|
|
|
|
|
|
| 218 |
|
| 219 |
work_dir = works_dir / work_id
|
| 220 |
work_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -229,6 +246,8 @@ def load_markdown_dataset() -> Optional[Path]:
|
|
| 229 |
# Copy to our cache
|
| 230 |
import shutil
|
| 231 |
shutil.copy2(md_file, work_dir / f"{work_id}.md")
|
|
|
|
|
|
|
| 232 |
except Exception as e:
|
| 233 |
print(f"β οΈ Could not download markdown for {work_id}: {e}")
|
| 234 |
|
|
@@ -240,6 +259,9 @@ def load_markdown_dataset() -> Optional[Path]:
|
|
| 240 |
# Get list of image files for this work
|
| 241 |
work_files = [f for f in files if f.startswith(f"works/{work_id}/images/")]
|
| 242 |
|
|
|
|
|
|
|
|
|
|
| 243 |
for img_file in work_files:
|
| 244 |
try:
|
| 245 |
downloaded_file = hf_hub_download(
|
|
|
|
| 202 |
from huggingface_hub import list_repo_files
|
| 203 |
files = list_repo_files(repo_id=ARTEFACT_MARKDOWN_DATASET, repo_type="dataset")
|
| 204 |
|
| 205 |
+
# Debug: Show dataset structure
|
| 206 |
+
print(f"π Total files in dataset: {len(files)}")
|
| 207 |
+
works_files = [f for f in files if f.startswith("works/")]
|
| 208 |
+
print(f"π Files starting with 'works/': {len(works_files)}")
|
| 209 |
+
if works_files:
|
| 210 |
+
print(f"π Sample work files: {works_files[:5]}")
|
| 211 |
+
|
| 212 |
# Filter for work directories and files
|
| 213 |
work_dirs = set()
|
| 214 |
for file_path in files:
|
| 215 |
+
if file_path.startswith("works/"):
|
| 216 |
+
parts = file_path.split("/")
|
| 217 |
+
if len(parts) >= 2:
|
| 218 |
+
work_id = parts[1]
|
| 219 |
+
if work_id.startswith("W"): # Only include work IDs
|
| 220 |
+
work_dirs.add(work_id)
|
| 221 |
|
| 222 |
print(f" Found {len(work_dirs)} work directories to download")
|
| 223 |
|
| 224 |
+
# Debug: Show sample work IDs
|
| 225 |
+
work_list = sorted(list(work_dirs))
|
| 226 |
+
print(f"π Sample work IDs: {work_list[:10]}")
|
| 227 |
+
print(f"π Last few work IDs: {work_list[-5:]}")
|
| 228 |
+
|
| 229 |
# Download each work directory
|
| 230 |
for i, work_id in enumerate(work_dirs):
|
| 231 |
if i % 100 == 0:
|
| 232 |
print(f" Downloaded {i}/{len(work_dirs)} work directories...")
|
| 233 |
+
if i < 10: # Show first 10 work IDs being processed
|
| 234 |
+
print(f"π Processing work: {work_id}")
|
| 235 |
|
| 236 |
work_dir = works_dir / work_id
|
| 237 |
work_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 246 |
# Copy to our cache
|
| 247 |
import shutil
|
| 248 |
shutil.copy2(md_file, work_dir / f"{work_id}.md")
|
| 249 |
+
if i < 5: # Debug: Show first few successful downloads
|
| 250 |
+
print(f"β
Downloaded markdown for {work_id}")
|
| 251 |
except Exception as e:
|
| 252 |
print(f"β οΈ Could not download markdown for {work_id}: {e}")
|
| 253 |
|
|
|
|
| 259 |
# Get list of image files for this work
|
| 260 |
work_files = [f for f in files if f.startswith(f"works/{work_id}/images/")]
|
| 261 |
|
| 262 |
+
if i < 3: # Debug: Show image count for first few works
|
| 263 |
+
print(f"π Found {len(work_files)} images for {work_id}")
|
| 264 |
+
|
| 265 |
for img_file in work_files:
|
| 266 |
try:
|
| 267 |
downloaded_file = hf_hub_download(
|