samwaugh commited on
Commit
e09d7cc
Β·
1 Parent(s): 9d2c440

Try to fix markdown loading

Browse files
Files changed (1) hide show
  1. backend/runner/config.py +25 -3
backend/runner/config.py CHANGED
@@ -202,19 +202,36 @@ def load_markdown_dataset() -> Optional[Path]:
202
  from huggingface_hub import list_repo_files
203
  files = list_repo_files(repo_id=ARTEFACT_MARKDOWN_DATASET, repo_type="dataset")
204
 
 
 
 
 
 
 
 
205
  # Filter for work directories and files
206
  work_dirs = set()
207
  for file_path in files:
208
- if file_path.startswith("works/") and "/" in file_path[7:]:
209
- work_id = file_path.split("/")[1]
210
- work_dirs.add(work_id)
 
 
 
211
 
212
  print(f" Found {len(work_dirs)} work directories to download")
213
 
 
 
 
 
 
214
  # Download each work directory
215
  for i, work_id in enumerate(work_dirs):
216
  if i % 100 == 0:
217
  print(f" Downloaded {i}/{len(work_dirs)} work directories...")
 
 
218
 
219
  work_dir = works_dir / work_id
220
  work_dir.mkdir(parents=True, exist_ok=True)
@@ -229,6 +246,8 @@ def load_markdown_dataset() -> Optional[Path]:
229
  # Copy to our cache
230
  import shutil
231
  shutil.copy2(md_file, work_dir / f"{work_id}.md")
 
 
232
  except Exception as e:
233
  print(f"⚠️ Could not download markdown for {work_id}: {e}")
234
 
@@ -240,6 +259,9 @@ def load_markdown_dataset() -> Optional[Path]:
240
  # Get list of image files for this work
241
  work_files = [f for f in files if f.startswith(f"works/{work_id}/images/")]
242
 
 
 
 
243
  for img_file in work_files:
244
  try:
245
  downloaded_file = hf_hub_download(
 
202
  from huggingface_hub import list_repo_files
203
  files = list_repo_files(repo_id=ARTEFACT_MARKDOWN_DATASET, repo_type="dataset")
204
 
205
+ # Debug: Show dataset structure
206
+ print(f"πŸ” Total files in dataset: {len(files)}")
207
+ works_files = [f for f in files if f.startswith("works/")]
208
+ print(f"πŸ” Files starting with 'works/': {len(works_files)}")
209
+ if works_files:
210
+ print(f"πŸ” Sample work files: {works_files[:5]}")
211
+
212
  # Filter for work directories and files
213
  work_dirs = set()
214
  for file_path in files:
215
+ if file_path.startswith("works/"):
216
+ parts = file_path.split("/")
217
+ if len(parts) >= 2:
218
+ work_id = parts[1]
219
+ if work_id.startswith("W"): # Only include work IDs
220
+ work_dirs.add(work_id)
221
 
222
  print(f" Found {len(work_dirs)} work directories to download")
223
 
224
+ # Debug: Show sample work IDs
225
+ work_list = sorted(list(work_dirs))
226
+ print(f"πŸ” Sample work IDs: {work_list[:10]}")
227
+ print(f"πŸ” Last few work IDs: {work_list[-5:]}")
228
+
229
  # Download each work directory
230
  for i, work_id in enumerate(work_dirs):
231
  if i % 100 == 0:
232
  print(f" Downloaded {i}/{len(work_dirs)} work directories...")
233
+ if i < 10: # Show first 10 work IDs being processed
234
+ print(f"πŸ” Processing work: {work_id}")
235
 
236
  work_dir = works_dir / work_id
237
  work_dir.mkdir(parents=True, exist_ok=True)
 
246
  # Copy to our cache
247
  import shutil
248
  shutil.copy2(md_file, work_dir / f"{work_id}.md")
249
+ if i < 5: # Debug: Show first few successful downloads
250
+ print(f"βœ… Downloaded markdown for {work_id}")
251
  except Exception as e:
252
  print(f"⚠️ Could not download markdown for {work_id}: {e}")
253
 
 
259
  # Get list of image files for this work
260
  work_files = [f for f in files if f.startswith(f"works/{work_id}/images/")]
261
 
262
+ if i < 3: # Debug: Show image count for first few works
263
+ print(f"πŸ” Found {len(work_files)} images for {work_id}")
264
+
265
  for img_file in work_files:
266
  try:
267
  downloaded_file = hf_hub_download(