samwaugh commited on
Commit
9d2c440
·
1 Parent(s): 3a15d23

Try to fix loading markdown dataset

Browse files
Files changed (1) hide show
  1. backend/runner/config.py +52 -17
backend/runner/config.py CHANGED
@@ -198,28 +198,63 @@ def load_markdown_dataset() -> Optional[Path]:
198
  if DATASETS_AVAILABLE:
199
  from datasets import load_dataset
200
  print("�� Downloading markdown dataset...")
201
- dataset = load_dataset(ARTEFACT_MARKDOWN_DATASET, split="train")
 
 
202
 
203
- # Save files to local cache
204
- for item in dataset:
205
- work_id = item.get('work_id', 'unknown')
206
- work_dir = markdown_cache_dir / work_id
207
- work_dir.mkdir(exist_ok=True)
208
-
209
- # Save markdown content
210
- if 'markdown' in item:
211
- md_file = work_dir / f"{work_id}.md"
212
- md_file.write_text(item['markdown'], encoding='utf-8')
 
 
 
213
 
214
- # Save images
215
- if 'images' in item and item['images']:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  images_dir = work_dir / "images"
217
  images_dir.mkdir(exist_ok=True)
218
 
219
- for i, img_data in enumerate(item['images']):
220
- img_ext = '.png' if img_data.get('format') == 'PNG' else '.jpg'
221
- img_file = images_dir / f"image-{i:03d}{img_ext}"
222
- img_file.write_bytes(img_data['bytes'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
  print(f"✅ Successfully downloaded markdown dataset to {works_dir}")
225
  return works_dir
 
198
  if DATASETS_AVAILABLE:
199
  from datasets import load_dataset
200
  print("�� Downloading markdown dataset...")
201
+ # Use huggingface_hub to download files directly instead of datasets library
202
+ from huggingface_hub import list_repo_files
203
+ files = list_repo_files(repo_id=ARTEFACT_MARKDOWN_DATASET, repo_type="dataset")
204
 
205
+ # Filter for work directories and files
206
+ work_dirs = set()
207
+ for file_path in files:
208
+ if file_path.startswith("works/") and "/" in file_path[7:]:
209
+ work_id = file_path.split("/")[1]
210
+ work_dirs.add(work_id)
211
+
212
+ print(f" Found {len(work_dirs)} work directories to download")
213
+
214
+ # Download each work directory
215
+ for i, work_id in enumerate(work_dirs):
216
+ if i % 100 == 0:
217
+ print(f" Downloaded {i}/{len(work_dirs)} work directories...")
218
 
219
+ work_dir = works_dir / work_id
220
+ work_dir.mkdir(parents=True, exist_ok=True)
221
+
222
+ # Download markdown file
223
+ try:
224
+ md_file = hf_hub_download(
225
+ repo_id=ARTEFACT_MARKDOWN_DATASET,
226
+ filename=f"works/{work_id}/{work_id}.md",
227
+ repo_type="dataset"
228
+ )
229
+ # Copy to our cache
230
+ import shutil
231
+ shutil.copy2(md_file, work_dir / f"{work_id}.md")
232
+ except Exception as e:
233
+ print(f"⚠️ Could not download markdown for {work_id}: {e}")
234
+
235
+ # Download images
236
+ try:
237
  images_dir = work_dir / "images"
238
  images_dir.mkdir(exist_ok=True)
239
 
240
+ # Get list of image files for this work
241
+ work_files = [f for f in files if f.startswith(f"works/{work_id}/images/")]
242
+
243
+ for img_file in work_files:
244
+ try:
245
+ downloaded_file = hf_hub_download(
246
+ repo_id=ARTEFACT_MARKDOWN_DATASET,
247
+ filename=img_file,
248
+ repo_type="dataset"
249
+ )
250
+ # Copy to our cache
251
+ img_name = img_file.split("/")[-1]
252
+ shutil.copy2(downloaded_file, images_dir / img_name)
253
+ except Exception as e:
254
+ print(f"⚠️ Could not download image {img_file}: {e}")
255
+
256
+ except Exception as e:
257
+ print(f"⚠️ Could not download images for {work_id}: {e}")
258
 
259
  print(f"✅ Successfully downloaded markdown dataset to {works_dir}")
260
  return works_dir