Spaces:

samwaugh
/

ArteFact

Paused

App Files Files Community

samwaugh commited on Sep 26, 2025

Commit

9d2c440

1 Parent(s): 3a15d23

Try to fix loading markdown dataset

Browse files

Files changed (1) hide show

backend/runner/config.py +52 -17

backend/runner/config.py CHANGED Viewed

@@ -198,28 +198,63 @@ def load_markdown_dataset() -> Optional[Path]:
         if DATASETS_AVAILABLE:
             from datasets import load_dataset
             print("�� Downloading markdown dataset...")
-            dataset = load_dataset(ARTEFACT_MARKDOWN_DATASET, split="train")
-            # Save files to local cache
-            for item in dataset:
-                work_id = item.get('work_id', 'unknown')
-                work_dir = markdown_cache_dir / work_id
-                work_dir.mkdir(exist_ok=True)
-                # Save markdown content
-                if 'markdown' in item:
-                    md_file = work_dir / f"{work_id}.md"
-                    md_file.write_text(item['markdown'], encoding='utf-8')
-                # Save images
-                if 'images' in item and item['images']:
                     images_dir = work_dir / "images"
                     images_dir.mkdir(exist_ok=True)
-                    for i, img_data in enumerate(item['images']):
-                        img_ext = '.png' if img_data.get('format') == 'PNG' else '.jpg'
-                        img_file = images_dir / f"image-{i:03d}{img_ext}"
-                        img_file.write_bytes(img_data['bytes'])
             print(f"✅ Successfully downloaded markdown dataset to {works_dir}")
             return works_dir

         if DATASETS_AVAILABLE:
             from datasets import load_dataset
             print("�� Downloading markdown dataset...")
+            # Use huggingface_hub to download files directly instead of datasets library
+            from huggingface_hub import list_repo_files
+            files = list_repo_files(repo_id=ARTEFACT_MARKDOWN_DATASET, repo_type="dataset")
+            # Filter for work directories and files
+            work_dirs = set()
+            for file_path in files:
+                if file_path.startswith("works/") and "/" in file_path[7:]:
+                    work_id = file_path.split("/")[1]
+                    work_dirs.add(work_id)
+            print(f" Found {len(work_dirs)} work directories to download")
+            # Download each work directory
+            for i, work_id in enumerate(work_dirs):
+                if i % 100 == 0:
+                    print(f" Downloaded {i}/{len(work_dirs)} work directories...")
+                work_dir = works_dir / work_id
+                work_dir.mkdir(parents=True, exist_ok=True)
+                # Download markdown file
+                try:
+                    md_file = hf_hub_download(
+                        repo_id=ARTEFACT_MARKDOWN_DATASET,
+                        filename=f"works/{work_id}/{work_id}.md",
+                        repo_type="dataset"
+                    )
+                    # Copy to our cache
+                    import shutil
+                    shutil.copy2(md_file, work_dir / f"{work_id}.md")
+                except Exception as e:
+                    print(f"⚠️  Could not download markdown for {work_id}: {e}")
+                # Download images
+                try:
                     images_dir = work_dir / "images"
                     images_dir.mkdir(exist_ok=True)
+                    # Get list of image files for this work
+                    work_files = [f for f in files if f.startswith(f"works/{work_id}/images/")]
+                    for img_file in work_files:
+                        try:
+                            downloaded_file = hf_hub_download(
+                                repo_id=ARTEFACT_MARKDOWN_DATASET,
+                                filename=img_file,
+                                repo_type="dataset"
+                            )
+                            # Copy to our cache
+                            img_name = img_file.split("/")[-1]
+                            shutil.copy2(downloaded_file, images_dir / img_name)
+                        except Exception as e:
+                            print(f"⚠️  Could not download image {img_file}: {e}")
+                except Exception as e:
+                    print(f"⚠️  Could not download images for {work_id}: {e}")
             print(f"✅ Successfully downloaded markdown dataset to {works_dir}")
             return works_dir