Spaces:

multimodalart
/

ai-toolkit

Running on CPU Upgrade

App Files Files Community

multimodalart HF Staff commited on Sep 17

Commit

e218de2

verified ·

1 Parent(s): 5675b78

new local dataset fix

Browse files

Files changed (1) hide show

ui/src/app/api/hf-jobs/route.ts +33 -13

ui/src/app/api/hf-jobs/route.ts CHANGED Viewed

@@ -244,6 +244,7 @@ def copy_dataset_files(source_dir: str, local_path: str):
             continue
     print(f"Prepared {len(image_files)} images and {captions_to_copy} captions in {local_path}")
 def download_dataset(dataset_repo: str, local_path: str):
@@ -255,16 +256,37 @@ def download_dataset(dataset_repo: str, local_path: str):
     local_source = find_local_dataset_source(dataset_repo)
     if local_source:
         print(f"Found local dataset at {local_source}")
-        copy_dataset_files(local_source, local_path)
-        return
     repo_id = normalize_repo_id(dataset_repo)
     try:
         dataset = load_dataset(repo_id, split="train")
         for i, item in enumerate(dataset):
-            if "image" in item:
                 image_path = os.path.join(local_path, f"image_{i:06d}.jpg")
                 image = item["image"]
@@ -276,24 +298,22 @@ def download_dataset(dataset_repo: str, local_path: str):
                     image = image.convert('RGB')
                 image.save(image_path, 'JPEG')
-            if "text" in item:
                 caption_path = os.path.join(local_path, f"image_{i:06d}.txt")
                 with open(caption_path, "w", encoding="utf-8") as f:
                     f.write(item["text"])
-        print(f"Downloaded {len(dataset)} items to {local_path}")
     except Exception as e:
         print(f"Failed to load as structured dataset: {e}")
-        print("Attempting to download raw files...")
-        temp_repo_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
-        print(f"Downloaded repo to: {temp_repo_path}")
-        print(f"Contents: {os.listdir(temp_repo_path)}")
-        copy_dataset_files(temp_repo_path, local_path)
 def create_config(dataset_path: str, output_path: str):
     """Create training configuration"""

             continue
     print(f"Prepared {len(image_files)} images and {captions_to_copy} captions in {local_path}")
+    return len(image_files), captions_to_copy
 def download_dataset(dataset_repo: str, local_path: str):
     local_source = find_local_dataset_source(dataset_repo)
     if local_source:
         print(f"Found local dataset at {local_source}")
+        images_copied, _ = copy_dataset_files(local_source, local_path)
+        if images_copied > 0:
+            return
+        print("Local dataset did not contain images, falling back to remote download")
     repo_id = normalize_repo_id(dataset_repo)
+    if repo_id:
+        try:
+            print(f"Attempting snapshot download for dataset {repo_id}")
+            temp_repo_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
+            print(f"Downloaded repo to: {temp_repo_path}")
+            print(f"Contents: {os.listdir(temp_repo_path)}")
+            images_copied, _ = copy_dataset_files(temp_repo_path, local_path)
+            if images_copied > 0:
+                return
+            print("Snapshot download did not contain images, attempting structured dataset load")
+        except Exception as snapshot_error:
+            print(f"Snapshot download failed: {snapshot_error}")
+    if not repo_id:
+        raise ValueError("Dataset repository ID is required when no local dataset is available")
     try:
         dataset = load_dataset(repo_id, split="train")
+        images_saved = 0
+        captions_saved = 0
         for i, item in enumerate(dataset):
+            if "image" in item and item["image"] is not None:
                 image_path = os.path.join(local_path, f"image_{i:06d}.jpg")
                 image = item["image"]
                     image = image.convert('RGB')
                 image.save(image_path, 'JPEG')
+                images_saved += 1
+            if "text" in item and item["text"] is not None:
                 caption_path = os.path.join(local_path, f"image_{i:06d}.txt")
                 with open(caption_path, "w", encoding="utf-8") as f:
                     f.write(item["text"])
+                captions_saved += 1
+        if images_saved == 0:
+            raise ValueError(f"Structured dataset load completed but produced 0 images for {repo_id}")
+        print(f"Downloaded {images_saved} items to {local_path}")
     except Exception as e:
         print(f"Failed to load as structured dataset: {e}")
+        raise
 def create_config(dataset_path: str, output_path: str):
     """Create training configuration"""