Spaces:

multimodalart
/

ai-toolkit

Running on CPU Upgrade

App Files Files Community

multimodalart HF Staff commited on Sep 17

Commit

5675b78

verified ·

1 Parent(s): 63037c8

attempt local dataset fix

Browse files

Files changed (1) hide show

ui/src/app/api/hf-jobs/route.ts +108 -74

ui/src/app/api/hf-jobs/route.ts CHANGED Viewed

@@ -162,104 +162,138 @@ def setup_ai_toolkit():
     sys.path.insert(0, os.path.abspath(repo_dir))
     return repo_dir
 def download_dataset(dataset_repo: str, local_path: str):
     """Download dataset from HF Hub as files"""
     print(f"Downloading dataset from {dataset_repo}...")
-    # Create local dataset directory
     os.makedirs(local_path, exist_ok=True)
-    # Use snapshot_download to get the dataset files directly
-    from huggingface_hub import snapshot_download
     try:
-        # First try to download as a structured dataset
-        dataset = load_dataset(dataset_repo, split="train")
-        # Download images and captions from structured dataset
         for i, item in enumerate(dataset):
-            # Save image
             if "image" in item:
                 image_path = os.path.join(local_path, f"image_{i:06d}.jpg")
                 image = item["image"]
-                # Convert RGBA to RGB if necessary (for JPEG compatibility)
                 if image.mode == 'RGBA':
-                    # Create a white background and paste the RGBA image on it
                     background = Image.new('RGB', image.size, (255, 255, 255))
-                    background.paste(image, mask=image.split()[-1])  # Use alpha channel as mask
                     image = background
                 elif image.mode not in ['RGB', 'L']:
-                    # Convert any other mode to RGB
                     image = image.convert('RGB')
                 image.save(image_path, 'JPEG')
-            # Save caption
             if "text" in item:
                 caption_path = os.path.join(local_path, f"image_{i:06d}.txt")
                 with open(caption_path, "w", encoding="utf-8") as f:
                     f.write(item["text"])
         print(f"Downloaded {len(dataset)} items to {local_path}")
     except Exception as e:
         print(f"Failed to load as structured dataset: {e}")
         print("Attempting to download raw files...")
-        # Download the dataset repository as files
-        temp_repo_path = snapshot_download(repo_id=dataset_repo, repo_type="dataset")
-        # Copy all image and text files to the local path
-        import glob
-        import shutil
         print(f"Downloaded repo to: {temp_repo_path}")
         print(f"Contents: {os.listdir(temp_repo_path)}")
-        # Find all image files
-        image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.webp', '*.bmp', '*.JPG', '*.JPEG', '*.PNG']
-        image_files = []
-        for ext in image_extensions:
-            pattern = os.path.join(temp_repo_path, "**", ext)
-            found_files = glob.glob(pattern, recursive=True)
-            image_files.extend(found_files)
-            print(f"Pattern {pattern} found {len(found_files)} files")
-        # Find all text files
-        text_files = glob.glob(os.path.join(temp_repo_path, "**", "*.txt"), recursive=True)
-        print(f"Found {len(image_files)} image files and {len(text_files)} text files")
-        # Copy image files
-        for i, img_file in enumerate(image_files):
-            dest_path = os.path.join(local_path, f"image_{i:06d}.jpg")
-            # Load and convert image if needed
-            try:
-                with Image.open(img_file) as image:
-                    if image.mode == 'RGBA':
-                        background = Image.new('RGB', image.size, (255, 255, 255))
-                        background.paste(image, mask=image.split()[-1])
-                        image = background
-                    elif image.mode not in ['RGB', 'L']:
-                        image = image.convert('RGB')
-                    image.save(dest_path, 'JPEG')
-            except Exception as img_error:
-                print(f"Error processing image {img_file}: {img_error}")
-                continue
-        # Copy text files (captions)
-        for i, txt_file in enumerate(text_files[:len(image_files)]):  # Match number of images
-            dest_path = os.path.join(local_path, f"image_{i:06d}.txt")
-            try:
-                shutil.copy2(txt_file, dest_path)
-            except Exception as txt_error:
-                print(f"Error copying text file {txt_file}: {txt_error}")
-                continue
-        print(f"Downloaded {len(image_files)} images and {len(text_files)} captions to {local_path}")
 def create_config(dataset_path: str, output_path: str):
     """Create training configuration"""
@@ -759,4 +793,4 @@ async function checkHFJobStatus(token: string, jobId: string): Promise<any> {
       reject(new Error(`Process error: ${err.message}`));
     });
   });
-}

     sys.path.insert(0, os.path.abspath(repo_dir))
     return repo_dir
+def find_local_dataset_source(dataset_repo: str):
+    if not dataset_repo:
+        return None
+    repo_stripped = dataset_repo.strip()
+    candidates = []
+    if os.path.isabs(repo_stripped):
+        candidates.append(repo_stripped)
+    else:
+        candidates.append(repo_stripped)
+        candidates.append(os.path.abspath(repo_stripped))
+    normalized = normalize_repo_id(repo_stripped)
+    if normalized:
+        candidates.append(os.path.join("/datasets", normalized))
+    if repo_stripped.startswith("/datasets/") and repo_stripped not in candidates:
+        candidates.append(repo_stripped)
+    seen = set()
+    for candidate in candidates:
+        if not candidate or candidate in seen:
+            continue
+        seen.add(candidate)
+        if os.path.exists(candidate):
+            return candidate
+    return None
+def normalize_repo_id(dataset_repo: str) -> str:
+    repo_id = dataset_repo.strip()
+    if repo_id.startswith("/datasets/"):
+        repo_id = repo_id[len("/datasets/"):]
+    elif repo_id.startswith("datasets/"):
+        repo_id = repo_id[len("datasets/"):]
+    return repo_id.strip("/")
+def copy_dataset_files(source_dir: str, local_path: str):
+    print(f"Collecting data files from {source_dir}")
+    image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.webp', '*.bmp', '*.JPG', '*.JPEG', '*.PNG']
+    image_files = []
+    for ext in image_extensions:
+        pattern = os.path.join(source_dir, "**", ext)
+        found_files = glob.glob(pattern, recursive=True)
+        image_files.extend(found_files)
+        print(f"Pattern {pattern} found {len(found_files)} files")
+    text_files = glob.glob(os.path.join(source_dir, "**", "*.txt"), recursive=True)
+    print(f"Found {len(image_files)} image files and {len(text_files)} text files")
+    for i, img_file in enumerate(image_files):
+        dest_path = os.path.join(local_path, f"image_{i:06d}.jpg")
+        try:
+            with Image.open(img_file) as image:
+                if image.mode == 'RGBA':
+                    background = Image.new('RGB', image.size, (255, 255, 255))
+                    background.paste(image, mask=image.split()[-1])
+                    image = background
+                elif image.mode not in ['RGB', 'L']:
+                    image = image.convert('RGB')
+                image.save(dest_path, 'JPEG')
+        except Exception as img_error:
+            print(f"Error processing image {img_file}: {img_error}")
+            continue
+    captions_to_copy = min(len(text_files), len(image_files))
+    for i, txt_file in enumerate(text_files[:captions_to_copy]):
+        dest_path = os.path.join(local_path, f"image_{i:06d}.txt")
+        try:
+            shutil.copy2(txt_file, dest_path)
+        except Exception as txt_error:
+            print(f"Error copying text file {txt_file}: {txt_error}")
+            continue
+    print(f"Prepared {len(image_files)} images and {captions_to_copy} captions in {local_path}")
 def download_dataset(dataset_repo: str, local_path: str):
     """Download dataset from HF Hub as files"""
     print(f"Downloading dataset from {dataset_repo}...")
     os.makedirs(local_path, exist_ok=True)
+    local_source = find_local_dataset_source(dataset_repo)
+    if local_source:
+        print(f"Found local dataset at {local_source}")
+        copy_dataset_files(local_source, local_path)
+        return
+    repo_id = normalize_repo_id(dataset_repo)
     try:
+        dataset = load_dataset(repo_id, split="train")
         for i, item in enumerate(dataset):
             if "image" in item:
                 image_path = os.path.join(local_path, f"image_{i:06d}.jpg")
                 image = item["image"]
                 if image.mode == 'RGBA':
                     background = Image.new('RGB', image.size, (255, 255, 255))
+                    background.paste(image, mask=image.split()[-1])
                     image = background
                 elif image.mode not in ['RGB', 'L']:
                     image = image.convert('RGB')
                 image.save(image_path, 'JPEG')
             if "text" in item:
                 caption_path = os.path.join(local_path, f"image_{i:06d}.txt")
                 with open(caption_path, "w", encoding="utf-8") as f:
                     f.write(item["text"])
         print(f"Downloaded {len(dataset)} items to {local_path}")
     except Exception as e:
         print(f"Failed to load as structured dataset: {e}")
         print("Attempting to download raw files...")
+        temp_repo_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
         print(f"Downloaded repo to: {temp_repo_path}")
         print(f"Contents: {os.listdir(temp_repo_path)}")
+        copy_dataset_files(temp_repo_path, local_path)
 def create_config(dataset_path: str, output_path: str):
     """Create training configuration"""
       reject(new Error(`Process error: ${err.message}`));
     });
   });
+}