Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
new local dataset fix
Browse files- ui/src/app/api/hf-jobs/route.ts +33 -13
ui/src/app/api/hf-jobs/route.ts
CHANGED
|
@@ -244,6 +244,7 @@ def copy_dataset_files(source_dir: str, local_path: str):
|
|
| 244 |
continue
|
| 245 |
|
| 246 |
print(f"Prepared {len(image_files)} images and {captions_to_copy} captions in {local_path}")
|
|
|
|
| 247 |
|
| 248 |
|
| 249 |
def download_dataset(dataset_repo: str, local_path: str):
|
|
@@ -255,16 +256,37 @@ def download_dataset(dataset_repo: str, local_path: str):
|
|
| 255 |
local_source = find_local_dataset_source(dataset_repo)
|
| 256 |
if local_source:
|
| 257 |
print(f"Found local dataset at {local_source}")
|
| 258 |
-
copy_dataset_files(local_source, local_path)
|
| 259 |
-
|
|
|
|
|
|
|
| 260 |
|
| 261 |
repo_id = normalize_repo_id(dataset_repo)
|
| 262 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
try:
|
| 264 |
dataset = load_dataset(repo_id, split="train")
|
| 265 |
|
|
|
|
|
|
|
|
|
|
| 266 |
for i, item in enumerate(dataset):
|
| 267 |
-
if "image" in item:
|
| 268 |
image_path = os.path.join(local_path, f"image_{i:06d}.jpg")
|
| 269 |
image = item["image"]
|
| 270 |
|
|
@@ -276,24 +298,22 @@ def download_dataset(dataset_repo: str, local_path: str):
|
|
| 276 |
image = image.convert('RGB')
|
| 277 |
|
| 278 |
image.save(image_path, 'JPEG')
|
|
|
|
| 279 |
|
| 280 |
-
if "text" in item:
|
| 281 |
caption_path = os.path.join(local_path, f"image_{i:06d}.txt")
|
| 282 |
with open(caption_path, "w", encoding="utf-8") as f:
|
| 283 |
f.write(item["text"])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
|
| 285 |
-
print(f"Downloaded {
|
| 286 |
|
| 287 |
except Exception as e:
|
| 288 |
print(f"Failed to load as structured dataset: {e}")
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
temp_repo_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
|
| 292 |
-
|
| 293 |
-
print(f"Downloaded repo to: {temp_repo_path}")
|
| 294 |
-
print(f"Contents: {os.listdir(temp_repo_path)}")
|
| 295 |
-
|
| 296 |
-
copy_dataset_files(temp_repo_path, local_path)
|
| 297 |
|
| 298 |
def create_config(dataset_path: str, output_path: str):
|
| 299 |
"""Create training configuration"""
|
|
|
|
| 244 |
continue
|
| 245 |
|
| 246 |
print(f"Prepared {len(image_files)} images and {captions_to_copy} captions in {local_path}")
|
| 247 |
+
return len(image_files), captions_to_copy
|
| 248 |
|
| 249 |
|
| 250 |
def download_dataset(dataset_repo: str, local_path: str):
|
|
|
|
| 256 |
local_source = find_local_dataset_source(dataset_repo)
|
| 257 |
if local_source:
|
| 258 |
print(f"Found local dataset at {local_source}")
|
| 259 |
+
images_copied, _ = copy_dataset_files(local_source, local_path)
|
| 260 |
+
if images_copied > 0:
|
| 261 |
+
return
|
| 262 |
+
print("Local dataset did not contain images, falling back to remote download")
|
| 263 |
|
| 264 |
repo_id = normalize_repo_id(dataset_repo)
|
| 265 |
|
| 266 |
+
if repo_id:
|
| 267 |
+
try:
|
| 268 |
+
print(f"Attempting snapshot download for dataset {repo_id}")
|
| 269 |
+
temp_repo_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
|
| 270 |
+
print(f"Downloaded repo to: {temp_repo_path}")
|
| 271 |
+
print(f"Contents: {os.listdir(temp_repo_path)}")
|
| 272 |
+
images_copied, _ = copy_dataset_files(temp_repo_path, local_path)
|
| 273 |
+
if images_copied > 0:
|
| 274 |
+
return
|
| 275 |
+
print("Snapshot download did not contain images, attempting structured dataset load")
|
| 276 |
+
except Exception as snapshot_error:
|
| 277 |
+
print(f"Snapshot download failed: {snapshot_error}")
|
| 278 |
+
|
| 279 |
+
if not repo_id:
|
| 280 |
+
raise ValueError("Dataset repository ID is required when no local dataset is available")
|
| 281 |
+
|
| 282 |
try:
|
| 283 |
dataset = load_dataset(repo_id, split="train")
|
| 284 |
|
| 285 |
+
images_saved = 0
|
| 286 |
+
captions_saved = 0
|
| 287 |
+
|
| 288 |
for i, item in enumerate(dataset):
|
| 289 |
+
if "image" in item and item["image"] is not None:
|
| 290 |
image_path = os.path.join(local_path, f"image_{i:06d}.jpg")
|
| 291 |
image = item["image"]
|
| 292 |
|
|
|
|
| 298 |
image = image.convert('RGB')
|
| 299 |
|
| 300 |
image.save(image_path, 'JPEG')
|
| 301 |
+
images_saved += 1
|
| 302 |
|
| 303 |
+
if "text" in item and item["text"] is not None:
|
| 304 |
caption_path = os.path.join(local_path, f"image_{i:06d}.txt")
|
| 305 |
with open(caption_path, "w", encoding="utf-8") as f:
|
| 306 |
f.write(item["text"])
|
| 307 |
+
captions_saved += 1
|
| 308 |
+
|
| 309 |
+
if images_saved == 0:
|
| 310 |
+
raise ValueError(f"Structured dataset load completed but produced 0 images for {repo_id}")
|
| 311 |
|
| 312 |
+
print(f"Downloaded {images_saved} items to {local_path}")
|
| 313 |
|
| 314 |
except Exception as e:
|
| 315 |
print(f"Failed to load as structured dataset: {e}")
|
| 316 |
+
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
|
| 318 |
def create_config(dataset_path: str, output_path: str):
|
| 319 |
"""Create training configuration"""
|