multimodalart HF Staff commited on
Commit
e218de2
·
verified ·
1 Parent(s): 5675b78

new local dataset fix

Browse files
Files changed (1) hide show
  1. ui/src/app/api/hf-jobs/route.ts +33 -13
ui/src/app/api/hf-jobs/route.ts CHANGED
@@ -244,6 +244,7 @@ def copy_dataset_files(source_dir: str, local_path: str):
244
  continue
245
 
246
  print(f"Prepared {len(image_files)} images and {captions_to_copy} captions in {local_path}")
 
247
 
248
 
249
  def download_dataset(dataset_repo: str, local_path: str):
@@ -255,16 +256,37 @@ def download_dataset(dataset_repo: str, local_path: str):
255
  local_source = find_local_dataset_source(dataset_repo)
256
  if local_source:
257
  print(f"Found local dataset at {local_source}")
258
- copy_dataset_files(local_source, local_path)
259
- return
 
 
260
 
261
  repo_id = normalize_repo_id(dataset_repo)
262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  try:
264
  dataset = load_dataset(repo_id, split="train")
265
 
 
 
 
266
  for i, item in enumerate(dataset):
267
- if "image" in item:
268
  image_path = os.path.join(local_path, f"image_{i:06d}.jpg")
269
  image = item["image"]
270
 
@@ -276,24 +298,22 @@ def download_dataset(dataset_repo: str, local_path: str):
276
  image = image.convert('RGB')
277
 
278
  image.save(image_path, 'JPEG')
 
279
 
280
- if "text" in item:
281
  caption_path = os.path.join(local_path, f"image_{i:06d}.txt")
282
  with open(caption_path, "w", encoding="utf-8") as f:
283
  f.write(item["text"])
 
 
 
 
284
 
285
- print(f"Downloaded {len(dataset)} items to {local_path}")
286
 
287
  except Exception as e:
288
  print(f"Failed to load as structured dataset: {e}")
289
- print("Attempting to download raw files...")
290
-
291
- temp_repo_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
292
-
293
- print(f"Downloaded repo to: {temp_repo_path}")
294
- print(f"Contents: {os.listdir(temp_repo_path)}")
295
-
296
- copy_dataset_files(temp_repo_path, local_path)
297
 
298
  def create_config(dataset_path: str, output_path: str):
299
  """Create training configuration"""
 
244
  continue
245
 
246
  print(f"Prepared {len(image_files)} images and {captions_to_copy} captions in {local_path}")
247
+ return len(image_files), captions_to_copy
248
 
249
 
250
  def download_dataset(dataset_repo: str, local_path: str):
 
256
  local_source = find_local_dataset_source(dataset_repo)
257
  if local_source:
258
  print(f"Found local dataset at {local_source}")
259
+ images_copied, _ = copy_dataset_files(local_source, local_path)
260
+ if images_copied > 0:
261
+ return
262
+ print("Local dataset did not contain images, falling back to remote download")
263
 
264
  repo_id = normalize_repo_id(dataset_repo)
265
 
266
+ if repo_id:
267
+ try:
268
+ print(f"Attempting snapshot download for dataset {repo_id}")
269
+ temp_repo_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
270
+ print(f"Downloaded repo to: {temp_repo_path}")
271
+ print(f"Contents: {os.listdir(temp_repo_path)}")
272
+ images_copied, _ = copy_dataset_files(temp_repo_path, local_path)
273
+ if images_copied > 0:
274
+ return
275
+ print("Snapshot download did not contain images, attempting structured dataset load")
276
+ except Exception as snapshot_error:
277
+ print(f"Snapshot download failed: {snapshot_error}")
278
+
279
+ if not repo_id:
280
+ raise ValueError("Dataset repository ID is required when no local dataset is available")
281
+
282
  try:
283
  dataset = load_dataset(repo_id, split="train")
284
 
285
+ images_saved = 0
286
+ captions_saved = 0
287
+
288
  for i, item in enumerate(dataset):
289
+ if "image" in item and item["image"] is not None:
290
  image_path = os.path.join(local_path, f"image_{i:06d}.jpg")
291
  image = item["image"]
292
 
 
298
  image = image.convert('RGB')
299
 
300
  image.save(image_path, 'JPEG')
301
+ images_saved += 1
302
 
303
+ if "text" in item and item["text"] is not None:
304
  caption_path = os.path.join(local_path, f"image_{i:06d}.txt")
305
  with open(caption_path, "w", encoding="utf-8") as f:
306
  f.write(item["text"])
307
+ captions_saved += 1
308
+
309
+ if images_saved == 0:
310
+ raise ValueError(f"Structured dataset load completed but produced 0 images for {repo_id}")
311
 
312
+ print(f"Downloaded {images_saved} items to {local_path}")
313
 
314
  except Exception as e:
315
  print(f"Failed to load as structured dataset: {e}")
316
+ raise
 
 
 
 
 
 
 
317
 
318
  def create_config(dataset_path: str, output_path: str):
319
  """Create training configuration"""