Ali Mohsin commited on
Commit
4619bfc
Β·
1 Parent(s): a9c2886

Optimizations try 1

Browse files

Enhance dataset preparation logic to check for existing images and splits before processing. Introduce manual dataset preparation option in the UI. Improve user feedback during dataset setup and streamline the bootstrap process.

Files changed (2) hide show
  1. app.py +72 -50
  2. utils/data_fetch.py +3 -1
app.py CHANGED
@@ -270,20 +270,38 @@ def _background_bootstrap():
270
  global BOOT_STATUS
271
  global DATASET_ROOT
272
  try:
273
- BOOT_STATUS = "preparing-dataset"
274
- ds_root = ensure_dataset_ready()
275
- DATASET_ROOT = ds_root
276
- if not ds_root:
277
- BOOT_STATUS = "dataset-not-prepared"
278
- return
279
-
280
- # Prepare splits from official data if missing
281
- splits_dir = os.path.join(ds_root, "splits")
282
- need_prepare = not (
283
  os.path.isfile(os.path.join(splits_dir, "train.json")) or
284
  os.path.isfile(os.path.join(splits_dir, "outfit_triplets_train.json"))
285
  )
286
- if need_prepare:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
  BOOT_STATUS = "creating-splits"
288
  os.makedirs(splits_dir, exist_ok=True)
289
  from scripts.prepare_polyvore import main as prepare_main
@@ -292,10 +310,12 @@ def _background_bootstrap():
292
  argv_bak = sys.argv
293
  try:
294
  # Use official splits from nondisjoint/ and disjoint/ folders with default size limit (500 samples for faster training)
295
- sys.argv = ["prepare_polyvore.py", "--root", ds_root, "--max_samples", "500"]
296
  prepare_main()
297
  finally:
298
  sys.argv = argv_bak
 
 
299
 
300
  # Train if checkpoints are absent
301
  export_dir = os.getenv("EXPORT_DIR", "models/exports")
@@ -1326,7 +1346,7 @@ def start_training_simple(dataset_size: str, res_epochs: int, vit_epochs: int):
1326
  except Exception as e:
1327
  log_message += f"\nError: {e}"
1328
 
1329
- threading.Thread(target=_runner, daemon=True).start()
1330
  return log_message
1331
 
1332
 
@@ -1440,10 +1460,48 @@ with gr.Blocks(fill_height=True, title="Dressify - Advanced Outfit Recommendatio
1440
  with gr.Tab("πŸ”¬ Advanced Training"):
1441
  gr.Markdown("### 🎯 Comprehensive Training Parameter Control\nCustomize every aspect of model training for research and experimentation.")
1442
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1443
  # Global Dataset Size Control
1444
  with gr.Row():
1445
  gr.Markdown("#### 🎯 **Global Dataset Size Control**")
1446
- gr.Markdown("**Note**: Initial bootstrap downloads full dataset (required). Use 'Apply' button to limit splits for testing.")
1447
 
1448
  with gr.Row():
1449
  gr.Markdown("#### πŸ“Š **Current Behavior**")
@@ -1703,42 +1761,6 @@ with gr.Blocks(fill_height=True, title="Dressify - Advanced Outfit Recommendatio
1703
 
1704
  download_all_btn.click(fn=download_all_files, inputs=[], outputs=download_result)
1705
 
1706
- with gr.Tab("πŸ”§ Simple Training"):
1707
- gr.Markdown("### πŸš€ Quick Training with Default Parameters\nFast training with proven configurations for immediate results.")
1708
-
1709
- with gr.Row():
1710
- with gr.Column(scale=1):
1711
- gr.Markdown("#### πŸ“Š Dataset Size Control")
1712
- gr.Markdown("Start small for testing, increase for production training")
1713
- dataset_size = gr.Dropdown(
1714
- choices=["160", "500", "2000", "5000", "10000", "25000", "50000", "full"],
1715
- value="500",
1716
- label="Training Dataset Size"
1717
- )
1718
- gr.Markdown("**2000**: Quick testing (~2-5 min)\n**5000**: Fast validation (~5-10 min)\n**10000**: Good validation (~10-20 min)\n**25000+**: Production training")
1719
-
1720
- with gr.Column(scale=1):
1721
- gr.Markdown("#### βš™οΈ Training Parameters")
1722
- epochs_res = gr.Slider(1, 50, value=3, step=1, label="ResNet epochs")
1723
- epochs_vit = gr.Slider(1, 100, value=3, step=1, label="ViT epochs")
1724
-
1725
- train_log = gr.Textbox(label="Training Log", lines=10)
1726
- start_btn = gr.Button("Start Training")
1727
- start_btn.click(fn=start_training_simple, inputs=[dataset_size, epochs_res, epochs_vit], outputs=train_log)
1728
-
1729
- with gr.Tab("πŸ“Š Embed (Debug)"):
1730
- inp = gr.Files(
1731
- label="Upload Items (multiple images)",
1732
- file_count="multiple"
1733
- # Note: file_types removed to allow API client flexibility
1734
- # Validation is handled by our image_utils.load_images_from_files()
1735
- )
1736
- out = gr.Textbox(label="Embeddings (JSON)")
1737
- btn = gr.Button("Compute Embeddings")
1738
- btn.click(fn=gradio_embed, inputs=inp, outputs=out)
1739
-
1740
-
1741
-
1742
  with gr.Tab("πŸ“ˆ Status"):
1743
  gr.Markdown("### 🚦 System Status and Monitoring\nReal-time status of dataset preparation, training, and system health.")
1744
  status = gr.Textbox(label="Bootstrap Status", value=lambda: BOOT_STATUS)
 
270
  global BOOT_STATUS
271
  global DATASET_ROOT
272
  try:
273
+ # Check if dataset root exists and has basic structure
274
+ root = os.path.abspath(os.path.join(os.getcwd(), "data", "Polyvore"))
275
+ images_dir = os.path.join(root, "images")
276
+ splits_dir = os.path.join(root, "splits")
277
+
278
+ # Only check dataset if images directory doesn't exist
279
+ has_images = os.path.isdir(images_dir) and any(os.listdir(images_dir))
280
+ has_splits = (
 
 
281
  os.path.isfile(os.path.join(splits_dir, "train.json")) or
282
  os.path.isfile(os.path.join(splits_dir, "outfit_triplets_train.json"))
283
  )
284
+
285
+ if has_images and has_splits:
286
+ print("βœ… Dataset and splits already prepared, skipping startup preparation")
287
+ DATASET_ROOT = root
288
+ BOOT_STATUS = "ready"
289
+ return
290
+
291
+ # Only prepare dataset if images are missing
292
+ if not has_images:
293
+ BOOT_STATUS = "preparing-dataset"
294
+ ds_root = ensure_dataset_ready()
295
+ DATASET_ROOT = ds_root
296
+ if not ds_root:
297
+ BOOT_STATUS = "dataset-not-prepared"
298
+ return
299
+ else:
300
+ DATASET_ROOT = root
301
+ print("βœ… Dataset images already exist, skipping extraction")
302
+
303
+ # Only prepare splits if missing
304
+ if not has_splits:
305
  BOOT_STATUS = "creating-splits"
306
  os.makedirs(splits_dir, exist_ok=True)
307
  from scripts.prepare_polyvore import main as prepare_main
 
310
  argv_bak = sys.argv
311
  try:
312
  # Use official splits from nondisjoint/ and disjoint/ folders with default size limit (500 samples for faster training)
313
+ sys.argv = ["prepare_polyvore.py", "--root", DATASET_ROOT, "--max_samples", "500"]
314
  prepare_main()
315
  finally:
316
  sys.argv = argv_bak
317
+ else:
318
+ print("βœ… Splits already prepared, skipping")
319
 
320
  # Train if checkpoints are absent
321
  export_dir = os.getenv("EXPORT_DIR", "models/exports")
 
1346
  except Exception as e:
1347
  log_message += f"\nError: {e}"
1348
 
1349
+ threading.Thread(target=_runner, daemon=True).start()
1350
  return log_message
1351
 
1352
 
 
1460
  with gr.Tab("πŸ”¬ Advanced Training"):
1461
  gr.Markdown("### 🎯 Comprehensive Training Parameter Control\nCustomize every aspect of model training for research and experimentation.")
1462
 
1463
+ # Dataset Preparation Section
1464
+ with gr.Accordion("πŸ“¦ Dataset Preparation (Optional)", open=False):
1465
+ gr.Markdown("**Note**: Dataset is automatically prepared on first startup. Use this only if you need to re-download or re-extract the dataset.")
1466
+ with gr.Row():
1467
+ prepare_dataset_btn = gr.Button("πŸ“₯ Download & Prepare Dataset", variant="secondary")
1468
+ prepare_status = gr.Textbox(label="Dataset Preparation Status", value="Dataset will be prepared if missing", interactive=False)
1469
+
1470
+ def prepare_dataset_manual():
1471
+ """Manually trigger dataset preparation."""
1472
+ global DATASET_ROOT, BOOT_STATUS
1473
+ try:
1474
+ BOOT_STATUS = "preparing-dataset"
1475
+ ds_root = ensure_dataset_ready()
1476
+ DATASET_ROOT = ds_root
1477
+ if not ds_root:
1478
+ BOOT_STATUS = "dataset-not-prepared"
1479
+ return "❌ Failed to prepare dataset"
1480
+
1481
+ # Prepare splits
1482
+ splits_dir = os.path.join(ds_root, "splits")
1483
+ os.makedirs(splits_dir, exist_ok=True)
1484
+ from scripts.prepare_polyvore import main as prepare_main
1485
+ os.environ.setdefault("PYTHONWARNINGS", "ignore")
1486
+ import sys
1487
+ argv_bak = sys.argv
1488
+ try:
1489
+ sys.argv = ["prepare_polyvore.py", "--root", ds_root, "--max_samples", "500"]
1490
+ prepare_main()
1491
+ BOOT_STATUS = "ready"
1492
+ return "βœ… Dataset prepared successfully!"
1493
+ finally:
1494
+ sys.argv = argv_bak
1495
+ except Exception as e:
1496
+ BOOT_STATUS = "error"
1497
+ return f"❌ Error: {str(e)}"
1498
+
1499
+ prepare_dataset_btn.click(fn=prepare_dataset_manual, inputs=[], outputs=prepare_status)
1500
+
1501
  # Global Dataset Size Control
1502
  with gr.Row():
1503
  gr.Markdown("#### 🎯 **Global Dataset Size Control**")
1504
+ gr.Markdown("**Note**: Use 'Apply' button to regenerate splits with different size limits.")
1505
 
1506
  with gr.Row():
1507
  gr.Markdown("#### πŸ“Š **Current Behavior**")
 
1761
 
1762
  download_all_btn.click(fn=download_all_files, inputs=[], outputs=download_result)
1763
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1764
  with gr.Tab("πŸ“ˆ Status"):
1765
  gr.Markdown("### 🚦 System Status and Monitoring\nReal-time status of dataset preparation, training, and system health.")
1766
  status = gr.Textbox(label="Bootstrap Status", value=lambda: BOOT_STATUS)
utils/data_fetch.py CHANGED
@@ -78,9 +78,11 @@ def ensure_dataset_ready() -> Optional[str]:
78
  print("βœ… Dataset already complete")
79
  return root
80
 
81
- # If images are already present, don't return early; still ensure metadata JSONs exist
82
  if not has_images:
83
  _unzip_images_if_needed(root)
 
 
84
 
85
  # Download the HF dataset snapshot into root
86
  try:
 
78
  print("βœ… Dataset already complete")
79
  return root
80
 
81
+ # If images are already present, skip extraction
82
  if not has_images:
83
  _unzip_images_if_needed(root)
84
+ else:
85
+ print("βœ… Images already extracted, skipping extraction")
86
 
87
  # Download the HF dataset snapshot into root
88
  try: