Spaces:
Paused
Paused
Ali Mohsin
commited on
Commit
Β·
4619bfc
1
Parent(s):
a9c2886
Optimizations try 1
Browse filesEnhance dataset preparation logic to check for existing images and splits before processing. Introduce manual dataset preparation option in the UI. Improve user feedback during dataset setup and streamline the bootstrap process.
- app.py +72 -50
- utils/data_fetch.py +3 -1
app.py
CHANGED
|
@@ -270,20 +270,38 @@ def _background_bootstrap():
|
|
| 270 |
global BOOT_STATUS
|
| 271 |
global DATASET_ROOT
|
| 272 |
try:
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
splits_dir = os.path.join(ds_root, "splits")
|
| 282 |
-
need_prepare = not (
|
| 283 |
os.path.isfile(os.path.join(splits_dir, "train.json")) or
|
| 284 |
os.path.isfile(os.path.join(splits_dir, "outfit_triplets_train.json"))
|
| 285 |
)
|
| 286 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
BOOT_STATUS = "creating-splits"
|
| 288 |
os.makedirs(splits_dir, exist_ok=True)
|
| 289 |
from scripts.prepare_polyvore import main as prepare_main
|
|
@@ -292,10 +310,12 @@ def _background_bootstrap():
|
|
| 292 |
argv_bak = sys.argv
|
| 293 |
try:
|
| 294 |
# Use official splits from nondisjoint/ and disjoint/ folders with default size limit (500 samples for faster training)
|
| 295 |
-
sys.argv = ["prepare_polyvore.py", "--root",
|
| 296 |
prepare_main()
|
| 297 |
finally:
|
| 298 |
sys.argv = argv_bak
|
|
|
|
|
|
|
| 299 |
|
| 300 |
# Train if checkpoints are absent
|
| 301 |
export_dir = os.getenv("EXPORT_DIR", "models/exports")
|
|
@@ -1326,7 +1346,7 @@ def start_training_simple(dataset_size: str, res_epochs: int, vit_epochs: int):
|
|
| 1326 |
except Exception as e:
|
| 1327 |
log_message += f"\nError: {e}"
|
| 1328 |
|
| 1329 |
-
|
| 1330 |
return log_message
|
| 1331 |
|
| 1332 |
|
|
@@ -1440,10 +1460,48 @@ with gr.Blocks(fill_height=True, title="Dressify - Advanced Outfit Recommendatio
|
|
| 1440 |
with gr.Tab("π¬ Advanced Training"):
|
| 1441 |
gr.Markdown("### π― Comprehensive Training Parameter Control\nCustomize every aspect of model training for research and experimentation.")
|
| 1442 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1443 |
# Global Dataset Size Control
|
| 1444 |
with gr.Row():
|
| 1445 |
gr.Markdown("#### π― **Global Dataset Size Control**")
|
| 1446 |
-
gr.Markdown("**Note**:
|
| 1447 |
|
| 1448 |
with gr.Row():
|
| 1449 |
gr.Markdown("#### π **Current Behavior**")
|
|
@@ -1703,42 +1761,6 @@ with gr.Blocks(fill_height=True, title="Dressify - Advanced Outfit Recommendatio
|
|
| 1703 |
|
| 1704 |
download_all_btn.click(fn=download_all_files, inputs=[], outputs=download_result)
|
| 1705 |
|
| 1706 |
-
with gr.Tab("π§ Simple Training"):
|
| 1707 |
-
gr.Markdown("### π Quick Training with Default Parameters\nFast training with proven configurations for immediate results.")
|
| 1708 |
-
|
| 1709 |
-
with gr.Row():
|
| 1710 |
-
with gr.Column(scale=1):
|
| 1711 |
-
gr.Markdown("#### π Dataset Size Control")
|
| 1712 |
-
gr.Markdown("Start small for testing, increase for production training")
|
| 1713 |
-
dataset_size = gr.Dropdown(
|
| 1714 |
-
choices=["160", "500", "2000", "5000", "10000", "25000", "50000", "full"],
|
| 1715 |
-
value="500",
|
| 1716 |
-
label="Training Dataset Size"
|
| 1717 |
-
)
|
| 1718 |
-
gr.Markdown("**2000**: Quick testing (~2-5 min)\n**5000**: Fast validation (~5-10 min)\n**10000**: Good validation (~10-20 min)\n**25000+**: Production training")
|
| 1719 |
-
|
| 1720 |
-
with gr.Column(scale=1):
|
| 1721 |
-
gr.Markdown("#### βοΈ Training Parameters")
|
| 1722 |
-
epochs_res = gr.Slider(1, 50, value=3, step=1, label="ResNet epochs")
|
| 1723 |
-
epochs_vit = gr.Slider(1, 100, value=3, step=1, label="ViT epochs")
|
| 1724 |
-
|
| 1725 |
-
train_log = gr.Textbox(label="Training Log", lines=10)
|
| 1726 |
-
start_btn = gr.Button("Start Training")
|
| 1727 |
-
start_btn.click(fn=start_training_simple, inputs=[dataset_size, epochs_res, epochs_vit], outputs=train_log)
|
| 1728 |
-
|
| 1729 |
-
with gr.Tab("π Embed (Debug)"):
|
| 1730 |
-
inp = gr.Files(
|
| 1731 |
-
label="Upload Items (multiple images)",
|
| 1732 |
-
file_count="multiple"
|
| 1733 |
-
# Note: file_types removed to allow API client flexibility
|
| 1734 |
-
# Validation is handled by our image_utils.load_images_from_files()
|
| 1735 |
-
)
|
| 1736 |
-
out = gr.Textbox(label="Embeddings (JSON)")
|
| 1737 |
-
btn = gr.Button("Compute Embeddings")
|
| 1738 |
-
btn.click(fn=gradio_embed, inputs=inp, outputs=out)
|
| 1739 |
-
|
| 1740 |
-
|
| 1741 |
-
|
| 1742 |
with gr.Tab("π Status"):
|
| 1743 |
gr.Markdown("### π¦ System Status and Monitoring\nReal-time status of dataset preparation, training, and system health.")
|
| 1744 |
status = gr.Textbox(label="Bootstrap Status", value=lambda: BOOT_STATUS)
|
|
|
|
| 270 |
global BOOT_STATUS
|
| 271 |
global DATASET_ROOT
|
| 272 |
try:
|
| 273 |
+
# Check if dataset root exists and has basic structure
|
| 274 |
+
root = os.path.abspath(os.path.join(os.getcwd(), "data", "Polyvore"))
|
| 275 |
+
images_dir = os.path.join(root, "images")
|
| 276 |
+
splits_dir = os.path.join(root, "splits")
|
| 277 |
+
|
| 278 |
+
# Only check dataset if images directory doesn't exist
|
| 279 |
+
has_images = os.path.isdir(images_dir) and any(os.listdir(images_dir))
|
| 280 |
+
has_splits = (
|
|
|
|
|
|
|
| 281 |
os.path.isfile(os.path.join(splits_dir, "train.json")) or
|
| 282 |
os.path.isfile(os.path.join(splits_dir, "outfit_triplets_train.json"))
|
| 283 |
)
|
| 284 |
+
|
| 285 |
+
if has_images and has_splits:
|
| 286 |
+
print("β
Dataset and splits already prepared, skipping startup preparation")
|
| 287 |
+
DATASET_ROOT = root
|
| 288 |
+
BOOT_STATUS = "ready"
|
| 289 |
+
return
|
| 290 |
+
|
| 291 |
+
# Only prepare dataset if images are missing
|
| 292 |
+
if not has_images:
|
| 293 |
+
BOOT_STATUS = "preparing-dataset"
|
| 294 |
+
ds_root = ensure_dataset_ready()
|
| 295 |
+
DATASET_ROOT = ds_root
|
| 296 |
+
if not ds_root:
|
| 297 |
+
BOOT_STATUS = "dataset-not-prepared"
|
| 298 |
+
return
|
| 299 |
+
else:
|
| 300 |
+
DATASET_ROOT = root
|
| 301 |
+
print("β
Dataset images already exist, skipping extraction")
|
| 302 |
+
|
| 303 |
+
# Only prepare splits if missing
|
| 304 |
+
if not has_splits:
|
| 305 |
BOOT_STATUS = "creating-splits"
|
| 306 |
os.makedirs(splits_dir, exist_ok=True)
|
| 307 |
from scripts.prepare_polyvore import main as prepare_main
|
|
|
|
| 310 |
argv_bak = sys.argv
|
| 311 |
try:
|
| 312 |
# Use official splits from nondisjoint/ and disjoint/ folders with default size limit (500 samples for faster training)
|
| 313 |
+
sys.argv = ["prepare_polyvore.py", "--root", DATASET_ROOT, "--max_samples", "500"]
|
| 314 |
prepare_main()
|
| 315 |
finally:
|
| 316 |
sys.argv = argv_bak
|
| 317 |
+
else:
|
| 318 |
+
print("β
Splits already prepared, skipping")
|
| 319 |
|
| 320 |
# Train if checkpoints are absent
|
| 321 |
export_dir = os.getenv("EXPORT_DIR", "models/exports")
|
|
|
|
| 1346 |
except Exception as e:
|
| 1347 |
log_message += f"\nError: {e}"
|
| 1348 |
|
| 1349 |
+
threading.Thread(target=_runner, daemon=True).start()
|
| 1350 |
return log_message
|
| 1351 |
|
| 1352 |
|
|
|
|
| 1460 |
with gr.Tab("π¬ Advanced Training"):
|
| 1461 |
gr.Markdown("### π― Comprehensive Training Parameter Control\nCustomize every aspect of model training for research and experimentation.")
|
| 1462 |
|
| 1463 |
+
# Dataset Preparation Section
|
| 1464 |
+
with gr.Accordion("π¦ Dataset Preparation (Optional)", open=False):
|
| 1465 |
+
gr.Markdown("**Note**: Dataset is automatically prepared on first startup. Use this only if you need to re-download or re-extract the dataset.")
|
| 1466 |
+
with gr.Row():
|
| 1467 |
+
prepare_dataset_btn = gr.Button("π₯ Download & Prepare Dataset", variant="secondary")
|
| 1468 |
+
prepare_status = gr.Textbox(label="Dataset Preparation Status", value="Dataset will be prepared if missing", interactive=False)
|
| 1469 |
+
|
| 1470 |
+
def prepare_dataset_manual():
|
| 1471 |
+
"""Manually trigger dataset preparation."""
|
| 1472 |
+
global DATASET_ROOT, BOOT_STATUS
|
| 1473 |
+
try:
|
| 1474 |
+
BOOT_STATUS = "preparing-dataset"
|
| 1475 |
+
ds_root = ensure_dataset_ready()
|
| 1476 |
+
DATASET_ROOT = ds_root
|
| 1477 |
+
if not ds_root:
|
| 1478 |
+
BOOT_STATUS = "dataset-not-prepared"
|
| 1479 |
+
return "β Failed to prepare dataset"
|
| 1480 |
+
|
| 1481 |
+
# Prepare splits
|
| 1482 |
+
splits_dir = os.path.join(ds_root, "splits")
|
| 1483 |
+
os.makedirs(splits_dir, exist_ok=True)
|
| 1484 |
+
from scripts.prepare_polyvore import main as prepare_main
|
| 1485 |
+
os.environ.setdefault("PYTHONWARNINGS", "ignore")
|
| 1486 |
+
import sys
|
| 1487 |
+
argv_bak = sys.argv
|
| 1488 |
+
try:
|
| 1489 |
+
sys.argv = ["prepare_polyvore.py", "--root", ds_root, "--max_samples", "500"]
|
| 1490 |
+
prepare_main()
|
| 1491 |
+
BOOT_STATUS = "ready"
|
| 1492 |
+
return "β
Dataset prepared successfully!"
|
| 1493 |
+
finally:
|
| 1494 |
+
sys.argv = argv_bak
|
| 1495 |
+
except Exception as e:
|
| 1496 |
+
BOOT_STATUS = "error"
|
| 1497 |
+
return f"β Error: {str(e)}"
|
| 1498 |
+
|
| 1499 |
+
prepare_dataset_btn.click(fn=prepare_dataset_manual, inputs=[], outputs=prepare_status)
|
| 1500 |
+
|
| 1501 |
# Global Dataset Size Control
|
| 1502 |
with gr.Row():
|
| 1503 |
gr.Markdown("#### π― **Global Dataset Size Control**")
|
| 1504 |
+
gr.Markdown("**Note**: Use 'Apply' button to regenerate splits with different size limits.")
|
| 1505 |
|
| 1506 |
with gr.Row():
|
| 1507 |
gr.Markdown("#### π **Current Behavior**")
|
|
|
|
| 1761 |
|
| 1762 |
download_all_btn.click(fn=download_all_files, inputs=[], outputs=download_result)
|
| 1763 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1764 |
with gr.Tab("π Status"):
|
| 1765 |
gr.Markdown("### π¦ System Status and Monitoring\nReal-time status of dataset preparation, training, and system health.")
|
| 1766 |
status = gr.Textbox(label="Bootstrap Status", value=lambda: BOOT_STATUS)
|
utils/data_fetch.py
CHANGED
|
@@ -78,9 +78,11 @@ def ensure_dataset_ready() -> Optional[str]:
|
|
| 78 |
print("β
Dataset already complete")
|
| 79 |
return root
|
| 80 |
|
| 81 |
-
# If images are already present,
|
| 82 |
if not has_images:
|
| 83 |
_unzip_images_if_needed(root)
|
|
|
|
|
|
|
| 84 |
|
| 85 |
# Download the HF dataset snapshot into root
|
| 86 |
try:
|
|
|
|
| 78 |
print("β
Dataset already complete")
|
| 79 |
return root
|
| 80 |
|
| 81 |
+
# If images are already present, skip extraction
|
| 82 |
if not has_images:
|
| 83 |
_unzip_images_if_needed(root)
|
| 84 |
+
else:
|
| 85 |
+
print("β
Images already extracted, skipping extraction")
|
| 86 |
|
| 87 |
# Download the HF dataset snapshot into root
|
| 88 |
try:
|