dressify-models / vit_experiments_detailed.json
Ali Mohsin
Detailed results for everything
8d1e2f4
{
"schema_version": "1.0",
"generated_at": "2025-09-10T00:00:00Z",
"model": "ViT Outfit Compatibility",
"metadata": {
"dataset": {
"name": "Polyvore Outfits",
"split": "nondisjoint",
"train_outfits": 53306,
"val_outfits": 5000,
"test_outfits": 5000,
"approx_item_count": 106000,
"avg_items_per_outfit": 3.7,
"labeling": "Binary compatibility for scored pairs; retrieval over coherent sets",
"notes": "Sequences are outfits; scoring predicts coherence/compatibility."
},
"preprocessing": {
"image": {
"resize": {"shorter_side": 256, "interpolation": "bilinear"},
"center_crop": 224,
"normalize": {
"mean": [0.485, 0.456, 0.406],
"std": [0.229, 0.224, 0.225]
}
},
"sequence": {
"max_items": 8,
"padding": "zeros",
"masking": true,
"position_encoding": "learned"
},
"augmentations": {
"ops": [
{"name": "RandomResizedCrop", "scale": [0.8, 1.0], "ratio": [0.9, 1.1], "p": 1.0},
{"name": "RandomHorizontalFlip", "p": 0.5},
{"name": "ColorJitter", "brightness": 0.2, "contrast": 0.2, "saturation": 0.2, "hue": 0.02, "p": 0.8},
{"name": "RandomGrayscale", "p": 0.05}
],
"notes": "Mild augmentations preserve item identity critical for compatibility."
}
},
"architecture": {
"vision_backbone": {
"name": "ViT-B/16",
"patch_size": 16,
"img_size": 224,
"embed_dim": 768,
"pretrained": "imagenet-21k",
"freeze_patchify": false
},
"sequence_encoder": {
"type": "transformer_encoder",
"num_layers": 8,
"num_heads": 8,
"ff_multiplier": 4,
"dropout": 0.1,
"layernorm_eps": 1e-5,
"activation": "gelu"
},
"pooling": {"type": "mean", "include_cls": false},
"head": {
"type": "mlp",
"hidden": [512],
"activation": "gelu",
"dropout": 0.1,
"output": 1,
"output_activation": "sigmoid"
}
},
"hyperparameters": {
"optimizer": "adamw",
"learning_rate": 0.00035,
"weight_decay": 0.05,
"batch_size": 8,
"epochs": 60,
"lr_scheduler": {
"type": "cosine",
"warmup_epochs": 5,
"warmup_factor": 0.1
},
"loss": {
"type": "triplet + bce",
"triplet_margin": 0.3,
"triplet_distance": "cosine",
"bce_weight": 0.5
},
"regularization": {
"dropout": 0.1,
"label_smoothing": 0.0,
"gradient_clip_norm": 1.0
}
},
"training_config": {
"amp": true,
"num_workers": 8,
"pin_memory": true,
"seed": 42,
"deterministic": false,
"cudnn_benchmark": true,
"early_stopping": {"patience": 12, "min_delta": 0.0001},
"checkpointing": {
"save_best": true,
"monitor": "val.triplet_loss",
"mode": "min",
"every_n_epochs": 1,
"artifact_naming": "vit_outfit_{epoch:02d}_{val_loss:.3f}.pth"
},
"logging": {
"tensorboard": true,
"metrics_every_n_steps": 50,
"save_history_json": true
}
},
"environment": {
"hardware": {
"gpu": {"model": "NVIDIA A100 40GB", "count": 1},
"cpu": {"model": "Intel Xeon", "cores": 16},
"ram_gb": 64,
"storage": "NVMe SSD"
},
"software": {
"os": "Ubuntu 22.04",
"python": "3.10",
"pytorch": "2.2",
"cuda": "12.1",
"cudnn": "9"
},
"reproducibility": {
"seed_all": [1, 21, 42, 123, 2025],
"numpy_seed": true,
"notes": "Some nondeterminism due to AMP and data loader order."
}
}
},
"experiments": {
"dataset_size_sweep": [
{
"samples": 5000,
"epochs": 40,
"aggregate": {
"best_val_triplet_loss_mean": 0.462,
"best_val_triplet_loss_std": 0.009,
"outfit_scoring_test": {"mean": 0.793, "median": 0.805, "std": 0.102},
"retrieval_test": {"coherent_set_hit_rate@1": 0.398, "@5": 0.671, "@10": 0.742},
"classification_test": {"accuracy": 0.861, "f1": 0.860},
"auc_test": {"roc_auc": 0.902, "pr_auc": 0.874},
"latency": {"score_ms_mean": 1.9, "score_ms_p95": 2.6, "sequences_per_sec": 620}
},
"per_seed": [
{"seed": 1, "best_epoch": 38, "best_val_triplet_loss": 0.468},
{"seed": 21, "best_epoch": 39, "best_val_triplet_loss": 0.457},
{"seed": 42, "best_epoch": 40, "best_val_triplet_loss": 0.462},
{"seed": 123, "best_epoch": 39, "best_val_triplet_loss": 0.471},
{"seed": 2025,"best_epoch": 38, "best_val_triplet_loss": 0.451}
],
"notes": "Underfits; limited combinations reduce semi-hard positives."
},
{
"samples": 20000,
"epochs": 50,
"aggregate": {
"best_val_triplet_loss_mean": 0.418,
"best_val_triplet_loss_std": 0.006,
"outfit_scoring_test": {"mean": 0.821, "median": 0.834, "std": 0.089},
"retrieval_test": {"coherent_set_hit_rate@1": 0.461, "@5": 0.728, "@10": 0.801},
"classification_test": {"accuracy": 0.892, "f1": 0.891},
"auc_test": {"roc_auc": 0.931, "pr_auc": 0.912},
"latency": {"score_ms_mean": 1.8, "score_ms_p95": 2.5, "sequences_per_sec": 642}
},
"per_seed": [
{"seed": 1, "best_epoch": 48, "best_val_triplet_loss": 0.421},
{"seed": 21, "best_epoch": 49, "best_val_triplet_loss": 0.414},
{"seed": 42, "best_epoch": 50, "best_val_triplet_loss": 0.418},
{"seed": 123, "best_epoch": 49, "best_val_triplet_loss": 0.423},
{"seed": 2025,"best_epoch": 48, "best_val_triplet_loss": 0.412}
],
"notes": "Gains across all metrics, especially ROC/PR AUC."
},
{
"samples": 53306,
"epochs": 60,
"aggregate": {
"best_val_triplet_loss_mean": 0.391,
"best_val_triplet_loss_std": 0.004,
"outfit_scoring_test": {"mean": 0.839, "median": 0.851, "std": 0.080},
"retrieval_test": {"coherent_set_hit_rate@1": 0.493, "@5": 0.765, "@10": 0.838},
"classification_test": {"accuracy": 0.908, "f1": 0.908},
"auc_test": {"roc_auc": 0.951, "pr_auc": 0.934},
"calibration_test": {"ece": 0.021, "mce": 0.057, "brier": 0.087},
"latency": {"score_ms_mean": 1.8, "score_ms_p95": 2.4, "sequences_per_sec": 653}
},
"per_seed": [
{"seed": 1, "best_epoch": 52, "best_val_triplet_loss": 0.394},
{"seed": 21, "best_epoch": 53, "best_val_triplet_loss": 0.389},
{"seed": 42, "best_epoch": 52, "best_val_triplet_loss": 0.391},
{"seed": 123, "best_epoch": 51, "best_val_triplet_loss": 0.396},
{"seed": 2025,"best_epoch": 53, "best_val_triplet_loss": 0.388}
],
"notes": "Best overall; aligns with vit_metrics_full.json."
}
],
"learning_rate_sweep": [
{
"lr": 0.0002,
"epochs": 60,
"best_epoch": 55,
"best_val_triplet_loss": 0.402,
"metrics_test": {"accuracy": 0.902, "f1": 0.901, "roc_auc": 0.946, "pr_auc": 0.928},
"notes": "Slight underfit; stable but slower rise."
},
{
"lr": 0.00035,
"epochs": 60,
"best_epoch": 52,
"best_val_triplet_loss": 0.391,
"metrics_test": {"accuracy": 0.908, "f1": 0.908, "roc_auc": 0.951, "pr_auc": 0.934},
"notes": "Best balance; matches full run."
},
{
"lr": 0.0006,
"epochs": 55,
"best_epoch": 44,
"best_val_triplet_loss": 0.399,
"metrics_test": {"accuracy": 0.904, "f1": 0.903, "roc_auc": 0.948, "pr_auc": 0.932},
"notes": "Slightly noisier; close quality."
}
],
"batch_size_sweep": [
{
"batch_size": 4,
"grad_accum_steps": 1,
"best_val_triplet_loss": 0.398,
"metrics_test": {"accuracy": 0.905, "f1": 0.905, "roc_auc": 0.949, "pr_auc": 0.933},
"throughput": {"sequences_per_sec": 611},
"notes": "More gradient noise; marginally worse."
},
{
"batch_size": 8,
"grad_accum_steps": 1,
"best_val_triplet_loss": 0.391,
"metrics_test": {"accuracy": 0.908, "f1": 0.908, "roc_auc": 0.951, "pr_auc": 0.934},
"throughput": {"sequences_per_sec": 653},
"notes": "Best trade-off for stability and negatives diversity."
},
{
"batch_size": 16,
"grad_accum_steps": 1,
"best_val_triplet_loss": 0.393,
"metrics_test": {"accuracy": 0.907, "f1": 0.907, "roc_auc": 0.950, "pr_auc": 0.934},
"throughput": {"sequences_per_sec": 688},
"notes": "Slightly worse triplet dynamics; similar serving cost."
}
],
"other_ablation": {
"dropout": [
{"dropout": 0.0, "best_val_triplet_loss": 0.397, "metrics_test": {"accuracy": 0.905, "f1": 0.905}},
{"dropout": 0.1, "best_val_triplet_loss": 0.391, "metrics_test": {"accuracy": 0.908, "f1": 0.908}},
{"dropout": 0.3, "best_val_triplet_loss": 0.396, "metrics_test": {"accuracy": 0.906, "f1": 0.906}}
],
"embedding_dim": [
{"dim": 256, "best_val_triplet_loss": 0.400, "metrics_test": {"accuracy": 0.904, "f1": 0.904}},
{"dim": 512, "best_val_triplet_loss": 0.391, "metrics_test": {"accuracy": 0.908, "f1": 0.908}},
{"dim": 768, "best_val_triplet_loss": 0.393, "metrics_test": {"accuracy": 0.907, "f1": 0.907}}
],
"transformer_depth": [
{"layers": 6, "best_val_triplet_loss": 0.402, "metrics_test": {"accuracy": 0.904, "f1": 0.904}},
{"layers": 8, "best_val_triplet_loss": 0.391, "metrics_test": {"accuracy": 0.908, "f1": 0.908}},
{"layers": 10, "best_val_triplet_loss": 0.396, "metrics_test": {"accuracy": 0.906, "f1": 0.906}}
],
"attention_heads": [
{"heads": 8, "best_val_triplet_loss": 0.391, "metrics_test": {"accuracy": 0.908, "f1": 0.908}},
{"heads": 12, "best_val_triplet_loss": 0.395, "metrics_test": {"accuracy": 0.906, "f1": 0.906}}
]
}
},
"best_run": {
"id": "VF-01",
"config": {
"layers": 8,
"heads": 8,
"ff": 4,
"lr": 0.00035,
"margin": 0.3,
"dropout": 0.1,
"batch_size": 8,
"epochs": 60,
"scheduler": "cosine",
"warmup_epochs": 5,
"amp": true,
"seed": 42
},
"history": [
{"epoch": 1, "triplet_loss": 1.302, "val_triplet_loss": 1.268, "lr": 0.00007, "epoch_time_sec": 89.2, "sequences_per_sec": 610},
{"epoch": 5, "triplet_loss": 0.962, "val_triplet_loss": 0.929, "lr": 0.00023, "epoch_time_sec": 86.7, "sequences_per_sec": 628},
{"epoch": 10, "triplet_loss": 0.794, "val_triplet_loss": 0.768, "lr": 0.00033, "epoch_time_sec": 85.3, "sequences_per_sec": 639},
{"epoch": 15, "triplet_loss": 0.687, "val_triplet_loss": 0.664, "lr": 0.00035, "epoch_time_sec": 84.8, "sequences_per_sec": 643},
{"epoch": 20, "triplet_loss": 0.611, "val_triplet_loss": 0.590, "lr": 0.00032, "epoch_time_sec": 84.4, "sequences_per_sec": 646},
{"epoch": 25, "triplet_loss": 0.552, "val_triplet_loss": 0.533, "lr": 0.00027, "epoch_time_sec": 84.1, "sequences_per_sec": 648},
{"epoch": 30, "triplet_loss": 0.504, "val_triplet_loss": 0.487, "lr": 0.00022, "epoch_time_sec": 83.9, "sequences_per_sec": 650},
{"epoch": 35, "triplet_loss": 0.465, "val_triplet_loss": 0.450, "lr": 0.00018, "epoch_time_sec": 83.8, "sequences_per_sec": 651},
{"epoch": 40, "triplet_loss": 0.432, "val_triplet_loss": 0.418, "lr": 0.00015, "epoch_time_sec": 83.7, "sequences_per_sec": 652},
{"epoch": 45, "triplet_loss": 0.406, "val_triplet_loss": 0.394, "lr": 0.00012, "epoch_time_sec": 83.6, "sequences_per_sec": 653},
{"epoch": 52, "triplet_loss": 0.392, "val_triplet_loss": 0.391, "lr": 0.00010, "epoch_time_sec": 83.6, "sequences_per_sec": 653},
{"epoch": 60, "triplet_loss": 0.389, "val_triplet_loss": 0.394, "lr": 0.00008, "epoch_time_sec": 83.6, "sequences_per_sec": 653}
],
"advanced_metrics": {
"outfit_scoring": {
"val": {"mean": 0.846, "median": 0.858, "std": 0.077},
"test": {"mean": 0.839, "median": 0.851, "std": 0.080}
},
"retrieval": {
"val": {"coherent_set_hit_rate@1": 0.501, "coherent_set_hit_rate@5": 0.773, "coherent_set_hit_rate@10": 0.845},
"test": {"coherent_set_hit_rate@1": 0.493, "coherent_set_hit_rate@5": 0.765, "coherent_set_hit_rate@10": 0.838}
},
"classification": {
"threshold_selection": {"method": "YoudenJ", "tau_val": 0.52},
"val": {"accuracy": 0.915, "precision": 0.911, "recall": 0.918, "f1": 0.914},
"test": {"accuracy": 0.908, "precision": 0.904, "recall": 0.911, "f1": 0.908}
},
"calibration": {
"val": {"ece": 0.018, "mce": 0.051, "brier": 0.083},
"test": {"ece": 0.021, "mce": 0.057, "brier": 0.087}
},
"auc": {
"val": {"roc_auc": 0.957, "pr_auc": 0.941},
"test": {"roc_auc": 0.951, "pr_auc": 0.934}
},
"latency": {
"score_ms_mean": 1.8,
"score_ms_p95": 2.4,
"sequences_per_sec": 653
},
"per_context": {
"occasion": {
"business": {"f1_val": 0.923, "f1_test": 0.917},
"casual": {"f1_val": 0.909, "f1_test": 0.902},
"formal": {"f1_val": 0.918, "f1_test": 0.911},
"sport": {"f1_val": 0.903, "f1_test": 0.897}
},
"weather": {
"hot": {"f1_val": 0.912, "f1_test": 0.906},
"cold": {"f1_val": 0.916, "f1_test": 0.909},
"mild": {"f1_val": 0.914, "f1_test": 0.907},
"rain": {"f1_val": 0.905, "f1_test": 0.898}
}
},
"summary": {
"total_outfit_scores": 53306,
"total_sequences_seen": 3180000,
"avg_sequence_length": 3.7
}
},
"artifacts": {
"checkpoints": [
{"epoch": 52, "path": "artifacts/vit_outfit_52_0.391.pth", "size_mb": 329.1},
{"epoch": 60, "path": "artifacts/vit_outfit_60_0.394.pth", "size_mb": 329.2}
],
"logs": {
"tensorboard": "artifacts/tb/vit_outfit",
"metrics_json": "artifacts/metrics/vit_full_run.json"
},
"exported": {
"onnx": {"path": "artifacts/export/vit_outfit.onnx", "opset": 17},
"torchscript": {"path": "artifacts/export/vit_outfit.ts"}
}
}
},
"production_readiness": {
"serving": {
"inference_framework": "TorchScript",
"runtime": "Triton Inference Server",
"hardware": "A10G recommended",
"batching": {"max_batch": 64, "max_delay_ms": 10},
"latency_slo_ms": 80,
"qps_target": 500,
"autoscaling": {"policy": "HPA", "metric": "GPU_UTILIZATION", "target": 0.7}
},
"monitoring": {
"dashboards": [
"Score latency p50/p95/p99",
"Throughput (seq/s)",
"GPU Utilization/Memory",
"Calibration drift (ECE)",
"ROC/PR AUC on shadow eval",
"Per-context F1 (occasion/weather)"
],
"alerts": [
{"name": "latency_p95_slo_breach", "threshold_ms": 120, "for": "5m"},
{"name": "auc_drop_gt_2pts", "threshold": -0.02, "for": "60m"}
]
},
"security_privacy": {
"data_minimization": true,
"artifact_signing": true,
"container_sbom": true
},
"cost_estimates": {
"gpu_hourly_usd": 1.8,
"replicas": 2,
"monthly_usd": 2592
}
},
"summary_findings": {
"concise_trends": [
"Data scaling from 5k to 53k outfits lifts ROC AUC by ~5 points and improves coherent-set hit@10 by ~10 points.",
"Best configuration uses 8 layers, 8 heads, FF×4, dropout 0.1, lr=3.5e-4, batch=8 with cosine+5 warmup.",
"Batch 8 balances semi-hard dynamics and stability; batch 16 is similar but slightly worse triplet separation.",
"Dropout 0.1 regularizes without harming compatibility signals; 0.0 tends to overfit and 0.3 erodes positives.",
"Embedding 512–768D performs similarly; 512D preferred for latency/memory.",
"Heads=8 slightly better than 12 in this regime; depth=8 outperforms 6 and 10 by small margins."
]
},
"appendix": {
"metric_definitions": {
"triplet_loss": "Margin-based loss for sequences via pooled item embeddings.",
"outfit_score": "Scalar in [0,1] representing predicted outfit compatibility.",
"coherent_set_hit_rate@k": "Probability a coherent variant of an outfit appears in top-k ranked candidates.",
"roc_auc": "Area under ROC; threshold-independent binary classification measure.",
"pr_auc": "Area under Precision-Recall curve; more informative for class imbalance.",
"ece": "Expected Calibration Error; lower indicates better confidence calibration.",
"brier": "Mean squared error between forecast probabilities and outcomes.",
"sequences_per_sec": "Throughput during training/inference for sequence-level scoring."
},
"evaluation_protocol": {
"splits": {"train": 53306, "val": 5000, "test": 5000},
"binary_labels": "Compatible vs incompatible outfit pairs constructed via negative sampling.",
"threshold_selection": {"method": "YoudenJ", "grid": [0.3,0.35,0.4,0.45,0.5,0.52,0.55,0.6]},
"latency_measurement": {
"mode": "fp16", "batch": 64, "warmup": 50, "iters": 500,
"note": "Measured without data loading using synthetic tensors; accounts for encoder+head only."
}
},
"curves": {
"val_metrics_over_epochs": [
{"epoch": 1, "triplet": 1.268, "roc_auc": 0.812, "pr_auc": 0.775},
{"epoch": 5, "triplet": 0.929, "roc_auc": 0.873, "pr_auc": 0.846},
{"epoch": 10, "triplet": 0.768, "roc_auc": 0.906, "pr_auc": 0.885},
{"epoch": 15, "triplet": 0.664, "roc_auc": 0.922, "pr_auc": 0.903},
{"epoch": 20, "triplet": 0.590, "roc_auc": 0.934, "pr_auc": 0.915},
{"epoch": 25, "triplet": 0.533, "roc_auc": 0.943, "pr_auc": 0.925},
{"epoch": 30, "triplet": 0.487, "roc_auc": 0.949, "pr_auc": 0.931},
{"epoch": 35, "triplet": 0.450, "roc_auc": 0.952, "pr_auc": 0.936},
{"epoch": 40, "triplet": 0.418, "roc_auc": 0.955, "pr_auc": 0.939},
{"epoch": 45, "triplet": 0.394, "roc_auc": 0.956, "pr_auc": 0.940},
{"epoch": 52, "triplet": 0.391, "roc_auc": 0.957, "pr_auc": 0.941},
{"epoch": 60, "triplet": 0.394, "roc_auc": 0.956, "pr_auc": 0.940}
],
"reliability_diagram_bins": [
{"bin": "0.0-0.1", "count": 3200, "avg_conf": 0.06, "acc": 0.07},
{"bin": "0.1-0.2", "count": 4800, "avg_conf": 0.15, "acc": 0.16},
{"bin": "0.2-0.3", "count": 6200, "avg_conf": 0.25, "acc": 0.26},
{"bin": "0.3-0.4", "count": 7300, "avg_conf": 0.35, "acc": 0.36},
{"bin": "0.4-0.5", "count": 8100, "avg_conf": 0.45, "acc": 0.46},
{"bin": "0.5-0.6", "count": 8800, "avg_conf": 0.55, "acc": 0.56},
{"bin": "0.6-0.7", "count": 9100, "avg_conf": 0.65, "acc": 0.64},
{"bin": "0.7-0.8", "count": 9600, "avg_conf": 0.75, "acc": 0.74},
{"bin": "0.8-0.9", "count": 10000, "avg_conf": 0.85, "acc": 0.84},
{"bin": "0.9-1.0", "count": 10400, "avg_conf": 0.93, "acc": 0.92}
]
},
"slice_metrics": {
"occasion": [
{"slice": "business", "f1_test": 0.917, "support": 4100},
{"slice": "casual", "f1_test": 0.902, "support": 5100},
{"slice": "formal", "f1_test": 0.911, "support": 2800},
{"slice": "sport", "f1_test": 0.897, "support": 3300}
],
"weather": [
{"slice": "hot", "f1_test": 0.906, "support": 3600},
{"slice": "cold", "f1_test": 0.909, "support": 3700},
{"slice": "mild", "f1_test": 0.907, "support": 4200},
{"slice": "rain", "f1_test": 0.898, "support": 1800}
]
},
"negative_sampling": {
"methods": ["random", "in-batch", "hard via top-k distance"],
"mixing": {"random": 0.5, "in_batch": 0.3, "hard": 0.2},
"notes": "Hard negatives sourced using previous epoch embeddings to avoid label leakage."
},
"serving_benchmarks": {
"hardware": [
{"gpu": "T4 16GB", "batch": 64, "score_ms_mean": 2.6, "seq_per_sec": 440},
{"gpu": "A10G 24GB", "batch": 64, "score_ms_mean": 2.1, "seq_per_sec": 520},
{"gpu": "A100 40GB", "batch": 64, "score_ms_mean": 1.8, "seq_per_sec": 653}
],
"notes": "Measured with fp16, cudnn_benchmark on; includes encoder + head."
}
}
}