{ "schema_version": "1.0", "generated_at": "2025-09-10T00:00:00Z", "model": "ViT Outfit Compatibility", "metadata": { "dataset": { "name": "Polyvore Outfits", "split": "nondisjoint", "train_outfits": 53306, "val_outfits": 5000, "test_outfits": 5000, "approx_item_count": 106000, "avg_items_per_outfit": 3.7, "labeling": "Binary compatibility for scored pairs; retrieval over coherent sets", "notes": "Sequences are outfits; scoring predicts coherence/compatibility." }, "preprocessing": { "image": { "resize": {"shorter_side": 256, "interpolation": "bilinear"}, "center_crop": 224, "normalize": { "mean": [0.485, 0.456, 0.406], "std": [0.229, 0.224, 0.225] } }, "sequence": { "max_items": 8, "padding": "zeros", "masking": true, "position_encoding": "learned" }, "augmentations": { "ops": [ {"name": "RandomResizedCrop", "scale": [0.8, 1.0], "ratio": [0.9, 1.1], "p": 1.0}, {"name": "RandomHorizontalFlip", "p": 0.5}, {"name": "ColorJitter", "brightness": 0.2, "contrast": 0.2, "saturation": 0.2, "hue": 0.02, "p": 0.8}, {"name": "RandomGrayscale", "p": 0.05} ], "notes": "Mild augmentations preserve item identity critical for compatibility." } }, "architecture": { "vision_backbone": { "name": "ViT-B/16", "patch_size": 16, "img_size": 224, "embed_dim": 768, "pretrained": "imagenet-21k", "freeze_patchify": false }, "sequence_encoder": { "type": "transformer_encoder", "num_layers": 8, "num_heads": 8, "ff_multiplier": 4, "dropout": 0.1, "layernorm_eps": 1e-5, "activation": "gelu" }, "pooling": {"type": "mean", "include_cls": false}, "head": { "type": "mlp", "hidden": [512], "activation": "gelu", "dropout": 0.1, "output": 1, "output_activation": "sigmoid" } }, "hyperparameters": { "optimizer": "adamw", "learning_rate": 0.00035, "weight_decay": 0.05, "batch_size": 8, "epochs": 60, "lr_scheduler": { "type": "cosine", "warmup_epochs": 5, "warmup_factor": 0.1 }, "loss": { "type": "triplet + bce", "triplet_margin": 0.3, "triplet_distance": "cosine", "bce_weight": 0.5 }, "regularization": { "dropout": 0.1, "label_smoothing": 0.0, "gradient_clip_norm": 1.0 } }, "training_config": { "amp": true, "num_workers": 8, "pin_memory": true, "seed": 42, "deterministic": false, "cudnn_benchmark": true, "early_stopping": {"patience": 12, "min_delta": 0.0001}, "checkpointing": { "save_best": true, "monitor": "val.triplet_loss", "mode": "min", "every_n_epochs": 1, "artifact_naming": "vit_outfit_{epoch:02d}_{val_loss:.3f}.pth" }, "logging": { "tensorboard": true, "metrics_every_n_steps": 50, "save_history_json": true } }, "environment": { "hardware": { "gpu": {"model": "NVIDIA A100 40GB", "count": 1}, "cpu": {"model": "Intel Xeon", "cores": 16}, "ram_gb": 64, "storage": "NVMe SSD" }, "software": { "os": "Ubuntu 22.04", "python": "3.10", "pytorch": "2.2", "cuda": "12.1", "cudnn": "9" }, "reproducibility": { "seed_all": [1, 21, 42, 123, 2025], "numpy_seed": true, "notes": "Some nondeterminism due to AMP and data loader order." } } }, "experiments": { "dataset_size_sweep": [ { "samples": 5000, "epochs": 40, "aggregate": { "best_val_triplet_loss_mean": 0.462, "best_val_triplet_loss_std": 0.009, "outfit_scoring_test": {"mean": 0.793, "median": 0.805, "std": 0.102}, "retrieval_test": {"coherent_set_hit_rate@1": 0.398, "@5": 0.671, "@10": 0.742}, "classification_test": {"accuracy": 0.861, "f1": 0.860}, "auc_test": {"roc_auc": 0.902, "pr_auc": 0.874}, "latency": {"score_ms_mean": 1.9, "score_ms_p95": 2.6, "sequences_per_sec": 620} }, "per_seed": [ {"seed": 1, "best_epoch": 38, "best_val_triplet_loss": 0.468}, {"seed": 21, "best_epoch": 39, "best_val_triplet_loss": 0.457}, {"seed": 42, "best_epoch": 40, "best_val_triplet_loss": 0.462}, {"seed": 123, "best_epoch": 39, "best_val_triplet_loss": 0.471}, {"seed": 2025,"best_epoch": 38, "best_val_triplet_loss": 0.451} ], "notes": "Underfits; limited combinations reduce semi-hard positives." }, { "samples": 20000, "epochs": 50, "aggregate": { "best_val_triplet_loss_mean": 0.418, "best_val_triplet_loss_std": 0.006, "outfit_scoring_test": {"mean": 0.821, "median": 0.834, "std": 0.089}, "retrieval_test": {"coherent_set_hit_rate@1": 0.461, "@5": 0.728, "@10": 0.801}, "classification_test": {"accuracy": 0.892, "f1": 0.891}, "auc_test": {"roc_auc": 0.931, "pr_auc": 0.912}, "latency": {"score_ms_mean": 1.8, "score_ms_p95": 2.5, "sequences_per_sec": 642} }, "per_seed": [ {"seed": 1, "best_epoch": 48, "best_val_triplet_loss": 0.421}, {"seed": 21, "best_epoch": 49, "best_val_triplet_loss": 0.414}, {"seed": 42, "best_epoch": 50, "best_val_triplet_loss": 0.418}, {"seed": 123, "best_epoch": 49, "best_val_triplet_loss": 0.423}, {"seed": 2025,"best_epoch": 48, "best_val_triplet_loss": 0.412} ], "notes": "Gains across all metrics, especially ROC/PR AUC." }, { "samples": 53306, "epochs": 60, "aggregate": { "best_val_triplet_loss_mean": 0.391, "best_val_triplet_loss_std": 0.004, "outfit_scoring_test": {"mean": 0.839, "median": 0.851, "std": 0.080}, "retrieval_test": {"coherent_set_hit_rate@1": 0.493, "@5": 0.765, "@10": 0.838}, "classification_test": {"accuracy": 0.908, "f1": 0.908}, "auc_test": {"roc_auc": 0.951, "pr_auc": 0.934}, "calibration_test": {"ece": 0.021, "mce": 0.057, "brier": 0.087}, "latency": {"score_ms_mean": 1.8, "score_ms_p95": 2.4, "sequences_per_sec": 653} }, "per_seed": [ {"seed": 1, "best_epoch": 52, "best_val_triplet_loss": 0.394}, {"seed": 21, "best_epoch": 53, "best_val_triplet_loss": 0.389}, {"seed": 42, "best_epoch": 52, "best_val_triplet_loss": 0.391}, {"seed": 123, "best_epoch": 51, "best_val_triplet_loss": 0.396}, {"seed": 2025,"best_epoch": 53, "best_val_triplet_loss": 0.388} ], "notes": "Best overall; aligns with vit_metrics_full.json." } ], "learning_rate_sweep": [ { "lr": 0.0002, "epochs": 60, "best_epoch": 55, "best_val_triplet_loss": 0.402, "metrics_test": {"accuracy": 0.902, "f1": 0.901, "roc_auc": 0.946, "pr_auc": 0.928}, "notes": "Slight underfit; stable but slower rise." }, { "lr": 0.00035, "epochs": 60, "best_epoch": 52, "best_val_triplet_loss": 0.391, "metrics_test": {"accuracy": 0.908, "f1": 0.908, "roc_auc": 0.951, "pr_auc": 0.934}, "notes": "Best balance; matches full run." }, { "lr": 0.0006, "epochs": 55, "best_epoch": 44, "best_val_triplet_loss": 0.399, "metrics_test": {"accuracy": 0.904, "f1": 0.903, "roc_auc": 0.948, "pr_auc": 0.932}, "notes": "Slightly noisier; close quality." } ], "batch_size_sweep": [ { "batch_size": 4, "grad_accum_steps": 1, "best_val_triplet_loss": 0.398, "metrics_test": {"accuracy": 0.905, "f1": 0.905, "roc_auc": 0.949, "pr_auc": 0.933}, "throughput": {"sequences_per_sec": 611}, "notes": "More gradient noise; marginally worse." }, { "batch_size": 8, "grad_accum_steps": 1, "best_val_triplet_loss": 0.391, "metrics_test": {"accuracy": 0.908, "f1": 0.908, "roc_auc": 0.951, "pr_auc": 0.934}, "throughput": {"sequences_per_sec": 653}, "notes": "Best trade-off for stability and negatives diversity." }, { "batch_size": 16, "grad_accum_steps": 1, "best_val_triplet_loss": 0.393, "metrics_test": {"accuracy": 0.907, "f1": 0.907, "roc_auc": 0.950, "pr_auc": 0.934}, "throughput": {"sequences_per_sec": 688}, "notes": "Slightly worse triplet dynamics; similar serving cost." } ], "other_ablation": { "dropout": [ {"dropout": 0.0, "best_val_triplet_loss": 0.397, "metrics_test": {"accuracy": 0.905, "f1": 0.905}}, {"dropout": 0.1, "best_val_triplet_loss": 0.391, "metrics_test": {"accuracy": 0.908, "f1": 0.908}}, {"dropout": 0.3, "best_val_triplet_loss": 0.396, "metrics_test": {"accuracy": 0.906, "f1": 0.906}} ], "embedding_dim": [ {"dim": 256, "best_val_triplet_loss": 0.400, "metrics_test": {"accuracy": 0.904, "f1": 0.904}}, {"dim": 512, "best_val_triplet_loss": 0.391, "metrics_test": {"accuracy": 0.908, "f1": 0.908}}, {"dim": 768, "best_val_triplet_loss": 0.393, "metrics_test": {"accuracy": 0.907, "f1": 0.907}} ], "transformer_depth": [ {"layers": 6, "best_val_triplet_loss": 0.402, "metrics_test": {"accuracy": 0.904, "f1": 0.904}}, {"layers": 8, "best_val_triplet_loss": 0.391, "metrics_test": {"accuracy": 0.908, "f1": 0.908}}, {"layers": 10, "best_val_triplet_loss": 0.396, "metrics_test": {"accuracy": 0.906, "f1": 0.906}} ], "attention_heads": [ {"heads": 8, "best_val_triplet_loss": 0.391, "metrics_test": {"accuracy": 0.908, "f1": 0.908}}, {"heads": 12, "best_val_triplet_loss": 0.395, "metrics_test": {"accuracy": 0.906, "f1": 0.906}} ] } }, "best_run": { "id": "VF-01", "config": { "layers": 8, "heads": 8, "ff": 4, "lr": 0.00035, "margin": 0.3, "dropout": 0.1, "batch_size": 8, "epochs": 60, "scheduler": "cosine", "warmup_epochs": 5, "amp": true, "seed": 42 }, "history": [ {"epoch": 1, "triplet_loss": 1.302, "val_triplet_loss": 1.268, "lr": 0.00007, "epoch_time_sec": 89.2, "sequences_per_sec": 610}, {"epoch": 5, "triplet_loss": 0.962, "val_triplet_loss": 0.929, "lr": 0.00023, "epoch_time_sec": 86.7, "sequences_per_sec": 628}, {"epoch": 10, "triplet_loss": 0.794, "val_triplet_loss": 0.768, "lr": 0.00033, "epoch_time_sec": 85.3, "sequences_per_sec": 639}, {"epoch": 15, "triplet_loss": 0.687, "val_triplet_loss": 0.664, "lr": 0.00035, "epoch_time_sec": 84.8, "sequences_per_sec": 643}, {"epoch": 20, "triplet_loss": 0.611, "val_triplet_loss": 0.590, "lr": 0.00032, "epoch_time_sec": 84.4, "sequences_per_sec": 646}, {"epoch": 25, "triplet_loss": 0.552, "val_triplet_loss": 0.533, "lr": 0.00027, "epoch_time_sec": 84.1, "sequences_per_sec": 648}, {"epoch": 30, "triplet_loss": 0.504, "val_triplet_loss": 0.487, "lr": 0.00022, "epoch_time_sec": 83.9, "sequences_per_sec": 650}, {"epoch": 35, "triplet_loss": 0.465, "val_triplet_loss": 0.450, "lr": 0.00018, "epoch_time_sec": 83.8, "sequences_per_sec": 651}, {"epoch": 40, "triplet_loss": 0.432, "val_triplet_loss": 0.418, "lr": 0.00015, "epoch_time_sec": 83.7, "sequences_per_sec": 652}, {"epoch": 45, "triplet_loss": 0.406, "val_triplet_loss": 0.394, "lr": 0.00012, "epoch_time_sec": 83.6, "sequences_per_sec": 653}, {"epoch": 52, "triplet_loss": 0.392, "val_triplet_loss": 0.391, "lr": 0.00010, "epoch_time_sec": 83.6, "sequences_per_sec": 653}, {"epoch": 60, "triplet_loss": 0.389, "val_triplet_loss": 0.394, "lr": 0.00008, "epoch_time_sec": 83.6, "sequences_per_sec": 653} ], "advanced_metrics": { "outfit_scoring": { "val": {"mean": 0.846, "median": 0.858, "std": 0.077}, "test": {"mean": 0.839, "median": 0.851, "std": 0.080} }, "retrieval": { "val": {"coherent_set_hit_rate@1": 0.501, "coherent_set_hit_rate@5": 0.773, "coherent_set_hit_rate@10": 0.845}, "test": {"coherent_set_hit_rate@1": 0.493, "coherent_set_hit_rate@5": 0.765, "coherent_set_hit_rate@10": 0.838} }, "classification": { "threshold_selection": {"method": "YoudenJ", "tau_val": 0.52}, "val": {"accuracy": 0.915, "precision": 0.911, "recall": 0.918, "f1": 0.914}, "test": {"accuracy": 0.908, "precision": 0.904, "recall": 0.911, "f1": 0.908} }, "calibration": { "val": {"ece": 0.018, "mce": 0.051, "brier": 0.083}, "test": {"ece": 0.021, "mce": 0.057, "brier": 0.087} }, "auc": { "val": {"roc_auc": 0.957, "pr_auc": 0.941}, "test": {"roc_auc": 0.951, "pr_auc": 0.934} }, "latency": { "score_ms_mean": 1.8, "score_ms_p95": 2.4, "sequences_per_sec": 653 }, "per_context": { "occasion": { "business": {"f1_val": 0.923, "f1_test": 0.917}, "casual": {"f1_val": 0.909, "f1_test": 0.902}, "formal": {"f1_val": 0.918, "f1_test": 0.911}, "sport": {"f1_val": 0.903, "f1_test": 0.897} }, "weather": { "hot": {"f1_val": 0.912, "f1_test": 0.906}, "cold": {"f1_val": 0.916, "f1_test": 0.909}, "mild": {"f1_val": 0.914, "f1_test": 0.907}, "rain": {"f1_val": 0.905, "f1_test": 0.898} } }, "summary": { "total_outfit_scores": 53306, "total_sequences_seen": 3180000, "avg_sequence_length": 3.7 } }, "artifacts": { "checkpoints": [ {"epoch": 52, "path": "artifacts/vit_outfit_52_0.391.pth", "size_mb": 329.1}, {"epoch": 60, "path": "artifacts/vit_outfit_60_0.394.pth", "size_mb": 329.2} ], "logs": { "tensorboard": "artifacts/tb/vit_outfit", "metrics_json": "artifacts/metrics/vit_full_run.json" }, "exported": { "onnx": {"path": "artifacts/export/vit_outfit.onnx", "opset": 17}, "torchscript": {"path": "artifacts/export/vit_outfit.ts"} } } }, "production_readiness": { "serving": { "inference_framework": "TorchScript", "runtime": "Triton Inference Server", "hardware": "A10G recommended", "batching": {"max_batch": 64, "max_delay_ms": 10}, "latency_slo_ms": 80, "qps_target": 500, "autoscaling": {"policy": "HPA", "metric": "GPU_UTILIZATION", "target": 0.7} }, "monitoring": { "dashboards": [ "Score latency p50/p95/p99", "Throughput (seq/s)", "GPU Utilization/Memory", "Calibration drift (ECE)", "ROC/PR AUC on shadow eval", "Per-context F1 (occasion/weather)" ], "alerts": [ {"name": "latency_p95_slo_breach", "threshold_ms": 120, "for": "5m"}, {"name": "auc_drop_gt_2pts", "threshold": -0.02, "for": "60m"} ] }, "security_privacy": { "data_minimization": true, "artifact_signing": true, "container_sbom": true }, "cost_estimates": { "gpu_hourly_usd": 1.8, "replicas": 2, "monthly_usd": 2592 } }, "summary_findings": { "concise_trends": [ "Data scaling from 5k to 53k outfits lifts ROC AUC by ~5 points and improves coherent-set hit@10 by ~10 points.", "Best configuration uses 8 layers, 8 heads, FF×4, dropout 0.1, lr=3.5e-4, batch=8 with cosine+5 warmup.", "Batch 8 balances semi-hard dynamics and stability; batch 16 is similar but slightly worse triplet separation.", "Dropout 0.1 regularizes without harming compatibility signals; 0.0 tends to overfit and 0.3 erodes positives.", "Embedding 512–768D performs similarly; 512D preferred for latency/memory.", "Heads=8 slightly better than 12 in this regime; depth=8 outperforms 6 and 10 by small margins." ] }, "appendix": { "metric_definitions": { "triplet_loss": "Margin-based loss for sequences via pooled item embeddings.", "outfit_score": "Scalar in [0,1] representing predicted outfit compatibility.", "coherent_set_hit_rate@k": "Probability a coherent variant of an outfit appears in top-k ranked candidates.", "roc_auc": "Area under ROC; threshold-independent binary classification measure.", "pr_auc": "Area under Precision-Recall curve; more informative for class imbalance.", "ece": "Expected Calibration Error; lower indicates better confidence calibration.", "brier": "Mean squared error between forecast probabilities and outcomes.", "sequences_per_sec": "Throughput during training/inference for sequence-level scoring." }, "evaluation_protocol": { "splits": {"train": 53306, "val": 5000, "test": 5000}, "binary_labels": "Compatible vs incompatible outfit pairs constructed via negative sampling.", "threshold_selection": {"method": "YoudenJ", "grid": [0.3,0.35,0.4,0.45,0.5,0.52,0.55,0.6]}, "latency_measurement": { "mode": "fp16", "batch": 64, "warmup": 50, "iters": 500, "note": "Measured without data loading using synthetic tensors; accounts for encoder+head only." } }, "curves": { "val_metrics_over_epochs": [ {"epoch": 1, "triplet": 1.268, "roc_auc": 0.812, "pr_auc": 0.775}, {"epoch": 5, "triplet": 0.929, "roc_auc": 0.873, "pr_auc": 0.846}, {"epoch": 10, "triplet": 0.768, "roc_auc": 0.906, "pr_auc": 0.885}, {"epoch": 15, "triplet": 0.664, "roc_auc": 0.922, "pr_auc": 0.903}, {"epoch": 20, "triplet": 0.590, "roc_auc": 0.934, "pr_auc": 0.915}, {"epoch": 25, "triplet": 0.533, "roc_auc": 0.943, "pr_auc": 0.925}, {"epoch": 30, "triplet": 0.487, "roc_auc": 0.949, "pr_auc": 0.931}, {"epoch": 35, "triplet": 0.450, "roc_auc": 0.952, "pr_auc": 0.936}, {"epoch": 40, "triplet": 0.418, "roc_auc": 0.955, "pr_auc": 0.939}, {"epoch": 45, "triplet": 0.394, "roc_auc": 0.956, "pr_auc": 0.940}, {"epoch": 52, "triplet": 0.391, "roc_auc": 0.957, "pr_auc": 0.941}, {"epoch": 60, "triplet": 0.394, "roc_auc": 0.956, "pr_auc": 0.940} ], "reliability_diagram_bins": [ {"bin": "0.0-0.1", "count": 3200, "avg_conf": 0.06, "acc": 0.07}, {"bin": "0.1-0.2", "count": 4800, "avg_conf": 0.15, "acc": 0.16}, {"bin": "0.2-0.3", "count": 6200, "avg_conf": 0.25, "acc": 0.26}, {"bin": "0.3-0.4", "count": 7300, "avg_conf": 0.35, "acc": 0.36}, {"bin": "0.4-0.5", "count": 8100, "avg_conf": 0.45, "acc": 0.46}, {"bin": "0.5-0.6", "count": 8800, "avg_conf": 0.55, "acc": 0.56}, {"bin": "0.6-0.7", "count": 9100, "avg_conf": 0.65, "acc": 0.64}, {"bin": "0.7-0.8", "count": 9600, "avg_conf": 0.75, "acc": 0.74}, {"bin": "0.8-0.9", "count": 10000, "avg_conf": 0.85, "acc": 0.84}, {"bin": "0.9-1.0", "count": 10400, "avg_conf": 0.93, "acc": 0.92} ] }, "slice_metrics": { "occasion": [ {"slice": "business", "f1_test": 0.917, "support": 4100}, {"slice": "casual", "f1_test": 0.902, "support": 5100}, {"slice": "formal", "f1_test": 0.911, "support": 2800}, {"slice": "sport", "f1_test": 0.897, "support": 3300} ], "weather": [ {"slice": "hot", "f1_test": 0.906, "support": 3600}, {"slice": "cold", "f1_test": 0.909, "support": 3700}, {"slice": "mild", "f1_test": 0.907, "support": 4200}, {"slice": "rain", "f1_test": 0.898, "support": 1800} ] }, "negative_sampling": { "methods": ["random", "in-batch", "hard via top-k distance"], "mixing": {"random": 0.5, "in_batch": 0.3, "hard": 0.2}, "notes": "Hard negatives sourced using previous epoch embeddings to avoid label leakage." }, "serving_benchmarks": { "hardware": [ {"gpu": "T4 16GB", "batch": 64, "score_ms_mean": 2.6, "seq_per_sec": 440}, {"gpu": "A10G 24GB", "batch": 64, "score_ms_mean": 2.1, "seq_per_sec": 520}, {"gpu": "A100 40GB", "batch": 64, "score_ms_mean": 1.8, "seq_per_sec": 653} ], "notes": "Measured with fp16, cudnn_benchmark on; includes encoder + head." } } }