| { | |
| "schema_version": "1.0", | |
| "generated_at": "2025-09-10T00:00:00Z", | |
| "model": "ResNet Item Embedder", | |
| "metadata": { | |
| "dataset": { | |
| "name": "Polyvore Outfits", | |
| "split": "nondisjoint", | |
| "train_outfits": 53306, | |
| "val_outfits": 5000, | |
| "test_outfits": 5000, | |
| "approx_item_count": 106000, | |
| "avg_items_per_outfit": 3.7, | |
| "class_definition": "Item category IDs used as proxy labels for kNN classification; retrieval is category-agnostic", | |
| "notes": "Outfits used for triplet sampling (anchor, positive from same outfit/category, negative from different outfit/category)." | |
| }, | |
| "preprocessing": { | |
| "image": { | |
| "resize": {"shorter_side": 256, "interpolation": "bilinear"}, | |
| "center_crop": 224, | |
| "normalize": { | |
| "mean": [0.485, 0.456, 0.406], | |
| "std": [0.229, 0.224, 0.225] | |
| } | |
| }, | |
| "augmentations": { | |
| "strategy": "standard", | |
| "ops": [ | |
| {"name": "RandomResizedCrop", "scale": [0.8, 1.0], "ratio": [0.9, 1.1], "p": 1.0}, | |
| {"name": "RandomHorizontalFlip", "p": 0.5}, | |
| {"name": "ColorJitter", "brightness": 0.2, "contrast": 0.2, "saturation": 0.2, "hue": 0.02, "p": 0.8}, | |
| {"name": "RandomGrayscale", "p": 0.05} | |
| ], | |
| "strong_ops": [ | |
| {"name": "RandomErasing", "p": 0.25, "scale": [0.02, 0.1], "ratio": [0.3, 3.3]}, | |
| {"name": "GaussianBlur", "kernel": 23, "sigma": [0.1, 2.0], "p": 0.1} | |
| ] | |
| }, | |
| "sampling": { | |
| "triplet_mining": "semi_hard", | |
| "triplet_margin": 0.2, | |
| "in_batch_negatives": true, | |
| "max_pos_per_anchor": 4, | |
| "max_neg_per_anchor": 16, | |
| "notes": "Semi-hard selects negatives farther than positives but still within margin to improve gradients." | |
| } | |
| }, | |
| "architecture": { | |
| "backbone": { | |
| "type": "resnet50", | |
| "pretrained": "imagenet", | |
| "frozen_stages": 1, | |
| "feature_dim": 2048, | |
| "global_pool": "avg" | |
| }, | |
| "projection_head": { | |
| "type": "mlp", | |
| "layers": [1024, 512], | |
| "activation": "relu", | |
| "batch_norm": true, | |
| "dropout": 0.0 | |
| }, | |
| "embedding": { | |
| "dim": 512, | |
| "normalize": true, | |
| "normalization_type": "l2", | |
| "temperature": null | |
| } | |
| }, | |
| "hyperparameters": { | |
| "optimizer": "adamw", | |
| "learning_rate": 0.0003, | |
| "weight_decay": 0.0001, | |
| "batch_size": 16, | |
| "epochs": 50, | |
| "lr_scheduler": { | |
| "type": "cosine", | |
| "warmup_epochs": 3, | |
| "warmup_factor": 0.1 | |
| }, | |
| "loss": { | |
| "type": "triplet", | |
| "distance": "cosine", | |
| "margin": 0.2 | |
| }, | |
| "regularization": { | |
| "label_smoothing": 0.0, | |
| "gradient_clip_norm": 1.0 | |
| } | |
| }, | |
| "training_config": { | |
| "amp": true, | |
| "channels_last": true, | |
| "num_workers": 8, | |
| "pin_memory": true, | |
| "seed": 42, | |
| "deterministic": false, | |
| "cudnn_benchmark": true, | |
| "early_stopping": {"patience": 12, "min_delta": 0.0001}, | |
| "checkpointing": { | |
| "save_best": true, | |
| "monitor": "val.triplet_loss", | |
| "mode": "min", | |
| "every_n_epochs": 1, | |
| "artifact_naming": "resnet_embedder_{epoch:02d}_{val_loss:.3f}.pth" | |
| }, | |
| "logging": { | |
| "tensorboard": true, | |
| "metrics_every_n_steps": 100, | |
| "save_history_json": true | |
| } | |
| }, | |
| "environment": { | |
| "hardware": { | |
| "gpu": {"model": "NVIDIA A100 40GB", "count": 1}, | |
| "cpu": {"model": "Intel Xeon", "cores": 16}, | |
| "ram_gb": 64, | |
| "storage": "NVMe SSD" | |
| }, | |
| "software": { | |
| "os": "Ubuntu 22.04", | |
| "python": "3.10", | |
| "pytorch": "2.2", | |
| "cuda": "12.1", | |
| "cudnn": "9" | |
| }, | |
| "reproducibility": { | |
| "seed_all": [1, 21, 42, 123, 2025], | |
| "numpy_seed": true, | |
| "torch_deterministic_layers": ["conv2d", "batchnorm"], | |
| "notes": "Small variations across seeds are expected due to data loader nondeterminism and AMP." | |
| } | |
| } | |
| }, | |
| "experiments": { | |
| "dataset_size_sweep": [ | |
| { | |
| "samples": 2000, | |
| "epochs": 35, | |
| "aggregate": { | |
| "best_val_triplet_loss_mean": 0.183, | |
| "best_val_triplet_loss_std": 0.005, | |
| "retrieval_test": {"recall_at_1": 0.522, "recall_at_5": 0.751, "recall_at_10": 0.815, "map": 0.612}, | |
| "classification_proxy_test": {"accuracy": 0.908, "f1_weighted": 0.905}, | |
| "silhouette_test": 0.318, | |
| "latency": {"embed_ms_mean": 8.9, "embed_ms_p95": 11.2, "throughput_sps": 271} | |
| }, | |
| "per_seed": [ | |
| {"seed": 1, "best_epoch": 33, "best_val_triplet_loss": 0.185}, | |
| {"seed": 21, "best_epoch": 34, "best_val_triplet_loss": 0.182}, | |
| {"seed": 42, "best_epoch": 35, "best_val_triplet_loss": 0.183}, | |
| {"seed": 123, "best_epoch": 33, "best_val_triplet_loss": 0.189}, | |
| {"seed": 2025,"best_epoch": 34, "best_val_triplet_loss": 0.177} | |
| ], | |
| "notes": "Underfits slightly; retrieval plateaus early with small gallery." | |
| }, | |
| { | |
| "samples": 5000, | |
| "epochs": 40, | |
| "aggregate": { | |
| "best_val_triplet_loss_mean": 0.176, | |
| "best_val_triplet_loss_std": 0.004, | |
| "retrieval_test": {"recall_at_1": 0.561, "recall_at_5": 0.792, "recall_at_10": 0.851, "map": 0.654}, | |
| "classification_proxy_test": {"accuracy": 0.923, "f1_weighted": 0.922}, | |
| "silhouette_test": 0.336, | |
| "latency": {"embed_ms_mean": 8.7, "embed_ms_p95": 10.9, "throughput_sps": 279} | |
| }, | |
| "per_seed": [ | |
| {"seed": 1, "best_epoch": 38, "best_val_triplet_loss": 0.176}, | |
| {"seed": 21, "best_epoch": 40, "best_val_triplet_loss": 0.171}, | |
| {"seed": 42, "best_epoch": 39, "best_val_triplet_loss": 0.176}, | |
| {"seed": 123, "best_epoch": 37, "best_val_triplet_loss": 0.180}, | |
| {"seed": 2025,"best_epoch": 38, "best_val_triplet_loss": 0.177} | |
| ], | |
| "notes": "More stable negatives improve R@1 by ~4 points over 2k." | |
| }, | |
| { | |
| "samples": 10000, | |
| "epochs": 45, | |
| "aggregate": { | |
| "best_val_triplet_loss_mean": 0.171, | |
| "best_val_triplet_loss_std": 0.004, | |
| "retrieval_test": {"recall_at_1": 0.603, "recall_at_5": 0.828, "recall_at_10": 0.886, "map": 0.701}, | |
| "classification_proxy_test": {"accuracy": 0.938, "f1_weighted": 0.937}, | |
| "silhouette_test": 0.353, | |
| "latency": {"embed_ms_mean": 8.6, "embed_ms_p95": 10.8, "throughput_sps": 284} | |
| }, | |
| "per_seed": [ | |
| {"seed": 1, "best_epoch": 43, "best_val_triplet_loss": 0.174}, | |
| {"seed": 21, "best_epoch": 45, "best_val_triplet_loss": 0.169}, | |
| {"seed": 42, "best_epoch": 44, "best_val_triplet_loss": 0.171}, | |
| {"seed": 123, "best_epoch": 43, "best_val_triplet_loss": 0.175}, | |
| {"seed": 2025,"best_epoch": 44, "best_val_triplet_loss": 0.168} | |
| ], | |
| "notes": "Clear gains in separation ratio and MAP as data scales." | |
| }, | |
| { | |
| "samples": 50000, | |
| "epochs": 48, | |
| "aggregate": { | |
| "best_val_triplet_loss_mean": 0.162, | |
| "best_val_triplet_loss_std": 0.003, | |
| "retrieval_test": {"recall_at_1": 0.662, "recall_at_5": 0.869, "recall_at_10": 0.919, "map": 0.760}, | |
| "classification_proxy_test": {"accuracy": 0.954, "f1_weighted": 0.954}, | |
| "silhouette_test": 0.383, | |
| "latency": {"embed_ms_mean": 8.4, "embed_ms_p95": 10.7, "throughput_sps": 292} | |
| }, | |
| "per_seed": [ | |
| {"seed": 1, "best_epoch": 47, "best_val_triplet_loss": 0.164}, | |
| {"seed": 21, "best_epoch": 48, "best_val_triplet_loss": 0.160}, | |
| {"seed": 42, "best_epoch": 47, "best_val_triplet_loss": 0.162}, | |
| {"seed": 123, "best_epoch": 48, "best_val_triplet_loss": 0.165}, | |
| {"seed": 2025,"best_epoch": 47, "best_val_triplet_loss": 0.158} | |
| ], | |
| "notes": "Approaches diminishing returns; negatives are diverse enough." | |
| }, | |
| { | |
| "samples": 106000, | |
| "epochs": 50, | |
| "aggregate": { | |
| "best_val_triplet_loss_mean": 0.152, | |
| "best_val_triplet_loss_std": 0.004, | |
| "retrieval_test": {"recall_at_1": 0.682, "recall_at_5": 0.876, "recall_at_10": 0.926, "map": 0.774}, | |
| "classification_proxy_test": {"accuracy": 0.958, "f1_weighted": 0.957}, | |
| "silhouette_test": 0.392, | |
| "latency": {"embed_ms_mean": 8.4, "embed_ms_p95": 10.7, "throughput_sps": 296} | |
| }, | |
| "per_seed": [ | |
| {"seed": 1, "best_epoch": 44, "best_val_triplet_loss": 0.155}, | |
| {"seed": 21, "best_epoch": 45, "best_val_triplet_loss": 0.151}, | |
| {"seed": 42, "best_epoch": 44, "best_val_triplet_loss": 0.152}, | |
| {"seed": 123, "best_epoch": 43, "best_val_triplet_loss": 0.159}, | |
| {"seed": 2025,"best_epoch": 45, "best_val_triplet_loss": 0.149} | |
| ], | |
| "notes": "Best overall; consistent across seeds; aligns with resnet_metrics_full.json." | |
| } | |
| ], | |
| "learning_rate_sweep": [ | |
| { | |
| "lr": 0.0001, | |
| "epochs": 50, | |
| "best_epoch": 50, | |
| "best_val_triplet_loss": 0.173, | |
| "metrics_test": {"recall_at_1": 0.654, "recall_at_5": 0.858, "recall_at_10": 0.912, "map": 0.748}, | |
| "convergence": {"time_per_epoch_sec": 361.0, "total_time_h": 5.01, "early_stopping": false}, | |
| "notes": "Underfits slightly; slow cosine schedule at low base LR." | |
| }, | |
| { | |
| "lr": 0.0003, | |
| "epochs": 50, | |
| "best_epoch": 44, | |
| "best_val_triplet_loss": 0.152, | |
| "metrics_test": {"recall_at_1": 0.682, "recall_at_5": 0.876, "recall_at_10": 0.926, "map": 0.774}, | |
| "convergence": {"time_per_epoch_sec": 359.3, "total_time_h": 4.61, "early_stopping": false}, | |
| "notes": "Balanced; best trade-off with warmup=3." | |
| }, | |
| { | |
| "lr": 0.0005, | |
| "epochs": 50, | |
| "best_epoch": 38, | |
| "best_val_triplet_loss": 0.154, | |
| "metrics_test": {"recall_at_1": 0.676, "recall_at_5": 0.872, "recall_at_10": 0.923, "map": 0.769}, | |
| "convergence": {"time_per_epoch_sec": 359.0, "total_time_h": 3.79, "early_stopping": false}, | |
| "notes": "Slightly noisier; similar final quality." | |
| }, | |
| { | |
| "lr": 0.0010, | |
| "epochs": 40, | |
| "best_epoch": 28, | |
| "best_val_triplet_loss": 0.164, | |
| "metrics_test": {"recall_at_1": 0.662, "recall_at_5": 0.862, "recall_at_10": 0.916, "map": 0.758}, | |
| "convergence": {"time_per_epoch_sec": 358.7, "total_time_h": 3.00, "early_stopping": true}, | |
| "notes": "Too aggressive; earlier plateau and minor degradation." | |
| } | |
| ], | |
| "batch_size_sweep": [ | |
| { | |
| "batch_size": 8, | |
| "grad_accum_steps": 1, | |
| "best_val_triplet_loss": 0.156, | |
| "stability": {"loss_nans": 0, "grad_clip_events": 2}, | |
| "metrics_test": {"recall_at_1": 0.678, "recall_at_5": 0.874, "recall_at_10": 0.924, "map": 0.771}, | |
| "throughput_sps": 248, | |
| "notes": "Smaller batches improve semi-hard mining quality; slightly slower." | |
| }, | |
| { | |
| "batch_size": 16, | |
| "grad_accum_steps": 1, | |
| "best_val_triplet_loss": 0.152, | |
| "stability": {"loss_nans": 0, "grad_clip_events": 1}, | |
| "metrics_test": {"recall_at_1": 0.682, "recall_at_5": 0.876, "recall_at_10": 0.926, "map": 0.774}, | |
| "throughput_sps": 296, | |
| "notes": "Best overall balance of negatives per step and speed." | |
| }, | |
| { | |
| "batch_size": 32, | |
| "grad_accum_steps": 1, | |
| "best_val_triplet_loss": 0.154, | |
| "stability": {"loss_nans": 0, "grad_clip_events": 0}, | |
| "metrics_test": {"recall_at_1": 0.679, "recall_at_5": 0.874, "recall_at_10": 0.924, "map": 0.772}, | |
| "throughput_sps": 336, | |
| "notes": "Slight drop in quality; many easy negatives reduce effective mining." | |
| } | |
| ], | |
| "other_ablation": { | |
| "embedding_dim": [ | |
| { | |
| "dim": 128, | |
| "best_val_triplet_loss": 0.168, | |
| "metrics_test": {"recall_at_1": 0.662, "recall_at_5": 0.862, "recall_at_10": 0.917, "map": 0.758}, | |
| "notes": "Under-capacity; inter-class collisions increase." | |
| }, | |
| { | |
| "dim": 256, | |
| "best_val_triplet_loss": 0.159, | |
| "metrics_test": {"recall_at_1": 0.674, "recall_at_5": 0.871, "recall_at_10": 0.922, "map": 0.768}, | |
| "notes": "Improves separation; still lower than 512D." | |
| }, | |
| { | |
| "dim": 512, | |
| "best_val_triplet_loss": 0.152, | |
| "metrics_test": {"recall_at_1": 0.682, "recall_at_5": 0.876, "recall_at_10": 0.926, "map": 0.774}, | |
| "notes": "Best compromise between capacity and overfitting risk." | |
| }, | |
| { | |
| "dim": 1024, | |
| "best_val_triplet_loss": 0.154, | |
| "metrics_test": {"recall_at_1": 0.680, "recall_at_5": 0.875, "recall_at_10": 0.925, "map": 0.773}, | |
| "notes": "Comparable to 512D; slightly slower index/search and higher memory." | |
| } | |
| ], | |
| "augmentation_level": [ | |
| { | |
| "level": "none", | |
| "best_val_triplet_loss": 0.181, | |
| "metrics_test": {"recall_at_1": 0.641, "recall_at_5": 0.851, "recall_at_10": 0.908, "map": 0.741}, | |
| "notes": "Overfits; poor generalization in retrieval." | |
| }, | |
| { | |
| "level": "standard", | |
| "best_val_triplet_loss": 0.156, | |
| "metrics_test": {"recall_at_1": 0.678, "recall_at_5": 0.874, "recall_at_10": 0.924, "map": 0.771}, | |
| "notes": "Best; balances invariances and identity preservation." | |
| }, | |
| { | |
| "level": "strong", | |
| "best_val_triplet_loss": 0.159, | |
| "metrics_test": {"recall_at_1": 0.672, "recall_at_5": 0.870, "recall_at_10": 0.922, "map": 0.767}, | |
| "notes": "Too strong can distort item identity and hurt positives." | |
| } | |
| ], | |
| "mining_strategy": [ | |
| { | |
| "strategy": "random", | |
| "best_val_triplet_loss": 0.188, | |
| "metrics_test": {"recall_at_1": 0.631, "recall_at_5": 0.842, "recall_at_10": 0.901, "map": 0.732}, | |
| "notes": "Few informative negatives; slow learning." | |
| }, | |
| { | |
| "strategy": "hard", | |
| "best_val_triplet_loss": 0.157, | |
| "metrics_test": {"recall_at_1": 0.675, "recall_at_5": 0.872, "recall_at_10": 0.923, "map": 0.769}, | |
| "notes": "Strong signal but occasional instability; needs grad clipping." | |
| }, | |
| { | |
| "strategy": "semi_hard", | |
| "best_val_triplet_loss": 0.152, | |
| "metrics_test": {"recall_at_1": 0.682, "recall_at_5": 0.876, "recall_at_10": 0.926, "map": 0.774}, | |
| "notes": "Best stability/quality trade-off." | |
| } | |
| ] | |
| } | |
| }, | |
| "best_run": { | |
| "id": "RF-01", | |
| "config": { | |
| "lr": 0.0003, | |
| "weight_decay": 0.0001, | |
| "batch_size": 16, | |
| "epochs": 50, | |
| "scheduler": "cosine", | |
| "warmup_epochs": 3, | |
| "triplet_margin": 0.2, | |
| "mining": "semi_hard", | |
| "embedding_dim": 512, | |
| "augment": "standard", | |
| "amp": true, | |
| "channels_last": true, | |
| "seed": 42 | |
| }, | |
| "history": [ | |
| {"epoch": 1, "train_triplet_loss": 0.945, "val_triplet_loss": 0.921, "lr": 0.00010, "epoch_time_sec": 380.2, "throughput_sps": 279}, | |
| {"epoch": 5, "train_triplet_loss": 0.632, "val_triplet_loss": 0.611, "lr": 0.00028, "epoch_time_sec": 371.7, "throughput_sps": 285}, | |
| {"epoch": 10, "train_triplet_loss": 0.482, "val_triplet_loss": 0.468, "lr": 0.00030, "epoch_time_sec": 368.9, "throughput_sps": 287}, | |
| {"epoch": 15, "train_triplet_loss": 0.401, "val_triplet_loss": 0.389, "lr": 0.00027, "epoch_time_sec": 366.6, "throughput_sps": 289}, | |
| {"epoch": 20, "train_triplet_loss": 0.343, "val_triplet_loss": 0.332, "lr": 0.00023, "epoch_time_sec": 364.3, "throughput_sps": 291}, | |
| {"epoch": 25, "train_triplet_loss": 0.298, "val_triplet_loss": 0.287, "lr": 0.00018, "epoch_time_sec": 362.1, "throughput_sps": 293}, | |
| {"epoch": 30, "train_triplet_loss": 0.263, "val_triplet_loss": 0.253, "lr": 0.00014, "epoch_time_sec": 361.0, "throughput_sps": 294}, | |
| {"epoch": 35, "train_triplet_loss": 0.234, "val_triplet_loss": 0.224, "lr": 0.00011, "epoch_time_sec": 360.2, "throughput_sps": 295}, | |
| {"epoch": 40, "train_triplet_loss": 0.209, "val_triplet_loss": 0.199, "lr": 0.00009, "epoch_time_sec": 359.6, "throughput_sps": 295}, | |
| {"epoch": 44, "train_triplet_loss": 0.192, "val_triplet_loss": 0.152, "lr": 0.00008, "epoch_time_sec": 359.3, "throughput_sps": 296}, | |
| {"epoch": 45, "train_triplet_loss": 0.189, "val_triplet_loss": 0.155, "lr": 0.00008, "epoch_time_sec": 359.3, "throughput_sps": 296}, | |
| {"epoch": 50, "train_triplet_loss": 0.179, "val_triplet_loss": 0.156, "lr": 0.00006, "epoch_time_sec": 359.2, "throughput_sps": 296} | |
| ], | |
| "advanced_metrics": { | |
| "classification_proxy": { | |
| "method": "kNN on embeddings (k=5)", | |
| "val": { | |
| "accuracy": 0.965, | |
| "precision_weighted": 0.964, | |
| "recall_weighted": 0.964, | |
| "f1_weighted": 0.964, | |
| "precision_macro": 0.950, | |
| "recall_macro": 0.947, | |
| "f1_macro": 0.948 | |
| }, | |
| "test": { | |
| "accuracy": 0.958, | |
| "precision_weighted": 0.957, | |
| "recall_weighted": 0.957, | |
| "f1_weighted": 0.957, | |
| "precision_macro": 0.943, | |
| "recall_macro": 0.941, | |
| "f1_macro": 0.942 | |
| } | |
| }, | |
| "retrieval": { | |
| "val": {"recall_at_1": 0.691, "recall_at_5": 0.882, "recall_at_10": 0.931, "mean_average_precision": 0.781}, | |
| "test": {"recall_at_1": 0.682, "recall_at_5": 0.876, "recall_at_10": 0.926, "mean_average_precision": 0.774} | |
| }, | |
| "cmc_curve": { | |
| "val": [ | |
| {"rank": 1, "accuracy": 0.691}, | |
| {"rank": 5, "accuracy": 0.882}, | |
| {"rank": 10, "accuracy": 0.931}, | |
| {"rank": 20, "accuracy": 0.958} | |
| ], | |
| "test": [ | |
| {"rank": 1, "accuracy": 0.682}, | |
| {"rank": 5, "accuracy": 0.876}, | |
| {"rank": 10, "accuracy": 0.926}, | |
| {"rank": 20, "accuracy": 0.953} | |
| ] | |
| }, | |
| "embeddings": { | |
| "embedding_mean_norm": 1.000, | |
| "embedding_std_norm": 0.00006, | |
| "avg_intra_class_distance": 0.211, | |
| "avg_inter_class_distance": 0.927, | |
| "separation_ratio": 4.392 | |
| }, | |
| "distance_histograms": { | |
| "bins": [0.0, 0.2, 0.4, 0.6, 0.8, 1.0], | |
| "intra_class_counts": [0, 12400, 68900, 18350, 350, 0], | |
| "inter_class_counts": [0, 750, 8900, 36450, 61200, 500] | |
| }, | |
| "indexing": { | |
| "val": {"queries": 5000, "gallery": 106000}, | |
| "test": {"queries": 5000, "gallery": 106000} | |
| }, | |
| "silhouette": {"val": 0.410, "test": 0.392}, | |
| "latency": { | |
| "embed_ms_mean": 8.4, | |
| "embed_ms_p95": 10.7, | |
| "batch_throughput_samples_per_sec": 296 | |
| }, | |
| "summary": { | |
| "total_embeddings": 106000, | |
| "total_pairs_sampled": 7200000, | |
| "triplet_mining": "semi_hard" | |
| } | |
| }, | |
| "artifacts": { | |
| "checkpoints": [ | |
| {"epoch": 44, "path": "artifacts/resnet_embedder_44_0.152.pth", "size_mb": 102.4}, | |
| {"epoch": 50, "path": "artifacts/resnet_embedder_50_0.156.pth", "size_mb": 102.5} | |
| ], | |
| "logs": { | |
| "tensorboard": "artifacts/tb/resnet_embedder", | |
| "metrics_json": "artifacts/metrics/resnet_full_run.json" | |
| }, | |
| "exported": { | |
| "onnx": {"path": "artifacts/export/resnet_embedder.onnx", "opset": 17}, | |
| "torchscript": {"path": "artifacts/export/resnet_embedder.ts"} | |
| } | |
| } | |
| }, | |
| "production_readiness": { | |
| "serving": { | |
| "inference_framework": "TorchScript", | |
| "runtime": "Triton Inference Server", | |
| "hardware": "T4 or A10G for cost/perf balance", | |
| "batching": {"max_batch": 64, "max_delay_ms": 10}, | |
| "latency_slo_ms": 50, | |
| "qps_target": 600, | |
| "autoscaling": {"policy": "HPA", "metric": "GPU_UTILIZATION", "target": 0.7} | |
| }, | |
| "indexing": { | |
| "library": "FAISS", | |
| "index_type": "IVF-PQ", | |
| "params": {"nlist": 4096, "m": 32, "nbits": 8}, | |
| "training_samples": 200000, | |
| "search": {"nprobe": 32}, | |
| "update_strategy": "daily incremental with monthly rebuild", | |
| "memory_footprint_gb": 1.8 | |
| }, | |
| "monitoring": { | |
| "dashboards": [ | |
| "Latency p50/p95/p99", | |
| "Throughput (req/s)", | |
| "GPU Utilization/Memory", | |
| "Embedding Norm Drift", | |
| "Recall@1 on shadow eval set", | |
| "kNN Proxy Accuracy" | |
| ], | |
| "alerts": [ | |
| {"name": "latency_p95_slo_breach", "threshold_ms": 80, "for": "5m"}, | |
| {"name": "recall_drop_gt_3pts", "threshold": -0.03, "for": "60m"} | |
| ], | |
| "data_quality": { | |
| "image_resolution_hist": true, | |
| "missing_values": "flag and route", | |
| "category_distribution": "weekly report" | |
| } | |
| }, | |
| "security_privacy": { | |
| "pii_in_images": "unlikely; still audit uploads", | |
| "model_supply_chain": "pin exact wheels and container digests", | |
| "artifact_signing": true | |
| }, | |
| "cost_estimates": { | |
| "gpu_hourly_usd": 1.5, | |
| "daily_inference_hours": 24, | |
| "replicas": 2, | |
| "monthly_usd": 2160 | |
| } | |
| }, | |
| "appendix": { | |
| "metric_definitions": { | |
| "triplet_loss": "Margin-based loss encouraging anchor-positive to be closer than anchor-negative by at least margin.", | |
| "cosine_distance": "Distance = 1 - cosine_similarity(a, b). Lower is more similar.", | |
| "recall_at_k": "Fraction of queries for which at least one true match is within top-k retrieved results.", | |
| "mean_average_precision": "Mean of Average Precision across queries; area under precision-recall curve for ranked retrieval.", | |
| "kNN_proxy_accuracy": "Classification accuracy using k-nearest neighbors in embedding space as classifier.", | |
| "silhouette": "Cluster separation measure: (b - a) / max(a, b) where a=intra, b=nearest inter distance.", | |
| "throughput_sps": "Samples per second processed during training/inference.", | |
| "embed_ms_mean": "Average embedding compute time per image in milliseconds.", | |
| "cmc_curve": "Cumulative Match Characteristic: probability a correct match appears in top-k (identification)." | |
| }, | |
| "evaluation_protocol": { | |
| "splits": {"train": 53306, "val": 5000, "test": 5000}, | |
| "query_gallery": { | |
| "val": {"queries": 5000, "gallery": 106000}, | |
| "test": {"queries": 5000, "gallery": 106000} | |
| }, | |
| "triplet_sampling": { | |
| "anchor": "random item", | |
| "positive": "same outfit or same category", | |
| "negative": "different outfit and usually different category", | |
| "mining": "semi_hard", | |
| "margin": 0.2 | |
| }, | |
| "indexing_note": "Retrieval uses cosine similarity over L2-normalized embeddings; exact search unless FAISS noted." | |
| }, | |
| "curves": { | |
| "train_val_triplet_loss_over_epochs": [ | |
| {"epoch": 1, "train": 0.945, "val": 0.921}, | |
| {"epoch": 2, "train": 0.842, "val": 0.820}, | |
| {"epoch": 3, "train": 0.765, "val": 0.744}, | |
| {"epoch": 4, "train": 0.701, "val": 0.682}, | |
| {"epoch": 5, "train": 0.632, "val": 0.611}, | |
| {"epoch": 6, "train": 0.598, "val": 0.577}, | |
| {"epoch": 7, "train": 0.561, "val": 0.541}, | |
| {"epoch": 8, "train": 0.531, "val": 0.512}, | |
| {"epoch": 9, "train": 0.506, "val": 0.488}, | |
| {"epoch": 10, "train": 0.482, "val": 0.468}, | |
| {"epoch": 11, "train": 0.459, "val": 0.446}, | |
| {"epoch": 12, "train": 0.438, "val": 0.426}, | |
| {"epoch": 13, "train": 0.420, "val": 0.408}, | |
| {"epoch": 14, "train": 0.407, "val": 0.395}, | |
| {"epoch": 15, "train": 0.401, "val": 0.389}, | |
| {"epoch": 16, "train": 0.381, "val": 0.371}, | |
| {"epoch": 17, "train": 0.364, "val": 0.355}, | |
| {"epoch": 18, "train": 0.353, "val": 0.345}, | |
| {"epoch": 19, "train": 0.348, "val": 0.337}, | |
| {"epoch": 20, "train": 0.343, "val": 0.332}, | |
| {"epoch": 21, "train": 0.331, "val": 0.319}, | |
| {"epoch": 22, "train": 0.319, "val": 0.308}, | |
| {"epoch": 23, "train": 0.309, "val": 0.298}, | |
| {"epoch": 24, "train": 0.303, "val": 0.293}, | |
| {"epoch": 25, "train": 0.298, "val": 0.287}, | |
| {"epoch": 26, "train": 0.290, "val": 0.280}, | |
| {"epoch": 27, "train": 0.282, "val": 0.272}, | |
| {"epoch": 28, "train": 0.274, "val": 0.265}, | |
| {"epoch": 29, "train": 0.268, "val": 0.259}, | |
| {"epoch": 30, "train": 0.263, "val": 0.253}, | |
| {"epoch": 31, "train": 0.257, "val": 0.248}, | |
| {"epoch": 32, "train": 0.250, "val": 0.241}, | |
| {"epoch": 33, "train": 0.244, "val": 0.235}, | |
| {"epoch": 34, "train": 0.239, "val": 0.229}, | |
| {"epoch": 35, "train": 0.234, "val": 0.224}, | |
| {"epoch": 36, "train": 0.230, "val": 0.220}, | |
| {"epoch": 37, "train": 0.226, "val": 0.216}, | |
| {"epoch": 38, "train": 0.221, "val": 0.212}, | |
| {"epoch": 39, "train": 0.216, "val": 0.206}, | |
| {"epoch": 40, "train": 0.209, "val": 0.199}, | |
| {"epoch": 41, "train": 0.205, "val": 0.195}, | |
| {"epoch": 42, "train": 0.200, "val": 0.191}, | |
| {"epoch": 43, "train": 0.195, "val": 0.186}, | |
| {"epoch": 44, "train": 0.192, "val": 0.182}, | |
| {"epoch": 45, "train": 0.189, "val": 0.184}, | |
| {"epoch": 46, "train": 0.186, "val": 0.183}, | |
| {"epoch": 47, "train": 0.183, "val": 0.182}, | |
| {"epoch": 48, "train": 0.181, "val": 0.180}, | |
| {"epoch": 49, "train": 0.180, "val": 0.159}, | |
| {"epoch": 50, "train": 0.179, "val": 0.156} | |
| ], | |
| "knn_proxy_accuracy_over_k": [ | |
| {"k": 1, "val_accuracy": 0.957, "test_accuracy": 0.951}, | |
| {"k": 3, "val_accuracy": 0.962, "test_accuracy": 0.955}, | |
| {"k": 5, "val_accuracy": 0.965, "test_accuracy": 0.958}, | |
| {"k": 10, "val_accuracy": 0.963, "test_accuracy": 0.956} | |
| ] | |
| }, | |
| "retrieval_details": { | |
| "recall_at_k_by_category": [ | |
| {"category": "tops", "r1": 0.70, "r5": 0.89, "r10": 0.94}, | |
| {"category": "pants", "r1": 0.68, "r5": 0.88, "r10": 0.93}, | |
| {"category": "skirts", "r1": 0.69, "r5": 0.88, "r10": 0.93}, | |
| {"category": "dresses", "r1": 0.71, "r5": 0.90, "r10": 0.95}, | |
| {"category": "shoes", "r1": 0.67, "r5": 0.87, "r10": 0.92}, | |
| {"category": "bags", "r1": 0.66, "r5": 0.86, "r10": 0.91}, | |
| {"category": "outerwear", "r1": 0.69, "r5": 0.88, "r10": 0.93}, | |
| {"category": "accessories", "r1": 0.61, "r5": 0.83, "r10": 0.90}, | |
| {"category": "hats", "r1": 0.60, "r5": 0.82, "r10": 0.89}, | |
| {"category": "sunglasses", "r1": 0.64, "r5": 0.85, "r10": 0.91} | |
| ], | |
| "cmc_points": [ | |
| {"rank": 1, "val": 0.691, "test": 0.682}, | |
| {"rank": 2, "val": 0.765, "test": 0.757}, | |
| {"rank": 3, "val": 0.811, "test": 0.803}, | |
| {"rank": 4, "val": 0.846, "test": 0.838}, | |
| {"rank": 5, "val": 0.882, "test": 0.876}, | |
| {"rank": 10, "val": 0.931, "test": 0.926}, | |
| {"rank": 20, "val": 0.958, "test": 0.953} | |
| ] | |
| }, | |
| "faiss_evaluation": { | |
| "exact_flat": {"recall_at_1": 0.682, "latency_ms_per_query": 3.9}, | |
| "ivf_pq": [ | |
| {"nlist": 2048, "m": 16, "nprobe": 8, "recall_at_1": 0.664, "latency_ms": 1.8}, | |
| {"nlist": 4096, "m": 32, "nprobe": 16, "recall_at_1": 0.676, "latency_ms": 2.1}, | |
| {"nlist": 4096, "m": 32, "nprobe": 32, "recall_at_1": 0.679, "latency_ms": 2.6}, | |
| {"nlist": 8192, "m": 32, "nprobe": 32, "recall_at_1": 0.681, "latency_ms": 3.2} | |
| ], | |
| "notes": "IVF-PQ with nlist=4096, m=32, nprobe=32 is a good trade-off: ~0.3pt drop vs exact with ~33% latency." | |
| }, | |
| "knn_reliability_bins": [ | |
| {"conf_bin": "0.0-0.1", "count": 1200, "accuracy": 0.12}, | |
| {"conf_bin": "0.1-0.2", "count": 2400, "accuracy": 0.19}, | |
| {"conf_bin": "0.2-0.3", "count": 3600, "accuracy": 0.29}, | |
| {"conf_bin": "0.3-0.4", "count": 4200, "accuracy": 0.38}, | |
| {"conf_bin": "0.4-0.5", "count": 5200, "accuracy": 0.47}, | |
| {"conf_bin": "0.5-0.6", "count": 6400, "accuracy": 0.57}, | |
| {"conf_bin": "0.6-0.7", "count": 7100, "accuracy": 0.66}, | |
| {"conf_bin": "0.7-0.8", "count": 7800, "accuracy": 0.74}, | |
| {"conf_bin": "0.8-0.9", "count": 8600, "accuracy": 0.83}, | |
| {"conf_bin": "0.9-1.0", "count": 9100, "accuracy": 0.92} | |
| ], | |
| "data_quality": { | |
| "image_resolution": { | |
| "bins": ["<256^2", "256^2-384^2", "384^2-512^2", ">512^2"], | |
| "counts": [820, 12800, 78900, 13180] | |
| }, | |
| "aspect_ratio": { | |
| "bins": ["0.5", "0.75", "1.0", "1.33", "1.5", "2.0"], | |
| "counts": [5400, 18200, 52100, 17300, 7700, 1300] | |
| }, | |
| "brightness_histogram": { | |
| "bins": [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0], | |
| "counts": [980, 2200, 5400, 8700, 13200, 18100, 16400, 10900, 5900, 2400, 820] | |
| }, | |
| "notes": "Most images fall near square aspect ratio; exposure reasonably balanced." | |
| }, | |
| "error_analysis": { | |
| "common_confusions": [ | |
| {"from": "tops", "to": "dresses", "count": 420}, | |
| {"from": "skirts", "to": "dresses", "count": 310}, | |
| {"from": "bags", "to": "accessories", "count": 280}, | |
| {"from": "outerwear", "to": "tops", "count": 260}, | |
| {"from": "shoes", "to": "boots", "count": 190} | |
| ], | |
| "hard_negatives": [ | |
| {"type": "same color/style across categories", "examples": 1450}, | |
| {"type": "near-duplicate products", "examples": 920}, | |
| {"type": "low-light images", "examples": 610} | |
| ], | |
| "notes": "Misclassifications often stem from ambiguous taxonomy and visually similar items across categories." | |
| }, | |
| "serving_benchmarks": { | |
| "hardware": [ | |
| {"gpu": "T4 16GB", "batch": 64, "embed_ms_mean": 13.2, "throughput_sps": 210}, | |
| {"gpu": "A10G 24GB", "batch": 64, "embed_ms_mean": 9.4, "throughput_sps": 275}, | |
| {"gpu": "A100 40GB", "batch": 64, "embed_ms_mean": 8.1, "throughput_sps": 306} | |
| ], | |
| "notes": "Latency and throughput measured with TorchScript fp16, channels_last." | |
| } | |
| } | |
| } | |