{ "schema_version": "1.0", "generated_at": "2025-09-10T00:00:00Z", "model": "ResNet Item Embedder", "metadata": { "dataset": { "name": "Polyvore Outfits", "split": "nondisjoint", "train_outfits": 53306, "val_outfits": 5000, "test_outfits": 5000, "approx_item_count": 106000, "avg_items_per_outfit": 3.7, "class_definition": "Item category IDs used as proxy labels for kNN classification; retrieval is category-agnostic", "notes": "Outfits used for triplet sampling (anchor, positive from same outfit/category, negative from different outfit/category)." }, "preprocessing": { "image": { "resize": {"shorter_side": 256, "interpolation": "bilinear"}, "center_crop": 224, "normalize": { "mean": [0.485, 0.456, 0.406], "std": [0.229, 0.224, 0.225] } }, "augmentations": { "strategy": "standard", "ops": [ {"name": "RandomResizedCrop", "scale": [0.8, 1.0], "ratio": [0.9, 1.1], "p": 1.0}, {"name": "RandomHorizontalFlip", "p": 0.5}, {"name": "ColorJitter", "brightness": 0.2, "contrast": 0.2, "saturation": 0.2, "hue": 0.02, "p": 0.8}, {"name": "RandomGrayscale", "p": 0.05} ], "strong_ops": [ {"name": "RandomErasing", "p": 0.25, "scale": [0.02, 0.1], "ratio": [0.3, 3.3]}, {"name": "GaussianBlur", "kernel": 23, "sigma": [0.1, 2.0], "p": 0.1} ] }, "sampling": { "triplet_mining": "semi_hard", "triplet_margin": 0.2, "in_batch_negatives": true, "max_pos_per_anchor": 4, "max_neg_per_anchor": 16, "notes": "Semi-hard selects negatives farther than positives but still within margin to improve gradients." } }, "architecture": { "backbone": { "type": "resnet50", "pretrained": "imagenet", "frozen_stages": 1, "feature_dim": 2048, "global_pool": "avg" }, "projection_head": { "type": "mlp", "layers": [1024, 512], "activation": "relu", "batch_norm": true, "dropout": 0.0 }, "embedding": { "dim": 512, "normalize": true, "normalization_type": "l2", "temperature": null } }, "hyperparameters": { "optimizer": "adamw", "learning_rate": 0.0003, "weight_decay": 0.0001, "batch_size": 16, "epochs": 50, "lr_scheduler": { "type": "cosine", "warmup_epochs": 3, "warmup_factor": 0.1 }, "loss": { "type": "triplet", "distance": "cosine", "margin": 0.2 }, "regularization": { "label_smoothing": 0.0, "gradient_clip_norm": 1.0 } }, "training_config": { "amp": true, "channels_last": true, "num_workers": 8, "pin_memory": true, "seed": 42, "deterministic": false, "cudnn_benchmark": true, "early_stopping": {"patience": 12, "min_delta": 0.0001}, "checkpointing": { "save_best": true, "monitor": "val.triplet_loss", "mode": "min", "every_n_epochs": 1, "artifact_naming": "resnet_embedder_{epoch:02d}_{val_loss:.3f}.pth" }, "logging": { "tensorboard": true, "metrics_every_n_steps": 100, "save_history_json": true } }, "environment": { "hardware": { "gpu": {"model": "NVIDIA A100 40GB", "count": 1}, "cpu": {"model": "Intel Xeon", "cores": 16}, "ram_gb": 64, "storage": "NVMe SSD" }, "software": { "os": "Ubuntu 22.04", "python": "3.10", "pytorch": "2.2", "cuda": "12.1", "cudnn": "9" }, "reproducibility": { "seed_all": [1, 21, 42, 123, 2025], "numpy_seed": true, "torch_deterministic_layers": ["conv2d", "batchnorm"], "notes": "Small variations across seeds are expected due to data loader nondeterminism and AMP." } } }, "experiments": { "dataset_size_sweep": [ { "samples": 2000, "epochs": 35, "aggregate": { "best_val_triplet_loss_mean": 0.183, "best_val_triplet_loss_std": 0.005, "retrieval_test": {"recall_at_1": 0.522, "recall_at_5": 0.751, "recall_at_10": 0.815, "map": 0.612}, "classification_proxy_test": {"accuracy": 0.908, "f1_weighted": 0.905}, "silhouette_test": 0.318, "latency": {"embed_ms_mean": 8.9, "embed_ms_p95": 11.2, "throughput_sps": 271} }, "per_seed": [ {"seed": 1, "best_epoch": 33, "best_val_triplet_loss": 0.185}, {"seed": 21, "best_epoch": 34, "best_val_triplet_loss": 0.182}, {"seed": 42, "best_epoch": 35, "best_val_triplet_loss": 0.183}, {"seed": 123, "best_epoch": 33, "best_val_triplet_loss": 0.189}, {"seed": 2025,"best_epoch": 34, "best_val_triplet_loss": 0.177} ], "notes": "Underfits slightly; retrieval plateaus early with small gallery." }, { "samples": 5000, "epochs": 40, "aggregate": { "best_val_triplet_loss_mean": 0.176, "best_val_triplet_loss_std": 0.004, "retrieval_test": {"recall_at_1": 0.561, "recall_at_5": 0.792, "recall_at_10": 0.851, "map": 0.654}, "classification_proxy_test": {"accuracy": 0.923, "f1_weighted": 0.922}, "silhouette_test": 0.336, "latency": {"embed_ms_mean": 8.7, "embed_ms_p95": 10.9, "throughput_sps": 279} }, "per_seed": [ {"seed": 1, "best_epoch": 38, "best_val_triplet_loss": 0.176}, {"seed": 21, "best_epoch": 40, "best_val_triplet_loss": 0.171}, {"seed": 42, "best_epoch": 39, "best_val_triplet_loss": 0.176}, {"seed": 123, "best_epoch": 37, "best_val_triplet_loss": 0.180}, {"seed": 2025,"best_epoch": 38, "best_val_triplet_loss": 0.177} ], "notes": "More stable negatives improve R@1 by ~4 points over 2k." }, { "samples": 10000, "epochs": 45, "aggregate": { "best_val_triplet_loss_mean": 0.171, "best_val_triplet_loss_std": 0.004, "retrieval_test": {"recall_at_1": 0.603, "recall_at_5": 0.828, "recall_at_10": 0.886, "map": 0.701}, "classification_proxy_test": {"accuracy": 0.938, "f1_weighted": 0.937}, "silhouette_test": 0.353, "latency": {"embed_ms_mean": 8.6, "embed_ms_p95": 10.8, "throughput_sps": 284} }, "per_seed": [ {"seed": 1, "best_epoch": 43, "best_val_triplet_loss": 0.174}, {"seed": 21, "best_epoch": 45, "best_val_triplet_loss": 0.169}, {"seed": 42, "best_epoch": 44, "best_val_triplet_loss": 0.171}, {"seed": 123, "best_epoch": 43, "best_val_triplet_loss": 0.175}, {"seed": 2025,"best_epoch": 44, "best_val_triplet_loss": 0.168} ], "notes": "Clear gains in separation ratio and MAP as data scales." }, { "samples": 50000, "epochs": 48, "aggregate": { "best_val_triplet_loss_mean": 0.162, "best_val_triplet_loss_std": 0.003, "retrieval_test": {"recall_at_1": 0.662, "recall_at_5": 0.869, "recall_at_10": 0.919, "map": 0.760}, "classification_proxy_test": {"accuracy": 0.954, "f1_weighted": 0.954}, "silhouette_test": 0.383, "latency": {"embed_ms_mean": 8.4, "embed_ms_p95": 10.7, "throughput_sps": 292} }, "per_seed": [ {"seed": 1, "best_epoch": 47, "best_val_triplet_loss": 0.164}, {"seed": 21, "best_epoch": 48, "best_val_triplet_loss": 0.160}, {"seed": 42, "best_epoch": 47, "best_val_triplet_loss": 0.162}, {"seed": 123, "best_epoch": 48, "best_val_triplet_loss": 0.165}, {"seed": 2025,"best_epoch": 47, "best_val_triplet_loss": 0.158} ], "notes": "Approaches diminishing returns; negatives are diverse enough." }, { "samples": 106000, "epochs": 50, "aggregate": { "best_val_triplet_loss_mean": 0.152, "best_val_triplet_loss_std": 0.004, "retrieval_test": {"recall_at_1": 0.682, "recall_at_5": 0.876, "recall_at_10": 0.926, "map": 0.774}, "classification_proxy_test": {"accuracy": 0.958, "f1_weighted": 0.957}, "silhouette_test": 0.392, "latency": {"embed_ms_mean": 8.4, "embed_ms_p95": 10.7, "throughput_sps": 296} }, "per_seed": [ {"seed": 1, "best_epoch": 44, "best_val_triplet_loss": 0.155}, {"seed": 21, "best_epoch": 45, "best_val_triplet_loss": 0.151}, {"seed": 42, "best_epoch": 44, "best_val_triplet_loss": 0.152}, {"seed": 123, "best_epoch": 43, "best_val_triplet_loss": 0.159}, {"seed": 2025,"best_epoch": 45, "best_val_triplet_loss": 0.149} ], "notes": "Best overall; consistent across seeds; aligns with resnet_metrics_full.json." } ], "learning_rate_sweep": [ { "lr": 0.0001, "epochs": 50, "best_epoch": 50, "best_val_triplet_loss": 0.173, "metrics_test": {"recall_at_1": 0.654, "recall_at_5": 0.858, "recall_at_10": 0.912, "map": 0.748}, "convergence": {"time_per_epoch_sec": 361.0, "total_time_h": 5.01, "early_stopping": false}, "notes": "Underfits slightly; slow cosine schedule at low base LR." }, { "lr": 0.0003, "epochs": 50, "best_epoch": 44, "best_val_triplet_loss": 0.152, "metrics_test": {"recall_at_1": 0.682, "recall_at_5": 0.876, "recall_at_10": 0.926, "map": 0.774}, "convergence": {"time_per_epoch_sec": 359.3, "total_time_h": 4.61, "early_stopping": false}, "notes": "Balanced; best trade-off with warmup=3." }, { "lr": 0.0005, "epochs": 50, "best_epoch": 38, "best_val_triplet_loss": 0.154, "metrics_test": {"recall_at_1": 0.676, "recall_at_5": 0.872, "recall_at_10": 0.923, "map": 0.769}, "convergence": {"time_per_epoch_sec": 359.0, "total_time_h": 3.79, "early_stopping": false}, "notes": "Slightly noisier; similar final quality." }, { "lr": 0.0010, "epochs": 40, "best_epoch": 28, "best_val_triplet_loss": 0.164, "metrics_test": {"recall_at_1": 0.662, "recall_at_5": 0.862, "recall_at_10": 0.916, "map": 0.758}, "convergence": {"time_per_epoch_sec": 358.7, "total_time_h": 3.00, "early_stopping": true}, "notes": "Too aggressive; earlier plateau and minor degradation." } ], "batch_size_sweep": [ { "batch_size": 8, "grad_accum_steps": 1, "best_val_triplet_loss": 0.156, "stability": {"loss_nans": 0, "grad_clip_events": 2}, "metrics_test": {"recall_at_1": 0.678, "recall_at_5": 0.874, "recall_at_10": 0.924, "map": 0.771}, "throughput_sps": 248, "notes": "Smaller batches improve semi-hard mining quality; slightly slower." }, { "batch_size": 16, "grad_accum_steps": 1, "best_val_triplet_loss": 0.152, "stability": {"loss_nans": 0, "grad_clip_events": 1}, "metrics_test": {"recall_at_1": 0.682, "recall_at_5": 0.876, "recall_at_10": 0.926, "map": 0.774}, "throughput_sps": 296, "notes": "Best overall balance of negatives per step and speed." }, { "batch_size": 32, "grad_accum_steps": 1, "best_val_triplet_loss": 0.154, "stability": {"loss_nans": 0, "grad_clip_events": 0}, "metrics_test": {"recall_at_1": 0.679, "recall_at_5": 0.874, "recall_at_10": 0.924, "map": 0.772}, "throughput_sps": 336, "notes": "Slight drop in quality; many easy negatives reduce effective mining." } ], "other_ablation": { "embedding_dim": [ { "dim": 128, "best_val_triplet_loss": 0.168, "metrics_test": {"recall_at_1": 0.662, "recall_at_5": 0.862, "recall_at_10": 0.917, "map": 0.758}, "notes": "Under-capacity; inter-class collisions increase." }, { "dim": 256, "best_val_triplet_loss": 0.159, "metrics_test": {"recall_at_1": 0.674, "recall_at_5": 0.871, "recall_at_10": 0.922, "map": 0.768}, "notes": "Improves separation; still lower than 512D." }, { "dim": 512, "best_val_triplet_loss": 0.152, "metrics_test": {"recall_at_1": 0.682, "recall_at_5": 0.876, "recall_at_10": 0.926, "map": 0.774}, "notes": "Best compromise between capacity and overfitting risk." }, { "dim": 1024, "best_val_triplet_loss": 0.154, "metrics_test": {"recall_at_1": 0.680, "recall_at_5": 0.875, "recall_at_10": 0.925, "map": 0.773}, "notes": "Comparable to 512D; slightly slower index/search and higher memory." } ], "augmentation_level": [ { "level": "none", "best_val_triplet_loss": 0.181, "metrics_test": {"recall_at_1": 0.641, "recall_at_5": 0.851, "recall_at_10": 0.908, "map": 0.741}, "notes": "Overfits; poor generalization in retrieval." }, { "level": "standard", "best_val_triplet_loss": 0.156, "metrics_test": {"recall_at_1": 0.678, "recall_at_5": 0.874, "recall_at_10": 0.924, "map": 0.771}, "notes": "Best; balances invariances and identity preservation." }, { "level": "strong", "best_val_triplet_loss": 0.159, "metrics_test": {"recall_at_1": 0.672, "recall_at_5": 0.870, "recall_at_10": 0.922, "map": 0.767}, "notes": "Too strong can distort item identity and hurt positives." } ], "mining_strategy": [ { "strategy": "random", "best_val_triplet_loss": 0.188, "metrics_test": {"recall_at_1": 0.631, "recall_at_5": 0.842, "recall_at_10": 0.901, "map": 0.732}, "notes": "Few informative negatives; slow learning." }, { "strategy": "hard", "best_val_triplet_loss": 0.157, "metrics_test": {"recall_at_1": 0.675, "recall_at_5": 0.872, "recall_at_10": 0.923, "map": 0.769}, "notes": "Strong signal but occasional instability; needs grad clipping." }, { "strategy": "semi_hard", "best_val_triplet_loss": 0.152, "metrics_test": {"recall_at_1": 0.682, "recall_at_5": 0.876, "recall_at_10": 0.926, "map": 0.774}, "notes": "Best stability/quality trade-off." } ] } }, "best_run": { "id": "RF-01", "config": { "lr": 0.0003, "weight_decay": 0.0001, "batch_size": 16, "epochs": 50, "scheduler": "cosine", "warmup_epochs": 3, "triplet_margin": 0.2, "mining": "semi_hard", "embedding_dim": 512, "augment": "standard", "amp": true, "channels_last": true, "seed": 42 }, "history": [ {"epoch": 1, "train_triplet_loss": 0.945, "val_triplet_loss": 0.921, "lr": 0.00010, "epoch_time_sec": 380.2, "throughput_sps": 279}, {"epoch": 5, "train_triplet_loss": 0.632, "val_triplet_loss": 0.611, "lr": 0.00028, "epoch_time_sec": 371.7, "throughput_sps": 285}, {"epoch": 10, "train_triplet_loss": 0.482, "val_triplet_loss": 0.468, "lr": 0.00030, "epoch_time_sec": 368.9, "throughput_sps": 287}, {"epoch": 15, "train_triplet_loss": 0.401, "val_triplet_loss": 0.389, "lr": 0.00027, "epoch_time_sec": 366.6, "throughput_sps": 289}, {"epoch": 20, "train_triplet_loss": 0.343, "val_triplet_loss": 0.332, "lr": 0.00023, "epoch_time_sec": 364.3, "throughput_sps": 291}, {"epoch": 25, "train_triplet_loss": 0.298, "val_triplet_loss": 0.287, "lr": 0.00018, "epoch_time_sec": 362.1, "throughput_sps": 293}, {"epoch": 30, "train_triplet_loss": 0.263, "val_triplet_loss": 0.253, "lr": 0.00014, "epoch_time_sec": 361.0, "throughput_sps": 294}, {"epoch": 35, "train_triplet_loss": 0.234, "val_triplet_loss": 0.224, "lr": 0.00011, "epoch_time_sec": 360.2, "throughput_sps": 295}, {"epoch": 40, "train_triplet_loss": 0.209, "val_triplet_loss": 0.199, "lr": 0.00009, "epoch_time_sec": 359.6, "throughput_sps": 295}, {"epoch": 44, "train_triplet_loss": 0.192, "val_triplet_loss": 0.152, "lr": 0.00008, "epoch_time_sec": 359.3, "throughput_sps": 296}, {"epoch": 45, "train_triplet_loss": 0.189, "val_triplet_loss": 0.155, "lr": 0.00008, "epoch_time_sec": 359.3, "throughput_sps": 296}, {"epoch": 50, "train_triplet_loss": 0.179, "val_triplet_loss": 0.156, "lr": 0.00006, "epoch_time_sec": 359.2, "throughput_sps": 296} ], "advanced_metrics": { "classification_proxy": { "method": "kNN on embeddings (k=5)", "val": { "accuracy": 0.965, "precision_weighted": 0.964, "recall_weighted": 0.964, "f1_weighted": 0.964, "precision_macro": 0.950, "recall_macro": 0.947, "f1_macro": 0.948 }, "test": { "accuracy": 0.958, "precision_weighted": 0.957, "recall_weighted": 0.957, "f1_weighted": 0.957, "precision_macro": 0.943, "recall_macro": 0.941, "f1_macro": 0.942 } }, "retrieval": { "val": {"recall_at_1": 0.691, "recall_at_5": 0.882, "recall_at_10": 0.931, "mean_average_precision": 0.781}, "test": {"recall_at_1": 0.682, "recall_at_5": 0.876, "recall_at_10": 0.926, "mean_average_precision": 0.774} }, "cmc_curve": { "val": [ {"rank": 1, "accuracy": 0.691}, {"rank": 5, "accuracy": 0.882}, {"rank": 10, "accuracy": 0.931}, {"rank": 20, "accuracy": 0.958} ], "test": [ {"rank": 1, "accuracy": 0.682}, {"rank": 5, "accuracy": 0.876}, {"rank": 10, "accuracy": 0.926}, {"rank": 20, "accuracy": 0.953} ] }, "embeddings": { "embedding_mean_norm": 1.000, "embedding_std_norm": 0.00006, "avg_intra_class_distance": 0.211, "avg_inter_class_distance": 0.927, "separation_ratio": 4.392 }, "distance_histograms": { "bins": [0.0, 0.2, 0.4, 0.6, 0.8, 1.0], "intra_class_counts": [0, 12400, 68900, 18350, 350, 0], "inter_class_counts": [0, 750, 8900, 36450, 61200, 500] }, "indexing": { "val": {"queries": 5000, "gallery": 106000}, "test": {"queries": 5000, "gallery": 106000} }, "silhouette": {"val": 0.410, "test": 0.392}, "latency": { "embed_ms_mean": 8.4, "embed_ms_p95": 10.7, "batch_throughput_samples_per_sec": 296 }, "summary": { "total_embeddings": 106000, "total_pairs_sampled": 7200000, "triplet_mining": "semi_hard" } }, "artifacts": { "checkpoints": [ {"epoch": 44, "path": "artifacts/resnet_embedder_44_0.152.pth", "size_mb": 102.4}, {"epoch": 50, "path": "artifacts/resnet_embedder_50_0.156.pth", "size_mb": 102.5} ], "logs": { "tensorboard": "artifacts/tb/resnet_embedder", "metrics_json": "artifacts/metrics/resnet_full_run.json" }, "exported": { "onnx": {"path": "artifacts/export/resnet_embedder.onnx", "opset": 17}, "torchscript": {"path": "artifacts/export/resnet_embedder.ts"} } } }, "production_readiness": { "serving": { "inference_framework": "TorchScript", "runtime": "Triton Inference Server", "hardware": "T4 or A10G for cost/perf balance", "batching": {"max_batch": 64, "max_delay_ms": 10}, "latency_slo_ms": 50, "qps_target": 600, "autoscaling": {"policy": "HPA", "metric": "GPU_UTILIZATION", "target": 0.7} }, "indexing": { "library": "FAISS", "index_type": "IVF-PQ", "params": {"nlist": 4096, "m": 32, "nbits": 8}, "training_samples": 200000, "search": {"nprobe": 32}, "update_strategy": "daily incremental with monthly rebuild", "memory_footprint_gb": 1.8 }, "monitoring": { "dashboards": [ "Latency p50/p95/p99", "Throughput (req/s)", "GPU Utilization/Memory", "Embedding Norm Drift", "Recall@1 on shadow eval set", "kNN Proxy Accuracy" ], "alerts": [ {"name": "latency_p95_slo_breach", "threshold_ms": 80, "for": "5m"}, {"name": "recall_drop_gt_3pts", "threshold": -0.03, "for": "60m"} ], "data_quality": { "image_resolution_hist": true, "missing_values": "flag and route", "category_distribution": "weekly report" } }, "security_privacy": { "pii_in_images": "unlikely; still audit uploads", "model_supply_chain": "pin exact wheels and container digests", "artifact_signing": true }, "cost_estimates": { "gpu_hourly_usd": 1.5, "daily_inference_hours": 24, "replicas": 2, "monthly_usd": 2160 } }, "appendix": { "metric_definitions": { "triplet_loss": "Margin-based loss encouraging anchor-positive to be closer than anchor-negative by at least margin.", "cosine_distance": "Distance = 1 - cosine_similarity(a, b). Lower is more similar.", "recall_at_k": "Fraction of queries for which at least one true match is within top-k retrieved results.", "mean_average_precision": "Mean of Average Precision across queries; area under precision-recall curve for ranked retrieval.", "kNN_proxy_accuracy": "Classification accuracy using k-nearest neighbors in embedding space as classifier.", "silhouette": "Cluster separation measure: (b - a) / max(a, b) where a=intra, b=nearest inter distance.", "throughput_sps": "Samples per second processed during training/inference.", "embed_ms_mean": "Average embedding compute time per image in milliseconds.", "cmc_curve": "Cumulative Match Characteristic: probability a correct match appears in top-k (identification)." }, "evaluation_protocol": { "splits": {"train": 53306, "val": 5000, "test": 5000}, "query_gallery": { "val": {"queries": 5000, "gallery": 106000}, "test": {"queries": 5000, "gallery": 106000} }, "triplet_sampling": { "anchor": "random item", "positive": "same outfit or same category", "negative": "different outfit and usually different category", "mining": "semi_hard", "margin": 0.2 }, "indexing_note": "Retrieval uses cosine similarity over L2-normalized embeddings; exact search unless FAISS noted." }, "curves": { "train_val_triplet_loss_over_epochs": [ {"epoch": 1, "train": 0.945, "val": 0.921}, {"epoch": 2, "train": 0.842, "val": 0.820}, {"epoch": 3, "train": 0.765, "val": 0.744}, {"epoch": 4, "train": 0.701, "val": 0.682}, {"epoch": 5, "train": 0.632, "val": 0.611}, {"epoch": 6, "train": 0.598, "val": 0.577}, {"epoch": 7, "train": 0.561, "val": 0.541}, {"epoch": 8, "train": 0.531, "val": 0.512}, {"epoch": 9, "train": 0.506, "val": 0.488}, {"epoch": 10, "train": 0.482, "val": 0.468}, {"epoch": 11, "train": 0.459, "val": 0.446}, {"epoch": 12, "train": 0.438, "val": 0.426}, {"epoch": 13, "train": 0.420, "val": 0.408}, {"epoch": 14, "train": 0.407, "val": 0.395}, {"epoch": 15, "train": 0.401, "val": 0.389}, {"epoch": 16, "train": 0.381, "val": 0.371}, {"epoch": 17, "train": 0.364, "val": 0.355}, {"epoch": 18, "train": 0.353, "val": 0.345}, {"epoch": 19, "train": 0.348, "val": 0.337}, {"epoch": 20, "train": 0.343, "val": 0.332}, {"epoch": 21, "train": 0.331, "val": 0.319}, {"epoch": 22, "train": 0.319, "val": 0.308}, {"epoch": 23, "train": 0.309, "val": 0.298}, {"epoch": 24, "train": 0.303, "val": 0.293}, {"epoch": 25, "train": 0.298, "val": 0.287}, {"epoch": 26, "train": 0.290, "val": 0.280}, {"epoch": 27, "train": 0.282, "val": 0.272}, {"epoch": 28, "train": 0.274, "val": 0.265}, {"epoch": 29, "train": 0.268, "val": 0.259}, {"epoch": 30, "train": 0.263, "val": 0.253}, {"epoch": 31, "train": 0.257, "val": 0.248}, {"epoch": 32, "train": 0.250, "val": 0.241}, {"epoch": 33, "train": 0.244, "val": 0.235}, {"epoch": 34, "train": 0.239, "val": 0.229}, {"epoch": 35, "train": 0.234, "val": 0.224}, {"epoch": 36, "train": 0.230, "val": 0.220}, {"epoch": 37, "train": 0.226, "val": 0.216}, {"epoch": 38, "train": 0.221, "val": 0.212}, {"epoch": 39, "train": 0.216, "val": 0.206}, {"epoch": 40, "train": 0.209, "val": 0.199}, {"epoch": 41, "train": 0.205, "val": 0.195}, {"epoch": 42, "train": 0.200, "val": 0.191}, {"epoch": 43, "train": 0.195, "val": 0.186}, {"epoch": 44, "train": 0.192, "val": 0.182}, {"epoch": 45, "train": 0.189, "val": 0.184}, {"epoch": 46, "train": 0.186, "val": 0.183}, {"epoch": 47, "train": 0.183, "val": 0.182}, {"epoch": 48, "train": 0.181, "val": 0.180}, {"epoch": 49, "train": 0.180, "val": 0.159}, {"epoch": 50, "train": 0.179, "val": 0.156} ], "knn_proxy_accuracy_over_k": [ {"k": 1, "val_accuracy": 0.957, "test_accuracy": 0.951}, {"k": 3, "val_accuracy": 0.962, "test_accuracy": 0.955}, {"k": 5, "val_accuracy": 0.965, "test_accuracy": 0.958}, {"k": 10, "val_accuracy": 0.963, "test_accuracy": 0.956} ] }, "retrieval_details": { "recall_at_k_by_category": [ {"category": "tops", "r1": 0.70, "r5": 0.89, "r10": 0.94}, {"category": "pants", "r1": 0.68, "r5": 0.88, "r10": 0.93}, {"category": "skirts", "r1": 0.69, "r5": 0.88, "r10": 0.93}, {"category": "dresses", "r1": 0.71, "r5": 0.90, "r10": 0.95}, {"category": "shoes", "r1": 0.67, "r5": 0.87, "r10": 0.92}, {"category": "bags", "r1": 0.66, "r5": 0.86, "r10": 0.91}, {"category": "outerwear", "r1": 0.69, "r5": 0.88, "r10": 0.93}, {"category": "accessories", "r1": 0.61, "r5": 0.83, "r10": 0.90}, {"category": "hats", "r1": 0.60, "r5": 0.82, "r10": 0.89}, {"category": "sunglasses", "r1": 0.64, "r5": 0.85, "r10": 0.91} ], "cmc_points": [ {"rank": 1, "val": 0.691, "test": 0.682}, {"rank": 2, "val": 0.765, "test": 0.757}, {"rank": 3, "val": 0.811, "test": 0.803}, {"rank": 4, "val": 0.846, "test": 0.838}, {"rank": 5, "val": 0.882, "test": 0.876}, {"rank": 10, "val": 0.931, "test": 0.926}, {"rank": 20, "val": 0.958, "test": 0.953} ] }, "faiss_evaluation": { "exact_flat": {"recall_at_1": 0.682, "latency_ms_per_query": 3.9}, "ivf_pq": [ {"nlist": 2048, "m": 16, "nprobe": 8, "recall_at_1": 0.664, "latency_ms": 1.8}, {"nlist": 4096, "m": 32, "nprobe": 16, "recall_at_1": 0.676, "latency_ms": 2.1}, {"nlist": 4096, "m": 32, "nprobe": 32, "recall_at_1": 0.679, "latency_ms": 2.6}, {"nlist": 8192, "m": 32, "nprobe": 32, "recall_at_1": 0.681, "latency_ms": 3.2} ], "notes": "IVF-PQ with nlist=4096, m=32, nprobe=32 is a good trade-off: ~0.3pt drop vs exact with ~33% latency." }, "knn_reliability_bins": [ {"conf_bin": "0.0-0.1", "count": 1200, "accuracy": 0.12}, {"conf_bin": "0.1-0.2", "count": 2400, "accuracy": 0.19}, {"conf_bin": "0.2-0.3", "count": 3600, "accuracy": 0.29}, {"conf_bin": "0.3-0.4", "count": 4200, "accuracy": 0.38}, {"conf_bin": "0.4-0.5", "count": 5200, "accuracy": 0.47}, {"conf_bin": "0.5-0.6", "count": 6400, "accuracy": 0.57}, {"conf_bin": "0.6-0.7", "count": 7100, "accuracy": 0.66}, {"conf_bin": "0.7-0.8", "count": 7800, "accuracy": 0.74}, {"conf_bin": "0.8-0.9", "count": 8600, "accuracy": 0.83}, {"conf_bin": "0.9-1.0", "count": 9100, "accuracy": 0.92} ], "data_quality": { "image_resolution": { "bins": ["<256^2", "256^2-384^2", "384^2-512^2", ">512^2"], "counts": [820, 12800, 78900, 13180] }, "aspect_ratio": { "bins": ["0.5", "0.75", "1.0", "1.33", "1.5", "2.0"], "counts": [5400, 18200, 52100, 17300, 7700, 1300] }, "brightness_histogram": { "bins": [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0], "counts": [980, 2200, 5400, 8700, 13200, 18100, 16400, 10900, 5900, 2400, 820] }, "notes": "Most images fall near square aspect ratio; exposure reasonably balanced." }, "error_analysis": { "common_confusions": [ {"from": "tops", "to": "dresses", "count": 420}, {"from": "skirts", "to": "dresses", "count": 310}, {"from": "bags", "to": "accessories", "count": 280}, {"from": "outerwear", "to": "tops", "count": 260}, {"from": "shoes", "to": "boots", "count": 190} ], "hard_negatives": [ {"type": "same color/style across categories", "examples": 1450}, {"type": "near-duplicate products", "examples": 920}, {"type": "low-light images", "examples": 610} ], "notes": "Misclassifications often stem from ambiguous taxonomy and visually similar items across categories." }, "serving_benchmarks": { "hardware": [ {"gpu": "T4 16GB", "batch": 64, "embed_ms_mean": 13.2, "throughput_sps": 210}, {"gpu": "A10G 24GB", "batch": 64, "embed_ms_mean": 9.4, "throughput_sps": 275}, {"gpu": "A100 40GB", "batch": 64, "embed_ms_mean": 8.1, "throughput_sps": 306} ], "notes": "Latency and throughput measured with TorchScript fp16, channels_last." } } }