Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/out.log +195 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/params.txt +103 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_caltech101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_cars_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_cifar10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_eurosat_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_mscoco_captions_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_stl10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/out.log +195 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/params.txt +103 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_caltech101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_cars_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_cifar10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_eurosat_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_mscoco_captions_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_stl10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "cifar100", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7589, "acc5": 0.9387, "mean_per_class_recall": 0.7590999999999998}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "country211", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.1975829383886256, "acc5": 0.4209952606635071, "mean_per_class_recall": 0.19763033175355452}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "dtd", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5484042553191489, "acc5": 0.8191489361702128, "mean_per_class_recall": 0.547872340425532}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "flickr30k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.656000018119812, "text_retrieval_recall@1": 0.8009999990463257, "image_retrieval_recall@5": 0.8655999898910522, "text_retrieval_recall@5": 0.9539999961853027, "image_retrieval_recall@10": 0.9193999767303467, "text_retrieval_recall@10": 0.9760000109672546}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "flowers", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7487396324605627, "acc5": 0.897869572288177, "mean_per_class_recall": 0.7396028874215905}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "food101", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8691485148514851, "acc5": 0.980039603960396, "mean_per_class_recall": 0.8692277227722772}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "gtsrb", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.510609659540776, "acc5": 0.7530482977038796, "mean_per_class_recall": 0.4616824052066722}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "imagenet1k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.68688, "acc5": 0.90938, "mean_per_class_recall": 0.6868800000000002}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "pets", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8947942218588171, "acc5": 0.9964568002180431, "mean_per_class_recall": 0.8933304274009377}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "sun397", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6720764293727127, "acc5": 0.928158964267981, "mean_per_class_recall": 0.6638586591957565}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "vtab/resisc45", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6063492063492063, "acc5": 0.9077777777777778, "mean_per_class_recall": 0.6129633195377677}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/out.log
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-04-26,19:20:36 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2.
|
| 2 |
+
2025-04-26,19:20:36 | INFO | Loaded ViT-B-16 model config.
|
| 3 |
+
2025-04-26,19:20:37 | INFO | Loading pretrained ViT-B-16 weights (datacomp_xl_s13b_b90k).
|
| 4 |
+
2025-04-26,19:20:38 | INFO | Model:
|
| 5 |
+
2025-04-26,19:20:38 | INFO | CLIP(
|
| 6 |
+
(visual): VisionTransformer(
|
| 7 |
+
(conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
|
| 8 |
+
(patch_dropout): Identity()
|
| 9 |
+
(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 10 |
+
(transformer): Transformer(
|
| 11 |
+
(resblocks): ModuleList(
|
| 12 |
+
(0-11): 12 x ResidualAttentionBlock(
|
| 13 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 14 |
+
(attn): MultiheadAttention(
|
| 15 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 16 |
+
)
|
| 17 |
+
(ls_1): Identity()
|
| 18 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 19 |
+
(mlp): Sequential(
|
| 20 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 21 |
+
(gelu): GELU(approximate='none')
|
| 22 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 23 |
+
)
|
| 24 |
+
(ls_2): Identity()
|
| 25 |
+
)
|
| 26 |
+
)
|
| 27 |
+
)
|
| 28 |
+
(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 29 |
+
)
|
| 30 |
+
(transformer): Transformer(
|
| 31 |
+
(resblocks): ModuleList(
|
| 32 |
+
(0-11): 12 x ResidualAttentionBlock(
|
| 33 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 34 |
+
(attn): MultiheadAttention(
|
| 35 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 36 |
+
)
|
| 37 |
+
(ls_1): Identity()
|
| 38 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 39 |
+
(mlp): Sequential(
|
| 40 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 41 |
+
(gelu): GELU(approximate='none')
|
| 42 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 43 |
+
)
|
| 44 |
+
(ls_2): Identity()
|
| 45 |
+
)
|
| 46 |
+
)
|
| 47 |
+
)
|
| 48 |
+
(token_embedding): Embedding(49408, 512)
|
| 49 |
+
(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 50 |
+
)
|
| 51 |
+
2025-04-26,19:20:38 | INFO | Params:
|
| 52 |
+
2025-04-26,19:20:38 | INFO | accum_freq: 2
|
| 53 |
+
2025-04-26,19:20:38 | INFO | aug_cfg: {}
|
| 54 |
+
2025-04-26,19:20:38 | INFO | batch_size: 2048
|
| 55 |
+
2025-04-26,19:20:38 | INFO | beta1: 0.9
|
| 56 |
+
2025-04-26,19:20:38 | INFO | beta2: 0.98
|
| 57 |
+
2025-04-26,19:20:38 | INFO | cache_dir: None
|
| 58 |
+
2025-04-26,19:20:38 | INFO | caption_ratio: 0.1
|
| 59 |
+
2025-04-26,19:20:38 | INFO | checkpoint_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints
|
| 60 |
+
2025-04-26,19:20:38 | INFO | coca_caption_loss_weight: 2.0
|
| 61 |
+
2025-04-26,19:20:38 | INFO | coca_contrastive_loss_weight: 1.0
|
| 62 |
+
2025-04-26,19:20:38 | INFO | copy_codebase: False
|
| 63 |
+
2025-04-26,19:20:38 | INFO | csv_caption_key: title
|
| 64 |
+
2025-04-26,19:20:38 | INFO | csv_img_key: filepath
|
| 65 |
+
2025-04-26,19:20:38 | INFO | csv_separator:
|
| 66 |
+
2025-04-26,19:20:38 | INFO | dataset_resampled: False
|
| 67 |
+
2025-04-26,19:20:38 | INFO | dataset_type: synthetic
|
| 68 |
+
2025-04-26,19:20:38 | INFO | ddp_static_graph: False
|
| 69 |
+
2025-04-26,19:20:38 | INFO | debug: False
|
| 70 |
+
2025-04-26,19:20:38 | INFO | delete_previous_checkpoint: False
|
| 71 |
+
2025-04-26,19:20:38 | INFO | device: cuda:0
|
| 72 |
+
2025-04-26,19:20:38 | INFO | dist_backend: None
|
| 73 |
+
2025-04-26,19:20:38 | INFO | dist_url: None
|
| 74 |
+
2025-04-26,19:20:38 | INFO | distill: False
|
| 75 |
+
2025-04-26,19:20:38 | INFO | distill_model: None
|
| 76 |
+
2025-04-26,19:20:38 | INFO | distill_pretrained: None
|
| 77 |
+
2025-04-26,19:20:38 | INFO | distributed: True
|
| 78 |
+
2025-04-26,19:20:38 | INFO | epochs: 10
|
| 79 |
+
2025-04-26,19:20:38 | INFO | epochs_cooldown: None
|
| 80 |
+
2025-04-26,19:20:38 | INFO | eps: 1e-08
|
| 81 |
+
2025-04-26,19:20:38 | INFO | force_custom_text: False
|
| 82 |
+
2025-04-26,19:20:38 | INFO | force_image_size: None
|
| 83 |
+
2025-04-26,19:20:38 | INFO | force_patch_dropout: None
|
| 84 |
+
2025-04-26,19:20:38 | INFO | force_quick_gelu: False
|
| 85 |
+
2025-04-26,19:20:38 | INFO | gather_with_grad: True
|
| 86 |
+
2025-04-26,19:20:38 | INFO | grad_checkpointing: True
|
| 87 |
+
2025-04-26,19:20:38 | INFO | grad_clip_norm: None
|
| 88 |
+
2025-04-26,19:20:38 | INFO | horovod: False
|
| 89 |
+
2025-04-26,19:20:38 | INFO | image_interpolation: None
|
| 90 |
+
2025-04-26,19:20:38 | INFO | image_mean: None
|
| 91 |
+
2025-04-26,19:20:38 | INFO | image_resize_mode: None
|
| 92 |
+
2025-04-26,19:20:38 | INFO | image_std: None
|
| 93 |
+
2025-04-26,19:20:38 | INFO | imagenet_v2: None
|
| 94 |
+
2025-04-26,19:20:38 | INFO | imagenet_val: None
|
| 95 |
+
2025-04-26,19:20:38 | INFO | keep_func_name: keep_image_farest
|
| 96 |
+
2025-04-26,19:20:38 | INFO | local_loss: False
|
| 97 |
+
2025-04-26,19:20:38 | INFO | local_rank: 0
|
| 98 |
+
2025-04-26,19:20:38 | INFO | lock_image: False
|
| 99 |
+
2025-04-26,19:20:38 | INFO | lock_image_freeze_bn_stats: False
|
| 100 |
+
2025-04-26,19:20:38 | INFO | lock_image_unlocked_groups: 0
|
| 101 |
+
2025-04-26,19:20:38 | INFO | lock_text: True
|
| 102 |
+
2025-04-26,19:20:38 | INFO | lock_text_freeze_layer_norm: False
|
| 103 |
+
2025-04-26,19:20:38 | INFO | lock_text_unlocked_layers: 0
|
| 104 |
+
2025-04-26,19:20:38 | INFO | log_every_n_steps: 100
|
| 105 |
+
2025-04-26,19:20:38 | INFO | log_level: 20
|
| 106 |
+
2025-04-26,19:20:38 | INFO | log_local: False
|
| 107 |
+
2025-04-26,19:20:38 | INFO | log_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/out.log
|
| 108 |
+
2025-04-26,19:20:38 | INFO | logs: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text
|
| 109 |
+
2025-04-26,19:20:38 | INFO | loss_dist_impl: None
|
| 110 |
+
2025-04-26,19:20:38 | INFO | lr: 4e-05
|
| 111 |
+
2025-04-26,19:20:38 | INFO | lr_cooldown_end: 0.0
|
| 112 |
+
2025-04-26,19:20:38 | INFO | lr_cooldown_power: 1.0
|
| 113 |
+
2025-04-26,19:20:38 | INFO | lr_scheduler: cosine
|
| 114 |
+
2025-04-26,19:20:38 | INFO | map_func_name: use_all
|
| 115 |
+
2025-04-26,19:20:38 | INFO | model: ViT-B-16
|
| 116 |
+
2025-04-26,19:20:38 | INFO | momentum: None
|
| 117 |
+
2025-04-26,19:20:38 | INFO | name: keep_image_farest
|
| 118 |
+
2025-04-26,19:20:38 | INFO | no_set_device_rank: False
|
| 119 |
+
2025-04-26,19:20:38 | INFO | opt: adamw
|
| 120 |
+
2025-04-26,19:20:38 | INFO | precision: amp
|
| 121 |
+
2025-04-26,19:20:38 | INFO | pretrained: datacomp_xl_s13b_b90k
|
| 122 |
+
2025-04-26,19:20:38 | INFO | pretrained_image: False
|
| 123 |
+
2025-04-26,19:20:38 | INFO | rank: 0
|
| 124 |
+
2025-04-26,19:20:38 | INFO | remote_sync: None
|
| 125 |
+
2025-04-26,19:20:38 | INFO | remote_sync_frequency: 300
|
| 126 |
+
2025-04-26,19:20:38 | INFO | remote_sync_protocol: s3
|
| 127 |
+
2025-04-26,19:20:38 | INFO | report_to: tensorboard,wandb
|
| 128 |
+
2025-04-26,19:20:38 | INFO | resume: None
|
| 129 |
+
2025-04-26,19:20:38 | INFO | save_frequency: 10
|
| 130 |
+
2025-04-26,19:20:38 | INFO | save_most_recent: False
|
| 131 |
+
2025-04-26,19:20:38 | INFO | seed: 0
|
| 132 |
+
2025-04-26,19:20:38 | INFO | siglip: False
|
| 133 |
+
2025-04-26,19:20:38 | INFO | skip_scheduler: False
|
| 134 |
+
2025-04-26,19:20:38 | INFO | tensorboard: True
|
| 135 |
+
2025-04-26,19:20:38 | INFO | tensorboard_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/tensorboard
|
| 136 |
+
2025-04-26,19:20:38 | INFO | torchcompile: False
|
| 137 |
+
2025-04-26,19:20:38 | INFO | torchscript: False
|
| 138 |
+
2025-04-26,19:20:38 | INFO | trace: False
|
| 139 |
+
2025-04-26,19:20:38 | INFO | train_data: /mnt/personal/zhudongy/recap-datacomp-3m-wds/{00376..01507}.tar
|
| 140 |
+
2025-04-26,19:20:38 | INFO | train_data_upsampling_factors: None
|
| 141 |
+
2025-04-26,19:20:38 | INFO | train_num_samples: 9011874
|
| 142 |
+
2025-04-26,19:20:38 | INFO | use_bn_sync: False
|
| 143 |
+
2025-04-26,19:20:38 | INFO | use_bnb_linear: None
|
| 144 |
+
2025-04-26,19:20:38 | INFO | val_data: None
|
| 145 |
+
2025-04-26,19:20:38 | INFO | val_frequency: 1
|
| 146 |
+
2025-04-26,19:20:38 | INFO | val_num_samples: None
|
| 147 |
+
2025-04-26,19:20:38 | INFO | wandb: True
|
| 148 |
+
2025-04-26,19:20:38 | INFO | wandb_notes:
|
| 149 |
+
2025-04-26,19:20:38 | INFO | wandb_project_name: open-clip
|
| 150 |
+
2025-04-26,19:20:38 | INFO | warmup: 110
|
| 151 |
+
2025-04-26,19:20:38 | INFO | wd: 0.5
|
| 152 |
+
2025-04-26,19:20:38 | INFO | workers: 16
|
| 153 |
+
2025-04-26,19:20:38 | INFO | world_size: 2
|
| 154 |
+
2025-04-26,19:20:38 | INFO | zeroshot_frequency: 2
|
| 155 |
+
2025-04-26,19:20:39 | INFO | Created AdamW (adamw) optimizer: lr: 4e-05, betas: (0.9, 0.98), eps: 1e-08, weight_decay: 0.5, amsgrad: False, foreach: None, maximize: False, capturable: False, differentiable: False, fused: None
|
| 156 |
+
2025-04-26,19:20:57 | INFO | Start epoch 0
|
| 157 |
+
2025-04-26,19:21:51 | INFO | Train Epoch: 0 [ 8192/917504 (1%)] Data (t): 44.342 Batch (t): 53.918, 151.934/s, 75.9669/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 28.554 (28.554) Imm_text: 28.554 (28.554) Isd_image: 3.2214 (3.2214) Isd_text: 3.2214 (3.2214) Contrastive_loss: 1.4326 (1.4326) Loss: 1.4326 (1.4326)
|
| 158 |
+
2025-04-26,19:36:52 | INFO | Train Epoch: 0 [827392/917504 (90%)] Data (t): 0.998 Batch (t): 9.004, 917.588/s, 458.794/s/gpu LR: 0.000037 Logit Scale: 99.933 Imm_image: 28.962 (28.758) Imm_text: 28.962 (28.758) Isd_image: 5.8420 (4.5317) Isd_text: 5.8420 (4.5317) Contrastive_loss: 0.73414 (1.0834) Loss: 0.73414 (1.0834)
|
| 159 |
+
2025-04-26,19:38:30 | INFO | Train Epoch: 0 [917504/917504 (100%)] Data (t): 0.953 Batch (t): 8.939, 913.115/s, 456.558/s/gpu LR: 0.000040 Logit Scale: 99.924 Imm_image: 28.947 (28.821) Imm_text: 28.947 (28.821) Isd_image: 5.6367 (4.9000) Isd_text: 5.6367 (4.9000) Contrastive_loss: 0.69624 (0.95433) Loss: 0.69624 (0.95433)
|
| 160 |
+
2025-04-26,19:38:30 | INFO | Start epoch 1
|
| 161 |
+
2025-04-26,19:39:13 | INFO | Train Epoch: 1 [ 8192/917504 (1%)] Data (t): 33.564 Batch (t): 43.142, 189.886/s, 94.9431/s/gpu LR: 0.000040 Logit Scale: 99.923 Imm_image: 29.097 (29.097) Imm_text: 29.097 (29.097) Isd_image: 5.5602 (5.5602) Isd_text: 5.5602 (5.5602) Contrastive_loss: 0.60997 (0.60997) Loss: 0.60997 (0.60997)
|
| 162 |
+
2025-04-26,19:54:15 | INFO | Train Epoch: 1 [827392/917504 (90%)] Data (t): 1.005 Batch (t): 9.013, 917.390/s, 458.695/s/gpu LR: 0.000039 Logit Scale: 99.881 Imm_image: 28.927 (29.012) Imm_text: 28.927 (29.012) Isd_image: 4.1130 (4.8366) Isd_text: 4.1130 (4.8366) Contrastive_loss: 0.62246 (0.61621) Loss: 0.62246 (0.61621)
|
| 163 |
+
2025-04-26,19:55:53 | INFO | Train Epoch: 1 [917504/917504 (100%)] Data (t): 0.953 Batch (t): 8.938, 915.751/s, 457.875/s/gpu LR: 0.000039 Logit Scale: 99.881 Imm_image: 29.309 (29.111) Imm_text: 29.309 (29.111) Isd_image: 3.7556 (4.4762) Isd_text: 3.7556 (4.4762) Contrastive_loss: 0.49672 (0.57638) Loss: 0.49672 (0.57638)
|
| 164 |
+
2025-04-26,19:55:53 | INFO | Start epoch 2
|
| 165 |
+
2025-04-26,19:56:39 | INFO | Train Epoch: 2 [ 8192/917504 (1%)] Data (t): 35.471 Batch (t): 45.796, 178.882/s, 89.4409/s/gpu LR: 0.000039 Logit Scale: 99.882 Imm_image: 29.290 (29.290) Imm_text: 29.290 (29.290) Isd_image: 3.6671 (3.6671) Isd_text: 3.6671 (3.6671) Contrastive_loss: 0.49962 (0.49962) Loss: 0.49962 (0.49962)
|
| 166 |
+
2025-04-26,20:11:40 | INFO | Train Epoch: 2 [827392/917504 (90%)] Data (t): 0.999 Batch (t): 9.008, 913.482/s, 456.741/s/gpu LR: 0.000036 Logit Scale: 100.000 Imm_image: 29.430 (29.360) Imm_text: 29.430 (29.360) Isd_image: 2.6187 (3.1429) Isd_text: 2.6187 (3.1429) Contrastive_loss: 0.45664 (0.47813) Loss: 0.45664 (0.47813)
|
| 167 |
+
2025-04-26,20:13:18 | INFO | Train Epoch: 2 [917504/917504 (100%)] Data (t): 0.958 Batch (t): 8.943, 914.170/s, 457.085/s/gpu LR: 0.000035 Logit Scale: 100.000 Imm_image: 29.466 (29.396) Imm_text: 29.466 (29.396) Isd_image: 2.6329 (2.9729) Isd_text: 2.6329 (2.9729) Contrastive_loss: 0.41372 (0.45666) Loss: 0.41372 (0.45666)
|
| 168 |
+
2025-04-26,20:13:18 | INFO | Start epoch 3
|
| 169 |
+
2025-04-26,20:14:03 | INFO | Train Epoch: 3 [ 8192/917504 (1%)] Data (t): 36.546 Batch (t): 44.617, 183.607/s, 91.8035/s/gpu LR: 0.000035 Logit Scale: 100.000 Imm_image: 29.504 (29.504) Imm_text: 29.504 (29.504) Isd_image: 2.5284 (2.5284) Isd_text: 2.5284 (2.5284) Contrastive_loss: 0.39021 (0.39021) Loss: 0.39021 (0.39021)
|
| 170 |
+
2025-04-26,20:29:06 | INFO | Train Epoch: 3 [827392/917504 (90%)] Data (t): 1.015 Batch (t): 9.027, 917.585/s, 458.792/s/gpu LR: 0.000031 Logit Scale: 100.000 Imm_image: 29.626 (29.565) Imm_text: 29.626 (29.565) Isd_image: 1.6747 (2.1016) Isd_text: 1.6747 (2.1016) Contrastive_loss: 0.41409 (0.40215) Loss: 0.41409 (0.40215)
|
| 171 |
+
2025-04-26,20:30:44 | INFO | Train Epoch: 3 [917504/917504 (100%)] Data (t): 0.962 Batch (t): 8.949, 915.455/s, 457.728/s/gpu LR: 0.000030 Logit Scale: 100.000 Imm_image: 29.619 (29.583) Imm_text: 29.619 (29.583) Isd_image: 1.6334 (1.9455) Isd_text: 1.6334 (1.9455) Contrastive_loss: 0.36740 (0.39056) Loss: 0.36740 (0.39056)
|
| 172 |
+
2025-04-26,20:30:44 | INFO | Start epoch 4
|
| 173 |
+
2025-04-26,20:31:29 | INFO | Train Epoch: 4 [ 8192/917504 (1%)] Data (t): 35.128 Batch (t): 44.257, 185.102/s, 92.5508/s/gpu LR: 0.000030 Logit Scale: 100.000 Imm_image: 29.757 (29.757) Imm_text: 29.757 (29.757) Isd_image: 1.5362 (1.5362) Isd_text: 1.5362 (1.5362) Contrastive_loss: 0.33672 (0.33672) Loss: 0.33672 (0.33672)
|
| 174 |
+
2025-04-26,20:46:29 | INFO | Train Epoch: 4 [827392/917504 (90%)] Data (t): 0.997 Batch (t): 9.004, 917.946/s, 458.973/s/gpu LR: 0.000024 Logit Scale: 100.000 Imm_image: 29.850 (29.804) Imm_text: 29.850 (29.804) Isd_image: 1.1166 (1.3264) Isd_text: 1.1166 (1.3264) Contrastive_loss: 0.33999 (0.33835) Loss: 0.33999 (0.33835)
|
| 175 |
+
2025-04-26,20:48:07 | INFO | Train Epoch: 4 [917504/917504 (100%)] Data (t): 0.954 Batch (t): 8.939, 915.946/s, 457.973/s/gpu LR: 0.000023 Logit Scale: 100.000 Imm_image: 29.934 (29.847) Imm_text: 29.934 (29.847) Isd_image: 1.0718 (1.2415) Isd_text: 1.0718 (1.2415) Contrastive_loss: 0.28950 (0.32207) Loss: 0.28950 (0.32207)
|
| 176 |
+
2025-04-26,20:48:08 | INFO | Start epoch 5
|
| 177 |
+
2025-04-26,20:48:52 | INFO | Train Epoch: 5 [ 8192/917504 (1%)] Data (t): 36.121 Batch (t): 44.476, 184.189/s, 92.0946/s/gpu LR: 0.000023 Logit Scale: 100.000 Imm_image: 29.804 (29.804) Imm_text: 29.804 (29.804) Isd_image: 1.1836 (1.1836) Isd_text: 1.1836 (1.1836) Contrastive_loss: 0.33592 (0.33592) Loss: 0.33592 (0.33592)
|
| 178 |
+
2025-04-26,21:03:54 | INFO | Train Epoch: 5 [827392/917504 (90%)] Data (t): 1.010 Batch (t): 9.022, 916.297/s, 458.148/s/gpu LR: 0.000017 Logit Scale: 100.000 Imm_image: 30.026 (29.915) Imm_text: 30.026 (29.915) Isd_image: 0.84884 (1.0162) Isd_text: 0.84884 (1.0162) Contrastive_loss: 0.26665 (0.30128) Loss: 0.26665 (0.30128)
|
| 179 |
+
2025-04-26,21:05:33 | INFO | Train Epoch: 5 [917504/917504 (100%)] Data (t): 0.952 Batch (t): 8.936, 915.004/s, 457.502/s/gpu LR: 0.000017 Logit Scale: 100.000 Imm_image: 29.985 (29.938) Imm_text: 29.985 (29.938) Isd_image: 0.86452 (0.96566) Isd_text: 0.86452 (0.96566) Contrastive_loss: 0.26700 (0.28985) Loss: 0.26700 (0.28985)
|
| 180 |
+
2025-04-26,21:05:33 | INFO | Start epoch 6
|
| 181 |
+
2025-04-26,21:06:18 | INFO | Train Epoch: 6 [ 8192/917504 (1%)] Data (t): 36.805 Batch (t): 44.934, 182.311/s, 91.1555/s/gpu LR: 0.000016 Logit Scale: 100.000 Imm_image: 30.204 (30.204) Imm_text: 30.204 (30.204) Isd_image: 0.70070 (0.70070) Isd_text: 0.70070 (0.70070) Contrastive_loss: 0.23083 (0.23083) Loss: 0.23083 (0.23083)
|
| 182 |
+
2025-04-26,21:21:18 | INFO | Train Epoch: 6 [827392/917504 (90%)] Data (t): 0.999 Batch (t): 9.007, 917.786/s, 458.893/s/gpu LR: 0.000011 Logit Scale: 100.000 Imm_image: 30.193 (30.199) Imm_text: 30.193 (30.199) Isd_image: 0.75573 (0.72822) Isd_text: 0.75573 (0.72822) Contrastive_loss: 0.24233 (0.23658) Loss: 0.24233 (0.23658)
|
| 183 |
+
2025-04-26,21:22:57 | INFO | Train Epoch: 6 [917504/917504 (100%)] Data (t): 0.954 Batch (t): 8.940, 916.499/s, 458.250/s/gpu LR: 0.000010 Logit Scale: 100.000 Imm_image: 30.249 (30.216) Imm_text: 30.249 (30.216) Isd_image: 0.70645 (0.72096) Isd_text: 0.70645 (0.72096) Contrastive_loss: 0.21447 (0.22921) Loss: 0.21447 (0.22921)
|
| 184 |
+
2025-04-26,21:22:57 | INFO | Start epoch 7
|
| 185 |
+
2025-04-26,21:23:42 | INFO | Train Epoch: 7 [ 8192/917504 (1%)] Data (t): 36.516 Batch (t): 44.643, 183.500/s, 91.7499/s/gpu LR: 0.000010 Logit Scale: 100.000 Imm_image: 30.320 (30.320) Imm_text: 30.320 (30.320) Isd_image: 0.57832 (0.57832) Isd_text: 0.57832 (0.57832) Contrastive_loss: 0.22137 (0.22137) Loss: 0.22137 (0.22137)
|
| 186 |
+
2025-04-26,21:38:42 | INFO | Train Epoch: 7 [827392/917504 (90%)] Data (t): 0.998 Batch (t): 9.006, 915.734/s, 457.867/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.290 (30.305) Imm_text: 30.290 (30.305) Isd_image: 0.58627 (0.58230) Isd_text: 0.58627 (0.58230) Contrastive_loss: 0.24158 (0.23147) Loss: 0.24158 (0.23147)
|
| 187 |
+
2025-04-26,21:40:21 | INFO | Train Epoch: 7 [917504/917504 (100%)] Data (t): 0.959 Batch (t): 8.944, 914.296/s, 457.148/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.336 (30.315) Imm_text: 30.336 (30.315) Isd_image: 0.57076 (0.57845) Isd_text: 0.57076 (0.57845) Contrastive_loss: 0.20793 (0.22363) Loss: 0.20793 (0.22363)
|
| 188 |
+
2025-04-26,21:40:21 | INFO | Start epoch 8
|
| 189 |
+
2025-04-26,21:41:05 | INFO | Train Epoch: 8 [ 8192/917504 (1%)] Data (t): 34.989 Batch (t): 44.032, 186.047/s, 93.0235/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.384 (30.384) Imm_text: 30.384 (30.384) Isd_image: 0.50557 (0.50557) Isd_text: 0.50557 (0.50557) Contrastive_loss: 0.19039 (0.19039) Loss: 0.19039 (0.19039)
|
| 190 |
+
2025-04-26,21:56:10 | INFO | Train Epoch: 8 [827392/917504 (90%)] Data (t): 1.028 Batch (t): 9.047, 913.226/s, 456.613/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.238 (30.311) Imm_text: 30.238 (30.311) Isd_image: 0.65661 (0.58109) Isd_text: 0.65661 (0.58109) Contrastive_loss: 0.24421 (0.21730) Loss: 0.24421 (0.21730)
|
| 191 |
+
2025-04-26,21:57:48 | INFO | Train Epoch: 8 [917504/917504 (100%)] Data (t): 0.951 Batch (t): 8.934, 914.753/s, 457.376/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.413 (30.345) Imm_text: 30.413 (30.345) Isd_image: 0.54766 (0.56995) Isd_text: 0.54766 (0.56995) Contrastive_loss: 0.19143 (0.20867) Loss: 0.19143 (0.20867)
|
| 192 |
+
2025-04-26,21:57:48 | INFO | Start epoch 9
|
| 193 |
+
2025-04-26,21:58:33 | INFO | Train Epoch: 9 [ 8192/917504 (1%)] Data (t): 36.704 Batch (t): 44.784, 182.922/s, 91.4612/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.425 (30.425) Imm_text: 30.425 (30.425) Isd_image: 0.52132 (0.52132) Isd_text: 0.52132 (0.52132) Contrastive_loss: 0.20011 (0.20011) Loss: 0.20011 (0.20011)
|
| 194 |
+
2025-04-26,22:13:32 | INFO | Train Epoch: 9 [827392/917504 (90%)] Data (t): 0.981 Batch (t): 8.986, 916.852/s, 458.426/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 30.419 (30.422) Imm_text: 30.419 (30.422) Isd_image: 0.57758 (0.54945) Isd_text: 0.57758 (0.54945) Contrastive_loss: 0.21143 (0.20577) Loss: 0.21143 (0.20577)
|
| 195 |
+
2025-04-26,22:15:10 | INFO | Train Epoch: 9 [917504/917504 (100%)] Data (t): 0.936 Batch (t): 8.913, 916.256/s, 458.128/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 30.455 (30.433) Imm_text: 30.455 (30.433) Isd_image: 0.51568 (0.53820) Isd_text: 0.51568 (0.53820) Contrastive_loss: 0.18257 (0.19804) Loss: 0.18257 (0.19804)
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/params.txt
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accum_freq: 2
|
| 2 |
+
aug_cfg: {}
|
| 3 |
+
batch_size: 2048
|
| 4 |
+
beta1: 0.9
|
| 5 |
+
beta2: 0.98
|
| 6 |
+
cache_dir: None
|
| 7 |
+
caption_ratio: 0.1
|
| 8 |
+
checkpoint_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints
|
| 9 |
+
coca_caption_loss_weight: 2.0
|
| 10 |
+
coca_contrastive_loss_weight: 1.0
|
| 11 |
+
copy_codebase: False
|
| 12 |
+
csv_caption_key: title
|
| 13 |
+
csv_img_key: filepath
|
| 14 |
+
csv_separator:
|
| 15 |
+
dataset_resampled: False
|
| 16 |
+
dataset_type: synthetic
|
| 17 |
+
ddp_static_graph: False
|
| 18 |
+
debug: False
|
| 19 |
+
delete_previous_checkpoint: False
|
| 20 |
+
device: cuda:0
|
| 21 |
+
dist_backend: None
|
| 22 |
+
dist_url: None
|
| 23 |
+
distill: False
|
| 24 |
+
distill_model: None
|
| 25 |
+
distill_pretrained: None
|
| 26 |
+
distributed: True
|
| 27 |
+
epochs: 10
|
| 28 |
+
epochs_cooldown: None
|
| 29 |
+
eps: 1e-08
|
| 30 |
+
force_custom_text: False
|
| 31 |
+
force_image_size: None
|
| 32 |
+
force_patch_dropout: None
|
| 33 |
+
force_quick_gelu: False
|
| 34 |
+
gather_with_grad: True
|
| 35 |
+
grad_checkpointing: True
|
| 36 |
+
grad_clip_norm: None
|
| 37 |
+
horovod: False
|
| 38 |
+
image_interpolation: None
|
| 39 |
+
image_mean: None
|
| 40 |
+
image_resize_mode: None
|
| 41 |
+
image_std: None
|
| 42 |
+
imagenet_v2: None
|
| 43 |
+
imagenet_val: None
|
| 44 |
+
keep_func_name: keep_image_farest
|
| 45 |
+
local_loss: False
|
| 46 |
+
local_rank: 0
|
| 47 |
+
lock_image: False
|
| 48 |
+
lock_image_freeze_bn_stats: False
|
| 49 |
+
lock_image_unlocked_groups: 0
|
| 50 |
+
lock_text: True
|
| 51 |
+
lock_text_freeze_layer_norm: False
|
| 52 |
+
lock_text_unlocked_layers: 0
|
| 53 |
+
log_every_n_steps: 100
|
| 54 |
+
log_level: 20
|
| 55 |
+
log_local: False
|
| 56 |
+
log_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/out.log
|
| 57 |
+
logs: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text
|
| 58 |
+
loss_dist_impl: None
|
| 59 |
+
lr: 4e-05
|
| 60 |
+
lr_cooldown_end: 0.0
|
| 61 |
+
lr_cooldown_power: 1.0
|
| 62 |
+
lr_scheduler: cosine
|
| 63 |
+
map_func_name: use_all
|
| 64 |
+
model: ViT-B-16
|
| 65 |
+
momentum: None
|
| 66 |
+
name: keep_image_farest
|
| 67 |
+
no_set_device_rank: False
|
| 68 |
+
opt: adamw
|
| 69 |
+
precision: amp
|
| 70 |
+
pretrained: datacomp_xl_s13b_b90k
|
| 71 |
+
pretrained_image: False
|
| 72 |
+
rank: 0
|
| 73 |
+
remote_sync: None
|
| 74 |
+
remote_sync_frequency: 300
|
| 75 |
+
remote_sync_protocol: s3
|
| 76 |
+
report_to: tensorboard,wandb
|
| 77 |
+
resume: None
|
| 78 |
+
save_frequency: 10
|
| 79 |
+
save_most_recent: False
|
| 80 |
+
seed: 0
|
| 81 |
+
siglip: False
|
| 82 |
+
skip_scheduler: False
|
| 83 |
+
tensorboard: True
|
| 84 |
+
tensorboard_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/tensorboard
|
| 85 |
+
torchcompile: False
|
| 86 |
+
torchscript: False
|
| 87 |
+
trace: False
|
| 88 |
+
train_data: /mnt/personal/zhudongy/recap-datacomp-3m-wds/{00376..01507}.tar
|
| 89 |
+
train_data_upsampling_factors: None
|
| 90 |
+
train_num_samples: 9011874
|
| 91 |
+
use_bn_sync: False
|
| 92 |
+
use_bnb_linear: None
|
| 93 |
+
val_data: None
|
| 94 |
+
val_frequency: 1
|
| 95 |
+
val_num_samples: None
|
| 96 |
+
wandb: True
|
| 97 |
+
wandb_notes:
|
| 98 |
+
wandb_project_name: open-clip
|
| 99 |
+
warmup: 110
|
| 100 |
+
wd: 0.5
|
| 101 |
+
workers: 16
|
| 102 |
+
world_size: 2
|
| 103 |
+
zeroshot_frequency: 2
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_caltech101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "caltech101", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8642825896762905, "acc5": 0.968503937007874, "mean_per_class_recall": 0.9268833112583469}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_cars_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "cars", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8307424449695312, "acc5": 0.9890560875512996, "mean_per_class_recall": 0.831079202139019}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "cifar100", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7507, "acc5": 0.9307, "mean_per_class_recall": 0.7508}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_cifar10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "cifar10", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.9424, "acc5": 0.9983, "mean_per_class_recall": 0.9426}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "country211", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.19488151658767772, "acc5": 0.42402843601895734, "mean_per_class_recall": 0.19497630331753554}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "dtd", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5574468085106383, "acc5": 0.8218085106382979, "mean_per_class_recall": 0.5574468085106383}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_eurosat_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "eurosat", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.4812222222222222, "acc5": 0.9082592592592592, "mean_per_class_recall": 0.49873}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "fgvc_aircraft", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.24302430243024303, "acc5": 0.5886588658865887, "mean_per_class_recall": 0.24406417112299464}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "flickr30k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.6453999876976013, "text_retrieval_recall@1": 0.8119999766349792, "image_retrieval_recall@5": 0.8736000061035156, "text_retrieval_recall@5": 0.9599999785423279, "image_retrieval_recall@10": 0.9232000112533569, "text_retrieval_recall@10": 0.9769999980926514}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "flowers", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7315010570824524, "acc5": 0.9033989266547406, "mean_per_class_recall": 0.7427204309687857}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "food101", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8706930693069307, "acc5": 0.981069306930693, "mean_per_class_recall": 0.8706138613861385}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "gtsrb", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5311955661124307, "acc5": 0.7740300870942202, "mean_per_class_recall": 0.5028962559086422}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "imagenet1k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.68738, "acc5": 0.90974, "mean_per_class_recall": 0.68722}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_mscoco_captions_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "mscoco_captions", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.38168731331825256, "text_retrieval_recall@1": 0.5389999747276306, "image_retrieval_recall@5": 0.6399040222167969, "text_retrieval_recall@5": 0.7764000296592712, "image_retrieval_recall@10": 0.7417033314704895, "text_retrieval_recall@10": 0.8560000061988831}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "pets", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8997001907876806, "acc5": 0.9964568002180431, "mean_per_class_recall": 0.8992468186086529}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_stl10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "stl10", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.970375, "acc5": 0.999875, "mean_per_class_recall": 0.9705}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "sun397", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6735016643066002, "acc5": 0.927404969012634, "mean_per_class_recall": 0.6633693099922054}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "vtab/resisc45", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6115873015873016, "acc5": 0.9157142857142857, "mean_per_class_recall": 0.6191834588628579}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/out.log
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-04-27,01:37:31 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2.
|
| 2 |
+
2025-04-27,01:37:31 | INFO | Loaded ViT-B-16 model config.
|
| 3 |
+
2025-04-27,01:37:33 | INFO | Loading pretrained ViT-B-16 weights (datacomp_xl_s13b_b90k).
|
| 4 |
+
2025-04-27,01:37:33 | INFO | Model:
|
| 5 |
+
2025-04-27,01:37:33 | INFO | CLIP(
|
| 6 |
+
(visual): VisionTransformer(
|
| 7 |
+
(conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
|
| 8 |
+
(patch_dropout): Identity()
|
| 9 |
+
(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 10 |
+
(transformer): Transformer(
|
| 11 |
+
(resblocks): ModuleList(
|
| 12 |
+
(0-11): 12 x ResidualAttentionBlock(
|
| 13 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 14 |
+
(attn): MultiheadAttention(
|
| 15 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 16 |
+
)
|
| 17 |
+
(ls_1): Identity()
|
| 18 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 19 |
+
(mlp): Sequential(
|
| 20 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 21 |
+
(gelu): GELU(approximate='none')
|
| 22 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 23 |
+
)
|
| 24 |
+
(ls_2): Identity()
|
| 25 |
+
)
|
| 26 |
+
)
|
| 27 |
+
)
|
| 28 |
+
(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 29 |
+
)
|
| 30 |
+
(transformer): Transformer(
|
| 31 |
+
(resblocks): ModuleList(
|
| 32 |
+
(0-11): 12 x ResidualAttentionBlock(
|
| 33 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 34 |
+
(attn): MultiheadAttention(
|
| 35 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 36 |
+
)
|
| 37 |
+
(ls_1): Identity()
|
| 38 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 39 |
+
(mlp): Sequential(
|
| 40 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 41 |
+
(gelu): GELU(approximate='none')
|
| 42 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 43 |
+
)
|
| 44 |
+
(ls_2): Identity()
|
| 45 |
+
)
|
| 46 |
+
)
|
| 47 |
+
)
|
| 48 |
+
(token_embedding): Embedding(49408, 512)
|
| 49 |
+
(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 50 |
+
)
|
| 51 |
+
2025-04-27,01:37:33 | INFO | Params:
|
| 52 |
+
2025-04-27,01:37:33 | INFO | accum_freq: 2
|
| 53 |
+
2025-04-27,01:37:33 | INFO | aug_cfg: {}
|
| 54 |
+
2025-04-27,01:37:33 | INFO | batch_size: 2048
|
| 55 |
+
2025-04-27,01:37:33 | INFO | beta1: 0.9
|
| 56 |
+
2025-04-27,01:37:33 | INFO | beta2: 0.98
|
| 57 |
+
2025-04-27,01:37:33 | INFO | cache_dir: None
|
| 58 |
+
2025-04-27,01:37:33 | INFO | caption_ratio: 0.1
|
| 59 |
+
2025-04-27,01:37:33 | INFO | checkpoint_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints
|
| 60 |
+
2025-04-27,01:37:33 | INFO | coca_caption_loss_weight: 2.0
|
| 61 |
+
2025-04-27,01:37:33 | INFO | coca_contrastive_loss_weight: 1.0
|
| 62 |
+
2025-04-27,01:37:33 | INFO | copy_codebase: False
|
| 63 |
+
2025-04-27,01:37:33 | INFO | csv_caption_key: title
|
| 64 |
+
2025-04-27,01:37:33 | INFO | csv_img_key: filepath
|
| 65 |
+
2025-04-27,01:37:33 | INFO | csv_separator:
|
| 66 |
+
2025-04-27,01:37:33 | INFO | dataset_resampled: False
|
| 67 |
+
2025-04-27,01:37:33 | INFO | dataset_type: synthetic
|
| 68 |
+
2025-04-27,01:37:33 | INFO | ddp_static_graph: False
|
| 69 |
+
2025-04-27,01:37:33 | INFO | debug: False
|
| 70 |
+
2025-04-27,01:37:33 | INFO | delete_previous_checkpoint: False
|
| 71 |
+
2025-04-27,01:37:33 | INFO | device: cuda:0
|
| 72 |
+
2025-04-27,01:37:33 | INFO | dist_backend: None
|
| 73 |
+
2025-04-27,01:37:33 | INFO | dist_url: None
|
| 74 |
+
2025-04-27,01:37:33 | INFO | distill: False
|
| 75 |
+
2025-04-27,01:37:33 | INFO | distill_model: None
|
| 76 |
+
2025-04-27,01:37:33 | INFO | distill_pretrained: None
|
| 77 |
+
2025-04-27,01:37:33 | INFO | distributed: True
|
| 78 |
+
2025-04-27,01:37:33 | INFO | epochs: 10
|
| 79 |
+
2025-04-27,01:37:33 | INFO | epochs_cooldown: None
|
| 80 |
+
2025-04-27,01:37:33 | INFO | eps: 1e-08
|
| 81 |
+
2025-04-27,01:37:33 | INFO | force_custom_text: False
|
| 82 |
+
2025-04-27,01:37:33 | INFO | force_image_size: None
|
| 83 |
+
2025-04-27,01:37:33 | INFO | force_patch_dropout: None
|
| 84 |
+
2025-04-27,01:37:33 | INFO | force_quick_gelu: False
|
| 85 |
+
2025-04-27,01:37:33 | INFO | gather_with_grad: True
|
| 86 |
+
2025-04-27,01:37:33 | INFO | grad_checkpointing: True
|
| 87 |
+
2025-04-27,01:37:33 | INFO | grad_clip_norm: None
|
| 88 |
+
2025-04-27,01:37:33 | INFO | horovod: False
|
| 89 |
+
2025-04-27,01:37:33 | INFO | image_interpolation: None
|
| 90 |
+
2025-04-27,01:37:33 | INFO | image_mean: None
|
| 91 |
+
2025-04-27,01:37:33 | INFO | image_resize_mode: None
|
| 92 |
+
2025-04-27,01:37:33 | INFO | image_std: None
|
| 93 |
+
2025-04-27,01:37:33 | INFO | imagenet_v2: None
|
| 94 |
+
2025-04-27,01:37:33 | INFO | imagenet_val: None
|
| 95 |
+
2025-04-27,01:37:33 | INFO | keep_func_name: keep_random
|
| 96 |
+
2025-04-27,01:37:33 | INFO | local_loss: False
|
| 97 |
+
2025-04-27,01:37:33 | INFO | local_rank: 0
|
| 98 |
+
2025-04-27,01:37:33 | INFO | lock_image: False
|
| 99 |
+
2025-04-27,01:37:33 | INFO | lock_image_freeze_bn_stats: False
|
| 100 |
+
2025-04-27,01:37:33 | INFO | lock_image_unlocked_groups: 0
|
| 101 |
+
2025-04-27,01:37:33 | INFO | lock_text: True
|
| 102 |
+
2025-04-27,01:37:33 | INFO | lock_text_freeze_layer_norm: False
|
| 103 |
+
2025-04-27,01:37:33 | INFO | lock_text_unlocked_layers: 0
|
| 104 |
+
2025-04-27,01:37:33 | INFO | log_every_n_steps: 100
|
| 105 |
+
2025-04-27,01:37:33 | INFO | log_level: 20
|
| 106 |
+
2025-04-27,01:37:33 | INFO | log_local: False
|
| 107 |
+
2025-04-27,01:37:33 | INFO | log_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/out.log
|
| 108 |
+
2025-04-27,01:37:33 | INFO | logs: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text
|
| 109 |
+
2025-04-27,01:37:33 | INFO | loss_dist_impl: None
|
| 110 |
+
2025-04-27,01:37:33 | INFO | lr: 4e-05
|
| 111 |
+
2025-04-27,01:37:33 | INFO | lr_cooldown_end: 0.0
|
| 112 |
+
2025-04-27,01:37:33 | INFO | lr_cooldown_power: 1.0
|
| 113 |
+
2025-04-27,01:37:33 | INFO | lr_scheduler: cosine
|
| 114 |
+
2025-04-27,01:37:33 | INFO | map_func_name: use_all
|
| 115 |
+
2025-04-27,01:37:33 | INFO | model: ViT-B-16
|
| 116 |
+
2025-04-27,01:37:33 | INFO | momentum: None
|
| 117 |
+
2025-04-27,01:37:33 | INFO | name: keep_random
|
| 118 |
+
2025-04-27,01:37:33 | INFO | no_set_device_rank: False
|
| 119 |
+
2025-04-27,01:37:33 | INFO | opt: adamw
|
| 120 |
+
2025-04-27,01:37:33 | INFO | precision: amp
|
| 121 |
+
2025-04-27,01:37:33 | INFO | pretrained: datacomp_xl_s13b_b90k
|
| 122 |
+
2025-04-27,01:37:33 | INFO | pretrained_image: False
|
| 123 |
+
2025-04-27,01:37:33 | INFO | rank: 0
|
| 124 |
+
2025-04-27,01:37:33 | INFO | remote_sync: None
|
| 125 |
+
2025-04-27,01:37:33 | INFO | remote_sync_frequency: 300
|
| 126 |
+
2025-04-27,01:37:33 | INFO | remote_sync_protocol: s3
|
| 127 |
+
2025-04-27,01:37:33 | INFO | report_to: tensorboard,wandb
|
| 128 |
+
2025-04-27,01:37:33 | INFO | resume: None
|
| 129 |
+
2025-04-27,01:37:33 | INFO | save_frequency: 10
|
| 130 |
+
2025-04-27,01:37:33 | INFO | save_most_recent: False
|
| 131 |
+
2025-04-27,01:37:33 | INFO | seed: 0
|
| 132 |
+
2025-04-27,01:37:33 | INFO | siglip: False
|
| 133 |
+
2025-04-27,01:37:33 | INFO | skip_scheduler: False
|
| 134 |
+
2025-04-27,01:37:33 | INFO | tensorboard: True
|
| 135 |
+
2025-04-27,01:37:33 | INFO | tensorboard_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/tensorboard
|
| 136 |
+
2025-04-27,01:37:33 | INFO | torchcompile: False
|
| 137 |
+
2025-04-27,01:37:33 | INFO | torchscript: False
|
| 138 |
+
2025-04-27,01:37:33 | INFO | trace: False
|
| 139 |
+
2025-04-27,01:37:33 | INFO | train_data: /mnt/personal/zhudongy/recap-datacomp-3m-wds/{00376..01507}.tar
|
| 140 |
+
2025-04-27,01:37:33 | INFO | train_data_upsampling_factors: None
|
| 141 |
+
2025-04-27,01:37:33 | INFO | train_num_samples: 9011874
|
| 142 |
+
2025-04-27,01:37:33 | INFO | use_bn_sync: False
|
| 143 |
+
2025-04-27,01:37:33 | INFO | use_bnb_linear: None
|
| 144 |
+
2025-04-27,01:37:33 | INFO | val_data: None
|
| 145 |
+
2025-04-27,01:37:33 | INFO | val_frequency: 1
|
| 146 |
+
2025-04-27,01:37:33 | INFO | val_num_samples: None
|
| 147 |
+
2025-04-27,01:37:33 | INFO | wandb: True
|
| 148 |
+
2025-04-27,01:37:33 | INFO | wandb_notes:
|
| 149 |
+
2025-04-27,01:37:33 | INFO | wandb_project_name: open-clip
|
| 150 |
+
2025-04-27,01:37:33 | INFO | warmup: 110
|
| 151 |
+
2025-04-27,01:37:33 | INFO | wd: 0.5
|
| 152 |
+
2025-04-27,01:37:33 | INFO | workers: 16
|
| 153 |
+
2025-04-27,01:37:33 | INFO | world_size: 2
|
| 154 |
+
2025-04-27,01:37:33 | INFO | zeroshot_frequency: 2
|
| 155 |
+
2025-04-27,01:37:34 | INFO | Created AdamW (adamw) optimizer: lr: 4e-05, betas: (0.9, 0.98), eps: 1e-08, weight_decay: 0.5, amsgrad: False, foreach: None, maximize: False, capturable: False, differentiable: False, fused: None
|
| 156 |
+
2025-04-27,01:37:47 | INFO | Start epoch 0
|
| 157 |
+
2025-04-27,01:38:39 | INFO | Train Epoch: 0 [ 8192/917504 (1%)] Data (t): 42.123 Batch (t): 51.906, 157.823/s, 78.9117/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 28.618 (28.618) Imm_text: 28.618 (28.618) Isd_image: 3.3252 (3.3252) Isd_text: 3.3252 (3.3252) Contrastive_loss: 1.4192 (1.4192) Loss: 1.4192 (1.4192)
|
| 158 |
+
2025-04-27,01:53:40 | INFO | Train Epoch: 0 [827392/917504 (90%)] Data (t): 0.999 Batch (t): 9.010, 915.892/s, 457.946/s/gpu LR: 0.000037 Logit Scale: 99.934 Imm_image: 28.968 (28.793) Imm_text: 28.968 (28.793) Isd_image: 5.7168 (4.5210) Isd_text: 5.7168 (4.5210) Contrastive_loss: 0.72981 (1.0745) Loss: 0.72981 (1.0745)
|
| 159 |
+
2025-04-27,01:55:19 | INFO | Train Epoch: 0 [917504/917504 (100%)] Data (t): 0.952 Batch (t): 8.936, 914.273/s, 457.137/s/gpu LR: 0.000040 Logit Scale: 99.924 Imm_image: 28.905 (28.830) Imm_text: 28.905 (28.830) Isd_image: 5.5824 (4.8748) Isd_text: 5.5824 (4.8748) Contrastive_loss: 0.71474 (0.95458) Loss: 0.71474 (0.95458)
|
| 160 |
+
2025-04-27,01:55:19 | INFO | Start epoch 1
|
| 161 |
+
2025-04-27,01:56:03 | INFO | Train Epoch: 1 [ 8192/917504 (1%)] Data (t): 35.769 Batch (t): 44.168, 185.473/s, 92.7366/s/gpu LR: 0.000040 Logit Scale: 99.924 Imm_image: 29.056 (29.056) Imm_text: 29.056 (29.056) Isd_image: 5.6160 (5.6160) Isd_text: 5.6160 (5.6160) Contrastive_loss: 0.63035 (0.63035) Loss: 0.63035 (0.63035)
|
| 162 |
+
2025-04-27,02:11:05 | INFO | Train Epoch: 1 [827392/917504 (90%)] Data (t): 1.006 Batch (t): 9.021, 916.703/s, 458.351/s/gpu LR: 0.000039 Logit Scale: 99.879 Imm_image: 28.940 (28.998) Imm_text: 28.940 (28.998) Isd_image: 4.3083 (4.9621) Isd_text: 4.3083 (4.9621) Contrastive_loss: 0.64448 (0.63742) Loss: 0.64448 (0.63742)
|
| 163 |
+
2025-04-27,02:12:43 | INFO | Train Epoch: 1 [917504/917504 (100%)] Data (t): 0.950 Batch (t): 8.937, 918.129/s, 459.065/s/gpu LR: 0.000039 Logit Scale: 99.883 Imm_image: 29.164 (29.053) Imm_text: 29.164 (29.053) Isd_image: 3.7631 (4.5625) Isd_text: 3.7631 (4.5625) Contrastive_loss: 0.51947 (0.59810) Loss: 0.51947 (0.59810)
|
| 164 |
+
2025-04-27,02:12:44 | INFO | Start epoch 2
|
| 165 |
+
2025-04-27,02:13:29 | INFO | Train Epoch: 2 [ 8192/917504 (1%)] Data (t): 37.286 Batch (t): 45.551, 179.841/s, 89.9207/s/gpu LR: 0.000039 Logit Scale: 99.885 Imm_image: 29.259 (29.259) Imm_text: 29.259 (29.259) Isd_image: 3.6721 (3.6721) Isd_text: 3.6721 (3.6721) Contrastive_loss: 0.49092 (0.49092) Loss: 0.49092 (0.49092)
|
| 166 |
+
2025-04-27,02:28:32 | INFO | Train Epoch: 2 [827392/917504 (90%)] Data (t): 1.009 Batch (t): 9.023, 917.789/s, 458.894/s/gpu LR: 0.000036 Logit Scale: 100.000 Imm_image: 29.366 (29.312) Imm_text: 29.366 (29.312) Isd_image: 2.6899 (3.1810) Isd_text: 2.6899 (3.1810) Contrastive_loss: 0.46473 (0.47783) Loss: 0.46473 (0.47783)
|
| 167 |
+
2025-04-27,02:30:10 | INFO | Train Epoch: 2 [917504/917504 (100%)] Data (t): 0.952 Batch (t): 8.936, 915.433/s, 457.716/s/gpu LR: 0.000035 Logit Scale: 100.000 Imm_image: 29.520 (29.382) Imm_text: 29.520 (29.382) Isd_image: 2.5545 (2.9722) Isd_text: 2.5545 (2.9722) Contrastive_loss: 0.39856 (0.45141) Loss: 0.39856 (0.45141)
|
| 168 |
+
2025-04-27,02:30:10 | INFO | Start epoch 3
|
| 169 |
+
2025-04-27,02:30:55 | INFO | Train Epoch: 3 [ 8192/917504 (1%)] Data (t): 36.864 Batch (t): 44.915, 182.389/s, 91.1947/s/gpu LR: 0.000035 Logit Scale: 100.000 Imm_image: 29.511 (29.511) Imm_text: 29.511 (29.511) Isd_image: 2.5891 (2.5891) Isd_text: 2.5891 (2.5891) Contrastive_loss: 0.42152 (0.42152) Loss: 0.42152 (0.42152)
|
| 170 |
+
2025-04-27,02:46:00 | INFO | Train Epoch: 3 [827392/917504 (90%)] Data (t): 1.025 Batch (t): 9.047, 917.102/s, 458.551/s/gpu LR: 0.000031 Logit Scale: 100.000 Imm_image: 29.689 (29.600) Imm_text: 29.689 (29.600) Isd_image: 1.7021 (2.1456) Isd_text: 1.7021 (2.1456) Contrastive_loss: 0.39319 (0.40736) Loss: 0.39319 (0.40736)
|
| 171 |
+
2025-04-27,02:47:38 | INFO | Train Epoch: 3 [917504/917504 (100%)] Data (t): 0.951 Batch (t): 8.934, 916.799/s, 458.399/s/gpu LR: 0.000030 Logit Scale: 100.000 Imm_image: 29.685 (29.628) Imm_text: 29.685 (29.628) Isd_image: 1.5345 (1.9419) Isd_text: 1.5345 (1.9419) Contrastive_loss: 0.34130 (0.38534) Loss: 0.34130 (0.38534)
|
| 172 |
+
2025-04-27,02:47:38 | INFO | Start epoch 4
|
| 173 |
+
2025-04-27,02:48:23 | INFO | Train Epoch: 4 [ 8192/917504 (1%)] Data (t): 36.836 Batch (t): 44.907, 182.422/s, 91.2108/s/gpu LR: 0.000030 Logit Scale: 100.000 Imm_image: 29.707 (29.707) Imm_text: 29.707 (29.707) Isd_image: 1.5228 (1.5228) Isd_text: 1.5228 (1.5228) Contrastive_loss: 0.35253 (0.35253) Loss: 0.35253 (0.35253)
|
| 174 |
+
2025-04-27,03:03:22 | INFO | Train Epoch: 4 [827392/917504 (90%)] Data (t): 0.984 Batch (t): 8.987, 918.381/s, 459.190/s/gpu LR: 0.000024 Logit Scale: 100.000 Imm_image: 29.872 (29.789) Imm_text: 29.872 (29.789) Isd_image: 1.0030 (1.2629) Isd_text: 1.0030 (1.2629) Contrastive_loss: 0.33160 (0.34206) Loss: 0.33160 (0.34206)
|
| 175 |
+
2025-04-27,03:05:00 | INFO | Train Epoch: 4 [917504/917504 (100%)] Data (t): 0.940 Batch (t): 8.925, 915.463/s, 457.731/s/gpu LR: 0.000023 Logit Scale: 100.000 Imm_image: 29.722 (29.767) Imm_text: 29.722 (29.767) Isd_image: 1.1486 (1.2248) Isd_text: 1.1486 (1.2248) Contrastive_loss: 0.31939 (0.33451) Loss: 0.31939 (0.33451)
|
| 176 |
+
2025-04-27,03:05:00 | INFO | Start epoch 5
|
| 177 |
+
2025-04-27,03:05:45 | INFO | Train Epoch: 5 [ 8192/917504 (1%)] Data (t): 35.249 Batch (t): 44.356, 184.686/s, 92.3430/s/gpu LR: 0.000023 Logit Scale: 100.000 Imm_image: 29.820 (29.820) Imm_text: 29.820 (29.820) Isd_image: 1.0999 (1.0999) Isd_text: 1.0999 (1.0999) Contrastive_loss: 0.29852 (0.29852) Loss: 0.29852 (0.29852)
|
| 178 |
+
2025-04-27,03:20:46 | INFO | Train Epoch: 5 [827392/917504 (90%)] Data (t): 1.002 Batch (t): 9.017, 918.400/s, 459.200/s/gpu LR: 0.000017 Logit Scale: 100.000 Imm_image: 29.969 (29.895) Imm_text: 29.969 (29.895) Isd_image: 0.86037 (0.98011) Isd_text: 0.86037 (0.98011) Contrastive_loss: 0.27611 (0.28732) Loss: 0.27611 (0.28732)
|
| 179 |
+
2025-04-27,03:22:25 | INFO | Train Epoch: 5 [917504/917504 (100%)] Data (t): 0.949 Batch (t): 8.933, 914.883/s, 457.441/s/gpu LR: 0.000017 Logit Scale: 100.000 Imm_image: 29.996 (29.928) Imm_text: 29.996 (29.928) Isd_image: 0.85540 (0.93854) Isd_text: 0.85540 (0.93854) Contrastive_loss: 0.24764 (0.27409) Loss: 0.24764 (0.27409)
|
| 180 |
+
2025-04-27,03:22:25 | INFO | Start epoch 6
|
| 181 |
+
2025-04-27,03:23:10 | INFO | Train Epoch: 6 [ 8192/917504 (1%)] Data (t): 36.590 Batch (t): 44.694, 183.290/s, 91.6450/s/gpu LR: 0.000016 Logit Scale: 100.000 Imm_image: 30.118 (30.118) Imm_text: 30.118 (30.118) Isd_image: 0.77700 (0.77700) Isd_text: 0.77700 (0.77700) Contrastive_loss: 0.24394 (0.24394) Loss: 0.24394 (0.24394)
|
| 182 |
+
2025-04-27,03:38:11 | INFO | Train Epoch: 6 [827392/917504 (90%)] Data (t): 1.004 Batch (t): 9.018, 917.586/s, 458.793/s/gpu LR: 0.000011 Logit Scale: 100.000 Imm_image: 30.208 (30.163) Imm_text: 30.208 (30.163) Isd_image: 0.66865 (0.72282) Isd_text: 0.66865 (0.72282) Contrastive_loss: 0.24086 (0.24240) Loss: 0.24086 (0.24240)
|
| 183 |
+
2025-04-27,03:39:50 | INFO | Train Epoch: 6 [917504/917504 (100%)] Data (t): 0.950 Batch (t): 8.933, 917.061/s, 458.530/s/gpu LR: 0.000010 Logit Scale: 100.000 Imm_image: 30.353 (30.226) Imm_text: 30.353 (30.226) Isd_image: 0.60130 (0.68231) Isd_text: 0.60130 (0.68231) Contrastive_loss: 0.17543 (0.22008) Loss: 0.17543 (0.22008)
|
| 184 |
+
2025-04-27,03:39:50 | INFO | Start epoch 7
|
| 185 |
+
2025-04-27,03:40:34 | INFO | Train Epoch: 7 [ 8192/917504 (1%)] Data (t): 36.419 Batch (t): 44.470, 184.213/s, 92.1063/s/gpu LR: 0.000010 Logit Scale: 100.000 Imm_image: 30.290 (30.290) Imm_text: 30.290 (30.290) Isd_image: 0.66099 (0.66099) Isd_text: 0.66099 (0.66099) Contrastive_loss: 0.21813 (0.21813) Loss: 0.21813 (0.21813)
|
| 186 |
+
2025-04-27,03:55:41 | INFO | Train Epoch: 7 [827392/917504 (90%)] Data (t): 1.037 Batch (t): 9.062, 921.569/s, 460.784/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.350 (30.320) Imm_text: 30.350 (30.320) Isd_image: 0.52269 (0.59184) Isd_text: 0.52269 (0.59184) Contrastive_loss: 0.21930 (0.21872) Loss: 0.21930 (0.21872)
|
| 187 |
+
2025-04-27,03:57:19 | INFO | Train Epoch: 7 [917504/917504 (100%)] Data (t): 0.951 Batch (t): 8.931, 912.708/s, 456.354/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.451 (30.364) Imm_text: 30.451 (30.364) Isd_image: 0.50853 (0.56407) Isd_text: 0.50853 (0.56407) Contrastive_loss: 0.19753 (0.21166) Loss: 0.19753 (0.21166)
|
| 188 |
+
2025-04-27,03:57:19 | INFO | Start epoch 8
|
| 189 |
+
2025-04-27,03:58:04 | INFO | Train Epoch: 8 [ 8192/917504 (1%)] Data (t): 36.493 Batch (t): 44.842, 182.684/s, 91.3421/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.394 (30.394) Imm_text: 30.394 (30.394) Isd_image: 0.56205 (0.56205) Isd_text: 0.56205 (0.56205) Contrastive_loss: 0.19877 (0.19877) Loss: 0.19877 (0.19877)
|
| 190 |
+
2025-04-27,04:13:05 | INFO | Train Epoch: 8 [827392/917504 (90%)] Data (t): 1.001 Batch (t): 9.007, 919.521/s, 459.760/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.305 (30.350) Imm_text: 30.305 (30.350) Isd_image: 0.50703 (0.53454) Isd_text: 0.50703 (0.53454) Contrastive_loss: 0.23204 (0.21540) Loss: 0.23204 (0.21540)
|
| 191 |
+
2025-04-27,04:14:43 | INFO | Train Epoch: 8 [917504/917504 (100%)] Data (t): 0.940 Batch (t): 8.918, 914.019/s, 457.010/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.433 (30.377) Imm_text: 30.433 (30.377) Isd_image: 0.56413 (0.54440) Isd_text: 0.56413 (0.54440) Contrastive_loss: 0.18968 (0.20683) Loss: 0.18968 (0.20683)
|
| 192 |
+
2025-04-27,04:14:43 | INFO | Start epoch 9
|
| 193 |
+
2025-04-27,04:15:26 | INFO | Train Epoch: 9 [ 8192/917504 (1%)] Data (t): 34.886 Batch (t): 43.055, 190.267/s, 95.1333/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.423 (30.423) Imm_text: 30.423 (30.423) Isd_image: 0.50569 (0.50569) Isd_text: 0.50569 (0.50569) Contrastive_loss: 0.20082 (0.20082) Loss: 0.20082 (0.20082)
|
| 194 |
+
2025-04-27,04:30:24 | INFO | Train Epoch: 9 [827392/917504 (90%)] Data (t): 0.980 Batch (t): 8.984, 919.065/s, 459.533/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 30.383 (30.403) Imm_text: 30.383 (30.403) Isd_image: 0.52638 (0.51604) Isd_text: 0.52638 (0.51604) Contrastive_loss: 0.22643 (0.21362) Loss: 0.22643 (0.21362)
|
| 195 |
+
2025-04-27,04:32:02 | INFO | Train Epoch: 9 [917504/917504 (100%)] Data (t): 0.931 Batch (t): 8.910, 918.487/s, 459.244/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 30.408 (30.404) Imm_text: 30.408 (30.404) Isd_image: 0.47812 (0.50340) Isd_text: 0.47812 (0.50340) Contrastive_loss: 0.19293 (0.20673) Loss: 0.19293 (0.20673)
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/params.txt
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accum_freq: 2
|
| 2 |
+
aug_cfg: {}
|
| 3 |
+
batch_size: 2048
|
| 4 |
+
beta1: 0.9
|
| 5 |
+
beta2: 0.98
|
| 6 |
+
cache_dir: None
|
| 7 |
+
caption_ratio: 0.1
|
| 8 |
+
checkpoint_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints
|
| 9 |
+
coca_caption_loss_weight: 2.0
|
| 10 |
+
coca_contrastive_loss_weight: 1.0
|
| 11 |
+
copy_codebase: False
|
| 12 |
+
csv_caption_key: title
|
| 13 |
+
csv_img_key: filepath
|
| 14 |
+
csv_separator:
|
| 15 |
+
dataset_resampled: False
|
| 16 |
+
dataset_type: synthetic
|
| 17 |
+
ddp_static_graph: False
|
| 18 |
+
debug: False
|
| 19 |
+
delete_previous_checkpoint: False
|
| 20 |
+
device: cuda:0
|
| 21 |
+
dist_backend: None
|
| 22 |
+
dist_url: None
|
| 23 |
+
distill: False
|
| 24 |
+
distill_model: None
|
| 25 |
+
distill_pretrained: None
|
| 26 |
+
distributed: True
|
| 27 |
+
epochs: 10
|
| 28 |
+
epochs_cooldown: None
|
| 29 |
+
eps: 1e-08
|
| 30 |
+
force_custom_text: False
|
| 31 |
+
force_image_size: None
|
| 32 |
+
force_patch_dropout: None
|
| 33 |
+
force_quick_gelu: False
|
| 34 |
+
gather_with_grad: True
|
| 35 |
+
grad_checkpointing: True
|
| 36 |
+
grad_clip_norm: None
|
| 37 |
+
horovod: False
|
| 38 |
+
image_interpolation: None
|
| 39 |
+
image_mean: None
|
| 40 |
+
image_resize_mode: None
|
| 41 |
+
image_std: None
|
| 42 |
+
imagenet_v2: None
|
| 43 |
+
imagenet_val: None
|
| 44 |
+
keep_func_name: keep_random
|
| 45 |
+
local_loss: False
|
| 46 |
+
local_rank: 0
|
| 47 |
+
lock_image: False
|
| 48 |
+
lock_image_freeze_bn_stats: False
|
| 49 |
+
lock_image_unlocked_groups: 0
|
| 50 |
+
lock_text: True
|
| 51 |
+
lock_text_freeze_layer_norm: False
|
| 52 |
+
lock_text_unlocked_layers: 0
|
| 53 |
+
log_every_n_steps: 100
|
| 54 |
+
log_level: 20
|
| 55 |
+
log_local: False
|
| 56 |
+
log_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/out.log
|
| 57 |
+
logs: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text
|
| 58 |
+
loss_dist_impl: None
|
| 59 |
+
lr: 4e-05
|
| 60 |
+
lr_cooldown_end: 0.0
|
| 61 |
+
lr_cooldown_power: 1.0
|
| 62 |
+
lr_scheduler: cosine
|
| 63 |
+
map_func_name: use_all
|
| 64 |
+
model: ViT-B-16
|
| 65 |
+
momentum: None
|
| 66 |
+
name: keep_random
|
| 67 |
+
no_set_device_rank: False
|
| 68 |
+
opt: adamw
|
| 69 |
+
precision: amp
|
| 70 |
+
pretrained: datacomp_xl_s13b_b90k
|
| 71 |
+
pretrained_image: False
|
| 72 |
+
rank: 0
|
| 73 |
+
remote_sync: None
|
| 74 |
+
remote_sync_frequency: 300
|
| 75 |
+
remote_sync_protocol: s3
|
| 76 |
+
report_to: tensorboard,wandb
|
| 77 |
+
resume: None
|
| 78 |
+
save_frequency: 10
|
| 79 |
+
save_most_recent: False
|
| 80 |
+
seed: 0
|
| 81 |
+
siglip: False
|
| 82 |
+
skip_scheduler: False
|
| 83 |
+
tensorboard: True
|
| 84 |
+
tensorboard_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/tensorboard
|
| 85 |
+
torchcompile: False
|
| 86 |
+
torchscript: False
|
| 87 |
+
trace: False
|
| 88 |
+
train_data: /mnt/personal/zhudongy/recap-datacomp-3m-wds/{00376..01507}.tar
|
| 89 |
+
train_data_upsampling_factors: None
|
| 90 |
+
train_num_samples: 9011874
|
| 91 |
+
use_bn_sync: False
|
| 92 |
+
use_bnb_linear: None
|
| 93 |
+
val_data: None
|
| 94 |
+
val_frequency: 1
|
| 95 |
+
val_num_samples: None
|
| 96 |
+
wandb: True
|
| 97 |
+
wandb_notes:
|
| 98 |
+
wandb_project_name: open-clip
|
| 99 |
+
warmup: 110
|
| 100 |
+
wd: 0.5
|
| 101 |
+
workers: 16
|
| 102 |
+
world_size: 2
|
| 103 |
+
zeroshot_frequency: 2
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_caltech101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "caltech101", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8618766404199475, "acc5": 0.9681758530183727, "mean_per_class_recall": 0.925570681349594}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_cars_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "cars", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8362144011938814, "acc5": 0.9920407909463997, "mean_per_class_recall": 0.8370035365963817}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "cifar100", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7489, "acc5": 0.9325, "mean_per_class_recall": 0.7489}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_cifar10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "cifar10", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.9405, "acc5": 0.9984, "mean_per_class_recall": 0.9405000000000001}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "country211", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.1942654028436019, "acc5": 0.42255924170616116, "mean_per_class_recall": 0.19436018957345974}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "dtd", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5537234042553192, "acc5": 0.8122340425531915, "mean_per_class_recall": 0.5537234042553191}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_eurosat_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "eurosat", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.48733333333333334, "acc5": 0.8834074074074074, "mean_per_class_recall": 0.48750666666666664}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "fgvc_aircraft", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.24152415241524153, "acc5": 0.5781578157815782, "mean_per_class_recall": 0.2407130124777184}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "flickr30k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.652999997138977, "text_retrieval_recall@1": 0.8100000023841858, "image_retrieval_recall@5": 0.8772000074386597, "text_retrieval_recall@5": 0.9539999961853027, "image_retrieval_recall@10": 0.921999990940094, "text_retrieval_recall@10": 0.9779999852180481}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "flowers", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7280858676207513, "acc5": 0.9017726459586924, "mean_per_class_recall": 0.7367525532935172}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "food101", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8707722772277228, "acc5": 0.9812277227722772, "mean_per_class_recall": 0.8707326732673267}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "gtsrb", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5553444180522565, "acc5": 0.7724465558194774, "mean_per_class_recall": 0.5145985462408618}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "imagenet1k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6846, "acc5": 0.90768, "mean_per_class_recall": 0.6846}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_mscoco_captions_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "mscoco_captions", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.384246289730072, "text_retrieval_recall@1": 0.5473999977111816, "image_retrieval_recall@5": 0.6422231197357178, "text_retrieval_recall@5": 0.7784000039100647, "image_retrieval_recall@10": 0.7440223693847656, "text_retrieval_recall@10": 0.8586000204086304}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "pets", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8975197601526301, "acc5": 0.9964568002180431, "mean_per_class_recall": 0.8970007980600225}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_stl10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "stl10", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.97225, "acc5": 0.9995, "mean_per_class_recall": 0.9721250000000001}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "sun397", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6729039851407764, "acc5": 0.9277635765121283, "mean_per_class_recall": 0.6608765022275734}, "language": "en"}
|