diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..2fff8ca7363f6c73b841e4ceb20dd31c0a82ca3f --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "cifar100", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7589, "acc5": 0.9387, "mean_per_class_recall": 0.7590999999999998}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..ea692e7942b8b8a7f8b7bf5ba1c24aa00e1243dd --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "country211", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.1975829383886256, "acc5": 0.4209952606635071, "mean_per_class_recall": 0.19763033175355452}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..7c79b9eecd4e6683340f8f7db88b4846897e8a64 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "dtd", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5484042553191489, "acc5": 0.8191489361702128, "mean_per_class_recall": 0.547872340425532}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json new file mode 100644 index 0000000000000000000000000000000000000000..7ccadbd02987ce941688511f72f9c9699b98e2c3 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json @@ -0,0 +1 @@ +{"dataset": "flickr30k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.656000018119812, "text_retrieval_recall@1": 0.8009999990463257, "image_retrieval_recall@5": 0.8655999898910522, "text_retrieval_recall@5": 0.9539999961853027, "image_retrieval_recall@10": 0.9193999767303467, "text_retrieval_recall@10": 0.9760000109672546}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..d60968b87b0c5ea294a9e33196ca5110c33d6c70 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "flowers", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7487396324605627, "acc5": 0.897869572288177, "mean_per_class_recall": 0.7396028874215905}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..96e3fef2ed5d4807f11ef4fe9d0b86b903a9a15c --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "food101", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8691485148514851, "acc5": 0.980039603960396, "mean_per_class_recall": 0.8692277227722772}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..f64461b5e6ba270be8e1e6d40791e1516fddb156 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "gtsrb", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.510609659540776, "acc5": 0.7530482977038796, "mean_per_class_recall": 0.4616824052066722}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..db63da9beacda4fb322f7e97c56dd5b439599227 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "imagenet1k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.68688, "acc5": 0.90938, "mean_per_class_recall": 0.6868800000000002}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..fb16d15bbe49c01c0cf772e72113f82039502656 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "pets", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8947942218588171, "acc5": 0.9964568002180431, "mean_per_class_recall": 0.8933304274009377}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..cfa13914fb0708c7f1c252f460d2ba2da1d271c0 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "sun397", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6720764293727127, "acc5": 0.928158964267981, "mean_per_class_recall": 0.6638586591957565}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..7cef8f451ce3632fae6b27ded7f217d10ec74cf1 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "vtab/resisc45", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6063492063492063, "acc5": 0.9077777777777778, "mean_per_class_recall": 0.6129633195377677}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/out.log b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/out.log new file mode 100644 index 0000000000000000000000000000000000000000..01cb818bb3aef736011dd5dee232576dc395a336 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/out.log @@ -0,0 +1,195 @@ +2025-04-26,19:20:36 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2. +2025-04-26,19:20:36 | INFO | Loaded ViT-B-16 model config. +2025-04-26,19:20:37 | INFO | Loading pretrained ViT-B-16 weights (datacomp_xl_s13b_b90k). +2025-04-26,19:20:38 | INFO | Model: +2025-04-26,19:20:38 | INFO | CLIP( + (visual): VisionTransformer( + (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False) + (patch_dropout): Identity() + (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (transformer): Transformer( + (resblocks): ModuleList( + (0-11): 12 x ResidualAttentionBlock( + (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (attn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) + ) + (ls_1): Identity() + (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): Sequential( + (c_fc): Linear(in_features=768, out_features=3072, bias=True) + (gelu): GELU(approximate='none') + (c_proj): Linear(in_features=3072, out_features=768, bias=True) + ) + (ls_2): Identity() + ) + ) + ) + (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (transformer): Transformer( + (resblocks): ModuleList( + (0-11): 12 x ResidualAttentionBlock( + (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) + (attn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) + ) + (ls_1): Identity() + (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) + (mlp): Sequential( + (c_fc): Linear(in_features=512, out_features=2048, bias=True) + (gelu): GELU(approximate='none') + (c_proj): Linear(in_features=2048, out_features=512, bias=True) + ) + (ls_2): Identity() + ) + ) + ) + (token_embedding): Embedding(49408, 512) + (ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True) +) +2025-04-26,19:20:38 | INFO | Params: +2025-04-26,19:20:38 | INFO | accum_freq: 2 +2025-04-26,19:20:38 | INFO | aug_cfg: {} +2025-04-26,19:20:38 | INFO | batch_size: 2048 +2025-04-26,19:20:38 | INFO | beta1: 0.9 +2025-04-26,19:20:38 | INFO | beta2: 0.98 +2025-04-26,19:20:38 | INFO | cache_dir: None +2025-04-26,19:20:38 | INFO | caption_ratio: 0.1 +2025-04-26,19:20:38 | INFO | checkpoint_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints +2025-04-26,19:20:38 | INFO | coca_caption_loss_weight: 2.0 +2025-04-26,19:20:38 | INFO | coca_contrastive_loss_weight: 1.0 +2025-04-26,19:20:38 | INFO | copy_codebase: False +2025-04-26,19:20:38 | INFO | csv_caption_key: title +2025-04-26,19:20:38 | INFO | csv_img_key: filepath +2025-04-26,19:20:38 | INFO | csv_separator: +2025-04-26,19:20:38 | INFO | dataset_resampled: False +2025-04-26,19:20:38 | INFO | dataset_type: synthetic +2025-04-26,19:20:38 | INFO | ddp_static_graph: False +2025-04-26,19:20:38 | INFO | debug: False +2025-04-26,19:20:38 | INFO | delete_previous_checkpoint: False +2025-04-26,19:20:38 | INFO | device: cuda:0 +2025-04-26,19:20:38 | INFO | dist_backend: None +2025-04-26,19:20:38 | INFO | dist_url: None +2025-04-26,19:20:38 | INFO | distill: False +2025-04-26,19:20:38 | INFO | distill_model: None +2025-04-26,19:20:38 | INFO | distill_pretrained: None +2025-04-26,19:20:38 | INFO | distributed: True +2025-04-26,19:20:38 | INFO | epochs: 10 +2025-04-26,19:20:38 | INFO | epochs_cooldown: None +2025-04-26,19:20:38 | INFO | eps: 1e-08 +2025-04-26,19:20:38 | INFO | force_custom_text: False +2025-04-26,19:20:38 | INFO | force_image_size: None +2025-04-26,19:20:38 | INFO | force_patch_dropout: None +2025-04-26,19:20:38 | INFO | force_quick_gelu: False +2025-04-26,19:20:38 | INFO | gather_with_grad: True +2025-04-26,19:20:38 | INFO | grad_checkpointing: True +2025-04-26,19:20:38 | INFO | grad_clip_norm: None +2025-04-26,19:20:38 | INFO | horovod: False +2025-04-26,19:20:38 | INFO | image_interpolation: None +2025-04-26,19:20:38 | INFO | image_mean: None +2025-04-26,19:20:38 | INFO | image_resize_mode: None +2025-04-26,19:20:38 | INFO | image_std: None +2025-04-26,19:20:38 | INFO | imagenet_v2: None +2025-04-26,19:20:38 | INFO | imagenet_val: None +2025-04-26,19:20:38 | INFO | keep_func_name: keep_image_farest +2025-04-26,19:20:38 | INFO | local_loss: False +2025-04-26,19:20:38 | INFO | local_rank: 0 +2025-04-26,19:20:38 | INFO | lock_image: False +2025-04-26,19:20:38 | INFO | lock_image_freeze_bn_stats: False +2025-04-26,19:20:38 | INFO | lock_image_unlocked_groups: 0 +2025-04-26,19:20:38 | INFO | lock_text: True +2025-04-26,19:20:38 | INFO | lock_text_freeze_layer_norm: False +2025-04-26,19:20:38 | INFO | lock_text_unlocked_layers: 0 +2025-04-26,19:20:38 | INFO | log_every_n_steps: 100 +2025-04-26,19:20:38 | INFO | log_level: 20 +2025-04-26,19:20:38 | INFO | log_local: False +2025-04-26,19:20:38 | INFO | log_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/out.log +2025-04-26,19:20:38 | INFO | logs: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text +2025-04-26,19:20:38 | INFO | loss_dist_impl: None +2025-04-26,19:20:38 | INFO | lr: 4e-05 +2025-04-26,19:20:38 | INFO | lr_cooldown_end: 0.0 +2025-04-26,19:20:38 | INFO | lr_cooldown_power: 1.0 +2025-04-26,19:20:38 | INFO | lr_scheduler: cosine +2025-04-26,19:20:38 | INFO | map_func_name: use_all +2025-04-26,19:20:38 | INFO | model: ViT-B-16 +2025-04-26,19:20:38 | INFO | momentum: None +2025-04-26,19:20:38 | INFO | name: keep_image_farest +2025-04-26,19:20:38 | INFO | no_set_device_rank: False +2025-04-26,19:20:38 | INFO | opt: adamw +2025-04-26,19:20:38 | INFO | precision: amp +2025-04-26,19:20:38 | INFO | pretrained: datacomp_xl_s13b_b90k +2025-04-26,19:20:38 | INFO | pretrained_image: False +2025-04-26,19:20:38 | INFO | rank: 0 +2025-04-26,19:20:38 | INFO | remote_sync: None +2025-04-26,19:20:38 | INFO | remote_sync_frequency: 300 +2025-04-26,19:20:38 | INFO | remote_sync_protocol: s3 +2025-04-26,19:20:38 | INFO | report_to: tensorboard,wandb +2025-04-26,19:20:38 | INFO | resume: None +2025-04-26,19:20:38 | INFO | save_frequency: 10 +2025-04-26,19:20:38 | INFO | save_most_recent: False +2025-04-26,19:20:38 | INFO | seed: 0 +2025-04-26,19:20:38 | INFO | siglip: False +2025-04-26,19:20:38 | INFO | skip_scheduler: False +2025-04-26,19:20:38 | INFO | tensorboard: True +2025-04-26,19:20:38 | INFO | tensorboard_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/tensorboard +2025-04-26,19:20:38 | INFO | torchcompile: False +2025-04-26,19:20:38 | INFO | torchscript: False +2025-04-26,19:20:38 | INFO | trace: False +2025-04-26,19:20:38 | INFO | train_data: /mnt/personal/zhudongy/recap-datacomp-3m-wds/{00376..01507}.tar +2025-04-26,19:20:38 | INFO | train_data_upsampling_factors: None +2025-04-26,19:20:38 | INFO | train_num_samples: 9011874 +2025-04-26,19:20:38 | INFO | use_bn_sync: False +2025-04-26,19:20:38 | INFO | use_bnb_linear: None +2025-04-26,19:20:38 | INFO | val_data: None +2025-04-26,19:20:38 | INFO | val_frequency: 1 +2025-04-26,19:20:38 | INFO | val_num_samples: None +2025-04-26,19:20:38 | INFO | wandb: True +2025-04-26,19:20:38 | INFO | wandb_notes: +2025-04-26,19:20:38 | INFO | wandb_project_name: open-clip +2025-04-26,19:20:38 | INFO | warmup: 110 +2025-04-26,19:20:38 | INFO | wd: 0.5 +2025-04-26,19:20:38 | INFO | workers: 16 +2025-04-26,19:20:38 | INFO | world_size: 2 +2025-04-26,19:20:38 | INFO | zeroshot_frequency: 2 +2025-04-26,19:20:39 | INFO | Created AdamW (adamw) optimizer: lr: 4e-05, betas: (0.9, 0.98), eps: 1e-08, weight_decay: 0.5, amsgrad: False, foreach: None, maximize: False, capturable: False, differentiable: False, fused: None +2025-04-26,19:20:57 | INFO | Start epoch 0 +2025-04-26,19:21:51 | INFO | Train Epoch: 0 [ 8192/917504 (1%)] Data (t): 44.342 Batch (t): 53.918, 151.934/s, 75.9669/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 28.554 (28.554) Imm_text: 28.554 (28.554) Isd_image: 3.2214 (3.2214) Isd_text: 3.2214 (3.2214) Contrastive_loss: 1.4326 (1.4326) Loss: 1.4326 (1.4326) +2025-04-26,19:36:52 | INFO | Train Epoch: 0 [827392/917504 (90%)] Data (t): 0.998 Batch (t): 9.004, 917.588/s, 458.794/s/gpu LR: 0.000037 Logit Scale: 99.933 Imm_image: 28.962 (28.758) Imm_text: 28.962 (28.758) Isd_image: 5.8420 (4.5317) Isd_text: 5.8420 (4.5317) Contrastive_loss: 0.73414 (1.0834) Loss: 0.73414 (1.0834) +2025-04-26,19:38:30 | INFO | Train Epoch: 0 [917504/917504 (100%)] Data (t): 0.953 Batch (t): 8.939, 913.115/s, 456.558/s/gpu LR: 0.000040 Logit Scale: 99.924 Imm_image: 28.947 (28.821) Imm_text: 28.947 (28.821) Isd_image: 5.6367 (4.9000) Isd_text: 5.6367 (4.9000) Contrastive_loss: 0.69624 (0.95433) Loss: 0.69624 (0.95433) +2025-04-26,19:38:30 | INFO | Start epoch 1 +2025-04-26,19:39:13 | INFO | Train Epoch: 1 [ 8192/917504 (1%)] Data (t): 33.564 Batch (t): 43.142, 189.886/s, 94.9431/s/gpu LR: 0.000040 Logit Scale: 99.923 Imm_image: 29.097 (29.097) Imm_text: 29.097 (29.097) Isd_image: 5.5602 (5.5602) Isd_text: 5.5602 (5.5602) Contrastive_loss: 0.60997 (0.60997) Loss: 0.60997 (0.60997) +2025-04-26,19:54:15 | INFO | Train Epoch: 1 [827392/917504 (90%)] Data (t): 1.005 Batch (t): 9.013, 917.390/s, 458.695/s/gpu LR: 0.000039 Logit Scale: 99.881 Imm_image: 28.927 (29.012) Imm_text: 28.927 (29.012) Isd_image: 4.1130 (4.8366) Isd_text: 4.1130 (4.8366) Contrastive_loss: 0.62246 (0.61621) Loss: 0.62246 (0.61621) +2025-04-26,19:55:53 | INFO | Train Epoch: 1 [917504/917504 (100%)] Data (t): 0.953 Batch (t): 8.938, 915.751/s, 457.875/s/gpu LR: 0.000039 Logit Scale: 99.881 Imm_image: 29.309 (29.111) Imm_text: 29.309 (29.111) Isd_image: 3.7556 (4.4762) Isd_text: 3.7556 (4.4762) Contrastive_loss: 0.49672 (0.57638) Loss: 0.49672 (0.57638) +2025-04-26,19:55:53 | INFO | Start epoch 2 +2025-04-26,19:56:39 | INFO | Train Epoch: 2 [ 8192/917504 (1%)] Data (t): 35.471 Batch (t): 45.796, 178.882/s, 89.4409/s/gpu LR: 0.000039 Logit Scale: 99.882 Imm_image: 29.290 (29.290) Imm_text: 29.290 (29.290) Isd_image: 3.6671 (3.6671) Isd_text: 3.6671 (3.6671) Contrastive_loss: 0.49962 (0.49962) Loss: 0.49962 (0.49962) +2025-04-26,20:11:40 | INFO | Train Epoch: 2 [827392/917504 (90%)] Data (t): 0.999 Batch (t): 9.008, 913.482/s, 456.741/s/gpu LR: 0.000036 Logit Scale: 100.000 Imm_image: 29.430 (29.360) Imm_text: 29.430 (29.360) Isd_image: 2.6187 (3.1429) Isd_text: 2.6187 (3.1429) Contrastive_loss: 0.45664 (0.47813) Loss: 0.45664 (0.47813) +2025-04-26,20:13:18 | INFO | Train Epoch: 2 [917504/917504 (100%)] Data (t): 0.958 Batch (t): 8.943, 914.170/s, 457.085/s/gpu LR: 0.000035 Logit Scale: 100.000 Imm_image: 29.466 (29.396) Imm_text: 29.466 (29.396) Isd_image: 2.6329 (2.9729) Isd_text: 2.6329 (2.9729) Contrastive_loss: 0.41372 (0.45666) Loss: 0.41372 (0.45666) +2025-04-26,20:13:18 | INFO | Start epoch 3 +2025-04-26,20:14:03 | INFO | Train Epoch: 3 [ 8192/917504 (1%)] Data (t): 36.546 Batch (t): 44.617, 183.607/s, 91.8035/s/gpu LR: 0.000035 Logit Scale: 100.000 Imm_image: 29.504 (29.504) Imm_text: 29.504 (29.504) Isd_image: 2.5284 (2.5284) Isd_text: 2.5284 (2.5284) Contrastive_loss: 0.39021 (0.39021) Loss: 0.39021 (0.39021) +2025-04-26,20:29:06 | INFO | Train Epoch: 3 [827392/917504 (90%)] Data (t): 1.015 Batch (t): 9.027, 917.585/s, 458.792/s/gpu LR: 0.000031 Logit Scale: 100.000 Imm_image: 29.626 (29.565) Imm_text: 29.626 (29.565) Isd_image: 1.6747 (2.1016) Isd_text: 1.6747 (2.1016) Contrastive_loss: 0.41409 (0.40215) Loss: 0.41409 (0.40215) +2025-04-26,20:30:44 | INFO | Train Epoch: 3 [917504/917504 (100%)] Data (t): 0.962 Batch (t): 8.949, 915.455/s, 457.728/s/gpu LR: 0.000030 Logit Scale: 100.000 Imm_image: 29.619 (29.583) Imm_text: 29.619 (29.583) Isd_image: 1.6334 (1.9455) Isd_text: 1.6334 (1.9455) Contrastive_loss: 0.36740 (0.39056) Loss: 0.36740 (0.39056) +2025-04-26,20:30:44 | INFO | Start epoch 4 +2025-04-26,20:31:29 | INFO | Train Epoch: 4 [ 8192/917504 (1%)] Data (t): 35.128 Batch (t): 44.257, 185.102/s, 92.5508/s/gpu LR: 0.000030 Logit Scale: 100.000 Imm_image: 29.757 (29.757) Imm_text: 29.757 (29.757) Isd_image: 1.5362 (1.5362) Isd_text: 1.5362 (1.5362) Contrastive_loss: 0.33672 (0.33672) Loss: 0.33672 (0.33672) +2025-04-26,20:46:29 | INFO | Train Epoch: 4 [827392/917504 (90%)] Data (t): 0.997 Batch (t): 9.004, 917.946/s, 458.973/s/gpu LR: 0.000024 Logit Scale: 100.000 Imm_image: 29.850 (29.804) Imm_text: 29.850 (29.804) Isd_image: 1.1166 (1.3264) Isd_text: 1.1166 (1.3264) Contrastive_loss: 0.33999 (0.33835) Loss: 0.33999 (0.33835) +2025-04-26,20:48:07 | INFO | Train Epoch: 4 [917504/917504 (100%)] Data (t): 0.954 Batch (t): 8.939, 915.946/s, 457.973/s/gpu LR: 0.000023 Logit Scale: 100.000 Imm_image: 29.934 (29.847) Imm_text: 29.934 (29.847) Isd_image: 1.0718 (1.2415) Isd_text: 1.0718 (1.2415) Contrastive_loss: 0.28950 (0.32207) Loss: 0.28950 (0.32207) +2025-04-26,20:48:08 | INFO | Start epoch 5 +2025-04-26,20:48:52 | INFO | Train Epoch: 5 [ 8192/917504 (1%)] Data (t): 36.121 Batch (t): 44.476, 184.189/s, 92.0946/s/gpu LR: 0.000023 Logit Scale: 100.000 Imm_image: 29.804 (29.804) Imm_text: 29.804 (29.804) Isd_image: 1.1836 (1.1836) Isd_text: 1.1836 (1.1836) Contrastive_loss: 0.33592 (0.33592) Loss: 0.33592 (0.33592) +2025-04-26,21:03:54 | INFO | Train Epoch: 5 [827392/917504 (90%)] Data (t): 1.010 Batch (t): 9.022, 916.297/s, 458.148/s/gpu LR: 0.000017 Logit Scale: 100.000 Imm_image: 30.026 (29.915) Imm_text: 30.026 (29.915) Isd_image: 0.84884 (1.0162) Isd_text: 0.84884 (1.0162) Contrastive_loss: 0.26665 (0.30128) Loss: 0.26665 (0.30128) +2025-04-26,21:05:33 | INFO | Train Epoch: 5 [917504/917504 (100%)] Data (t): 0.952 Batch (t): 8.936, 915.004/s, 457.502/s/gpu LR: 0.000017 Logit Scale: 100.000 Imm_image: 29.985 (29.938) Imm_text: 29.985 (29.938) Isd_image: 0.86452 (0.96566) Isd_text: 0.86452 (0.96566) Contrastive_loss: 0.26700 (0.28985) Loss: 0.26700 (0.28985) +2025-04-26,21:05:33 | INFO | Start epoch 6 +2025-04-26,21:06:18 | INFO | Train Epoch: 6 [ 8192/917504 (1%)] Data (t): 36.805 Batch (t): 44.934, 182.311/s, 91.1555/s/gpu LR: 0.000016 Logit Scale: 100.000 Imm_image: 30.204 (30.204) Imm_text: 30.204 (30.204) Isd_image: 0.70070 (0.70070) Isd_text: 0.70070 (0.70070) Contrastive_loss: 0.23083 (0.23083) Loss: 0.23083 (0.23083) +2025-04-26,21:21:18 | INFO | Train Epoch: 6 [827392/917504 (90%)] Data (t): 0.999 Batch (t): 9.007, 917.786/s, 458.893/s/gpu LR: 0.000011 Logit Scale: 100.000 Imm_image: 30.193 (30.199) Imm_text: 30.193 (30.199) Isd_image: 0.75573 (0.72822) Isd_text: 0.75573 (0.72822) Contrastive_loss: 0.24233 (0.23658) Loss: 0.24233 (0.23658) +2025-04-26,21:22:57 | INFO | Train Epoch: 6 [917504/917504 (100%)] Data (t): 0.954 Batch (t): 8.940, 916.499/s, 458.250/s/gpu LR: 0.000010 Logit Scale: 100.000 Imm_image: 30.249 (30.216) Imm_text: 30.249 (30.216) Isd_image: 0.70645 (0.72096) Isd_text: 0.70645 (0.72096) Contrastive_loss: 0.21447 (0.22921) Loss: 0.21447 (0.22921) +2025-04-26,21:22:57 | INFO | Start epoch 7 +2025-04-26,21:23:42 | INFO | Train Epoch: 7 [ 8192/917504 (1%)] Data (t): 36.516 Batch (t): 44.643, 183.500/s, 91.7499/s/gpu LR: 0.000010 Logit Scale: 100.000 Imm_image: 30.320 (30.320) Imm_text: 30.320 (30.320) Isd_image: 0.57832 (0.57832) Isd_text: 0.57832 (0.57832) Contrastive_loss: 0.22137 (0.22137) Loss: 0.22137 (0.22137) +2025-04-26,21:38:42 | INFO | Train Epoch: 7 [827392/917504 (90%)] Data (t): 0.998 Batch (t): 9.006, 915.734/s, 457.867/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.290 (30.305) Imm_text: 30.290 (30.305) Isd_image: 0.58627 (0.58230) Isd_text: 0.58627 (0.58230) Contrastive_loss: 0.24158 (0.23147) Loss: 0.24158 (0.23147) +2025-04-26,21:40:21 | INFO | Train Epoch: 7 [917504/917504 (100%)] Data (t): 0.959 Batch (t): 8.944, 914.296/s, 457.148/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.336 (30.315) Imm_text: 30.336 (30.315) Isd_image: 0.57076 (0.57845) Isd_text: 0.57076 (0.57845) Contrastive_loss: 0.20793 (0.22363) Loss: 0.20793 (0.22363) +2025-04-26,21:40:21 | INFO | Start epoch 8 +2025-04-26,21:41:05 | INFO | Train Epoch: 8 [ 8192/917504 (1%)] Data (t): 34.989 Batch (t): 44.032, 186.047/s, 93.0235/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.384 (30.384) Imm_text: 30.384 (30.384) Isd_image: 0.50557 (0.50557) Isd_text: 0.50557 (0.50557) Contrastive_loss: 0.19039 (0.19039) Loss: 0.19039 (0.19039) +2025-04-26,21:56:10 | INFO | Train Epoch: 8 [827392/917504 (90%)] Data (t): 1.028 Batch (t): 9.047, 913.226/s, 456.613/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.238 (30.311) Imm_text: 30.238 (30.311) Isd_image: 0.65661 (0.58109) Isd_text: 0.65661 (0.58109) Contrastive_loss: 0.24421 (0.21730) Loss: 0.24421 (0.21730) +2025-04-26,21:57:48 | INFO | Train Epoch: 8 [917504/917504 (100%)] Data (t): 0.951 Batch (t): 8.934, 914.753/s, 457.376/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.413 (30.345) Imm_text: 30.413 (30.345) Isd_image: 0.54766 (0.56995) Isd_text: 0.54766 (0.56995) Contrastive_loss: 0.19143 (0.20867) Loss: 0.19143 (0.20867) +2025-04-26,21:57:48 | INFO | Start epoch 9 +2025-04-26,21:58:33 | INFO | Train Epoch: 9 [ 8192/917504 (1%)] Data (t): 36.704 Batch (t): 44.784, 182.922/s, 91.4612/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.425 (30.425) Imm_text: 30.425 (30.425) Isd_image: 0.52132 (0.52132) Isd_text: 0.52132 (0.52132) Contrastive_loss: 0.20011 (0.20011) Loss: 0.20011 (0.20011) +2025-04-26,22:13:32 | INFO | Train Epoch: 9 [827392/917504 (90%)] Data (t): 0.981 Batch (t): 8.986, 916.852/s, 458.426/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 30.419 (30.422) Imm_text: 30.419 (30.422) Isd_image: 0.57758 (0.54945) Isd_text: 0.57758 (0.54945) Contrastive_loss: 0.21143 (0.20577) Loss: 0.21143 (0.20577) +2025-04-26,22:15:10 | INFO | Train Epoch: 9 [917504/917504 (100%)] Data (t): 0.936 Batch (t): 8.913, 916.256/s, 458.128/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 30.455 (30.433) Imm_text: 30.455 (30.433) Isd_image: 0.51568 (0.53820) Isd_text: 0.51568 (0.53820) Contrastive_loss: 0.18257 (0.19804) Loss: 0.18257 (0.19804) diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/params.txt b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..c655c33eabbaf5166d4c82c4bf881bd730547f14 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/params.txt @@ -0,0 +1,103 @@ +accum_freq: 2 +aug_cfg: {} +batch_size: 2048 +beta1: 0.9 +beta2: 0.98 +cache_dir: None +caption_ratio: 0.1 +checkpoint_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints +coca_caption_loss_weight: 2.0 +coca_contrastive_loss_weight: 1.0 +copy_codebase: False +csv_caption_key: title +csv_img_key: filepath +csv_separator: +dataset_resampled: False +dataset_type: synthetic +ddp_static_graph: False +debug: False +delete_previous_checkpoint: False +device: cuda:0 +dist_backend: None +dist_url: None +distill: False +distill_model: None +distill_pretrained: None +distributed: True +epochs: 10 +epochs_cooldown: None +eps: 1e-08 +force_custom_text: False +force_image_size: None +force_patch_dropout: None +force_quick_gelu: False +gather_with_grad: True +grad_checkpointing: True +grad_clip_norm: None +horovod: False +image_interpolation: None +image_mean: None +image_resize_mode: None +image_std: None +imagenet_v2: None +imagenet_val: None +keep_func_name: keep_image_farest +local_loss: False +local_rank: 0 +lock_image: False +lock_image_freeze_bn_stats: False +lock_image_unlocked_groups: 0 +lock_text: True +lock_text_freeze_layer_norm: False +lock_text_unlocked_layers: 0 +log_every_n_steps: 100 +log_level: 20 +log_local: False +log_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/out.log +logs: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text +loss_dist_impl: None +lr: 4e-05 +lr_cooldown_end: 0.0 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +map_func_name: use_all +model: ViT-B-16 +momentum: None +name: keep_image_farest +no_set_device_rank: False +opt: adamw +precision: amp +pretrained: datacomp_xl_s13b_b90k +pretrained_image: False +rank: 0 +remote_sync: None +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: tensorboard,wandb +resume: None +save_frequency: 10 +save_most_recent: False +seed: 0 +siglip: False +skip_scheduler: False +tensorboard: True +tensorboard_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/tensorboard +torchcompile: False +torchscript: False +trace: False +train_data: /mnt/personal/zhudongy/recap-datacomp-3m-wds/{00376..01507}.tar +train_data_upsampling_factors: None +train_num_samples: 9011874 +use_bn_sync: False +use_bnb_linear: None +val_data: None +val_frequency: 1 +val_num_samples: None +wandb: True +wandb_notes: +wandb_project_name: open-clip +warmup: 110 +wd: 0.5 +workers: 16 +world_size: 2 +zeroshot_frequency: 2 diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_caltech101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_caltech101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..90bba3441032b5484d2e22f33411609f112a2dd8 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_caltech101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "caltech101", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8642825896762905, "acc5": 0.968503937007874, "mean_per_class_recall": 0.9268833112583469}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_cars_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_cars_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..a711839787eca035bb9c3f9c944445658c5f08cf --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_cars_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "cars", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8307424449695312, "acc5": 0.9890560875512996, "mean_per_class_recall": 0.831079202139019}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..6968eb794b5b3b73083c0c246ff84832b0bc5cc8 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "cifar100", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7507, "acc5": 0.9307, "mean_per_class_recall": 0.7508}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_cifar10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_cifar10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..ba46cc239499cef7ebe6eb30073f7e13e2fbc48d --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_cifar10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "cifar10", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.9424, "acc5": 0.9983, "mean_per_class_recall": 0.9426}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..da0930cc9b8dd7b882bf84ec321de983c0385edb --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "country211", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.19488151658767772, "acc5": 0.42402843601895734, "mean_per_class_recall": 0.19497630331753554}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..d62927911c8fee6f1b49e6348aa349e29d172fac --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "dtd", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5574468085106383, "acc5": 0.8218085106382979, "mean_per_class_recall": 0.5574468085106383}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_eurosat_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_eurosat_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..5fbc7b007bdab02f80e2667291750a28040f7f46 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_eurosat_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "eurosat", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.4812222222222222, "acc5": 0.9082592592592592, "mean_per_class_recall": 0.49873}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..651143bea48a0b6707322d50abab7282afd579c0 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "fgvc_aircraft", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.24302430243024303, "acc5": 0.5886588658865887, "mean_per_class_recall": 0.24406417112299464}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json new file mode 100644 index 0000000000000000000000000000000000000000..1bb8dff3b2b56bc279f6dc3d2ace823745873929 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json @@ -0,0 +1 @@ +{"dataset": "flickr30k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.6453999876976013, "text_retrieval_recall@1": 0.8119999766349792, "image_retrieval_recall@5": 0.8736000061035156, "text_retrieval_recall@5": 0.9599999785423279, "image_retrieval_recall@10": 0.9232000112533569, "text_retrieval_recall@10": 0.9769999980926514}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..08d2574748fdb14c934c7afbc36069f6cccb7522 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "flowers", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7315010570824524, "acc5": 0.9033989266547406, "mean_per_class_recall": 0.7427204309687857}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..9deed0f746a74e70898995bbb2edb2484c1b52d4 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "food101", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8706930693069307, "acc5": 0.981069306930693, "mean_per_class_recall": 0.8706138613861385}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..16ac0656d9f372680e590fc8b29b66bd34c19eb5 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "gtsrb", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5311955661124307, "acc5": 0.7740300870942202, "mean_per_class_recall": 0.5028962559086422}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..a4e1b5aaefbb82a71ebb6dc35eef71b1574ec4ca --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "imagenet1k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.68738, "acc5": 0.90974, "mean_per_class_recall": 0.68722}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_mscoco_captions_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_mscoco_captions_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json new file mode 100644 index 0000000000000000000000000000000000000000..9bc804ca66188e75d449afe1441fdc4de981e394 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_mscoco_captions_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json @@ -0,0 +1 @@ +{"dataset": "mscoco_captions", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.38168731331825256, "text_retrieval_recall@1": 0.5389999747276306, "image_retrieval_recall@5": 0.6399040222167969, "text_retrieval_recall@5": 0.7764000296592712, "image_retrieval_recall@10": 0.7417033314704895, "text_retrieval_recall@10": 0.8560000061988831}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..468b8e8cef07d3afd1a87fdd7fd59d23a5d92ef9 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "pets", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8997001907876806, "acc5": 0.9964568002180431, "mean_per_class_recall": 0.8992468186086529}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_stl10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_stl10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..32e34a34150ab4850a3b4582c962c0e7e301b0dc --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_stl10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "stl10", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.970375, "acc5": 0.999875, "mean_per_class_recall": 0.9705}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..08ae786ac2d661f93f63a1d1d3e70baa82ffdb2c --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "sun397", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6735016643066002, "acc5": 0.927404969012634, "mean_per_class_recall": 0.6633693099922054}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..9f5f5bcf0d4f86d5bb82627c748e10ac4aaf27eb --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "vtab/resisc45", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6115873015873016, "acc5": 0.9157142857142857, "mean_per_class_recall": 0.6191834588628579}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/out.log b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/out.log new file mode 100644 index 0000000000000000000000000000000000000000..bd124d533cd409dc659c29963a92ef1b5741e784 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/out.log @@ -0,0 +1,195 @@ +2025-04-27,01:37:31 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2. +2025-04-27,01:37:31 | INFO | Loaded ViT-B-16 model config. +2025-04-27,01:37:33 | INFO | Loading pretrained ViT-B-16 weights (datacomp_xl_s13b_b90k). +2025-04-27,01:37:33 | INFO | Model: +2025-04-27,01:37:33 | INFO | CLIP( + (visual): VisionTransformer( + (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False) + (patch_dropout): Identity() + (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (transformer): Transformer( + (resblocks): ModuleList( + (0-11): 12 x ResidualAttentionBlock( + (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (attn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) + ) + (ls_1): Identity() + (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): Sequential( + (c_fc): Linear(in_features=768, out_features=3072, bias=True) + (gelu): GELU(approximate='none') + (c_proj): Linear(in_features=3072, out_features=768, bias=True) + ) + (ls_2): Identity() + ) + ) + ) + (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (transformer): Transformer( + (resblocks): ModuleList( + (0-11): 12 x ResidualAttentionBlock( + (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) + (attn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) + ) + (ls_1): Identity() + (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) + (mlp): Sequential( + (c_fc): Linear(in_features=512, out_features=2048, bias=True) + (gelu): GELU(approximate='none') + (c_proj): Linear(in_features=2048, out_features=512, bias=True) + ) + (ls_2): Identity() + ) + ) + ) + (token_embedding): Embedding(49408, 512) + (ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True) +) +2025-04-27,01:37:33 | INFO | Params: +2025-04-27,01:37:33 | INFO | accum_freq: 2 +2025-04-27,01:37:33 | INFO | aug_cfg: {} +2025-04-27,01:37:33 | INFO | batch_size: 2048 +2025-04-27,01:37:33 | INFO | beta1: 0.9 +2025-04-27,01:37:33 | INFO | beta2: 0.98 +2025-04-27,01:37:33 | INFO | cache_dir: None +2025-04-27,01:37:33 | INFO | caption_ratio: 0.1 +2025-04-27,01:37:33 | INFO | checkpoint_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints +2025-04-27,01:37:33 | INFO | coca_caption_loss_weight: 2.0 +2025-04-27,01:37:33 | INFO | coca_contrastive_loss_weight: 1.0 +2025-04-27,01:37:33 | INFO | copy_codebase: False +2025-04-27,01:37:33 | INFO | csv_caption_key: title +2025-04-27,01:37:33 | INFO | csv_img_key: filepath +2025-04-27,01:37:33 | INFO | csv_separator: +2025-04-27,01:37:33 | INFO | dataset_resampled: False +2025-04-27,01:37:33 | INFO | dataset_type: synthetic +2025-04-27,01:37:33 | INFO | ddp_static_graph: False +2025-04-27,01:37:33 | INFO | debug: False +2025-04-27,01:37:33 | INFO | delete_previous_checkpoint: False +2025-04-27,01:37:33 | INFO | device: cuda:0 +2025-04-27,01:37:33 | INFO | dist_backend: None +2025-04-27,01:37:33 | INFO | dist_url: None +2025-04-27,01:37:33 | INFO | distill: False +2025-04-27,01:37:33 | INFO | distill_model: None +2025-04-27,01:37:33 | INFO | distill_pretrained: None +2025-04-27,01:37:33 | INFO | distributed: True +2025-04-27,01:37:33 | INFO | epochs: 10 +2025-04-27,01:37:33 | INFO | epochs_cooldown: None +2025-04-27,01:37:33 | INFO | eps: 1e-08 +2025-04-27,01:37:33 | INFO | force_custom_text: False +2025-04-27,01:37:33 | INFO | force_image_size: None +2025-04-27,01:37:33 | INFO | force_patch_dropout: None +2025-04-27,01:37:33 | INFO | force_quick_gelu: False +2025-04-27,01:37:33 | INFO | gather_with_grad: True +2025-04-27,01:37:33 | INFO | grad_checkpointing: True +2025-04-27,01:37:33 | INFO | grad_clip_norm: None +2025-04-27,01:37:33 | INFO | horovod: False +2025-04-27,01:37:33 | INFO | image_interpolation: None +2025-04-27,01:37:33 | INFO | image_mean: None +2025-04-27,01:37:33 | INFO | image_resize_mode: None +2025-04-27,01:37:33 | INFO | image_std: None +2025-04-27,01:37:33 | INFO | imagenet_v2: None +2025-04-27,01:37:33 | INFO | imagenet_val: None +2025-04-27,01:37:33 | INFO | keep_func_name: keep_random +2025-04-27,01:37:33 | INFO | local_loss: False +2025-04-27,01:37:33 | INFO | local_rank: 0 +2025-04-27,01:37:33 | INFO | lock_image: False +2025-04-27,01:37:33 | INFO | lock_image_freeze_bn_stats: False +2025-04-27,01:37:33 | INFO | lock_image_unlocked_groups: 0 +2025-04-27,01:37:33 | INFO | lock_text: True +2025-04-27,01:37:33 | INFO | lock_text_freeze_layer_norm: False +2025-04-27,01:37:33 | INFO | lock_text_unlocked_layers: 0 +2025-04-27,01:37:33 | INFO | log_every_n_steps: 100 +2025-04-27,01:37:33 | INFO | log_level: 20 +2025-04-27,01:37:33 | INFO | log_local: False +2025-04-27,01:37:33 | INFO | log_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/out.log +2025-04-27,01:37:33 | INFO | logs: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text +2025-04-27,01:37:33 | INFO | loss_dist_impl: None +2025-04-27,01:37:33 | INFO | lr: 4e-05 +2025-04-27,01:37:33 | INFO | lr_cooldown_end: 0.0 +2025-04-27,01:37:33 | INFO | lr_cooldown_power: 1.0 +2025-04-27,01:37:33 | INFO | lr_scheduler: cosine +2025-04-27,01:37:33 | INFO | map_func_name: use_all +2025-04-27,01:37:33 | INFO | model: ViT-B-16 +2025-04-27,01:37:33 | INFO | momentum: None +2025-04-27,01:37:33 | INFO | name: keep_random +2025-04-27,01:37:33 | INFO | no_set_device_rank: False +2025-04-27,01:37:33 | INFO | opt: adamw +2025-04-27,01:37:33 | INFO | precision: amp +2025-04-27,01:37:33 | INFO | pretrained: datacomp_xl_s13b_b90k +2025-04-27,01:37:33 | INFO | pretrained_image: False +2025-04-27,01:37:33 | INFO | rank: 0 +2025-04-27,01:37:33 | INFO | remote_sync: None +2025-04-27,01:37:33 | INFO | remote_sync_frequency: 300 +2025-04-27,01:37:33 | INFO | remote_sync_protocol: s3 +2025-04-27,01:37:33 | INFO | report_to: tensorboard,wandb +2025-04-27,01:37:33 | INFO | resume: None +2025-04-27,01:37:33 | INFO | save_frequency: 10 +2025-04-27,01:37:33 | INFO | save_most_recent: False +2025-04-27,01:37:33 | INFO | seed: 0 +2025-04-27,01:37:33 | INFO | siglip: False +2025-04-27,01:37:33 | INFO | skip_scheduler: False +2025-04-27,01:37:33 | INFO | tensorboard: True +2025-04-27,01:37:33 | INFO | tensorboard_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/tensorboard +2025-04-27,01:37:33 | INFO | torchcompile: False +2025-04-27,01:37:33 | INFO | torchscript: False +2025-04-27,01:37:33 | INFO | trace: False +2025-04-27,01:37:33 | INFO | train_data: /mnt/personal/zhudongy/recap-datacomp-3m-wds/{00376..01507}.tar +2025-04-27,01:37:33 | INFO | train_data_upsampling_factors: None +2025-04-27,01:37:33 | INFO | train_num_samples: 9011874 +2025-04-27,01:37:33 | INFO | use_bn_sync: False +2025-04-27,01:37:33 | INFO | use_bnb_linear: None +2025-04-27,01:37:33 | INFO | val_data: None +2025-04-27,01:37:33 | INFO | val_frequency: 1 +2025-04-27,01:37:33 | INFO | val_num_samples: None +2025-04-27,01:37:33 | INFO | wandb: True +2025-04-27,01:37:33 | INFO | wandb_notes: +2025-04-27,01:37:33 | INFO | wandb_project_name: open-clip +2025-04-27,01:37:33 | INFO | warmup: 110 +2025-04-27,01:37:33 | INFO | wd: 0.5 +2025-04-27,01:37:33 | INFO | workers: 16 +2025-04-27,01:37:33 | INFO | world_size: 2 +2025-04-27,01:37:33 | INFO | zeroshot_frequency: 2 +2025-04-27,01:37:34 | INFO | Created AdamW (adamw) optimizer: lr: 4e-05, betas: (0.9, 0.98), eps: 1e-08, weight_decay: 0.5, amsgrad: False, foreach: None, maximize: False, capturable: False, differentiable: False, fused: None +2025-04-27,01:37:47 | INFO | Start epoch 0 +2025-04-27,01:38:39 | INFO | Train Epoch: 0 [ 8192/917504 (1%)] Data (t): 42.123 Batch (t): 51.906, 157.823/s, 78.9117/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 28.618 (28.618) Imm_text: 28.618 (28.618) Isd_image: 3.3252 (3.3252) Isd_text: 3.3252 (3.3252) Contrastive_loss: 1.4192 (1.4192) Loss: 1.4192 (1.4192) +2025-04-27,01:53:40 | INFO | Train Epoch: 0 [827392/917504 (90%)] Data (t): 0.999 Batch (t): 9.010, 915.892/s, 457.946/s/gpu LR: 0.000037 Logit Scale: 99.934 Imm_image: 28.968 (28.793) Imm_text: 28.968 (28.793) Isd_image: 5.7168 (4.5210) Isd_text: 5.7168 (4.5210) Contrastive_loss: 0.72981 (1.0745) Loss: 0.72981 (1.0745) +2025-04-27,01:55:19 | INFO | Train Epoch: 0 [917504/917504 (100%)] Data (t): 0.952 Batch (t): 8.936, 914.273/s, 457.137/s/gpu LR: 0.000040 Logit Scale: 99.924 Imm_image: 28.905 (28.830) Imm_text: 28.905 (28.830) Isd_image: 5.5824 (4.8748) Isd_text: 5.5824 (4.8748) Contrastive_loss: 0.71474 (0.95458) Loss: 0.71474 (0.95458) +2025-04-27,01:55:19 | INFO | Start epoch 1 +2025-04-27,01:56:03 | INFO | Train Epoch: 1 [ 8192/917504 (1%)] Data (t): 35.769 Batch (t): 44.168, 185.473/s, 92.7366/s/gpu LR: 0.000040 Logit Scale: 99.924 Imm_image: 29.056 (29.056) Imm_text: 29.056 (29.056) Isd_image: 5.6160 (5.6160) Isd_text: 5.6160 (5.6160) Contrastive_loss: 0.63035 (0.63035) Loss: 0.63035 (0.63035) +2025-04-27,02:11:05 | INFO | Train Epoch: 1 [827392/917504 (90%)] Data (t): 1.006 Batch (t): 9.021, 916.703/s, 458.351/s/gpu LR: 0.000039 Logit Scale: 99.879 Imm_image: 28.940 (28.998) Imm_text: 28.940 (28.998) Isd_image: 4.3083 (4.9621) Isd_text: 4.3083 (4.9621) Contrastive_loss: 0.64448 (0.63742) Loss: 0.64448 (0.63742) +2025-04-27,02:12:43 | INFO | Train Epoch: 1 [917504/917504 (100%)] Data (t): 0.950 Batch (t): 8.937, 918.129/s, 459.065/s/gpu LR: 0.000039 Logit Scale: 99.883 Imm_image: 29.164 (29.053) Imm_text: 29.164 (29.053) Isd_image: 3.7631 (4.5625) Isd_text: 3.7631 (4.5625) Contrastive_loss: 0.51947 (0.59810) Loss: 0.51947 (0.59810) +2025-04-27,02:12:44 | INFO | Start epoch 2 +2025-04-27,02:13:29 | INFO | Train Epoch: 2 [ 8192/917504 (1%)] Data (t): 37.286 Batch (t): 45.551, 179.841/s, 89.9207/s/gpu LR: 0.000039 Logit Scale: 99.885 Imm_image: 29.259 (29.259) Imm_text: 29.259 (29.259) Isd_image: 3.6721 (3.6721) Isd_text: 3.6721 (3.6721) Contrastive_loss: 0.49092 (0.49092) Loss: 0.49092 (0.49092) +2025-04-27,02:28:32 | INFO | Train Epoch: 2 [827392/917504 (90%)] Data (t): 1.009 Batch (t): 9.023, 917.789/s, 458.894/s/gpu LR: 0.000036 Logit Scale: 100.000 Imm_image: 29.366 (29.312) Imm_text: 29.366 (29.312) Isd_image: 2.6899 (3.1810) Isd_text: 2.6899 (3.1810) Contrastive_loss: 0.46473 (0.47783) Loss: 0.46473 (0.47783) +2025-04-27,02:30:10 | INFO | Train Epoch: 2 [917504/917504 (100%)] Data (t): 0.952 Batch (t): 8.936, 915.433/s, 457.716/s/gpu LR: 0.000035 Logit Scale: 100.000 Imm_image: 29.520 (29.382) Imm_text: 29.520 (29.382) Isd_image: 2.5545 (2.9722) Isd_text: 2.5545 (2.9722) Contrastive_loss: 0.39856 (0.45141) Loss: 0.39856 (0.45141) +2025-04-27,02:30:10 | INFO | Start epoch 3 +2025-04-27,02:30:55 | INFO | Train Epoch: 3 [ 8192/917504 (1%)] Data (t): 36.864 Batch (t): 44.915, 182.389/s, 91.1947/s/gpu LR: 0.000035 Logit Scale: 100.000 Imm_image: 29.511 (29.511) Imm_text: 29.511 (29.511) Isd_image: 2.5891 (2.5891) Isd_text: 2.5891 (2.5891) Contrastive_loss: 0.42152 (0.42152) Loss: 0.42152 (0.42152) +2025-04-27,02:46:00 | INFO | Train Epoch: 3 [827392/917504 (90%)] Data (t): 1.025 Batch (t): 9.047, 917.102/s, 458.551/s/gpu LR: 0.000031 Logit Scale: 100.000 Imm_image: 29.689 (29.600) Imm_text: 29.689 (29.600) Isd_image: 1.7021 (2.1456) Isd_text: 1.7021 (2.1456) Contrastive_loss: 0.39319 (0.40736) Loss: 0.39319 (0.40736) +2025-04-27,02:47:38 | INFO | Train Epoch: 3 [917504/917504 (100%)] Data (t): 0.951 Batch (t): 8.934, 916.799/s, 458.399/s/gpu LR: 0.000030 Logit Scale: 100.000 Imm_image: 29.685 (29.628) Imm_text: 29.685 (29.628) Isd_image: 1.5345 (1.9419) Isd_text: 1.5345 (1.9419) Contrastive_loss: 0.34130 (0.38534) Loss: 0.34130 (0.38534) +2025-04-27,02:47:38 | INFO | Start epoch 4 +2025-04-27,02:48:23 | INFO | Train Epoch: 4 [ 8192/917504 (1%)] Data (t): 36.836 Batch (t): 44.907, 182.422/s, 91.2108/s/gpu LR: 0.000030 Logit Scale: 100.000 Imm_image: 29.707 (29.707) Imm_text: 29.707 (29.707) Isd_image: 1.5228 (1.5228) Isd_text: 1.5228 (1.5228) Contrastive_loss: 0.35253 (0.35253) Loss: 0.35253 (0.35253) +2025-04-27,03:03:22 | INFO | Train Epoch: 4 [827392/917504 (90%)] Data (t): 0.984 Batch (t): 8.987, 918.381/s, 459.190/s/gpu LR: 0.000024 Logit Scale: 100.000 Imm_image: 29.872 (29.789) Imm_text: 29.872 (29.789) Isd_image: 1.0030 (1.2629) Isd_text: 1.0030 (1.2629) Contrastive_loss: 0.33160 (0.34206) Loss: 0.33160 (0.34206) +2025-04-27,03:05:00 | INFO | Train Epoch: 4 [917504/917504 (100%)] Data (t): 0.940 Batch (t): 8.925, 915.463/s, 457.731/s/gpu LR: 0.000023 Logit Scale: 100.000 Imm_image: 29.722 (29.767) Imm_text: 29.722 (29.767) Isd_image: 1.1486 (1.2248) Isd_text: 1.1486 (1.2248) Contrastive_loss: 0.31939 (0.33451) Loss: 0.31939 (0.33451) +2025-04-27,03:05:00 | INFO | Start epoch 5 +2025-04-27,03:05:45 | INFO | Train Epoch: 5 [ 8192/917504 (1%)] Data (t): 35.249 Batch (t): 44.356, 184.686/s, 92.3430/s/gpu LR: 0.000023 Logit Scale: 100.000 Imm_image: 29.820 (29.820) Imm_text: 29.820 (29.820) Isd_image: 1.0999 (1.0999) Isd_text: 1.0999 (1.0999) Contrastive_loss: 0.29852 (0.29852) Loss: 0.29852 (0.29852) +2025-04-27,03:20:46 | INFO | Train Epoch: 5 [827392/917504 (90%)] Data (t): 1.002 Batch (t): 9.017, 918.400/s, 459.200/s/gpu LR: 0.000017 Logit Scale: 100.000 Imm_image: 29.969 (29.895) Imm_text: 29.969 (29.895) Isd_image: 0.86037 (0.98011) Isd_text: 0.86037 (0.98011) Contrastive_loss: 0.27611 (0.28732) Loss: 0.27611 (0.28732) +2025-04-27,03:22:25 | INFO | Train Epoch: 5 [917504/917504 (100%)] Data (t): 0.949 Batch (t): 8.933, 914.883/s, 457.441/s/gpu LR: 0.000017 Logit Scale: 100.000 Imm_image: 29.996 (29.928) Imm_text: 29.996 (29.928) Isd_image: 0.85540 (0.93854) Isd_text: 0.85540 (0.93854) Contrastive_loss: 0.24764 (0.27409) Loss: 0.24764 (0.27409) +2025-04-27,03:22:25 | INFO | Start epoch 6 +2025-04-27,03:23:10 | INFO | Train Epoch: 6 [ 8192/917504 (1%)] Data (t): 36.590 Batch (t): 44.694, 183.290/s, 91.6450/s/gpu LR: 0.000016 Logit Scale: 100.000 Imm_image: 30.118 (30.118) Imm_text: 30.118 (30.118) Isd_image: 0.77700 (0.77700) Isd_text: 0.77700 (0.77700) Contrastive_loss: 0.24394 (0.24394) Loss: 0.24394 (0.24394) +2025-04-27,03:38:11 | INFO | Train Epoch: 6 [827392/917504 (90%)] Data (t): 1.004 Batch (t): 9.018, 917.586/s, 458.793/s/gpu LR: 0.000011 Logit Scale: 100.000 Imm_image: 30.208 (30.163) Imm_text: 30.208 (30.163) Isd_image: 0.66865 (0.72282) Isd_text: 0.66865 (0.72282) Contrastive_loss: 0.24086 (0.24240) Loss: 0.24086 (0.24240) +2025-04-27,03:39:50 | INFO | Train Epoch: 6 [917504/917504 (100%)] Data (t): 0.950 Batch (t): 8.933, 917.061/s, 458.530/s/gpu LR: 0.000010 Logit Scale: 100.000 Imm_image: 30.353 (30.226) Imm_text: 30.353 (30.226) Isd_image: 0.60130 (0.68231) Isd_text: 0.60130 (0.68231) Contrastive_loss: 0.17543 (0.22008) Loss: 0.17543 (0.22008) +2025-04-27,03:39:50 | INFO | Start epoch 7 +2025-04-27,03:40:34 | INFO | Train Epoch: 7 [ 8192/917504 (1%)] Data (t): 36.419 Batch (t): 44.470, 184.213/s, 92.1063/s/gpu LR: 0.000010 Logit Scale: 100.000 Imm_image: 30.290 (30.290) Imm_text: 30.290 (30.290) Isd_image: 0.66099 (0.66099) Isd_text: 0.66099 (0.66099) Contrastive_loss: 0.21813 (0.21813) Loss: 0.21813 (0.21813) +2025-04-27,03:55:41 | INFO | Train Epoch: 7 [827392/917504 (90%)] Data (t): 1.037 Batch (t): 9.062, 921.569/s, 460.784/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.350 (30.320) Imm_text: 30.350 (30.320) Isd_image: 0.52269 (0.59184) Isd_text: 0.52269 (0.59184) Contrastive_loss: 0.21930 (0.21872) Loss: 0.21930 (0.21872) +2025-04-27,03:57:19 | INFO | Train Epoch: 7 [917504/917504 (100%)] Data (t): 0.951 Batch (t): 8.931, 912.708/s, 456.354/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.451 (30.364) Imm_text: 30.451 (30.364) Isd_image: 0.50853 (0.56407) Isd_text: 0.50853 (0.56407) Contrastive_loss: 0.19753 (0.21166) Loss: 0.19753 (0.21166) +2025-04-27,03:57:19 | INFO | Start epoch 8 +2025-04-27,03:58:04 | INFO | Train Epoch: 8 [ 8192/917504 (1%)] Data (t): 36.493 Batch (t): 44.842, 182.684/s, 91.3421/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.394 (30.394) Imm_text: 30.394 (30.394) Isd_image: 0.56205 (0.56205) Isd_text: 0.56205 (0.56205) Contrastive_loss: 0.19877 (0.19877) Loss: 0.19877 (0.19877) +2025-04-27,04:13:05 | INFO | Train Epoch: 8 [827392/917504 (90%)] Data (t): 1.001 Batch (t): 9.007, 919.521/s, 459.760/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.305 (30.350) Imm_text: 30.305 (30.350) Isd_image: 0.50703 (0.53454) Isd_text: 0.50703 (0.53454) Contrastive_loss: 0.23204 (0.21540) Loss: 0.23204 (0.21540) +2025-04-27,04:14:43 | INFO | Train Epoch: 8 [917504/917504 (100%)] Data (t): 0.940 Batch (t): 8.918, 914.019/s, 457.010/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.433 (30.377) Imm_text: 30.433 (30.377) Isd_image: 0.56413 (0.54440) Isd_text: 0.56413 (0.54440) Contrastive_loss: 0.18968 (0.20683) Loss: 0.18968 (0.20683) +2025-04-27,04:14:43 | INFO | Start epoch 9 +2025-04-27,04:15:26 | INFO | Train Epoch: 9 [ 8192/917504 (1%)] Data (t): 34.886 Batch (t): 43.055, 190.267/s, 95.1333/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.423 (30.423) Imm_text: 30.423 (30.423) Isd_image: 0.50569 (0.50569) Isd_text: 0.50569 (0.50569) Contrastive_loss: 0.20082 (0.20082) Loss: 0.20082 (0.20082) +2025-04-27,04:30:24 | INFO | Train Epoch: 9 [827392/917504 (90%)] Data (t): 0.980 Batch (t): 8.984, 919.065/s, 459.533/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 30.383 (30.403) Imm_text: 30.383 (30.403) Isd_image: 0.52638 (0.51604) Isd_text: 0.52638 (0.51604) Contrastive_loss: 0.22643 (0.21362) Loss: 0.22643 (0.21362) +2025-04-27,04:32:02 | INFO | Train Epoch: 9 [917504/917504 (100%)] Data (t): 0.931 Batch (t): 8.910, 918.487/s, 459.244/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 30.408 (30.404) Imm_text: 30.408 (30.404) Isd_image: 0.47812 (0.50340) Isd_text: 0.47812 (0.50340) Contrastive_loss: 0.19293 (0.20673) Loss: 0.19293 (0.20673) diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/params.txt b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..aeb1fce6f322c87ccf6bf24546078b4c3dc6f912 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/params.txt @@ -0,0 +1,103 @@ +accum_freq: 2 +aug_cfg: {} +batch_size: 2048 +beta1: 0.9 +beta2: 0.98 +cache_dir: None +caption_ratio: 0.1 +checkpoint_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints +coca_caption_loss_weight: 2.0 +coca_contrastive_loss_weight: 1.0 +copy_codebase: False +csv_caption_key: title +csv_img_key: filepath +csv_separator: +dataset_resampled: False +dataset_type: synthetic +ddp_static_graph: False +debug: False +delete_previous_checkpoint: False +device: cuda:0 +dist_backend: None +dist_url: None +distill: False +distill_model: None +distill_pretrained: None +distributed: True +epochs: 10 +epochs_cooldown: None +eps: 1e-08 +force_custom_text: False +force_image_size: None +force_patch_dropout: None +force_quick_gelu: False +gather_with_grad: True +grad_checkpointing: True +grad_clip_norm: None +horovod: False +image_interpolation: None +image_mean: None +image_resize_mode: None +image_std: None +imagenet_v2: None +imagenet_val: None +keep_func_name: keep_random +local_loss: False +local_rank: 0 +lock_image: False +lock_image_freeze_bn_stats: False +lock_image_unlocked_groups: 0 +lock_text: True +lock_text_freeze_layer_norm: False +lock_text_unlocked_layers: 0 +log_every_n_steps: 100 +log_level: 20 +log_local: False +log_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/out.log +logs: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text +loss_dist_impl: None +lr: 4e-05 +lr_cooldown_end: 0.0 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +map_func_name: use_all +model: ViT-B-16 +momentum: None +name: keep_random +no_set_device_rank: False +opt: adamw +precision: amp +pretrained: datacomp_xl_s13b_b90k +pretrained_image: False +rank: 0 +remote_sync: None +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: tensorboard,wandb +resume: None +save_frequency: 10 +save_most_recent: False +seed: 0 +siglip: False +skip_scheduler: False +tensorboard: True +tensorboard_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/tensorboard +torchcompile: False +torchscript: False +trace: False +train_data: /mnt/personal/zhudongy/recap-datacomp-3m-wds/{00376..01507}.tar +train_data_upsampling_factors: None +train_num_samples: 9011874 +use_bn_sync: False +use_bnb_linear: None +val_data: None +val_frequency: 1 +val_num_samples: None +wandb: True +wandb_notes: +wandb_project_name: open-clip +warmup: 110 +wd: 0.5 +workers: 16 +world_size: 2 +zeroshot_frequency: 2 diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_caltech101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_caltech101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..524685b9c6da651c3ac876f1c8b8f9b928c35c03 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_caltech101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "caltech101", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8618766404199475, "acc5": 0.9681758530183727, "mean_per_class_recall": 0.925570681349594}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_cars_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_cars_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..9aed8e0207020d6356286ef7e0ec76a679e0255f --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_cars_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "cars", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8362144011938814, "acc5": 0.9920407909463997, "mean_per_class_recall": 0.8370035365963817}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..741f24300cd1bb16c0038dab3441c4035bb20033 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "cifar100", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7489, "acc5": 0.9325, "mean_per_class_recall": 0.7489}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_cifar10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_cifar10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..2bce76e243a17840a2e6b96a75c7f25faa6840bc --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_cifar10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "cifar10", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.9405, "acc5": 0.9984, "mean_per_class_recall": 0.9405000000000001}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..7c73c1b0082d87bac3d09f34303c39948ef7f0e4 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "country211", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.1942654028436019, "acc5": 0.42255924170616116, "mean_per_class_recall": 0.19436018957345974}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..8dc48ad9decb3d072d08f456689cd928159463f8 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "dtd", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5537234042553192, "acc5": 0.8122340425531915, "mean_per_class_recall": 0.5537234042553191}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_eurosat_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_eurosat_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..0c6041ef290c3d83d5ba6f1ec49e447bd5cc5654 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_eurosat_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "eurosat", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.48733333333333334, "acc5": 0.8834074074074074, "mean_per_class_recall": 0.48750666666666664}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..90d7b7588e27ab5bf93e81fff7cd9d53104ffce0 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "fgvc_aircraft", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.24152415241524153, "acc5": 0.5781578157815782, "mean_per_class_recall": 0.2407130124777184}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json new file mode 100644 index 0000000000000000000000000000000000000000..5d883ee9ab8328a6d10dc2cc62ab10cfd0037386 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json @@ -0,0 +1 @@ +{"dataset": "flickr30k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.652999997138977, "text_retrieval_recall@1": 0.8100000023841858, "image_retrieval_recall@5": 0.8772000074386597, "text_retrieval_recall@5": 0.9539999961853027, "image_retrieval_recall@10": 0.921999990940094, "text_retrieval_recall@10": 0.9779999852180481}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..7690088dbc14389f8ec831c3154905ae1f788ae9 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "flowers", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7280858676207513, "acc5": 0.9017726459586924, "mean_per_class_recall": 0.7367525532935172}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..ce9684c219d33a929694f60b33bd2c5c9b5e9609 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "food101", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8707722772277228, "acc5": 0.9812277227722772, "mean_per_class_recall": 0.8707326732673267}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..d2a5878389d0256e67f22b77156f7d89f56f8925 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "gtsrb", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5553444180522565, "acc5": 0.7724465558194774, "mean_per_class_recall": 0.5145985462408618}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..57569575aaab4bf69bf09dd5efc77918f325e039 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "imagenet1k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6846, "acc5": 0.90768, "mean_per_class_recall": 0.6846}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_mscoco_captions_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_mscoco_captions_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json new file mode 100644 index 0000000000000000000000000000000000000000..aa38018b1dd4585ad676ba94f66ca2324bc97e7b --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_mscoco_captions_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json @@ -0,0 +1 @@ +{"dataset": "mscoco_captions", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.384246289730072, "text_retrieval_recall@1": 0.5473999977111816, "image_retrieval_recall@5": 0.6422231197357178, "text_retrieval_recall@5": 0.7784000039100647, "image_retrieval_recall@10": 0.7440223693847656, "text_retrieval_recall@10": 0.8586000204086304}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..37109e1ee27037532f2df2cfd3744fbf7e546290 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "pets", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8975197601526301, "acc5": 0.9964568002180431, "mean_per_class_recall": 0.8970007980600225}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_stl10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_stl10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..61e146672cd4949888f811d7bf3d06b9f214476e --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_stl10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "stl10", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.97225, "acc5": 0.9995, "mean_per_class_recall": 0.9721250000000001}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..245c993eeccba2da4a7c9bb5930b2697caed0d79 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "sun397", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6729039851407764, "acc5": 0.9277635765121283, "mean_per_class_recall": 0.6608765022275734}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..1aaca95834e598d10b7cf1df6f30dbf26820f196 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "vtab/resisc45", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.577936507936508, "acc5": 0.8958730158730158, "mean_per_class_recall": 0.5856189977237182}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/out.log b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/out.log new file mode 100644 index 0000000000000000000000000000000000000000..255ac029e8e97f29612bc09162240f679b10858a --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/out.log @@ -0,0 +1,195 @@ +2025-04-26,12:38:35 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2. +2025-04-26,12:38:35 | INFO | Loaded ViT-B-16 model config. +2025-04-26,12:38:36 | INFO | Loading pretrained ViT-B-16 weights (datacomp_xl_s13b_b90k). +2025-04-26,12:38:37 | INFO | Model: +2025-04-26,12:38:37 | INFO | CLIP( + (visual): VisionTransformer( + (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False) + (patch_dropout): Identity() + (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (transformer): Transformer( + (resblocks): ModuleList( + (0-11): 12 x ResidualAttentionBlock( + (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (attn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) + ) + (ls_1): Identity() + (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): Sequential( + (c_fc): Linear(in_features=768, out_features=3072, bias=True) + (gelu): GELU(approximate='none') + (c_proj): Linear(in_features=3072, out_features=768, bias=True) + ) + (ls_2): Identity() + ) + ) + ) + (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (transformer): Transformer( + (resblocks): ModuleList( + (0-11): 12 x ResidualAttentionBlock( + (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) + (attn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) + ) + (ls_1): Identity() + (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) + (mlp): Sequential( + (c_fc): Linear(in_features=512, out_features=2048, bias=True) + (gelu): GELU(approximate='none') + (c_proj): Linear(in_features=2048, out_features=512, bias=True) + ) + (ls_2): Identity() + ) + ) + ) + (token_embedding): Embedding(49408, 512) + (ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True) +) +2025-04-26,12:38:37 | INFO | Params: +2025-04-26,12:38:37 | INFO | accum_freq: 2 +2025-04-26,12:38:37 | INFO | aug_cfg: {} +2025-04-26,12:38:37 | INFO | batch_size: 2048 +2025-04-26,12:38:37 | INFO | beta1: 0.9 +2025-04-26,12:38:37 | INFO | beta2: 0.98 +2025-04-26,12:38:37 | INFO | cache_dir: None +2025-04-26,12:38:37 | INFO | caption_ratio: 0.1 +2025-04-26,12:38:37 | INFO | checkpoint_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints +2025-04-26,12:38:37 | INFO | coca_caption_loss_weight: 2.0 +2025-04-26,12:38:37 | INFO | coca_contrastive_loss_weight: 1.0 +2025-04-26,12:38:37 | INFO | copy_codebase: False +2025-04-26,12:38:37 | INFO | csv_caption_key: title +2025-04-26,12:38:37 | INFO | csv_img_key: filepath +2025-04-26,12:38:37 | INFO | csv_separator: +2025-04-26,12:38:37 | INFO | dataset_resampled: False +2025-04-26,12:38:37 | INFO | dataset_type: synthetic +2025-04-26,12:38:37 | INFO | ddp_static_graph: False +2025-04-26,12:38:37 | INFO | debug: False +2025-04-26,12:38:37 | INFO | delete_previous_checkpoint: False +2025-04-26,12:38:37 | INFO | device: cuda:0 +2025-04-26,12:38:37 | INFO | dist_backend: None +2025-04-26,12:38:37 | INFO | dist_url: None +2025-04-26,12:38:37 | INFO | distill: False +2025-04-26,12:38:37 | INFO | distill_model: None +2025-04-26,12:38:37 | INFO | distill_pretrained: None +2025-04-26,12:38:37 | INFO | distributed: True +2025-04-26,12:38:37 | INFO | epochs: 10 +2025-04-26,12:38:37 | INFO | epochs_cooldown: None +2025-04-26,12:38:37 | INFO | eps: 1e-08 +2025-04-26,12:38:37 | INFO | force_custom_text: False +2025-04-26,12:38:37 | INFO | force_image_size: None +2025-04-26,12:38:37 | INFO | force_patch_dropout: None +2025-04-26,12:38:37 | INFO | force_quick_gelu: False +2025-04-26,12:38:37 | INFO | gather_with_grad: True +2025-04-26,12:38:37 | INFO | grad_checkpointing: True +2025-04-26,12:38:37 | INFO | grad_clip_norm: None +2025-04-26,12:38:37 | INFO | horovod: False +2025-04-26,12:38:37 | INFO | image_interpolation: None +2025-04-26,12:38:37 | INFO | image_mean: None +2025-04-26,12:38:37 | INFO | image_resize_mode: None +2025-04-26,12:38:37 | INFO | image_std: None +2025-04-26,12:38:37 | INFO | imagenet_v2: None +2025-04-26,12:38:37 | INFO | imagenet_val: None +2025-04-26,12:38:37 | INFO | keep_func_name: keep_text_closest_image_closest +2025-04-26,12:38:37 | INFO | local_loss: False +2025-04-26,12:38:37 | INFO | local_rank: 0 +2025-04-26,12:38:37 | INFO | lock_image: False +2025-04-26,12:38:37 | INFO | lock_image_freeze_bn_stats: False +2025-04-26,12:38:37 | INFO | lock_image_unlocked_groups: 0 +2025-04-26,12:38:37 | INFO | lock_text: True +2025-04-26,12:38:37 | INFO | lock_text_freeze_layer_norm: False +2025-04-26,12:38:37 | INFO | lock_text_unlocked_layers: 0 +2025-04-26,12:38:37 | INFO | log_every_n_steps: 100 +2025-04-26,12:38:37 | INFO | log_level: 20 +2025-04-26,12:38:37 | INFO | log_local: False +2025-04-26,12:38:37 | INFO | log_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/out.log +2025-04-26,12:38:37 | INFO | logs: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text +2025-04-26,12:38:37 | INFO | loss_dist_impl: None +2025-04-26,12:38:37 | INFO | lr: 4e-05 +2025-04-26,12:38:37 | INFO | lr_cooldown_end: 0.0 +2025-04-26,12:38:37 | INFO | lr_cooldown_power: 1.0 +2025-04-26,12:38:37 | INFO | lr_scheduler: cosine +2025-04-26,12:38:37 | INFO | map_func_name: use_all +2025-04-26,12:38:37 | INFO | model: ViT-B-16 +2025-04-26,12:38:37 | INFO | momentum: None +2025-04-26,12:38:37 | INFO | name: keep_text_closest_image_closest +2025-04-26,12:38:37 | INFO | no_set_device_rank: False +2025-04-26,12:38:37 | INFO | opt: adamw +2025-04-26,12:38:37 | INFO | precision: amp +2025-04-26,12:38:37 | INFO | pretrained: datacomp_xl_s13b_b90k +2025-04-26,12:38:37 | INFO | pretrained_image: False +2025-04-26,12:38:37 | INFO | rank: 0 +2025-04-26,12:38:37 | INFO | remote_sync: None +2025-04-26,12:38:37 | INFO | remote_sync_frequency: 300 +2025-04-26,12:38:37 | INFO | remote_sync_protocol: s3 +2025-04-26,12:38:37 | INFO | report_to: tensorboard,wandb +2025-04-26,12:38:37 | INFO | resume: None +2025-04-26,12:38:37 | INFO | save_frequency: 10 +2025-04-26,12:38:37 | INFO | save_most_recent: False +2025-04-26,12:38:37 | INFO | seed: 0 +2025-04-26,12:38:37 | INFO | siglip: False +2025-04-26,12:38:37 | INFO | skip_scheduler: False +2025-04-26,12:38:37 | INFO | tensorboard: True +2025-04-26,12:38:37 | INFO | tensorboard_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/tensorboard +2025-04-26,12:38:37 | INFO | torchcompile: False +2025-04-26,12:38:37 | INFO | torchscript: False +2025-04-26,12:38:37 | INFO | trace: False +2025-04-26,12:38:37 | INFO | train_data: /mnt/personal/zhudongy/recap-datacomp-3m-wds/{00376..01507}.tar +2025-04-26,12:38:37 | INFO | train_data_upsampling_factors: None +2025-04-26,12:38:37 | INFO | train_num_samples: 9011874 +2025-04-26,12:38:37 | INFO | use_bn_sync: False +2025-04-26,12:38:37 | INFO | use_bnb_linear: None +2025-04-26,12:38:37 | INFO | val_data: None +2025-04-26,12:38:37 | INFO | val_frequency: 1 +2025-04-26,12:38:37 | INFO | val_num_samples: None +2025-04-26,12:38:37 | INFO | wandb: True +2025-04-26,12:38:37 | INFO | wandb_notes: +2025-04-26,12:38:37 | INFO | wandb_project_name: open-clip +2025-04-26,12:38:37 | INFO | warmup: 110 +2025-04-26,12:38:37 | INFO | wd: 0.5 +2025-04-26,12:38:37 | INFO | workers: 16 +2025-04-26,12:38:37 | INFO | world_size: 2 +2025-04-26,12:38:37 | INFO | zeroshot_frequency: 2 +2025-04-26,12:38:38 | INFO | Created AdamW (adamw) optimizer: lr: 4e-05, betas: (0.9, 0.98), eps: 1e-08, weight_decay: 0.5, amsgrad: False, foreach: None, maximize: False, capturable: False, differentiable: False, fused: None +2025-04-26,12:39:00 | INFO | Start epoch 0 +2025-04-26,12:40:22 | INFO | Train Epoch: 0 [ 8192/917504 (1%)] Data (t): 72.281 Batch (t): 81.989, 99.9154/s, 49.9577/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 28.671 (28.671) Imm_text: 28.671 (28.671) Isd_image: 3.2045 (3.2045) Isd_text: 3.2045 (3.2045) Contrastive_loss: 1.4190 (1.4190) Loss: 1.4190 (1.4190) +2025-04-26,12:55:55 | INFO | Train Epoch: 0 [827392/917504 (90%)] Data (t): 1.208 Batch (t): 9.333, 906.918/s, 453.459/s/gpu LR: 0.000037 Logit Scale: 99.934 Imm_image: 28.998 (28.834) Imm_text: 28.998 (28.834) Isd_image: 5.7709 (4.4877) Isd_text: 5.7709 (4.4877) Contrastive_loss: 0.72784 (1.0734) Loss: 0.72784 (1.0734) +2025-04-26,12:57:33 | INFO | Train Epoch: 0 [917504/917504 (100%)] Data (t): 0.945 Batch (t): 8.929, 918.534/s, 459.267/s/gpu LR: 0.000040 Logit Scale: 99.925 Imm_image: 29.027 (28.899) Imm_text: 29.027 (28.899) Isd_image: 5.5162 (4.8305) Isd_text: 5.5162 (4.8305) Contrastive_loss: 0.69763 (0.94815) Loss: 0.69763 (0.94815) +2025-04-26,12:57:34 | INFO | Start epoch 1 +2025-04-26,12:58:47 | INFO | Train Epoch: 1 [ 8192/917504 (1%)] Data (t): 64.114 Batch (t): 73.005, 112.212/s, 56.1058/s/gpu LR: 0.000040 Logit Scale: 99.925 Imm_image: 29.101 (29.101) Imm_text: 29.101 (29.101) Isd_image: 5.5721 (5.5721) Isd_text: 5.5721 (5.5721) Contrastive_loss: 0.64322 (0.64322) Loss: 0.64322 (0.64322) +2025-04-26,13:14:32 | INFO | Train Epoch: 1 [827392/917504 (90%)] Data (t): 1.298 Batch (t): 9.455, 903.658/s, 451.829/s/gpu LR: 0.000039 Logit Scale: 99.883 Imm_image: 28.957 (29.029) Imm_text: 28.957 (29.029) Isd_image: 4.1823 (4.8772) Isd_text: 4.1823 (4.8772) Contrastive_loss: 0.61476 (0.62899) Loss: 0.61476 (0.62899) +2025-04-26,13:16:11 | INFO | Train Epoch: 1 [917504/917504 (100%)] Data (t): 0.968 Batch (t): 8.957, 911.494/s, 455.747/s/gpu LR: 0.000039 Logit Scale: 99.885 Imm_image: 29.341 (29.133) Imm_text: 29.341 (29.133) Isd_image: 3.7455 (4.5000) Isd_text: 3.7455 (4.5000) Contrastive_loss: 0.48650 (0.58149) Loss: 0.48650 (0.58149) +2025-04-26,13:16:11 | INFO | Start epoch 2 +2025-04-26,13:17:23 | INFO | Train Epoch: 2 [ 8192/917504 (1%)] Data (t): 61.286 Batch (t): 71.709, 114.239/s, 57.1197/s/gpu LR: 0.000039 Logit Scale: 99.887 Imm_image: 29.303 (29.303) Imm_text: 29.303 (29.303) Isd_image: 3.6748 (3.6748) Isd_text: 3.6748 (3.6748) Contrastive_loss: 0.53455 (0.53455) Loss: 0.53455 (0.53455) +2025-04-26,13:33:03 | INFO | Train Epoch: 2 [827392/917504 (90%)] Data (t): 1.273 Batch (t): 9.404, 912.364/s, 456.182/s/gpu LR: 0.000036 Logit Scale: 100.000 Imm_image: 29.434 (29.369) Imm_text: 29.434 (29.369) Isd_image: 2.7623 (3.2186) Isd_text: 2.7623 (3.2186) Contrastive_loss: 0.50407 (0.51931) Loss: 0.50407 (0.51931) +2025-04-26,13:34:41 | INFO | Train Epoch: 2 [917504/917504 (100%)] Data (t): 0.931 Batch (t): 8.909, 919.759/s, 459.880/s/gpu LR: 0.000035 Logit Scale: 100.000 Imm_image: 29.531 (29.423) Imm_text: 29.531 (29.423) Isd_image: 2.3814 (2.9395) Isd_text: 2.3814 (2.9395) Contrastive_loss: 0.40017 (0.47960) Loss: 0.40017 (0.47960) +2025-04-26,13:34:41 | INFO | Start epoch 3 +2025-04-26,13:36:00 | INFO | Train Epoch: 3 [ 8192/917504 (1%)] Data (t): 70.778 Batch (t): 78.815, 103.940/s, 51.9701/s/gpu LR: 0.000035 Logit Scale: 100.000 Imm_image: 29.527 (29.527) Imm_text: 29.527 (29.527) Isd_image: 2.4397 (2.4397) Isd_text: 2.4397 (2.4397) Contrastive_loss: 0.41395 (0.41395) Loss: 0.41395 (0.41395) +2025-04-26,13:51:43 | INFO | Train Epoch: 3 [827392/917504 (90%)] Data (t): 1.281 Batch (t): 9.427, 909.020/s, 454.510/s/gpu LR: 0.000031 Logit Scale: 100.000 Imm_image: 29.546 (29.537) Imm_text: 29.546 (29.537) Isd_image: 1.7331 (2.0864) Isd_text: 1.7331 (2.0864) Contrastive_loss: 0.42032 (0.41714) Loss: 0.42032 (0.41714) +2025-04-26,13:53:21 | INFO | Train Epoch: 3 [917504/917504 (100%)] Data (t): 0.960 Batch (t): 8.946, 916.436/s, 458.218/s/gpu LR: 0.000030 Logit Scale: 100.000 Imm_image: 29.730 (29.601) Imm_text: 29.730 (29.601) Isd_image: 1.6935 (1.9554) Isd_text: 1.6935 (1.9554) Contrastive_loss: 0.36120 (0.39849) Loss: 0.36120 (0.39849) +2025-04-26,13:53:21 | INFO | Start epoch 4 +2025-04-26,13:54:34 | INFO | Train Epoch: 4 [ 8192/917504 (1%)] Data (t): 64.098 Batch (t): 72.528, 112.949/s, 56.4747/s/gpu LR: 0.000030 Logit Scale: 100.000 Imm_image: 29.787 (29.787) Imm_text: 29.787 (29.787) Isd_image: 1.7499 (1.7499) Isd_text: 1.7499 (1.7499) Contrastive_loss: 0.36089 (0.36089) Loss: 0.36089 (0.36089) +2025-04-26,14:10:21 | INFO | Train Epoch: 4 [827392/917504 (90%)] Data (t): 1.307 Batch (t): 9.472, 914.504/s, 457.252/s/gpu LR: 0.000024 Logit Scale: 100.000 Imm_image: 29.817 (29.802) Imm_text: 29.817 (29.802) Isd_image: 1.1292 (1.4396) Isd_text: 1.1292 (1.4396) Contrastive_loss: 0.33858 (0.34973) Loss: 0.33858 (0.34973) +2025-04-26,14:11:59 | INFO | Train Epoch: 4 [917504/917504 (100%)] Data (t): 0.949 Batch (t): 8.931, 913.614/s, 456.807/s/gpu LR: 0.000023 Logit Scale: 100.000 Imm_image: 29.829 (29.811) Imm_text: 29.829 (29.811) Isd_image: 1.1127 (1.3306) Isd_text: 1.1127 (1.3306) Contrastive_loss: 0.26743 (0.32230) Loss: 0.26743 (0.32230) +2025-04-26,14:12:00 | INFO | Start epoch 5 +2025-04-26,14:13:20 | INFO | Train Epoch: 5 [ 8192/917504 (1%)] Data (t): 66.902 Batch (t): 80.486, 101.782/s, 50.8911/s/gpu LR: 0.000023 Logit Scale: 100.000 Imm_image: 29.840 (29.840) Imm_text: 29.840 (29.840) Isd_image: 1.0238 (1.0238) Isd_text: 1.0238 (1.0238) Contrastive_loss: 0.31508 (0.31508) Loss: 0.31508 (0.31508) +2025-04-26,14:29:01 | INFO | Train Epoch: 5 [827392/917504 (90%)] Data (t): 1.269 Batch (t): 9.412, 916.341/s, 458.171/s/gpu LR: 0.000017 Logit Scale: 100.000 Imm_image: 30.069 (29.955) Imm_text: 30.069 (29.955) Isd_image: 0.81852 (0.92114) Isd_text: 0.81852 (0.92114) Contrastive_loss: 0.27484 (0.29496) Loss: 0.27484 (0.29496) +2025-04-26,14:30:40 | INFO | Train Epoch: 5 [917504/917504 (100%)] Data (t): 0.954 Batch (t): 8.938, 914.340/s, 457.170/s/gpu LR: 0.000017 Logit Scale: 100.000 Imm_image: 30.128 (30.013) Imm_text: 30.128 (30.013) Isd_image: 0.85922 (0.90050) Isd_text: 0.85922 (0.90050) Contrastive_loss: 0.24429 (0.27807) Loss: 0.24429 (0.27807) +2025-04-26,14:30:40 | INFO | Start epoch 6 +2025-04-26,14:31:58 | INFO | Train Epoch: 6 [ 8192/917504 (1%)] Data (t): 69.745 Batch (t): 77.923, 105.129/s, 52.5644/s/gpu LR: 0.000016 Logit Scale: 100.000 Imm_image: 30.089 (30.089) Imm_text: 30.089 (30.089) Isd_image: 0.89953 (0.89953) Isd_text: 0.89953 (0.89953) Contrastive_loss: 0.26197 (0.26197) Loss: 0.26197 (0.26197) +2025-04-26,14:47:43 | INFO | Train Epoch: 6 [827392/917504 (90%)] Data (t): 1.288 Batch (t): 9.449, 906.285/s, 453.142/s/gpu LR: 0.000011 Logit Scale: 100.000 Imm_image: 30.241 (30.165) Imm_text: 30.241 (30.165) Isd_image: 0.59977 (0.74965) Isd_text: 0.59977 (0.74965) Contrastive_loss: 0.24411 (0.25304) Loss: 0.24411 (0.25304) +2025-04-26,14:49:21 | INFO | Train Epoch: 6 [917504/917504 (100%)] Data (t): 0.965 Batch (t): 8.950, 914.437/s, 457.218/s/gpu LR: 0.000010 Logit Scale: 100.000 Imm_image: 30.268 (30.200) Imm_text: 30.268 (30.200) Isd_image: 0.70007 (0.73312) Isd_text: 0.70007 (0.73312) Contrastive_loss: 0.21752 (0.24120) Loss: 0.21752 (0.24120) +2025-04-26,14:49:21 | INFO | Start epoch 7 +2025-04-26,14:50:38 | INFO | Train Epoch: 7 [ 8192/917504 (1%)] Data (t): 67.920 Batch (t): 76.353, 107.292/s, 53.6458/s/gpu LR: 0.000010 Logit Scale: 100.000 Imm_image: 30.308 (30.308) Imm_text: 30.308 (30.308) Isd_image: 0.84575 (0.84575) Isd_text: 0.84575 (0.84575) Contrastive_loss: 0.22420 (0.22420) Loss: 0.22420 (0.22420) +2025-04-26,15:06:22 | INFO | Train Epoch: 7 [827392/917504 (90%)] Data (t): 1.285 Batch (t): 9.441, 915.218/s, 457.609/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.315 (30.311) Imm_text: 30.315 (30.311) Isd_image: 0.56450 (0.70513) Isd_text: 0.56450 (0.70513) Contrastive_loss: 0.22563 (0.22491) Loss: 0.22563 (0.22491) +2025-04-26,15:08:00 | INFO | Train Epoch: 7 [917504/917504 (100%)] Data (t): 0.955 Batch (t): 8.937, 916.960/s, 458.480/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.454 (30.359) Imm_text: 30.454 (30.359) Isd_image: 0.54316 (0.65114) Isd_text: 0.54316 (0.65114) Contrastive_loss: 0.18511 (0.21164) Loss: 0.18511 (0.21164) +2025-04-26,15:08:00 | INFO | Start epoch 8 +2025-04-26,15:09:09 | INFO | Train Epoch: 8 [ 8192/917504 (1%)] Data (t): 60.600 Batch (t): 68.822, 119.032/s, 59.5161/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.370 (30.370) Imm_text: 30.370 (30.370) Isd_image: 0.56477 (0.56477) Isd_text: 0.56477 (0.56477) Contrastive_loss: 0.21653 (0.21653) Loss: 0.21653 (0.21653) +2025-04-26,15:24:57 | INFO | Train Epoch: 8 [827392/917504 (90%)] Data (t): 1.316 Batch (t): 9.476, 909.587/s, 454.793/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.324 (30.347) Imm_text: 30.324 (30.347) Isd_image: 0.53257 (0.54867) Isd_text: 0.53257 (0.54867) Contrastive_loss: 0.22996 (0.22325) Loss: 0.22996 (0.22325) +2025-04-26,15:26:35 | INFO | Train Epoch: 8 [917504/917504 (100%)] Data (t): 0.940 Batch (t): 8.916, 917.854/s, 458.927/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.447 (30.380) Imm_text: 30.447 (30.380) Isd_image: 0.55862 (0.55199) Isd_text: 0.55862 (0.55199) Contrastive_loss: 0.19649 (0.21433) Loss: 0.19649 (0.21433) +2025-04-26,15:26:35 | INFO | Start epoch 9 +2025-04-26,15:27:47 | INFO | Train Epoch: 9 [ 8192/917504 (1%)] Data (t): 64.048 Batch (t): 72.196, 113.469/s, 56.7345/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.532 (30.532) Imm_text: 30.532 (30.532) Isd_image: 0.51891 (0.51891) Isd_text: 0.51891 (0.51891) Contrastive_loss: 0.20214 (0.20214) Loss: 0.20214 (0.20214) +2025-04-26,15:43:31 | INFO | Train Epoch: 9 [827392/917504 (90%)] Data (t): 1.287 Batch (t): 9.438, 907.802/s, 453.901/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 30.392 (30.462) Imm_text: 30.392 (30.462) Isd_image: 0.61651 (0.56771) Isd_text: 0.61651 (0.56771) Contrastive_loss: 0.21764 (0.20989) Loss: 0.21764 (0.20989) +2025-04-26,15:45:10 | INFO | Train Epoch: 9 [917504/917504 (100%)] Data (t): 0.957 Batch (t): 8.946, 915.633/s, 457.817/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 30.416 (30.447) Imm_text: 30.416 (30.447) Isd_image: 0.55681 (0.56408) Isd_text: 0.55681 (0.56408) Contrastive_loss: 0.21648 (0.21209) Loss: 0.21648 (0.21209) diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/params.txt b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..5666869893f6307afca95e72131f5ba9fcb978c9 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/params.txt @@ -0,0 +1,103 @@ +accum_freq: 2 +aug_cfg: {} +batch_size: 2048 +beta1: 0.9 +beta2: 0.98 +cache_dir: None +caption_ratio: 0.1 +checkpoint_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints +coca_caption_loss_weight: 2.0 +coca_contrastive_loss_weight: 1.0 +copy_codebase: False +csv_caption_key: title +csv_img_key: filepath +csv_separator: +dataset_resampled: False +dataset_type: synthetic +ddp_static_graph: False +debug: False +delete_previous_checkpoint: False +device: cuda:0 +dist_backend: None +dist_url: None +distill: False +distill_model: None +distill_pretrained: None +distributed: True +epochs: 10 +epochs_cooldown: None +eps: 1e-08 +force_custom_text: False +force_image_size: None +force_patch_dropout: None +force_quick_gelu: False +gather_with_grad: True +grad_checkpointing: True +grad_clip_norm: None +horovod: False +image_interpolation: None +image_mean: None +image_resize_mode: None +image_std: None +imagenet_v2: None +imagenet_val: None +keep_func_name: keep_text_closest_image_closest +local_loss: False +local_rank: 0 +lock_image: False +lock_image_freeze_bn_stats: False +lock_image_unlocked_groups: 0 +lock_text: True +lock_text_freeze_layer_norm: False +lock_text_unlocked_layers: 0 +log_every_n_steps: 100 +log_level: 20 +log_local: False +log_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/out.log +logs: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text +loss_dist_impl: None +lr: 4e-05 +lr_cooldown_end: 0.0 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +map_func_name: use_all +model: ViT-B-16 +momentum: None +name: keep_text_closest_image_closest +no_set_device_rank: False +opt: adamw +precision: amp +pretrained: datacomp_xl_s13b_b90k +pretrained_image: False +rank: 0 +remote_sync: None +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: tensorboard,wandb +resume: None +save_frequency: 10 +save_most_recent: False +seed: 0 +siglip: False +skip_scheduler: False +tensorboard: True +tensorboard_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/tensorboard +torchcompile: False +torchscript: False +trace: False +train_data: /mnt/personal/zhudongy/recap-datacomp-3m-wds/{00376..01507}.tar +train_data_upsampling_factors: None +train_num_samples: 9011874 +use_bn_sync: False +use_bnb_linear: None +val_data: None +val_frequency: 1 +val_num_samples: None +wandb: True +wandb_notes: +wandb_project_name: open-clip +warmup: 110 +wd: 0.5 +workers: 16 +world_size: 2 +zeroshot_frequency: 2 diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_caltech101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_caltech101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..cf4b0a29057b1c776e6974437a4d480fc51bd556 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_caltech101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "caltech101", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8613298337707787, "acc5": 0.9704724409448819, "mean_per_class_recall": 0.9258855588155612}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_cars_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_cars_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..f8fa4ddc1fa31fb253a07a2d1489cc61c3e38739 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_cars_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "cars", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8357169506280313, "acc5": 0.9885586369854495, "mean_per_class_recall": 0.8352126945246118}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..40e0c3e3a24fbf65918cda9633a246ae205d0438 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "cifar100", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7519, "acc5": 0.936, "mean_per_class_recall": 0.7517999999999999}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_cifar10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_cifar10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..7093cb22a7dd68614717a1bcdbbf51e44aa30bbe --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_cifar10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "cifar10", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.9374, "acc5": 0.9977, "mean_per_class_recall": 0.9373999999999999}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4e24acbec65c66796ef4502982e673b427e1d7 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "country211", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.1941706161137441, "acc5": 0.4203791469194313, "mean_per_class_recall": 0.19393364928909954}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..5b09537e11917ab7e8afc208559d7b934ed3fc75 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "dtd", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5595744680851064, "acc5": 0.8207446808510638, "mean_per_class_recall": 0.5590425531914894}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_eurosat_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_eurosat_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..2d76533e8b5e366dad0d07f7d65b35377c5a8593 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_eurosat_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "eurosat", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5228148148148148, "acc5": 0.9313703703703704, "mean_per_class_recall": 0.5255733333333333}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..05d6c0ba251d5992bdf4eb33982cb4a4c51fb422 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "fgvc_aircraft", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.25802580258025803, "acc5": 0.6054605460546054, "mean_per_class_recall": 0.2579144385026738}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json new file mode 100644 index 0000000000000000000000000000000000000000..d76d4499ef7576a51c21cc594d800850f62208aa --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json @@ -0,0 +1 @@ +{"dataset": "flickr30k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.6575999855995178, "text_retrieval_recall@1": 0.8180000185966492, "image_retrieval_recall@5": 0.8712000250816345, "text_retrieval_recall@5": 0.9549999833106995, "image_retrieval_recall@10": 0.920199990272522, "text_retrieval_recall@10": 0.9739999771118164}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..cac829caa07916f7e42039ad9fa5af0d88b8b8da --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "flowers", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7384940640754595, "acc5": 0.8990079687754107, "mean_per_class_recall": 0.7369568492674381}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..9d22b617fc8e168a6e8ef668c65c90c22700b830 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "food101", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8704554455445545, "acc5": 0.9786534653465346, "mean_per_class_recall": 0.8703762376237623}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..444af3135d683bce1b3e9eb1a61e6b2a13360469 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "gtsrb", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5228028503562946, "acc5": 0.7722882026920032, "mean_per_class_recall": 0.4904283170860522}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..65e3fa358812125f48a404e2a54a7fdd60e2d7c5 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "imagenet1k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.68496, "acc5": 0.90966, "mean_per_class_recall": 0.68498}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_mscoco_captions_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_mscoco_captions_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json new file mode 100644 index 0000000000000000000000000000000000000000..e1f1c663a8519555e666885fe98ee0c8a66afd7b --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_mscoco_captions_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json @@ -0,0 +1 @@ +{"dataset": "mscoco_captions", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.3837265074253082, "text_retrieval_recall@1": 0.5424000024795532, "image_retrieval_recall@5": 0.6377848982810974, "text_retrieval_recall@5": 0.772599995136261, "image_retrieval_recall@10": 0.7367852926254272, "text_retrieval_recall@10": 0.8532000184059143}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..6cb14fc389e5b3701e200629727f180237b9bd5c --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "pets", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8967020986644862, "acc5": 0.9970019078768056, "mean_per_class_recall": 0.8959154999105521}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_stl10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_stl10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..15c2380e0b8d33754de9fc063beb6179dd3dbc21 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_stl10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "stl10", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.973125, "acc5": 0.99975, "mean_per_class_recall": 0.97325}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..a54cc49e229d38121d0f65ef34b4d595c2234f21 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "sun397", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6660628574581164, "acc5": 0.9277543814480387, "mean_per_class_recall": 0.6574215875086653}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..5181b404249ab62d6fc5d824b82759d27c35b51a --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "vtab/resisc45", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5852380952380952, "acc5": 0.8898412698412699, "mean_per_class_recall": 0.5932641405860534}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/out.log b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/out.log new file mode 100644 index 0000000000000000000000000000000000000000..64466e73bb1e7a60cdf9db2b7b44144451c120e7 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/out.log @@ -0,0 +1,195 @@ +2025-04-27,14:10:41 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2. +2025-04-27,14:10:41 | INFO | Loaded ViT-B-16 model config. +2025-04-27,14:10:43 | INFO | Loading pretrained ViT-B-16 weights (datacomp_xl_s13b_b90k). +2025-04-27,14:10:43 | INFO | Model: +2025-04-27,14:10:43 | INFO | CLIP( + (visual): VisionTransformer( + (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False) + (patch_dropout): Identity() + (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (transformer): Transformer( + (resblocks): ModuleList( + (0-11): 12 x ResidualAttentionBlock( + (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (attn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) + ) + (ls_1): Identity() + (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): Sequential( + (c_fc): Linear(in_features=768, out_features=3072, bias=True) + (gelu): GELU(approximate='none') + (c_proj): Linear(in_features=3072, out_features=768, bias=True) + ) + (ls_2): Identity() + ) + ) + ) + (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (transformer): Transformer( + (resblocks): ModuleList( + (0-11): 12 x ResidualAttentionBlock( + (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) + (attn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) + ) + (ls_1): Identity() + (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) + (mlp): Sequential( + (c_fc): Linear(in_features=512, out_features=2048, bias=True) + (gelu): GELU(approximate='none') + (c_proj): Linear(in_features=2048, out_features=512, bias=True) + ) + (ls_2): Identity() + ) + ) + ) + (token_embedding): Embedding(49408, 512) + (ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True) +) +2025-04-27,14:10:43 | INFO | Params: +2025-04-27,14:10:43 | INFO | accum_freq: 2 +2025-04-27,14:10:43 | INFO | aug_cfg: {} +2025-04-27,14:10:43 | INFO | batch_size: 2048 +2025-04-27,14:10:43 | INFO | beta1: 0.9 +2025-04-27,14:10:43 | INFO | beta2: 0.98 +2025-04-27,14:10:43 | INFO | cache_dir: None +2025-04-27,14:10:43 | INFO | caption_ratio: 0.1 +2025-04-27,14:10:43 | INFO | checkpoint_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/checkpoints +2025-04-27,14:10:43 | INFO | coca_caption_loss_weight: 2.0 +2025-04-27,14:10:43 | INFO | coca_contrastive_loss_weight: 1.0 +2025-04-27,14:10:43 | INFO | copy_codebase: False +2025-04-27,14:10:43 | INFO | csv_caption_key: title +2025-04-27,14:10:43 | INFO | csv_img_key: filepath +2025-04-27,14:10:43 | INFO | csv_separator: +2025-04-27,14:10:43 | INFO | dataset_resampled: False +2025-04-27,14:10:43 | INFO | dataset_type: synthetic +2025-04-27,14:10:43 | INFO | ddp_static_graph: False +2025-04-27,14:10:43 | INFO | debug: False +2025-04-27,14:10:43 | INFO | delete_previous_checkpoint: False +2025-04-27,14:10:43 | INFO | device: cuda:0 +2025-04-27,14:10:43 | INFO | dist_backend: None +2025-04-27,14:10:43 | INFO | dist_url: None +2025-04-27,14:10:43 | INFO | distill: False +2025-04-27,14:10:43 | INFO | distill_model: None +2025-04-27,14:10:43 | INFO | distill_pretrained: None +2025-04-27,14:10:43 | INFO | distributed: True +2025-04-27,14:10:43 | INFO | epochs: 10 +2025-04-27,14:10:43 | INFO | epochs_cooldown: None +2025-04-27,14:10:43 | INFO | eps: 1e-08 +2025-04-27,14:10:43 | INFO | force_custom_text: False +2025-04-27,14:10:43 | INFO | force_image_size: None +2025-04-27,14:10:43 | INFO | force_patch_dropout: None +2025-04-27,14:10:43 | INFO | force_quick_gelu: False +2025-04-27,14:10:43 | INFO | gather_with_grad: True +2025-04-27,14:10:43 | INFO | grad_checkpointing: True +2025-04-27,14:10:43 | INFO | grad_clip_norm: None +2025-04-27,14:10:43 | INFO | horovod: False +2025-04-27,14:10:43 | INFO | image_interpolation: None +2025-04-27,14:10:43 | INFO | image_mean: None +2025-04-27,14:10:43 | INFO | image_resize_mode: None +2025-04-27,14:10:43 | INFO | image_std: None +2025-04-27,14:10:43 | INFO | imagenet_v2: None +2025-04-27,14:10:43 | INFO | imagenet_val: None +2025-04-27,14:10:43 | INFO | keep_func_name: keep_text_farest_image_farest +2025-04-27,14:10:43 | INFO | local_loss: False +2025-04-27,14:10:43 | INFO | local_rank: 0 +2025-04-27,14:10:43 | INFO | lock_image: False +2025-04-27,14:10:43 | INFO | lock_image_freeze_bn_stats: False +2025-04-27,14:10:43 | INFO | lock_image_unlocked_groups: 0 +2025-04-27,14:10:43 | INFO | lock_text: True +2025-04-27,14:10:43 | INFO | lock_text_freeze_layer_norm: False +2025-04-27,14:10:43 | INFO | lock_text_unlocked_layers: 0 +2025-04-27,14:10:43 | INFO | log_every_n_steps: 100 +2025-04-27,14:10:43 | INFO | log_level: 20 +2025-04-27,14:10:43 | INFO | log_local: False +2025-04-27,14:10:43 | INFO | log_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/out.log +2025-04-27,14:10:43 | INFO | logs: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text +2025-04-27,14:10:43 | INFO | loss_dist_impl: None +2025-04-27,14:10:43 | INFO | lr: 4e-05 +2025-04-27,14:10:43 | INFO | lr_cooldown_end: 0.0 +2025-04-27,14:10:43 | INFO | lr_cooldown_power: 1.0 +2025-04-27,14:10:43 | INFO | lr_scheduler: cosine +2025-04-27,14:10:43 | INFO | map_func_name: use_all +2025-04-27,14:10:43 | INFO | model: ViT-B-16 +2025-04-27,14:10:43 | INFO | momentum: None +2025-04-27,14:10:43 | INFO | name: keep_text_farest_image_farest +2025-04-27,14:10:43 | INFO | no_set_device_rank: False +2025-04-27,14:10:43 | INFO | opt: adamw +2025-04-27,14:10:43 | INFO | precision: amp +2025-04-27,14:10:43 | INFO | pretrained: datacomp_xl_s13b_b90k +2025-04-27,14:10:43 | INFO | pretrained_image: False +2025-04-27,14:10:43 | INFO | rank: 0 +2025-04-27,14:10:43 | INFO | remote_sync: None +2025-04-27,14:10:43 | INFO | remote_sync_frequency: 300 +2025-04-27,14:10:43 | INFO | remote_sync_protocol: s3 +2025-04-27,14:10:43 | INFO | report_to: tensorboard,wandb +2025-04-27,14:10:43 | INFO | resume: None +2025-04-27,14:10:43 | INFO | save_frequency: 10 +2025-04-27,14:10:43 | INFO | save_most_recent: False +2025-04-27,14:10:43 | INFO | seed: 0 +2025-04-27,14:10:43 | INFO | siglip: False +2025-04-27,14:10:43 | INFO | skip_scheduler: False +2025-04-27,14:10:43 | INFO | tensorboard: True +2025-04-27,14:10:43 | INFO | tensorboard_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/tensorboard +2025-04-27,14:10:43 | INFO | torchcompile: False +2025-04-27,14:10:43 | INFO | torchscript: False +2025-04-27,14:10:43 | INFO | trace: False +2025-04-27,14:10:43 | INFO | train_data: /mnt/personal/zhudongy/recap-datacomp-3m-wds/{00376..01507}.tar +2025-04-27,14:10:43 | INFO | train_data_upsampling_factors: None +2025-04-27,14:10:43 | INFO | train_num_samples: 9011874 +2025-04-27,14:10:43 | INFO | use_bn_sync: False +2025-04-27,14:10:43 | INFO | use_bnb_linear: None +2025-04-27,14:10:43 | INFO | val_data: None +2025-04-27,14:10:43 | INFO | val_frequency: 1 +2025-04-27,14:10:43 | INFO | val_num_samples: None +2025-04-27,14:10:43 | INFO | wandb: True +2025-04-27,14:10:43 | INFO | wandb_notes: +2025-04-27,14:10:43 | INFO | wandb_project_name: open-clip +2025-04-27,14:10:43 | INFO | warmup: 110 +2025-04-27,14:10:43 | INFO | wd: 0.5 +2025-04-27,14:10:43 | INFO | workers: 16 +2025-04-27,14:10:43 | INFO | world_size: 2 +2025-04-27,14:10:43 | INFO | zeroshot_frequency: 2 +2025-04-27,14:10:44 | INFO | Created AdamW (adamw) optimizer: lr: 4e-05, betas: (0.9, 0.98), eps: 1e-08, weight_decay: 0.5, amsgrad: False, foreach: None, maximize: False, capturable: False, differentiable: False, fused: None +2025-04-27,14:11:06 | INFO | Start epoch 0 +2025-04-27,14:12:29 | INFO | Train Epoch: 0 [ 8192/917504 (1%)] Data (t): 73.574 Batch (t): 83.453, 98.1633/s, 49.0816/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 28.493 (28.493) Imm_text: 28.493 (28.493) Isd_image: 3.2680 (3.2680) Isd_text: 3.2680 (3.2680) Contrastive_loss: 1.4798 (1.4798) Loss: 1.4798 (1.4798) +2025-04-27,14:28:09 | INFO | Train Epoch: 0 [827392/917504 (90%)] Data (t): 1.261 Batch (t): 9.398, 906.234/s, 453.117/s/gpu LR: 0.000037 Logit Scale: 99.933 Imm_image: 28.917 (28.705) Imm_text: 28.917 (28.705) Isd_image: 5.8390 (4.5535) Isd_text: 5.8390 (4.5535) Contrastive_loss: 0.74036 (1.1101) Loss: 0.74036 (1.1101) +2025-04-27,14:29:47 | INFO | Train Epoch: 0 [917504/917504 (100%)] Data (t): 0.960 Batch (t): 8.946, 916.148/s, 458.074/s/gpu LR: 0.000040 Logit Scale: 99.924 Imm_image: 29.020 (28.810) Imm_text: 29.020 (28.810) Isd_image: 5.7111 (4.9394) Isd_text: 5.7111 (4.9394) Contrastive_loss: 0.66954 (0.96322) Loss: 0.66954 (0.96322) +2025-04-27,14:29:48 | INFO | Start epoch 1 +2025-04-27,14:31:02 | INFO | Train Epoch: 1 [ 8192/917504 (1%)] Data (t): 65.003 Batch (t): 74.908, 109.361/s, 54.6804/s/gpu LR: 0.000040 Logit Scale: 99.923 Imm_image: 28.999 (28.999) Imm_text: 28.999 (28.999) Isd_image: 5.7604 (5.7604) Isd_text: 5.7604 (5.7604) Contrastive_loss: 0.63022 (0.63022) Loss: 0.63022 (0.63022) +2025-04-27,14:46:50 | INFO | Train Epoch: 1 [827392/917504 (90%)] Data (t): 1.323 Batch (t): 9.478, 904.316/s, 452.158/s/gpu LR: 0.000039 Logit Scale: 99.879 Imm_image: 29.042 (29.020) Imm_text: 29.042 (29.020) Isd_image: 4.1819 (4.9712) Isd_text: 4.1819 (4.9712) Contrastive_loss: 0.63138 (0.63080) Loss: 0.63138 (0.63080) +2025-04-27,14:48:29 | INFO | Train Epoch: 1 [917504/917504 (100%)] Data (t): 0.974 Batch (t): 8.966, 914.640/s, 457.320/s/gpu LR: 0.000039 Logit Scale: 99.880 Imm_image: 29.415 (29.152) Imm_text: 29.415 (29.152) Isd_image: 3.7846 (4.5757) Isd_text: 3.7846 (4.5757) Contrastive_loss: 0.49720 (0.58627) Loss: 0.49720 (0.58627) +2025-04-27,14:48:29 | INFO | Start epoch 2 +2025-04-27,14:49:48 | INFO | Train Epoch: 2 [ 8192/917504 (1%)] Data (t): 67.665 Batch (t): 79.290, 103.317/s, 51.6584/s/gpu LR: 0.000039 Logit Scale: 99.881 Imm_image: 29.362 (29.362) Imm_text: 29.362 (29.362) Isd_image: 3.7103 (3.7103) Isd_text: 3.7103 (3.7103) Contrastive_loss: 0.49970 (0.49970) Loss: 0.49970 (0.49970) +2025-04-27,15:05:31 | INFO | Train Epoch: 2 [827392/917504 (90%)] Data (t): 1.286 Batch (t): 9.430, 910.998/s, 455.499/s/gpu LR: 0.000036 Logit Scale: 100.000 Imm_image: 29.406 (29.384) Imm_text: 29.406 (29.384) Isd_image: 2.7587 (3.2345) Isd_text: 2.7587 (3.2345) Contrastive_loss: 0.49095 (0.49533) Loss: 0.49095 (0.49533) +2025-04-27,15:07:10 | INFO | Train Epoch: 2 [917504/917504 (100%)] Data (t): 0.959 Batch (t): 8.946, 915.745/s, 457.872/s/gpu LR: 0.000035 Logit Scale: 100.000 Imm_image: 29.600 (29.456) Imm_text: 29.600 (29.456) Isd_image: 2.6386 (3.0359) Isd_text: 2.6386 (3.0359) Contrastive_loss: 0.41630 (0.46898) Loss: 0.41630 (0.46898) +2025-04-27,15:07:10 | INFO | Start epoch 3 +2025-04-27,15:08:34 | INFO | Train Epoch: 3 [ 8192/917504 (1%)] Data (t): 75.308 Batch (t): 83.463, 98.1507/s, 49.0754/s/gpu LR: 0.000035 Logit Scale: 100.000 Imm_image: 29.487 (29.487) Imm_text: 29.487 (29.487) Isd_image: 2.6624 (2.6624) Isd_text: 2.6624 (2.6624) Contrastive_loss: 0.40202 (0.40202) Loss: 0.40202 (0.40202) +2025-04-27,15:24:21 | INFO | Train Epoch: 3 [827392/917504 (90%)] Data (t): 1.309 Batch (t): 9.471, 891.132/s, 445.566/s/gpu LR: 0.000031 Logit Scale: 100.000 Imm_image: 29.683 (29.585) Imm_text: 29.683 (29.585) Isd_image: 1.6883 (2.1753) Isd_text: 1.6883 (2.1753) Contrastive_loss: 0.39447 (0.39825) Loss: 0.39447 (0.39825) +2025-04-27,15:25:58 | INFO | Train Epoch: 3 [917504/917504 (100%)] Data (t): 0.916 Batch (t): 8.888, 922.127/s, 461.064/s/gpu LR: 0.000030 Logit Scale: 100.000 Imm_image: 29.677 (29.616) Imm_text: 29.677 (29.616) Isd_image: 1.5397 (1.9635) Isd_text: 1.5397 (1.9635) Contrastive_loss: 0.33807 (0.37819) Loss: 0.33807 (0.37819) +2025-04-27,15:25:59 | INFO | Start epoch 4 +2025-04-27,15:27:14 | INFO | Train Epoch: 4 [ 8192/917504 (1%)] Data (t): 67.313 Batch (t): 75.469, 108.548/s, 54.2740/s/gpu LR: 0.000030 Logit Scale: 100.000 Imm_image: 29.732 (29.732) Imm_text: 29.732 (29.732) Isd_image: 1.5992 (1.5992) Isd_text: 1.5992 (1.5992) Contrastive_loss: 0.36937 (0.36937) Loss: 0.36937 (0.36937) +2025-04-27,15:42:56 | INFO | Train Epoch: 4 [827392/917504 (90%)] Data (t): 1.263 Batch (t): 9.418, 887.250/s, 443.625/s/gpu LR: 0.000024 Logit Scale: 100.000 Imm_image: 29.832 (29.782) Imm_text: 29.832 (29.782) Isd_image: 1.0603 (1.3298) Isd_text: 1.0603 (1.3298) Contrastive_loss: 0.35375 (0.36156) Loss: 0.35375 (0.36156) +2025-04-27,15:44:34 | INFO | Train Epoch: 4 [917504/917504 (100%)] Data (t): 0.958 Batch (t): 8.944, 917.607/s, 458.804/s/gpu LR: 0.000023 Logit Scale: 100.000 Imm_image: 29.912 (29.825) Imm_text: 29.912 (29.825) Isd_image: 1.2332 (1.2976) Isd_text: 1.2332 (1.2976) Contrastive_loss: 0.30779 (0.34364) Loss: 0.30779 (0.34364) +2025-04-27,15:44:35 | INFO | Start epoch 5 +2025-04-27,15:45:50 | INFO | Train Epoch: 5 [ 8192/917504 (1%)] Data (t): 66.719 Batch (t): 75.066, 109.131/s, 54.5655/s/gpu LR: 0.000023 Logit Scale: 100.000 Imm_image: 29.906 (29.906) Imm_text: 29.906 (29.906) Isd_image: 1.2436 (1.2436) Isd_text: 1.2436 (1.2436) Contrastive_loss: 0.31235 (0.31235) Loss: 0.31235 (0.31235) +2025-04-27,16:01:33 | INFO | Train Epoch: 5 [827392/917504 (90%)] Data (t): 1.294 Batch (t): 9.438, 910.314/s, 455.157/s/gpu LR: 0.000017 Logit Scale: 100.000 Imm_image: 30.076 (29.991) Imm_text: 30.076 (29.991) Isd_image: 0.80181 (1.0227) Isd_text: 0.80181 (1.0227) Contrastive_loss: 0.26279 (0.28757) Loss: 0.26279 (0.28757) +2025-04-27,16:03:12 | INFO | Train Epoch: 5 [917504/917504 (100%)] Data (t): 0.961 Batch (t): 8.947, 913.622/s, 456.811/s/gpu LR: 0.000017 Logit Scale: 100.000 Imm_image: 30.112 (30.031) Imm_text: 30.112 (30.031) Isd_image: 0.77558 (0.94034) Isd_text: 0.77558 (0.94034) Contrastive_loss: 0.24435 (0.27316) Loss: 0.24435 (0.27316) +2025-04-27,16:03:12 | INFO | Start epoch 6 +2025-04-27,16:04:21 | INFO | Train Epoch: 6 [ 8192/917504 (1%)] Data (t): 61.018 Batch (t): 69.241, 118.312/s, 59.1561/s/gpu LR: 0.000016 Logit Scale: 100.000 Imm_image: 30.126 (30.126) Imm_text: 30.126 (30.126) Isd_image: 0.82938 (0.82938) Isd_text: 0.82938 (0.82938) Contrastive_loss: 0.24072 (0.24072) Loss: 0.24072 (0.24072) +2025-04-27,16:20:07 | INFO | Train Epoch: 6 [827392/917504 (90%)] Data (t): 1.307 Batch (t): 9.455, 904.537/s, 452.268/s/gpu LR: 0.000011 Logit Scale: 100.000 Imm_image: 30.169 (30.147) Imm_text: 30.169 (30.147) Isd_image: 0.75930 (0.79434) Isd_text: 0.75930 (0.79434) Contrastive_loss: 0.22887 (0.23480) Loss: 0.22887 (0.23480) +2025-04-27,16:21:45 | INFO | Train Epoch: 6 [917504/917504 (100%)] Data (t): 0.978 Batch (t): 8.970, 913.069/s, 456.535/s/gpu LR: 0.000010 Logit Scale: 100.000 Imm_image: 30.280 (30.192) Imm_text: 30.280 (30.192) Isd_image: 0.61379 (0.73416) Isd_text: 0.61379 (0.73416) Contrastive_loss: 0.20299 (0.22419) Loss: 0.20299 (0.22419) +2025-04-27,16:21:46 | INFO | Start epoch 7 +2025-04-27,16:23:00 | INFO | Train Epoch: 7 [ 8192/917504 (1%)] Data (t): 65.399 Batch (t): 74.159, 110.465/s, 55.2326/s/gpu LR: 0.000010 Logit Scale: 100.000 Imm_image: 30.246 (30.246) Imm_text: 30.246 (30.246) Isd_image: 0.63558 (0.63558) Isd_text: 0.63558 (0.63558) Contrastive_loss: 0.23207 (0.23207) Loss: 0.23207 (0.23207) +2025-04-27,16:38:45 | INFO | Train Epoch: 7 [827392/917504 (90%)] Data (t): 1.312 Batch (t): 9.454, 903.345/s, 451.673/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.382 (30.314) Imm_text: 30.382 (30.314) Isd_image: 0.57732 (0.60645) Isd_text: 0.57732 (0.60645) Contrastive_loss: 0.24257 (0.23732) Loss: 0.24257 (0.23732) +2025-04-27,16:40:24 | INFO | Train Epoch: 7 [917504/917504 (100%)] Data (t): 0.967 Batch (t): 8.956, 914.599/s, 457.300/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.388 (30.339) Imm_text: 30.388 (30.339) Isd_image: 0.62119 (0.61136) Isd_text: 0.62119 (0.61136) Contrastive_loss: 0.20538 (0.22667) Loss: 0.20538 (0.22667) +2025-04-27,16:40:24 | INFO | Start epoch 8 +2025-04-27,16:41:41 | INFO | Train Epoch: 8 [ 8192/917504 (1%)] Data (t): 68.560 Batch (t): 76.671, 106.847/s, 53.4233/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.292 (30.292) Imm_text: 30.292 (30.292) Isd_image: 0.62705 (0.62705) Isd_text: 0.62705 (0.62705) Contrastive_loss: 0.22492 (0.22492) Loss: 0.22492 (0.22492) +2025-04-27,16:57:24 | INFO | Train Epoch: 8 [827392/917504 (90%)] Data (t): 1.272 Batch (t): 9.430, 907.430/s, 453.715/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.287 (30.289) Imm_text: 30.287 (30.289) Isd_image: 0.73358 (0.68032) Isd_text: 0.73358 (0.68032) Contrastive_loss: 0.23181 (0.22837) Loss: 0.23181 (0.22837) +2025-04-27,16:59:02 | INFO | Train Epoch: 8 [917504/917504 (100%)] Data (t): 0.965 Batch (t): 8.953, 916.282/s, 458.141/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.380 (30.320) Imm_text: 30.380 (30.320) Isd_image: 0.63722 (0.66595) Isd_text: 0.63722 (0.66595) Contrastive_loss: 0.21031 (0.22235) Loss: 0.21031 (0.22235) +2025-04-27,16:59:02 | INFO | Start epoch 9 +2025-04-27,17:00:13 | INFO | Train Epoch: 9 [ 8192/917504 (1%)] Data (t): 62.650 Batch (t): 70.931, 115.493/s, 57.7463/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.446 (30.446) Imm_text: 30.446 (30.446) Isd_image: 0.56550 (0.56550) Isd_text: 0.56550 (0.56550) Contrastive_loss: 0.19375 (0.19375) Loss: 0.19375 (0.19375) +2025-04-27,17:15:58 | INFO | Train Epoch: 9 [827392/917504 (90%)] Data (t): 1.296 Batch (t): 9.448, 899.411/s, 449.705/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 30.356 (30.401) Imm_text: 30.356 (30.401) Isd_image: 0.50091 (0.53320) Isd_text: 0.50091 (0.53320) Contrastive_loss: 0.22096 (0.20735) Loss: 0.22096 (0.20735) +2025-04-27,17:17:37 | INFO | Train Epoch: 9 [917504/917504 (100%)] Data (t): 0.967 Batch (t): 8.951, 914.908/s, 457.454/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 30.451 (30.418) Imm_text: 30.451 (30.418) Isd_image: 0.52989 (0.53210) Isd_text: 0.52989 (0.53210) Contrastive_loss: 0.19882 (0.20451) Loss: 0.19882 (0.20451) diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/params.txt b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..22a44908e1f2d75b3510a8bf19bffcd4d5858c79 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/params.txt @@ -0,0 +1,103 @@ +accum_freq: 2 +aug_cfg: {} +batch_size: 2048 +beta1: 0.9 +beta2: 0.98 +cache_dir: None +caption_ratio: 0.1 +checkpoint_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/checkpoints +coca_caption_loss_weight: 2.0 +coca_contrastive_loss_weight: 1.0 +copy_codebase: False +csv_caption_key: title +csv_img_key: filepath +csv_separator: +dataset_resampled: False +dataset_type: synthetic +ddp_static_graph: False +debug: False +delete_previous_checkpoint: False +device: cuda:0 +dist_backend: None +dist_url: None +distill: False +distill_model: None +distill_pretrained: None +distributed: True +epochs: 10 +epochs_cooldown: None +eps: 1e-08 +force_custom_text: False +force_image_size: None +force_patch_dropout: None +force_quick_gelu: False +gather_with_grad: True +grad_checkpointing: True +grad_clip_norm: None +horovod: False +image_interpolation: None +image_mean: None +image_resize_mode: None +image_std: None +imagenet_v2: None +imagenet_val: None +keep_func_name: keep_text_farest_image_farest +local_loss: False +local_rank: 0 +lock_image: False +lock_image_freeze_bn_stats: False +lock_image_unlocked_groups: 0 +lock_text: True +lock_text_freeze_layer_norm: False +lock_text_unlocked_layers: 0 +log_every_n_steps: 100 +log_level: 20 +log_local: False +log_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/out.log +logs: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text +loss_dist_impl: None +lr: 4e-05 +lr_cooldown_end: 0.0 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +map_func_name: use_all +model: ViT-B-16 +momentum: None +name: keep_text_farest_image_farest +no_set_device_rank: False +opt: adamw +precision: amp +pretrained: datacomp_xl_s13b_b90k +pretrained_image: False +rank: 0 +remote_sync: None +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: tensorboard,wandb +resume: None +save_frequency: 10 +save_most_recent: False +seed: 0 +siglip: False +skip_scheduler: False +tensorboard: True +tensorboard_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/tensorboard +torchcompile: False +torchscript: False +trace: False +train_data: /mnt/personal/zhudongy/recap-datacomp-3m-wds/{00376..01507}.tar +train_data_upsampling_factors: None +train_num_samples: 9011874 +use_bn_sync: False +use_bnb_linear: None +val_data: None +val_frequency: 1 +val_num_samples: None +wandb: True +wandb_notes: +wandb_project_name: open-clip +warmup: 110 +wd: 0.5 +workers: 16 +world_size: 2 +zeroshot_frequency: 2 diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..c4b268727efa11b2453215c218082a2c30f34772 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json @@ -0,0 +1 @@ +{"dataset": "fgvc_aircraft", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.2589258925892589, "acc5": 0.6201620162016201, "mean_per_class_recall": 0.25701426024955437}, "language": "en"} \ No newline at end of file diff --git a/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/params.txt b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..9804b08e919f82b7e66e9d0c2879c8b3c4a366e6 --- /dev/null +++ b/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/params.txt @@ -0,0 +1,103 @@ +accum_freq: 2 +aug_cfg: {} +batch_size: 2048 +beta1: 0.9 +beta2: 0.98 +cache_dir: None +caption_ratio: 0.1 +checkpoint_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/checkpoints +coca_caption_loss_weight: 2.0 +coca_contrastive_loss_weight: 1.0 +copy_codebase: False +csv_caption_key: title +csv_img_key: filepath +csv_separator: +dataset_resampled: False +dataset_type: synthetic +ddp_static_graph: False +debug: False +delete_previous_checkpoint: False +device: cuda:0 +dist_backend: None +dist_url: None +distill: False +distill_model: None +distill_pretrained: None +distributed: True +epochs: 10 +epochs_cooldown: None +eps: 1e-08 +force_custom_text: False +force_image_size: None +force_patch_dropout: None +force_quick_gelu: False +gather_with_grad: True +grad_checkpointing: True +grad_clip_norm: None +horovod: False +image_interpolation: None +image_mean: None +image_resize_mode: None +image_std: None +imagenet_v2: None +imagenet_val: None +keep_func_name: low_inter_only +local_loss: False +local_rank: 0 +lock_image: False +lock_image_freeze_bn_stats: False +lock_image_unlocked_groups: 0 +lock_text: True +lock_text_freeze_layer_norm: False +lock_text_unlocked_layers: 0 +log_every_n_steps: 100 +log_level: 20 +log_local: False +log_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/out.log +logs: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text +loss_dist_impl: None +lr: 4e-05 +lr_cooldown_end: 0.0 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +map_func_name: use_all +model: ViT-B-16 +momentum: None +name: low_inter_only +no_set_device_rank: False +opt: adamw +precision: amp +pretrained: datacomp_xl_s13b_b90k +pretrained_image: False +rank: 0 +remote_sync: None +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: tensorboard,wandb +resume: None +save_frequency: 10 +save_most_recent: False +seed: 0 +siglip: False +skip_scheduler: False +tensorboard: True +tensorboard_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/tensorboard +torchcompile: False +torchscript: False +trace: False +train_data: /mnt/personal/zhudongy/recap-datacomp-3m-wds/{00376..01507}.tar +train_data_upsampling_factors: None +train_num_samples: 9011874 +use_bn_sync: False +use_bnb_linear: None +val_data: None +val_frequency: 1 +val_num_samples: None +wandb: True +wandb_notes: +wandb_project_name: open-clip +warmup: 110 +wd: 0.5 +workers: 16 +world_size: 2 +zeroshot_frequency: 2