Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +6 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/tensorboard/events.out.tfevents.1745688045.g12.2586430.0 +3 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_caltech101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_cars_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_cifar10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_eurosat_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_mscoco_captions_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_stl10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/out.log +195 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/params.txt +103 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/tensorboard/events.out.tfevents.1745733263.g12.2655775.0 +3 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/tensorboard/events.out.tfevents.1745710659.g12.2628499.0 +3 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/tensorboard/events.out.tfevents.1745663928.g12.2393971.0 +3 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/tensorboard/events.out.tfevents.1745755854.g12.2682891.0 +3 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_caltech101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_cars_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_cifar10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_eurosat_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_mscoco_captions_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_stl10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/out.log +195 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/tensorboard/events.out.tfevents.1745780012.g12.2713681.0 +3 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/clusters/image_I_closest_0.1_SFR-Embedding-Code-2B_R_dinov2-large.npy +3 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/clusters/image_farest.jsonl +3 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/clusters/text_farest.jsonl +3 -0
- SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/clusters/text_uniform.jsonl +3 -0
- captions.tsv +3 -0
.gitattributes
CHANGED
|
@@ -36,3 +36,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 36 |
SFR-Embedding-Code-2B_R\#0.8\#0.6\#dinov2-large\#0.0\#0.2\#rouge_0.2\#top_8\#inter_0.4/clusters/image_uniform.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 37 |
SFR-Embedding-Code-2B_R\#0.8\#0.6\#dinov2-large\#0.0\#0.2\#rouge_0.2\#top_8\#inter_0.4/clusters/image_closest.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 38 |
SFR-Embedding-Code-2B_R\#0.8\#0.6\#dinov2-large\#0.0\#0.2\#rouge_0.2\#top_8\#inter_0.4/clusters/text_closest.jsonl filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
SFR-Embedding-Code-2B_R\#0.8\#0.6\#dinov2-large\#0.0\#0.2\#rouge_0.2\#top_8\#inter_0.4/clusters/image_uniform.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 37 |
SFR-Embedding-Code-2B_R\#0.8\#0.6\#dinov2-large\#0.0\#0.2\#rouge_0.2\#top_8\#inter_0.4/clusters/image_closest.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 38 |
SFR-Embedding-Code-2B_R\#0.8\#0.6\#dinov2-large\#0.0\#0.2\#rouge_0.2\#top_8\#inter_0.4/clusters/text_closest.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
SFR-Embedding-Code-2B_R\#0.8\#0.6\#dinov2-large\#0.0\#0.2\#rouge_0.2\#top_8\#inter_0.4/clusters/text_uniform.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
SFR-Embedding-Code-2B_R\#0.8\#0.6\#dinov2-large\#0.0\#0.2\#rouge_0.2\#top_8\#inter_0.4/clusters/image_farest.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
SFR-Embedding-Code-2B_R\#0.8\#0.6\#dinov2-large\#0.0\#0.2\#rouge_0.2\#top_8\#inter_0.4/clusters/text_farest.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
siglip-so400m-patch14-384\#0.8\#0.6\#siglip-so400m-patch14-384\#0.0\#0.2\#rouge_0.2\#top_8\#inter_0.4/clusters/text_closest.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
siglip-so400m-patch14-384\#0.8\#0.6\#siglip-so400m-patch14-384\#0.0\#0.2\#rouge_0.2\#top_8\#inter_0.4/clusters/text_uniform.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
captions.tsv filter=lfs diff=lfs merge=lfs -text
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/tensorboard/events.out.tfevents.1745688045.g12.2586430.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:72c633d46b0620be846fb5b5afdb9b32eb3b3f502669dd38665578d573a7a097
|
| 3 |
+
size 19936
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_caltech101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "caltech101", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8613298337707787, "acc5": 0.9757217847769029, "mean_per_class_recall": 0.9243666325650888}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_cars_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "cars", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8311155328939187, "acc5": 0.9896779007586121, "mean_per_class_recall": 0.8300859230489636}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "cifar100", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7575, "acc5": 0.9361, "mean_per_class_recall": 0.7575}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_cifar10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "cifar10", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.9358, "acc5": 0.9976, "mean_per_class_recall": 0.9359000000000002}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "country211", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.19450236966824644, "acc5": 0.42265402843601896, "mean_per_class_recall": 0.19445497630331754}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "dtd", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5531914893617021, "acc5": 0.8308510638297872, "mean_per_class_recall": 0.5531914893617021}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_eurosat_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "eurosat", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5212962962962963, "acc5": 0.8868148148148148, "mean_per_class_recall": 0.5254933333333334}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "fgvc_aircraft", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.2652265226522652, "acc5": 0.6126612661266126, "mean_per_class_recall": 0.26398395721925133}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "flickr30k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/checkpoints/epoch_10.pt", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.6549999713897705, "text_retrieval_recall@1": 0.8149999976158142, "image_retrieval_recall@5": 0.8682000041007996, "text_retrieval_recall@5": 0.9520000219345093, "image_retrieval_recall@10": 0.921999990940094, "text_retrieval_recall@10": 0.9810000061988831}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "flowers", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7440234184420231, "acc5": 0.9011221336802732, "mean_per_class_recall": 0.7362584000701315}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "food101", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.867049504950495, "acc5": 0.9799603960396039, "mean_per_class_recall": 0.867009900990099}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "gtsrb", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5332541567695962, "acc5": 0.7577988915281076, "mean_per_class_recall": 0.4841045961397327}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "imagenet1k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.68376, "acc5": 0.90904, "mean_per_class_recall": 0.68382}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_mscoco_captions_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "mscoco_captions", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/checkpoints/epoch_10.pt", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.38504597544670105, "text_retrieval_recall@1": 0.5383999943733215, "image_retrieval_recall@5": 0.6431427597999573, "text_retrieval_recall@5": 0.7778000235557556, "image_retrieval_recall@10": 0.7439424395561218, "text_retrieval_recall@10": 0.8586000204086304}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "pets", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.9078768056691197, "acc5": 0.9961842463886618, "mean_per_class_recall": 0.9070997688018754}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_stl10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "stl10", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.973375, "acc5": 1.0, "mean_per_class_recall": 0.9733749999999999}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "sun397", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6725453776412822, "acc5": 0.9295841992018684, "mean_per_class_recall": 0.6641095198145022}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "vtab/resisc45", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6007936507936508, "acc5": 0.9041269841269841, "mean_per_class_recall": 0.6084713941217861}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/out.log
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-04-27,07:54:14 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2.
|
| 2 |
+
2025-04-27,07:54:14 | INFO | Loaded ViT-B-16 model config.
|
| 3 |
+
2025-04-27,07:54:15 | INFO | Loading pretrained ViT-B-16 weights (datacomp_xl_s13b_b90k).
|
| 4 |
+
2025-04-27,07:54:15 | INFO | Model:
|
| 5 |
+
2025-04-27,07:54:15 | INFO | CLIP(
|
| 6 |
+
(visual): VisionTransformer(
|
| 7 |
+
(conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
|
| 8 |
+
(patch_dropout): Identity()
|
| 9 |
+
(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 10 |
+
(transformer): Transformer(
|
| 11 |
+
(resblocks): ModuleList(
|
| 12 |
+
(0-11): 12 x ResidualAttentionBlock(
|
| 13 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 14 |
+
(attn): MultiheadAttention(
|
| 15 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 16 |
+
)
|
| 17 |
+
(ls_1): Identity()
|
| 18 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 19 |
+
(mlp): Sequential(
|
| 20 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 21 |
+
(gelu): GELU(approximate='none')
|
| 22 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 23 |
+
)
|
| 24 |
+
(ls_2): Identity()
|
| 25 |
+
)
|
| 26 |
+
)
|
| 27 |
+
)
|
| 28 |
+
(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 29 |
+
)
|
| 30 |
+
(transformer): Transformer(
|
| 31 |
+
(resblocks): ModuleList(
|
| 32 |
+
(0-11): 12 x ResidualAttentionBlock(
|
| 33 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 34 |
+
(attn): MultiheadAttention(
|
| 35 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 36 |
+
)
|
| 37 |
+
(ls_1): Identity()
|
| 38 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 39 |
+
(mlp): Sequential(
|
| 40 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 41 |
+
(gelu): GELU(approximate='none')
|
| 42 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 43 |
+
)
|
| 44 |
+
(ls_2): Identity()
|
| 45 |
+
)
|
| 46 |
+
)
|
| 47 |
+
)
|
| 48 |
+
(token_embedding): Embedding(49408, 512)
|
| 49 |
+
(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 50 |
+
)
|
| 51 |
+
2025-04-27,07:54:15 | INFO | Params:
|
| 52 |
+
2025-04-27,07:54:15 | INFO | accum_freq: 2
|
| 53 |
+
2025-04-27,07:54:15 | INFO | aug_cfg: {}
|
| 54 |
+
2025-04-27,07:54:15 | INFO | batch_size: 2048
|
| 55 |
+
2025-04-27,07:54:15 | INFO | beta1: 0.9
|
| 56 |
+
2025-04-27,07:54:15 | INFO | beta2: 0.98
|
| 57 |
+
2025-04-27,07:54:15 | INFO | cache_dir: None
|
| 58 |
+
2025-04-27,07:54:15 | INFO | caption_ratio: 0.1
|
| 59 |
+
2025-04-27,07:54:15 | INFO | checkpoint_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/checkpoints
|
| 60 |
+
2025-04-27,07:54:15 | INFO | coca_caption_loss_weight: 2.0
|
| 61 |
+
2025-04-27,07:54:15 | INFO | coca_contrastive_loss_weight: 1.0
|
| 62 |
+
2025-04-27,07:54:15 | INFO | copy_codebase: False
|
| 63 |
+
2025-04-27,07:54:15 | INFO | csv_caption_key: title
|
| 64 |
+
2025-04-27,07:54:15 | INFO | csv_img_key: filepath
|
| 65 |
+
2025-04-27,07:54:15 | INFO | csv_separator:
|
| 66 |
+
2025-04-27,07:54:15 | INFO | dataset_resampled: False
|
| 67 |
+
2025-04-27,07:54:15 | INFO | dataset_type: synthetic
|
| 68 |
+
2025-04-27,07:54:15 | INFO | ddp_static_graph: False
|
| 69 |
+
2025-04-27,07:54:15 | INFO | debug: False
|
| 70 |
+
2025-04-27,07:54:15 | INFO | delete_previous_checkpoint: False
|
| 71 |
+
2025-04-27,07:54:15 | INFO | device: cuda:0
|
| 72 |
+
2025-04-27,07:54:15 | INFO | dist_backend: None
|
| 73 |
+
2025-04-27,07:54:15 | INFO | dist_url: None
|
| 74 |
+
2025-04-27,07:54:15 | INFO | distill: False
|
| 75 |
+
2025-04-27,07:54:15 | INFO | distill_model: None
|
| 76 |
+
2025-04-27,07:54:15 | INFO | distill_pretrained: None
|
| 77 |
+
2025-04-27,07:54:15 | INFO | distributed: True
|
| 78 |
+
2025-04-27,07:54:15 | INFO | epochs: 10
|
| 79 |
+
2025-04-27,07:54:15 | INFO | epochs_cooldown: None
|
| 80 |
+
2025-04-27,07:54:15 | INFO | eps: 1e-08
|
| 81 |
+
2025-04-27,07:54:15 | INFO | force_custom_text: False
|
| 82 |
+
2025-04-27,07:54:15 | INFO | force_image_size: None
|
| 83 |
+
2025-04-27,07:54:15 | INFO | force_patch_dropout: None
|
| 84 |
+
2025-04-27,07:54:15 | INFO | force_quick_gelu: False
|
| 85 |
+
2025-04-27,07:54:15 | INFO | gather_with_grad: True
|
| 86 |
+
2025-04-27,07:54:15 | INFO | grad_checkpointing: True
|
| 87 |
+
2025-04-27,07:54:15 | INFO | grad_clip_norm: None
|
| 88 |
+
2025-04-27,07:54:15 | INFO | horovod: False
|
| 89 |
+
2025-04-27,07:54:15 | INFO | image_interpolation: None
|
| 90 |
+
2025-04-27,07:54:15 | INFO | image_mean: None
|
| 91 |
+
2025-04-27,07:54:15 | INFO | image_resize_mode: None
|
| 92 |
+
2025-04-27,07:54:15 | INFO | image_std: None
|
| 93 |
+
2025-04-27,07:54:15 | INFO | imagenet_v2: None
|
| 94 |
+
2025-04-27,07:54:15 | INFO | imagenet_val: None
|
| 95 |
+
2025-04-27,07:54:15 | INFO | keep_func_name: keep_image_uniform
|
| 96 |
+
2025-04-27,07:54:15 | INFO | local_loss: False
|
| 97 |
+
2025-04-27,07:54:15 | INFO | local_rank: 0
|
| 98 |
+
2025-04-27,07:54:15 | INFO | lock_image: False
|
| 99 |
+
2025-04-27,07:54:15 | INFO | lock_image_freeze_bn_stats: False
|
| 100 |
+
2025-04-27,07:54:15 | INFO | lock_image_unlocked_groups: 0
|
| 101 |
+
2025-04-27,07:54:15 | INFO | lock_text: True
|
| 102 |
+
2025-04-27,07:54:15 | INFO | lock_text_freeze_layer_norm: False
|
| 103 |
+
2025-04-27,07:54:15 | INFO | lock_text_unlocked_layers: 0
|
| 104 |
+
2025-04-27,07:54:15 | INFO | log_every_n_steps: 100
|
| 105 |
+
2025-04-27,07:54:15 | INFO | log_level: 20
|
| 106 |
+
2025-04-27,07:54:15 | INFO | log_local: False
|
| 107 |
+
2025-04-27,07:54:15 | INFO | log_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/out.log
|
| 108 |
+
2025-04-27,07:54:15 | INFO | logs: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text
|
| 109 |
+
2025-04-27,07:54:15 | INFO | loss_dist_impl: None
|
| 110 |
+
2025-04-27,07:54:15 | INFO | lr: 4e-05
|
| 111 |
+
2025-04-27,07:54:15 | INFO | lr_cooldown_end: 0.0
|
| 112 |
+
2025-04-27,07:54:15 | INFO | lr_cooldown_power: 1.0
|
| 113 |
+
2025-04-27,07:54:15 | INFO | lr_scheduler: cosine
|
| 114 |
+
2025-04-27,07:54:15 | INFO | map_func_name: use_all
|
| 115 |
+
2025-04-27,07:54:15 | INFO | model: ViT-B-16
|
| 116 |
+
2025-04-27,07:54:15 | INFO | momentum: None
|
| 117 |
+
2025-04-27,07:54:15 | INFO | name: keep_image_uniform
|
| 118 |
+
2025-04-27,07:54:15 | INFO | no_set_device_rank: False
|
| 119 |
+
2025-04-27,07:54:15 | INFO | opt: adamw
|
| 120 |
+
2025-04-27,07:54:15 | INFO | precision: amp
|
| 121 |
+
2025-04-27,07:54:15 | INFO | pretrained: datacomp_xl_s13b_b90k
|
| 122 |
+
2025-04-27,07:54:15 | INFO | pretrained_image: False
|
| 123 |
+
2025-04-27,07:54:15 | INFO | rank: 0
|
| 124 |
+
2025-04-27,07:54:15 | INFO | remote_sync: None
|
| 125 |
+
2025-04-27,07:54:15 | INFO | remote_sync_frequency: 300
|
| 126 |
+
2025-04-27,07:54:15 | INFO | remote_sync_protocol: s3
|
| 127 |
+
2025-04-27,07:54:15 | INFO | report_to: tensorboard,wandb
|
| 128 |
+
2025-04-27,07:54:15 | INFO | resume: None
|
| 129 |
+
2025-04-27,07:54:15 | INFO | save_frequency: 10
|
| 130 |
+
2025-04-27,07:54:15 | INFO | save_most_recent: False
|
| 131 |
+
2025-04-27,07:54:15 | INFO | seed: 0
|
| 132 |
+
2025-04-27,07:54:15 | INFO | siglip: False
|
| 133 |
+
2025-04-27,07:54:15 | INFO | skip_scheduler: False
|
| 134 |
+
2025-04-27,07:54:15 | INFO | tensorboard: True
|
| 135 |
+
2025-04-27,07:54:15 | INFO | tensorboard_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/tensorboard
|
| 136 |
+
2025-04-27,07:54:15 | INFO | torchcompile: False
|
| 137 |
+
2025-04-27,07:54:15 | INFO | torchscript: False
|
| 138 |
+
2025-04-27,07:54:15 | INFO | trace: False
|
| 139 |
+
2025-04-27,07:54:15 | INFO | train_data: /mnt/personal/zhudongy/recap-datacomp-3m-wds/{00376..01507}.tar
|
| 140 |
+
2025-04-27,07:54:15 | INFO | train_data_upsampling_factors: None
|
| 141 |
+
2025-04-27,07:54:15 | INFO | train_num_samples: 9011874
|
| 142 |
+
2025-04-27,07:54:15 | INFO | use_bn_sync: False
|
| 143 |
+
2025-04-27,07:54:15 | INFO | use_bnb_linear: None
|
| 144 |
+
2025-04-27,07:54:15 | INFO | val_data: None
|
| 145 |
+
2025-04-27,07:54:15 | INFO | val_frequency: 1
|
| 146 |
+
2025-04-27,07:54:15 | INFO | val_num_samples: None
|
| 147 |
+
2025-04-27,07:54:15 | INFO | wandb: True
|
| 148 |
+
2025-04-27,07:54:15 | INFO | wandb_notes:
|
| 149 |
+
2025-04-27,07:54:15 | INFO | wandb_project_name: open-clip
|
| 150 |
+
2025-04-27,07:54:15 | INFO | warmup: 110
|
| 151 |
+
2025-04-27,07:54:15 | INFO | wd: 0.5
|
| 152 |
+
2025-04-27,07:54:15 | INFO | workers: 16
|
| 153 |
+
2025-04-27,07:54:15 | INFO | world_size: 2
|
| 154 |
+
2025-04-27,07:54:15 | INFO | zeroshot_frequency: 2
|
| 155 |
+
2025-04-27,07:54:16 | INFO | Created AdamW (adamw) optimizer: lr: 4e-05, betas: (0.9, 0.98), eps: 1e-08, weight_decay: 0.5, amsgrad: False, foreach: None, maximize: False, capturable: False, differentiable: False, fused: None
|
| 156 |
+
2025-04-27,07:54:35 | INFO | Start epoch 0
|
| 157 |
+
2025-04-27,07:55:28 | INFO | Train Epoch: 0 [ 8192/917504 (1%)] Data (t): 43.129 Batch (t): 52.904, 154.846/s, 77.4229/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 28.541 (28.541) Imm_text: 28.541 (28.541) Isd_image: 3.2232 (3.2232) Isd_text: 3.2232 (3.2232) Contrastive_loss: 1.4352 (1.4352) Loss: 1.4352 (1.4352)
|
| 158 |
+
2025-04-27,08:10:27 | INFO | Train Epoch: 0 [827392/917504 (90%)] Data (t): 0.991 Batch (t): 8.995, 919.591/s, 459.795/s/gpu LR: 0.000037 Logit Scale: 99.933 Imm_image: 28.938 (28.739) Imm_text: 28.938 (28.739) Isd_image: 5.8971 (4.5602) Isd_text: 5.8971 (4.5602) Contrastive_loss: 0.73869 (1.0870) Loss: 0.73869 (1.0870)
|
| 159 |
+
2025-04-27,08:12:05 | INFO | Train Epoch: 0 [917504/917504 (100%)] Data (t): 0.934 Batch (t): 8.914, 916.232/s, 458.116/s/gpu LR: 0.000040 Logit Scale: 99.925 Imm_image: 29.008 (28.829) Imm_text: 29.008 (28.829) Isd_image: 5.3500 (4.8234) Isd_text: 5.3500 (4.8234) Contrastive_loss: 0.70851 (0.96081) Loss: 0.70851 (0.96081)
|
| 160 |
+
2025-04-27,08:12:05 | INFO | Start epoch 1
|
| 161 |
+
2025-04-27,08:12:51 | INFO | Train Epoch: 1 [ 8192/917504 (1%)] Data (t): 37.414 Batch (t): 45.501, 180.041/s, 90.0203/s/gpu LR: 0.000040 Logit Scale: 99.924 Imm_image: 29.122 (29.122) Imm_text: 29.122 (29.122) Isd_image: 5.4319 (5.4319) Isd_text: 5.4319 (5.4319) Contrastive_loss: 0.63045 (0.63045) Loss: 0.63045 (0.63045)
|
| 162 |
+
2025-04-27,08:27:54 | INFO | Train Epoch: 1 [827392/917504 (90%)] Data (t): 1.012 Batch (t): 9.027, 920.444/s, 460.222/s/gpu LR: 0.000039 Logit Scale: 99.879 Imm_image: 29.092 (29.107) Imm_text: 29.092 (29.107) Isd_image: 4.1551 (4.7935) Isd_text: 4.1551 (4.7935) Contrastive_loss: 0.62187 (0.62616) Loss: 0.62187 (0.62616)
|
| 163 |
+
2025-04-27,08:29:32 | INFO | Train Epoch: 1 [917504/917504 (100%)] Data (t): 0.942 Batch (t): 8.924, 919.590/s, 459.795/s/gpu LR: 0.000039 Logit Scale: 99.882 Imm_image: 29.122 (29.112) Imm_text: 29.122 (29.112) Isd_image: 3.7083 (4.4318) Isd_text: 3.7083 (4.4318) Contrastive_loss: 0.52793 (0.59342) Loss: 0.52793 (0.59342)
|
| 164 |
+
2025-04-27,08:29:32 | INFO | Start epoch 2
|
| 165 |
+
2025-04-27,08:30:16 | INFO | Train Epoch: 2 [ 8192/917504 (1%)] Data (t): 34.892 Batch (t): 43.754, 187.231/s, 93.6153/s/gpu LR: 0.000039 Logit Scale: 99.884 Imm_image: 29.246 (29.246) Imm_text: 29.246 (29.246) Isd_image: 3.5515 (3.5515) Isd_text: 3.5515 (3.5515) Contrastive_loss: 0.50764 (0.50764) Loss: 0.50764 (0.50764)
|
| 166 |
+
2025-04-27,08:45:16 | INFO | Train Epoch: 2 [827392/917504 (90%)] Data (t): 0.995 Batch (t): 9.003, 918.046/s, 459.023/s/gpu LR: 0.000036 Logit Scale: 100.000 Imm_image: 29.356 (29.301) Imm_text: 29.356 (29.301) Isd_image: 2.5848 (3.0682) Isd_text: 2.5848 (3.0682) Contrastive_loss: 0.47861 (0.49313) Loss: 0.47861 (0.49313)
|
| 167 |
+
2025-04-27,08:46:54 | INFO | Train Epoch: 2 [917504/917504 (100%)] Data (t): 0.955 Batch (t): 8.942, 913.227/s, 456.613/s/gpu LR: 0.000035 Logit Scale: 100.000 Imm_image: 29.512 (29.371) Imm_text: 29.512 (29.371) Isd_image: 2.4185 (2.8516) Isd_text: 2.4185 (2.8516) Contrastive_loss: 0.40780 (0.46468) Loss: 0.40780 (0.46468)
|
| 168 |
+
2025-04-27,08:46:55 | INFO | Start epoch 3
|
| 169 |
+
2025-04-27,08:47:39 | INFO | Train Epoch: 3 [ 8192/917504 (1%)] Data (t): 36.440 Batch (t): 44.632, 183.546/s, 91.7728/s/gpu LR: 0.000035 Logit Scale: 100.000 Imm_image: 29.499 (29.499) Imm_text: 29.499 (29.499) Isd_image: 2.4591 (2.4591) Isd_text: 2.4591 (2.4591) Contrastive_loss: 0.39220 (0.39220) Loss: 0.39220 (0.39220)
|
| 170 |
+
2025-04-27,09:02:42 | INFO | Train Epoch: 3 [827392/917504 (90%)] Data (t): 1.015 Batch (t): 9.030, 917.801/s, 458.901/s/gpu LR: 0.000031 Logit Scale: 100.000 Imm_image: 29.522 (29.510) Imm_text: 29.522 (29.510) Isd_image: 1.6210 (2.0401) Isd_text: 1.6210 (2.0401) Contrastive_loss: 0.41654 (0.40437) Loss: 0.41654 (0.40437)
|
| 171 |
+
2025-04-27,09:04:21 | INFO | Train Epoch: 3 [917504/917504 (100%)] Data (t): 0.950 Batch (t): 8.931, 914.190/s, 457.095/s/gpu LR: 0.000030 Logit Scale: 100.000 Imm_image: 29.597 (29.539) Imm_text: 29.597 (29.539) Isd_image: 1.4161 (1.8321) Isd_text: 1.4161 (1.8321) Contrastive_loss: 0.36959 (0.39277) Loss: 0.36959 (0.39277)
|
| 172 |
+
2025-04-27,09:04:21 | INFO | Start epoch 4
|
| 173 |
+
2025-04-27,09:05:04 | INFO | Train Epoch: 4 [ 8192/917504 (1%)] Data (t): 35.511 Batch (t): 43.616, 187.822/s, 93.9110/s/gpu LR: 0.000030 Logit Scale: 100.000 Imm_image: 29.727 (29.727) Imm_text: 29.727 (29.727) Isd_image: 1.5082 (1.5082) Isd_text: 1.5082 (1.5082) Contrastive_loss: 0.34478 (0.34478) Loss: 0.34478 (0.34478)
|
| 174 |
+
2025-04-27,09:20:06 | INFO | Train Epoch: 4 [827392/917504 (90%)] Data (t): 1.004 Batch (t): 9.014, 916.202/s, 458.101/s/gpu LR: 0.000024 Logit Scale: 100.000 Imm_image: 29.862 (29.794) Imm_text: 29.862 (29.794) Isd_image: 0.93800 (1.2231) Isd_text: 0.93800 (1.2231) Contrastive_loss: 0.30456 (0.32467) Loss: 0.30456 (0.32467)
|
| 175 |
+
2025-04-27,09:21:44 | INFO | Train Epoch: 4 [917504/917504 (100%)] Data (t): 0.954 Batch (t): 8.936, 915.496/s, 457.748/s/gpu LR: 0.000023 Logit Scale: 100.000 Imm_image: 29.837 (29.809) Imm_text: 29.837 (29.809) Isd_image: 1.1397 (1.1953) Isd_text: 1.1397 (1.1953) Contrastive_loss: 0.28918 (0.31284) Loss: 0.28918 (0.31284)
|
| 176 |
+
2025-04-27,09:21:44 | INFO | Start epoch 5
|
| 177 |
+
2025-04-27,09:22:29 | INFO | Train Epoch: 5 [ 8192/917504 (1%)] Data (t): 33.353 Batch (t): 44.506, 184.065/s, 92.0325/s/gpu LR: 0.000023 Logit Scale: 100.000 Imm_image: 29.815 (29.815) Imm_text: 29.815 (29.815) Isd_image: 1.1481 (1.1481) Isd_text: 1.1481 (1.1481) Contrastive_loss: 0.30516 (0.30516) Loss: 0.30516 (0.30516)
|
| 178 |
+
2025-04-27,09:37:30 | INFO | Train Epoch: 5 [827392/917504 (90%)] Data (t): 1.007 Batch (t): 9.014, 918.615/s, 459.307/s/gpu LR: 0.000017 Logit Scale: 100.000 Imm_image: 29.975 (29.895) Imm_text: 29.975 (29.895) Isd_image: 0.74998 (0.94903) Isd_text: 0.74998 (0.94903) Contrastive_loss: 0.28071 (0.29293) Loss: 0.28071 (0.29293)
|
| 179 |
+
2025-04-27,09:39:09 | INFO | Train Epoch: 5 [917504/917504 (100%)] Data (t): 0.950 Batch (t): 8.933, 914.077/s, 457.039/s/gpu LR: 0.000017 Logit Scale: 100.000 Imm_image: 30.192 (29.994) Imm_text: 30.192 (29.994) Isd_image: 0.69866 (0.86557) Isd_text: 0.69866 (0.86557) Contrastive_loss: 0.22392 (0.26993) Loss: 0.22392 (0.26993)
|
| 180 |
+
2025-04-27,09:39:09 | INFO | Start epoch 6
|
| 181 |
+
2025-04-27,09:39:54 | INFO | Train Epoch: 6 [ 8192/917504 (1%)] Data (t): 35.636 Batch (t): 45.368, 180.569/s, 90.2846/s/gpu LR: 0.000016 Logit Scale: 100.000 Imm_image: 30.142 (30.142) Imm_text: 30.142 (30.142) Isd_image: 0.83195 (0.83195) Isd_text: 0.83195 (0.83195) Contrastive_loss: 0.24683 (0.24683) Loss: 0.24683 (0.24683)
|
| 182 |
+
2025-04-27,09:54:56 | INFO | Train Epoch: 6 [827392/917504 (90%)] Data (t): 1.009 Batch (t): 9.016, 917.514/s, 458.757/s/gpu LR: 0.000011 Logit Scale: 100.000 Imm_image: 30.182 (30.162) Imm_text: 30.182 (30.162) Isd_image: 0.66375 (0.74785) Isd_text: 0.66375 (0.74785) Contrastive_loss: 0.25392 (0.25038) Loss: 0.25392 (0.25038)
|
| 183 |
+
2025-04-27,09:56:34 | INFO | Train Epoch: 6 [917504/917504 (100%)] Data (t): 0.951 Batch (t): 8.934, 914.233/s, 457.117/s/gpu LR: 0.000010 Logit Scale: 100.000 Imm_image: 30.260 (30.194) Imm_text: 30.260 (30.194) Isd_image: 0.58846 (0.69472) Isd_text: 0.58846 (0.69472) Contrastive_loss: 0.21603 (0.23893) Loss: 0.21603 (0.23893)
|
| 184 |
+
2025-04-27,09:56:34 | INFO | Start epoch 7
|
| 185 |
+
2025-04-27,09:57:20 | INFO | Train Epoch: 7 [ 8192/917504 (1%)] Data (t): 36.683 Batch (t): 45.466, 180.179/s, 90.0895/s/gpu LR: 0.000010 Logit Scale: 100.000 Imm_image: 30.232 (30.232) Imm_text: 30.232 (30.232) Isd_image: 0.56494 (0.56494) Isd_text: 0.56494 (0.56494) Contrastive_loss: 0.21279 (0.21279) Loss: 0.21279 (0.21279)
|
| 186 |
+
2025-04-27,10:12:22 | INFO | Train Epoch: 7 [827392/917504 (90%)] Data (t): 1.006 Batch (t): 9.018, 918.150/s, 459.075/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.290 (30.261) Imm_text: 30.290 (30.261) Isd_image: 0.56392 (0.56443) Isd_text: 0.56392 (0.56443) Contrastive_loss: 0.24375 (0.22827) Loss: 0.24375 (0.22827)
|
| 187 |
+
2025-04-27,10:14:03 | INFO | Train Epoch: 7 [917504/917504 (100%)] Data (t): 1.142 Batch (t): 9.181, 904.205/s, 452.103/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.414 (30.312) Imm_text: 30.414 (30.312) Isd_image: 0.57138 (0.56675) Isd_text: 0.57138 (0.56675) Contrastive_loss: 0.21433 (0.22362) Loss: 0.21433 (0.22362)
|
| 188 |
+
2025-04-27,10:14:03 | INFO | Start epoch 8
|
| 189 |
+
2025-04-27,10:14:46 | INFO | Train Epoch: 8 [ 8192/917504 (1%)] Data (t): 34.882 Batch (t): 43.296, 189.209/s, 94.6043/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.263 (30.263) Imm_text: 30.263 (30.263) Isd_image: 0.63063 (0.63063) Isd_text: 0.63063 (0.63063) Contrastive_loss: 0.21246 (0.21246) Loss: 0.21246 (0.21246)
|
| 190 |
+
2025-04-27,10:29:43 | INFO | Train Epoch: 8 [827392/917504 (90%)] Data (t): 0.969 Batch (t): 8.967, 920.783/s, 460.391/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.228 (30.245) Imm_text: 30.228 (30.245) Isd_image: 0.59837 (0.61450) Isd_text: 0.59837 (0.61450) Contrastive_loss: 0.24290 (0.22768) Loss: 0.24290 (0.22768)
|
| 191 |
+
2025-04-27,10:31:21 | INFO | Train Epoch: 8 [917504/917504 (100%)] Data (t): 0.919 Batch (t): 8.886, 919.279/s, 459.640/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.411 (30.301) Imm_text: 30.411 (30.301) Isd_image: 0.56638 (0.59846) Isd_text: 0.56638 (0.59846) Contrastive_loss: 0.22206 (0.22581) Loss: 0.22206 (0.22581)
|
| 192 |
+
2025-04-27,10:31:21 | INFO | Start epoch 9
|
| 193 |
+
2025-04-27,10:32:04 | INFO | Train Epoch: 9 [ 8192/917504 (1%)] Data (t): 35.369 Batch (t): 43.449, 188.542/s, 94.2708/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.428 (30.428) Imm_text: 30.428 (30.428) Isd_image: 0.43297 (0.43297) Isd_text: 0.43297 (0.43297) Contrastive_loss: 0.18715 (0.18715) Loss: 0.18715 (0.18715)
|
| 194 |
+
2025-04-27,10:47:03 | INFO | Train Epoch: 9 [827392/917504 (90%)] Data (t): 0.984 Batch (t): 8.989, 919.087/s, 459.543/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 30.323 (30.375) Imm_text: 30.323 (30.375) Isd_image: 0.59950 (0.51624) Isd_text: 0.59950 (0.51624) Contrastive_loss: 0.22314 (0.20515) Loss: 0.22314 (0.20515)
|
| 195 |
+
2025-04-27,10:48:41 | INFO | Train Epoch: 9 [917504/917504 (100%)] Data (t): 0.937 Batch (t): 8.914, 919.208/s, 459.604/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 30.394 (30.382) Imm_text: 30.394 (30.382) Isd_image: 0.49815 (0.51021) Isd_text: 0.49815 (0.51021) Contrastive_loss: 0.20625 (0.20551) Loss: 0.20625 (0.20551)
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/params.txt
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accum_freq: 2
|
| 2 |
+
aug_cfg: {}
|
| 3 |
+
batch_size: 2048
|
| 4 |
+
beta1: 0.9
|
| 5 |
+
beta2: 0.98
|
| 6 |
+
cache_dir: None
|
| 7 |
+
caption_ratio: 0.1
|
| 8 |
+
checkpoint_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/checkpoints
|
| 9 |
+
coca_caption_loss_weight: 2.0
|
| 10 |
+
coca_contrastive_loss_weight: 1.0
|
| 11 |
+
copy_codebase: False
|
| 12 |
+
csv_caption_key: title
|
| 13 |
+
csv_img_key: filepath
|
| 14 |
+
csv_separator:
|
| 15 |
+
dataset_resampled: False
|
| 16 |
+
dataset_type: synthetic
|
| 17 |
+
ddp_static_graph: False
|
| 18 |
+
debug: False
|
| 19 |
+
delete_previous_checkpoint: False
|
| 20 |
+
device: cuda:0
|
| 21 |
+
dist_backend: None
|
| 22 |
+
dist_url: None
|
| 23 |
+
distill: False
|
| 24 |
+
distill_model: None
|
| 25 |
+
distill_pretrained: None
|
| 26 |
+
distributed: True
|
| 27 |
+
epochs: 10
|
| 28 |
+
epochs_cooldown: None
|
| 29 |
+
eps: 1e-08
|
| 30 |
+
force_custom_text: False
|
| 31 |
+
force_image_size: None
|
| 32 |
+
force_patch_dropout: None
|
| 33 |
+
force_quick_gelu: False
|
| 34 |
+
gather_with_grad: True
|
| 35 |
+
grad_checkpointing: True
|
| 36 |
+
grad_clip_norm: None
|
| 37 |
+
horovod: False
|
| 38 |
+
image_interpolation: None
|
| 39 |
+
image_mean: None
|
| 40 |
+
image_resize_mode: None
|
| 41 |
+
image_std: None
|
| 42 |
+
imagenet_v2: None
|
| 43 |
+
imagenet_val: None
|
| 44 |
+
keep_func_name: keep_image_uniform
|
| 45 |
+
local_loss: False
|
| 46 |
+
local_rank: 0
|
| 47 |
+
lock_image: False
|
| 48 |
+
lock_image_freeze_bn_stats: False
|
| 49 |
+
lock_image_unlocked_groups: 0
|
| 50 |
+
lock_text: True
|
| 51 |
+
lock_text_freeze_layer_norm: False
|
| 52 |
+
lock_text_unlocked_layers: 0
|
| 53 |
+
log_every_n_steps: 100
|
| 54 |
+
log_level: 20
|
| 55 |
+
log_local: False
|
| 56 |
+
log_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/out.log
|
| 57 |
+
logs: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text
|
| 58 |
+
loss_dist_impl: None
|
| 59 |
+
lr: 4e-05
|
| 60 |
+
lr_cooldown_end: 0.0
|
| 61 |
+
lr_cooldown_power: 1.0
|
| 62 |
+
lr_scheduler: cosine
|
| 63 |
+
map_func_name: use_all
|
| 64 |
+
model: ViT-B-16
|
| 65 |
+
momentum: None
|
| 66 |
+
name: keep_image_uniform
|
| 67 |
+
no_set_device_rank: False
|
| 68 |
+
opt: adamw
|
| 69 |
+
precision: amp
|
| 70 |
+
pretrained: datacomp_xl_s13b_b90k
|
| 71 |
+
pretrained_image: False
|
| 72 |
+
rank: 0
|
| 73 |
+
remote_sync: None
|
| 74 |
+
remote_sync_frequency: 300
|
| 75 |
+
remote_sync_protocol: s3
|
| 76 |
+
report_to: tensorboard,wandb
|
| 77 |
+
resume: None
|
| 78 |
+
save_frequency: 10
|
| 79 |
+
save_most_recent: False
|
| 80 |
+
seed: 0
|
| 81 |
+
siglip: False
|
| 82 |
+
skip_scheduler: False
|
| 83 |
+
tensorboard: True
|
| 84 |
+
tensorboard_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/tensorboard
|
| 85 |
+
torchcompile: False
|
| 86 |
+
torchscript: False
|
| 87 |
+
trace: False
|
| 88 |
+
train_data: /mnt/personal/zhudongy/recap-datacomp-3m-wds/{00376..01507}.tar
|
| 89 |
+
train_data_upsampling_factors: None
|
| 90 |
+
train_num_samples: 9011874
|
| 91 |
+
use_bn_sync: False
|
| 92 |
+
use_bnb_linear: None
|
| 93 |
+
val_data: None
|
| 94 |
+
val_frequency: 1
|
| 95 |
+
val_num_samples: None
|
| 96 |
+
wandb: True
|
| 97 |
+
wandb_notes:
|
| 98 |
+
wandb_project_name: open-clip
|
| 99 |
+
warmup: 110
|
| 100 |
+
wd: 0.5
|
| 101 |
+
workers: 16
|
| 102 |
+
world_size: 2
|
| 103 |
+
zeroshot_frequency: 2
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_uniform/tensorboard/events.out.tfevents.1745733263.g12.2655775.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7d083cee4ab2f2fb5312adf93f6583505476bcc1b1f2fc69ab2462355cb6aec4
|
| 3 |
+
size 19936
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/tensorboard/events.out.tfevents.1745710659.g12.2628499.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5f17f7467451bdea01cb70bda9f19c3d10fc6fa0852805af8a7ec1333a87945a
|
| 3 |
+
size 19936
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/tensorboard/events.out.tfevents.1745663928.g12.2393971.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7461fb18c1b53d1ed330cdf217a4ce913fd2348163ec9e311e36493041bd03db
|
| 3 |
+
size 19936
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_farest_image_farest/tensorboard/events.out.tfevents.1745755854.g12.2682891.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3498cb2e8566f663433bd058a744f1565c5d2ba308c6658b9965266bc5cd1f4d
|
| 3 |
+
size 19936
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_caltech101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "caltech101", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8611111111111112, "acc5": 0.9732064741907261, "mean_per_class_recall": 0.9224088045513618}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_cars_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "cars", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8303693570451436, "acc5": 0.988931724909837, "mean_per_class_recall": 0.8305479995448934}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "cifar100", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7475, "acc5": 0.9333, "mean_per_class_recall": 0.7476000000000002}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_cifar10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "cifar10", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.9345, "acc5": 0.999, "mean_per_class_recall": 0.9345000000000001}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "country211", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.19374407582938388, "acc5": 0.4218483412322275, "mean_per_class_recall": 0.1937914691943128}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "dtd", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5340425531914894, "acc5": 0.8228723404255319, "mean_per_class_recall": 0.5329787234042552}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_eurosat_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "eurosat", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.4602222222222222, "acc5": 0.8801111111111111, "mean_per_class_recall": 0.47197999999999996}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "flickr30k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/checkpoints/epoch_10.pt", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.6621999740600586, "text_retrieval_recall@1": 0.8259999752044678, "image_retrieval_recall@5": 0.8751999735832214, "text_retrieval_recall@5": 0.9580000042915344, "image_retrieval_recall@10": 0.9215999841690063, "text_retrieval_recall@10": 0.9800000190734863}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "flowers", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7518295657830542, "acc5": 0.9006342494714588, "mean_per_class_recall": 0.7590609398935276}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "food101", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8712475247524752, "acc5": 0.9801584158415841, "mean_per_class_recall": 0.8710891089108911}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "gtsrb", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5334916864608076, "acc5": 0.7653206650831353, "mean_per_class_recall": 0.49353806950268025}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "imagenet1k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.68076, "acc5": 0.90792, "mean_per_class_recall": 0.68056}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_mscoco_captions_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "mscoco_captions", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/checkpoints/epoch_10.pt", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.3862854838371277, "text_retrieval_recall@1": 0.5411999821662903, "image_retrieval_recall@5": 0.6445421576499939, "text_retrieval_recall@5": 0.7820000052452087, "image_retrieval_recall@10": 0.745781660079956, "text_retrieval_recall@10": 0.8583999872207642}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "pets", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8997001907876806, "acc5": 0.9967293540474244, "mean_per_class_recall": 0.8993521290462428}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_stl10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "stl10", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.977, "acc5": 1.0, "mean_per_class_recall": 0.977}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "sun397", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6715707008477849, "acc5": 0.9273681887562756, "mean_per_class_recall": 0.6589770233905725}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"dataset": "vtab/resisc45", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6215873015873016, "acc5": 0.9153968253968254, "mean_per_class_recall": 0.6280168420204852}, "language": "en"}
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/out.log
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-04-27,20:53:01 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2.
|
| 2 |
+
2025-04-27,20:53:01 | INFO | Loaded ViT-B-16 model config.
|
| 3 |
+
2025-04-27,20:53:02 | INFO | Loading pretrained ViT-B-16 weights (datacomp_xl_s13b_b90k).
|
| 4 |
+
2025-04-27,20:53:03 | INFO | Model:
|
| 5 |
+
2025-04-27,20:53:03 | INFO | CLIP(
|
| 6 |
+
(visual): VisionTransformer(
|
| 7 |
+
(conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
|
| 8 |
+
(patch_dropout): Identity()
|
| 9 |
+
(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 10 |
+
(transformer): Transformer(
|
| 11 |
+
(resblocks): ModuleList(
|
| 12 |
+
(0-11): 12 x ResidualAttentionBlock(
|
| 13 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 14 |
+
(attn): MultiheadAttention(
|
| 15 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 16 |
+
)
|
| 17 |
+
(ls_1): Identity()
|
| 18 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 19 |
+
(mlp): Sequential(
|
| 20 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 21 |
+
(gelu): GELU(approximate='none')
|
| 22 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 23 |
+
)
|
| 24 |
+
(ls_2): Identity()
|
| 25 |
+
)
|
| 26 |
+
)
|
| 27 |
+
)
|
| 28 |
+
(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 29 |
+
)
|
| 30 |
+
(transformer): Transformer(
|
| 31 |
+
(resblocks): ModuleList(
|
| 32 |
+
(0-11): 12 x ResidualAttentionBlock(
|
| 33 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 34 |
+
(attn): MultiheadAttention(
|
| 35 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 36 |
+
)
|
| 37 |
+
(ls_1): Identity()
|
| 38 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 39 |
+
(mlp): Sequential(
|
| 40 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 41 |
+
(gelu): GELU(approximate='none')
|
| 42 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 43 |
+
)
|
| 44 |
+
(ls_2): Identity()
|
| 45 |
+
)
|
| 46 |
+
)
|
| 47 |
+
)
|
| 48 |
+
(token_embedding): Embedding(49408, 512)
|
| 49 |
+
(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 50 |
+
)
|
| 51 |
+
2025-04-27,20:53:03 | INFO | Params:
|
| 52 |
+
2025-04-27,20:53:03 | INFO | accum_freq: 2
|
| 53 |
+
2025-04-27,20:53:03 | INFO | aug_cfg: {}
|
| 54 |
+
2025-04-27,20:53:03 | INFO | batch_size: 2048
|
| 55 |
+
2025-04-27,20:53:03 | INFO | beta1: 0.9
|
| 56 |
+
2025-04-27,20:53:03 | INFO | beta2: 0.98
|
| 57 |
+
2025-04-27,20:53:03 | INFO | cache_dir: None
|
| 58 |
+
2025-04-27,20:53:03 | INFO | caption_ratio: 0.1
|
| 59 |
+
2025-04-27,20:53:03 | INFO | checkpoint_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/checkpoints
|
| 60 |
+
2025-04-27,20:53:03 | INFO | coca_caption_loss_weight: 2.0
|
| 61 |
+
2025-04-27,20:53:03 | INFO | coca_contrastive_loss_weight: 1.0
|
| 62 |
+
2025-04-27,20:53:03 | INFO | copy_codebase: False
|
| 63 |
+
2025-04-27,20:53:03 | INFO | csv_caption_key: title
|
| 64 |
+
2025-04-27,20:53:03 | INFO | csv_img_key: filepath
|
| 65 |
+
2025-04-27,20:53:03 | INFO | csv_separator:
|
| 66 |
+
2025-04-27,20:53:03 | INFO | dataset_resampled: False
|
| 67 |
+
2025-04-27,20:53:03 | INFO | dataset_type: synthetic
|
| 68 |
+
2025-04-27,20:53:03 | INFO | ddp_static_graph: False
|
| 69 |
+
2025-04-27,20:53:03 | INFO | debug: False
|
| 70 |
+
2025-04-27,20:53:03 | INFO | delete_previous_checkpoint: False
|
| 71 |
+
2025-04-27,20:53:03 | INFO | device: cuda:0
|
| 72 |
+
2025-04-27,20:53:03 | INFO | dist_backend: None
|
| 73 |
+
2025-04-27,20:53:03 | INFO | dist_url: None
|
| 74 |
+
2025-04-27,20:53:03 | INFO | distill: False
|
| 75 |
+
2025-04-27,20:53:03 | INFO | distill_model: None
|
| 76 |
+
2025-04-27,20:53:03 | INFO | distill_pretrained: None
|
| 77 |
+
2025-04-27,20:53:03 | INFO | distributed: True
|
| 78 |
+
2025-04-27,20:53:03 | INFO | epochs: 10
|
| 79 |
+
2025-04-27,20:53:03 | INFO | epochs_cooldown: None
|
| 80 |
+
2025-04-27,20:53:03 | INFO | eps: 1e-08
|
| 81 |
+
2025-04-27,20:53:03 | INFO | force_custom_text: False
|
| 82 |
+
2025-04-27,20:53:03 | INFO | force_image_size: None
|
| 83 |
+
2025-04-27,20:53:03 | INFO | force_patch_dropout: None
|
| 84 |
+
2025-04-27,20:53:03 | INFO | force_quick_gelu: False
|
| 85 |
+
2025-04-27,20:53:03 | INFO | gather_with_grad: True
|
| 86 |
+
2025-04-27,20:53:03 | INFO | grad_checkpointing: True
|
| 87 |
+
2025-04-27,20:53:03 | INFO | grad_clip_norm: None
|
| 88 |
+
2025-04-27,20:53:03 | INFO | horovod: False
|
| 89 |
+
2025-04-27,20:53:03 | INFO | image_interpolation: None
|
| 90 |
+
2025-04-27,20:53:03 | INFO | image_mean: None
|
| 91 |
+
2025-04-27,20:53:03 | INFO | image_resize_mode: None
|
| 92 |
+
2025-04-27,20:53:03 | INFO | image_std: None
|
| 93 |
+
2025-04-27,20:53:03 | INFO | imagenet_v2: None
|
| 94 |
+
2025-04-27,20:53:03 | INFO | imagenet_val: None
|
| 95 |
+
2025-04-27,20:53:03 | INFO | keep_func_name: low_inter_only
|
| 96 |
+
2025-04-27,20:53:03 | INFO | local_loss: False
|
| 97 |
+
2025-04-27,20:53:03 | INFO | local_rank: 0
|
| 98 |
+
2025-04-27,20:53:03 | INFO | lock_image: False
|
| 99 |
+
2025-04-27,20:53:03 | INFO | lock_image_freeze_bn_stats: False
|
| 100 |
+
2025-04-27,20:53:03 | INFO | lock_image_unlocked_groups: 0
|
| 101 |
+
2025-04-27,20:53:03 | INFO | lock_text: True
|
| 102 |
+
2025-04-27,20:53:03 | INFO | lock_text_freeze_layer_norm: False
|
| 103 |
+
2025-04-27,20:53:03 | INFO | lock_text_unlocked_layers: 0
|
| 104 |
+
2025-04-27,20:53:03 | INFO | log_every_n_steps: 100
|
| 105 |
+
2025-04-27,20:53:03 | INFO | log_level: 20
|
| 106 |
+
2025-04-27,20:53:03 | INFO | log_local: False
|
| 107 |
+
2025-04-27,20:53:03 | INFO | log_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/out.log
|
| 108 |
+
2025-04-27,20:53:03 | INFO | logs: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text
|
| 109 |
+
2025-04-27,20:53:03 | INFO | loss_dist_impl: None
|
| 110 |
+
2025-04-27,20:53:03 | INFO | lr: 4e-05
|
| 111 |
+
2025-04-27,20:53:03 | INFO | lr_cooldown_end: 0.0
|
| 112 |
+
2025-04-27,20:53:03 | INFO | lr_cooldown_power: 1.0
|
| 113 |
+
2025-04-27,20:53:03 | INFO | lr_scheduler: cosine
|
| 114 |
+
2025-04-27,20:53:03 | INFO | map_func_name: use_all
|
| 115 |
+
2025-04-27,20:53:03 | INFO | model: ViT-B-16
|
| 116 |
+
2025-04-27,20:53:03 | INFO | momentum: None
|
| 117 |
+
2025-04-27,20:53:03 | INFO | name: low_inter_only
|
| 118 |
+
2025-04-27,20:53:03 | INFO | no_set_device_rank: False
|
| 119 |
+
2025-04-27,20:53:03 | INFO | opt: adamw
|
| 120 |
+
2025-04-27,20:53:03 | INFO | precision: amp
|
| 121 |
+
2025-04-27,20:53:03 | INFO | pretrained: datacomp_xl_s13b_b90k
|
| 122 |
+
2025-04-27,20:53:03 | INFO | pretrained_image: False
|
| 123 |
+
2025-04-27,20:53:03 | INFO | rank: 0
|
| 124 |
+
2025-04-27,20:53:03 | INFO | remote_sync: None
|
| 125 |
+
2025-04-27,20:53:03 | INFO | remote_sync_frequency: 300
|
| 126 |
+
2025-04-27,20:53:03 | INFO | remote_sync_protocol: s3
|
| 127 |
+
2025-04-27,20:53:03 | INFO | report_to: tensorboard,wandb
|
| 128 |
+
2025-04-27,20:53:03 | INFO | resume: None
|
| 129 |
+
2025-04-27,20:53:03 | INFO | save_frequency: 10
|
| 130 |
+
2025-04-27,20:53:03 | INFO | save_most_recent: False
|
| 131 |
+
2025-04-27,20:53:03 | INFO | seed: 0
|
| 132 |
+
2025-04-27,20:53:03 | INFO | siglip: False
|
| 133 |
+
2025-04-27,20:53:03 | INFO | skip_scheduler: False
|
| 134 |
+
2025-04-27,20:53:03 | INFO | tensorboard: True
|
| 135 |
+
2025-04-27,20:53:03 | INFO | tensorboard_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/tensorboard
|
| 136 |
+
2025-04-27,20:53:03 | INFO | torchcompile: False
|
| 137 |
+
2025-04-27,20:53:03 | INFO | torchscript: False
|
| 138 |
+
2025-04-27,20:53:03 | INFO | trace: False
|
| 139 |
+
2025-04-27,20:53:03 | INFO | train_data: /mnt/personal/zhudongy/recap-datacomp-3m-wds/{00376..01507}.tar
|
| 140 |
+
2025-04-27,20:53:03 | INFO | train_data_upsampling_factors: None
|
| 141 |
+
2025-04-27,20:53:03 | INFO | train_num_samples: 9011874
|
| 142 |
+
2025-04-27,20:53:03 | INFO | use_bn_sync: False
|
| 143 |
+
2025-04-27,20:53:03 | INFO | use_bnb_linear: None
|
| 144 |
+
2025-04-27,20:53:03 | INFO | val_data: None
|
| 145 |
+
2025-04-27,20:53:03 | INFO | val_frequency: 1
|
| 146 |
+
2025-04-27,20:53:03 | INFO | val_num_samples: None
|
| 147 |
+
2025-04-27,20:53:03 | INFO | wandb: True
|
| 148 |
+
2025-04-27,20:53:03 | INFO | wandb_notes:
|
| 149 |
+
2025-04-27,20:53:03 | INFO | wandb_project_name: open-clip
|
| 150 |
+
2025-04-27,20:53:03 | INFO | warmup: 110
|
| 151 |
+
2025-04-27,20:53:03 | INFO | wd: 0.5
|
| 152 |
+
2025-04-27,20:53:03 | INFO | workers: 16
|
| 153 |
+
2025-04-27,20:53:03 | INFO | world_size: 2
|
| 154 |
+
2025-04-27,20:53:03 | INFO | zeroshot_frequency: 2
|
| 155 |
+
2025-04-27,20:53:03 | INFO | Created AdamW (adamw) optimizer: lr: 4e-05, betas: (0.9, 0.98), eps: 1e-08, weight_decay: 0.5, amsgrad: False, foreach: None, maximize: False, capturable: False, differentiable: False, fused: None
|
| 156 |
+
2025-04-27,20:53:45 | INFO | Start epoch 0
|
| 157 |
+
2025-04-27,20:54:39 | INFO | Train Epoch: 0 [ 8192/917504 (1%)] Data (t): 44.740 Batch (t): 54.300, 150.864/s, 75.4322/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 28.004 (28.004) Imm_text: 28.004 (28.004) Isd_image: 0.48300 (0.48300) Isd_text: 0.48300 (0.48300) Contrastive_loss: 1.3863 (1.3863) Loss: 1.3863 (1.3863)
|
| 158 |
+
2025-04-27,21:09:40 | INFO | Train Epoch: 0 [827392/917504 (90%)] Data (t): 1.004 Batch (t): 9.011, 913.635/s, 456.818/s/gpu LR: 0.000037 Logit Scale: 99.932 Imm_image: 28.809 (28.407) Imm_text: 28.809 (28.407) Isd_image: 4.0077 (2.2454) Isd_text: 4.0077 (2.2454) Contrastive_loss: 0.64345 (1.0149) Loss: 0.64345 (1.0149)
|
| 159 |
+
2025-04-27,21:11:19 | INFO | Train Epoch: 0 [917504/917504 (100%)] Data (t): 0.966 Batch (t): 8.956, 911.469/s, 455.734/s/gpu LR: 0.000040 Logit Scale: 99.925 Imm_image: 28.850 (28.554) Imm_text: 28.850 (28.554) Isd_image: 3.6929 (2.7279) Isd_text: 3.6929 (2.7279) Contrastive_loss: 0.61486 (0.88152) Loss: 0.61486 (0.88152)
|
| 160 |
+
2025-04-27,21:11:19 | INFO | Start epoch 1
|
| 161 |
+
2025-04-27,21:12:04 | INFO | Train Epoch: 1 [ 8192/917504 (1%)] Data (t): 37.221 Batch (t): 45.300, 180.838/s, 90.4192/s/gpu LR: 0.000040 Logit Scale: 99.924 Imm_image: 28.894 (28.894) Imm_text: 28.894 (28.894) Isd_image: 3.6459 (3.6459) Isd_text: 3.6459 (3.6459) Contrastive_loss: 0.56427 (0.56427) Loss: 0.56427 (0.56427)
|
| 162 |
+
2025-04-27,21:27:11 | INFO | Train Epoch: 1 [827392/917504 (90%)] Data (t): 1.045 Batch (t): 9.067, 913.108/s, 456.554/s/gpu LR: 0.000039 Logit Scale: 99.876 Imm_image: 28.777 (28.835) Imm_text: 28.777 (28.835) Isd_image: 2.0186 (2.8323) Isd_text: 2.0186 (2.8323) Contrastive_loss: 0.54482 (0.55454) Loss: 0.54482 (0.55454)
|
| 163 |
+
2025-04-27,21:28:50 | INFO | Train Epoch: 1 [917504/917504 (100%)] Data (t): 0.990 Batch (t): 8.986, 910.140/s, 455.070/s/gpu LR: 0.000039 Logit Scale: 99.875 Imm_image: 28.908 (28.860) Imm_text: 28.908 (28.860) Isd_image: 1.5264 (2.3970) Isd_text: 1.5264 (2.3970) Contrastive_loss: 0.44167 (0.51692) Loss: 0.44167 (0.51692)
|
| 164 |
+
2025-04-27,21:28:50 | INFO | Start epoch 2
|
| 165 |
+
2025-04-27,21:29:35 | INFO | Train Epoch: 2 [ 8192/917504 (1%)] Data (t): 36.369 Batch (t): 44.535, 183.945/s, 91.9726/s/gpu LR: 0.000039 Logit Scale: 99.876 Imm_image: 28.971 (28.971) Imm_text: 28.971 (28.971) Isd_image: 1.5085 (1.5085) Isd_text: 1.5085 (1.5085) Contrastive_loss: 0.46002 (0.46002) Loss: 0.46002 (0.46002)
|
| 166 |
+
2025-04-27,21:44:31 | INFO | Train Epoch: 2 [827392/917504 (90%)] Data (t): 0.966 Batch (t): 8.965, 921.609/s, 460.804/s/gpu LR: 0.000036 Logit Scale: 99.957 Imm_image: 29.224 (29.097) Imm_text: 29.224 (29.097) Isd_image: 0.16710 (0.83778) Isd_text: 0.16710 (0.83778) Contrastive_loss: 0.42271 (0.44137) Loss: 0.42271 (0.44137)
|
| 167 |
+
2025-04-27,21:46:09 | INFO | Train Epoch: 2 [917504/917504 (100%)] Data (t): 0.923 Batch (t): 8.896, 916.982/s, 458.491/s/gpu LR: 0.000035 Logit Scale: 99.971 Imm_image: 29.325 (29.173) Imm_text: 29.325 (29.173) Isd_image: 0.038107 (0.57122) Isd_text: 0.038107 (0.57122) Contrastive_loss: 0.36076 (0.41450) Loss: 0.36076 (0.41450)
|
| 168 |
+
2025-04-27,21:46:09 | INFO | Start epoch 3
|
| 169 |
+
2025-04-27,21:46:54 | INFO | Train Epoch: 3 [ 8192/917504 (1%)] Data (t): 36.596 Batch (t): 44.687, 183.320/s, 91.6598/s/gpu LR: 0.000035 Logit Scale: 99.975 Imm_image: 29.349 (29.349) Imm_text: 29.349 (29.349) Isd_image: -0.042467 (-0.042467) Isd_text: -0.042467 (-0.042467) Contrastive_loss: 0.37044 (0.37044) Loss: 0.37044 (0.37044)
|
| 170 |
+
2025-04-27,22:01:54 | INFO | Train Epoch: 3 [827392/917504 (90%)] Data (t): 0.992 Batch (t): 8.996, 919.788/s, 459.894/s/gpu LR: 0.000031 Logit Scale: 100.000 Imm_image: 29.476 (29.412) Imm_text: 29.476 (29.412) Isd_image: -0.75747 (-0.39997) Isd_text: -0.75747 (-0.39997) Contrastive_loss: 0.37558 (0.37301) Loss: 0.37558 (0.37301)
|
| 171 |
+
2025-04-27,22:03:32 | INFO | Train Epoch: 3 [917504/917504 (100%)] Data (t): 0.935 Batch (t): 8.913, 917.124/s, 458.562/s/gpu LR: 0.000030 Logit Scale: 100.000 Imm_image: 29.571 (29.465) Imm_text: 29.571 (29.465) Isd_image: -0.97516 (-0.59170) Isd_text: -0.97516 (-0.59170) Contrastive_loss: 0.31964 (0.35522) Loss: 0.31964 (0.35522)
|
| 172 |
+
2025-04-27,22:03:32 | INFO | Start epoch 4
|
| 173 |
+
2025-04-27,22:04:17 | INFO | Train Epoch: 4 [ 8192/917504 (1%)] Data (t): 36.657 Batch (t): 44.719, 183.187/s, 91.5936/s/gpu LR: 0.000030 Logit Scale: 100.000 Imm_image: 29.600 (29.600) Imm_text: 29.600 (29.600) Isd_image: -0.96800 (-0.96800) Isd_text: -0.96800 (-0.96800) Contrastive_loss: 0.31170 (0.31170) Loss: 0.31170 (0.31170)
|
| 174 |
+
2025-04-27,22:19:17 | INFO | Train Epoch: 4 [827392/917504 (90%)] Data (t): 0.995 Batch (t): 9.002, 919.149/s, 459.574/s/gpu LR: 0.000024 Logit Scale: 100.000 Imm_image: 29.778 (29.689) Imm_text: 29.778 (29.689) Isd_image: -1.3826 (-1.1753) Isd_text: -1.3826 (-1.1753) Contrastive_loss: 0.30417 (0.30793) Loss: 0.30417 (0.30793)
|
| 175 |
+
2025-04-27,22:20:55 | INFO | Train Epoch: 4 [917504/917504 (100%)] Data (t): 0.948 Batch (t): 8.931, 914.345/s, 457.172/s/gpu LR: 0.000023 Logit Scale: 100.000 Imm_image: 29.832 (29.737) Imm_text: 29.832 (29.737) Isd_image: -1.6278 (-1.3261) Isd_text: -1.6278 (-1.3261) Contrastive_loss: 0.25928 (0.29172) Loss: 0.25928 (0.29172)
|
| 176 |
+
2025-04-27,22:20:55 | INFO | Start epoch 5
|
| 177 |
+
2025-04-27,22:21:40 | INFO | Train Epoch: 5 [ 8192/917504 (1%)] Data (t): 35.569 Batch (t): 44.337, 184.767/s, 92.3833/s/gpu LR: 0.000023 Logit Scale: 100.000 Imm_image: 29.864 (29.864) Imm_text: 29.864 (29.864) Isd_image: -1.5245 (-1.5245) Isd_text: -1.5245 (-1.5245) Contrastive_loss: 0.28583 (0.28583) Loss: 0.28583 (0.28583)
|
| 178 |
+
2025-04-27,22:36:41 | INFO | Train Epoch: 5 [827392/917504 (90%)] Data (t): 1.005 Batch (t): 9.014, 915.904/s, 457.952/s/gpu LR: 0.000017 Logit Scale: 100.000 Imm_image: 30.039 (29.951) Imm_text: 30.039 (29.951) Isd_image: -1.7178 (-1.6211) Isd_text: -1.7178 (-1.6211) Contrastive_loss: 0.26470 (0.27526) Loss: 0.26470 (0.27526)
|
| 179 |
+
2025-04-27,22:38:19 | INFO | Train Epoch: 5 [917504/917504 (100%)] Data (t): 0.956 Batch (t): 8.938, 913.227/s, 456.613/s/gpu LR: 0.000017 Logit Scale: 100.000 Imm_image: 29.983 (29.962) Imm_text: 29.983 (29.962) Isd_image: -1.5348 (-1.5924) Isd_text: -1.5348 (-1.5924) Contrastive_loss: 0.25894 (0.26982) Loss: 0.25894 (0.26982)
|
| 180 |
+
2025-04-27,22:38:20 | INFO | Start epoch 6
|
| 181 |
+
2025-04-27,22:39:05 | INFO | Train Epoch: 6 [ 8192/917504 (1%)] Data (t): 36.913 Batch (t): 45.213, 181.186/s, 90.5929/s/gpu LR: 0.000016 Logit Scale: 100.000 Imm_image: 30.205 (30.205) Imm_text: 30.205 (30.205) Isd_image: -1.5840 (-1.5840) Isd_text: -1.5840 (-1.5840) Contrastive_loss: 0.21889 (0.21889) Loss: 0.21889 (0.21889)
|
| 182 |
+
2025-04-27,22:54:06 | INFO | Train Epoch: 6 [827392/917504 (90%)] Data (t): 1.001 Batch (t): 9.008, 918.420/s, 459.210/s/gpu LR: 0.000011 Logit Scale: 100.000 Imm_image: 30.129 (30.167) Imm_text: 30.129 (30.167) Isd_image: -1.8268 (-1.7054) Isd_text: -1.8268 (-1.7054) Contrastive_loss: 0.24040 (0.22965) Loss: 0.24040 (0.22965)
|
| 183 |
+
2025-04-27,22:55:44 | INFO | Train Epoch: 6 [917504/917504 (100%)] Data (t): 0.956 Batch (t): 8.938, 914.360/s, 457.180/s/gpu LR: 0.000010 Logit Scale: 100.000 Imm_image: 30.250 (30.195) Imm_text: 30.250 (30.195) Isd_image: -1.7475 (-1.7195) Isd_text: -1.7475 (-1.7195) Contrastive_loss: 0.22036 (0.22655) Loss: 0.22036 (0.22655)
|
| 184 |
+
2025-04-27,22:55:44 | INFO | Start epoch 7
|
| 185 |
+
2025-04-27,22:56:29 | INFO | Train Epoch: 7 [ 8192/917504 (1%)] Data (t): 36.050 Batch (t): 44.469, 184.219/s, 92.1094/s/gpu LR: 0.000010 Logit Scale: 100.000 Imm_image: 30.307 (30.307) Imm_text: 30.307 (30.307) Isd_image: -1.7967 (-1.7967) Isd_text: -1.7967 (-1.7967) Contrastive_loss: 0.22148 (0.22148) Loss: 0.22148 (0.22148)
|
| 186 |
+
2025-04-27,23:11:28 | INFO | Train Epoch: 7 [827392/917504 (90%)] Data (t): 0.991 Batch (t): 8.997, 918.481/s, 459.241/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.379 (30.343) Imm_text: 30.379 (30.343) Isd_image: -1.9151 (-1.8559) Isd_text: -1.9151 (-1.8559) Contrastive_loss: 0.21597 (0.21872) Loss: 0.21597 (0.21872)
|
| 187 |
+
2025-04-27,23:13:07 | INFO | Train Epoch: 7 [917504/917504 (100%)] Data (t): 0.950 Batch (t): 8.932, 914.317/s, 457.159/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.325 (30.337) Imm_text: 30.325 (30.337) Isd_image: -1.7899 (-1.8339) Isd_text: -1.7899 (-1.8339) Contrastive_loss: 0.21086 (0.21610) Loss: 0.21086 (0.21610)
|
| 188 |
+
2025-04-27,23:13:07 | INFO | Start epoch 8
|
| 189 |
+
2025-04-27,23:13:52 | INFO | Train Epoch: 8 [ 8192/917504 (1%)] Data (t): 36.134 Batch (t): 44.652, 183.463/s, 91.7315/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.386 (30.386) Imm_text: 30.386 (30.386) Isd_image: -1.8206 (-1.8206) Isd_text: -1.8206 (-1.8206) Contrastive_loss: 0.21730 (0.21730) Loss: 0.21730 (0.21730)
|
| 190 |
+
2025-04-27,23:28:51 | INFO | Train Epoch: 8 [827392/917504 (90%)] Data (t): 0.992 Batch (t): 8.997, 919.420/s, 459.710/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.363 (30.374) Imm_text: 30.363 (30.374) Isd_image: -1.8669 (-1.8437) Isd_text: -1.8669 (-1.8437) Contrastive_loss: 0.22544 (0.22137) Loss: 0.22544 (0.22137)
|
| 191 |
+
2025-04-27,23:30:29 | INFO | Train Epoch: 8 [917504/917504 (100%)] Data (t): 0.946 Batch (t): 8.925, 915.526/s, 457.763/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.382 (30.377) Imm_text: 30.382 (30.377) Isd_image: -1.8523 (-1.8466) Isd_text: -1.8523 (-1.8466) Contrastive_loss: 0.20832 (0.21702) Loss: 0.20832 (0.21702)
|
| 192 |
+
2025-04-27,23:30:30 | INFO | Start epoch 9
|
| 193 |
+
2025-04-27,23:31:14 | INFO | Train Epoch: 9 [ 8192/917504 (1%)] Data (t): 36.011 Batch (t): 44.621, 183.592/s, 91.7960/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.424 (30.424) Imm_text: 30.424 (30.424) Isd_image: -1.8386 (-1.8386) Isd_text: -1.8386 (-1.8386) Contrastive_loss: 0.20019 (0.20019) Loss: 0.20019 (0.20019)
|
| 194 |
+
2025-04-27,23:46:16 | INFO | Train Epoch: 9 [827392/917504 (90%)] Data (t): 1.009 Batch (t): 9.018, 919.090/s, 459.545/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 30.359 (30.392) Imm_text: 30.359 (30.392) Isd_image: -1.8356 (-1.8371) Isd_text: -1.8356 (-1.8371) Contrastive_loss: 0.22642 (0.21331) Loss: 0.22642 (0.21331)
|
| 195 |
+
2025-04-27,23:47:54 | INFO | Train Epoch: 9 [917504/917504 (100%)] Data (t): 0.949 Batch (t): 8.931, 917.092/s, 458.546/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 30.506 (30.430) Imm_text: 30.506 (30.430) Isd_image: -1.8817 (-1.8520) Isd_text: -1.8817 (-1.8520) Contrastive_loss: 0.18029 (0.20230) Loss: 0.18029 (0.20230)
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/low_inter_only/tensorboard/events.out.tfevents.1745780012.g12.2713681.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:265d4b9d7c4756414318fe99f7568a80b7462cd2872802cdd17f1c9d6f497c9a
|
| 3 |
+
size 19936
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/clusters/image_I_closest_0.1_SFR-Embedding-Code-2B_R_dinov2-large.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ca2641da262316bb2b895abae3224ab6cc166b82a2dd4f65b1fe1f67cb907350
|
| 3 |
+
size 228262016
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/clusters/image_farest.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b60d6b08a008e194de90c00df9913a0203d8898fd75763bf2d6a84840c64192e
|
| 3 |
+
size 124009441
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/clusters/text_farest.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:51013bcfe4a79c5ee87068f55c96d5a0cb0bfb0c1899f48a634c42240d5a2e2f
|
| 3 |
+
size 124084962
|
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/clusters/text_uniform.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7c39234fd0396c1f512ea29d8efbb4e717734421d7132a2dcf073af3dabb23ce
|
| 3 |
+
size 124061847
|
captions.tsv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:38b710ad65f51fd80e92d8ea2e1aabf1df625f9720d7626d9724e48d4d09b3d8
|
| 3 |
+
size 771831271
|