cornuHGF commited on
Commit
f9a2f4e
·
verified ·
1 Parent(s): 5502320

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  2. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  3. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  4. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json +1 -0
  5. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  6. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  7. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  8. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  9. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  10. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  11. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  12. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/out.log +195 -0
  13. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/params.txt +103 -0
  14. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_caltech101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  15. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_cars_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  16. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  17. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_cifar10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  18. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  19. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  20. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_eurosat_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  21. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  22. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json +1 -0
  23. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  24. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  25. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  26. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  27. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_mscoco_captions_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json +1 -0
  28. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  29. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_stl10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  30. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  31. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  32. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/out.log +195 -0
  33. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/params.txt +103 -0
  34. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_caltech101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  35. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_cars_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  36. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  37. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_cifar10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  38. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  39. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  40. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_eurosat_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  41. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  42. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json +1 -0
  43. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  44. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  45. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  46. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  47. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_mscoco_captions_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json +1 -0
  48. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  49. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_stl10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
  50. SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json +1 -0
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "cifar100", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7589, "acc5": 0.9387, "mean_per_class_recall": 0.7590999999999998}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "country211", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.1975829383886256, "acc5": 0.4209952606635071, "mean_per_class_recall": 0.19763033175355452}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "dtd", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5484042553191489, "acc5": 0.8191489361702128, "mean_per_class_recall": 0.547872340425532}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "flickr30k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.656000018119812, "text_retrieval_recall@1": 0.8009999990463257, "image_retrieval_recall@5": 0.8655999898910522, "text_retrieval_recall@5": 0.9539999961853027, "image_retrieval_recall@10": 0.9193999767303467, "text_retrieval_recall@10": 0.9760000109672546}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "flowers", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7487396324605627, "acc5": 0.897869572288177, "mean_per_class_recall": 0.7396028874215905}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "food101", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8691485148514851, "acc5": 0.980039603960396, "mean_per_class_recall": 0.8692277227722772}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "gtsrb", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.510609659540776, "acc5": 0.7530482977038796, "mean_per_class_recall": 0.4616824052066722}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "imagenet1k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.68688, "acc5": 0.90938, "mean_per_class_recall": 0.6868800000000002}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "pets", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8947942218588171, "acc5": 0.9964568002180431, "mean_per_class_recall": 0.8933304274009377}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "sun397", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6720764293727127, "acc5": 0.928158964267981, "mean_per_class_recall": 0.6638586591957565}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "vtab/resisc45", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6063492063492063, "acc5": 0.9077777777777778, "mean_per_class_recall": 0.6129633195377677}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/out.log ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-26,19:20:36 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2.
2
+ 2025-04-26,19:20:36 | INFO | Loaded ViT-B-16 model config.
3
+ 2025-04-26,19:20:37 | INFO | Loading pretrained ViT-B-16 weights (datacomp_xl_s13b_b90k).
4
+ 2025-04-26,19:20:38 | INFO | Model:
5
+ 2025-04-26,19:20:38 | INFO | CLIP(
6
+ (visual): VisionTransformer(
7
+ (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
8
+ (patch_dropout): Identity()
9
+ (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
10
+ (transformer): Transformer(
11
+ (resblocks): ModuleList(
12
+ (0-11): 12 x ResidualAttentionBlock(
13
+ (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
14
+ (attn): MultiheadAttention(
15
+ (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
16
+ )
17
+ (ls_1): Identity()
18
+ (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
19
+ (mlp): Sequential(
20
+ (c_fc): Linear(in_features=768, out_features=3072, bias=True)
21
+ (gelu): GELU(approximate='none')
22
+ (c_proj): Linear(in_features=3072, out_features=768, bias=True)
23
+ )
24
+ (ls_2): Identity()
25
+ )
26
+ )
27
+ )
28
+ (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
29
+ )
30
+ (transformer): Transformer(
31
+ (resblocks): ModuleList(
32
+ (0-11): 12 x ResidualAttentionBlock(
33
+ (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
34
+ (attn): MultiheadAttention(
35
+ (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
36
+ )
37
+ (ls_1): Identity()
38
+ (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
39
+ (mlp): Sequential(
40
+ (c_fc): Linear(in_features=512, out_features=2048, bias=True)
41
+ (gelu): GELU(approximate='none')
42
+ (c_proj): Linear(in_features=2048, out_features=512, bias=True)
43
+ )
44
+ (ls_2): Identity()
45
+ )
46
+ )
47
+ )
48
+ (token_embedding): Embedding(49408, 512)
49
+ (ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
50
+ )
51
+ 2025-04-26,19:20:38 | INFO | Params:
52
+ 2025-04-26,19:20:38 | INFO | accum_freq: 2
53
+ 2025-04-26,19:20:38 | INFO | aug_cfg: {}
54
+ 2025-04-26,19:20:38 | INFO | batch_size: 2048
55
+ 2025-04-26,19:20:38 | INFO | beta1: 0.9
56
+ 2025-04-26,19:20:38 | INFO | beta2: 0.98
57
+ 2025-04-26,19:20:38 | INFO | cache_dir: None
58
+ 2025-04-26,19:20:38 | INFO | caption_ratio: 0.1
59
+ 2025-04-26,19:20:38 | INFO | checkpoint_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints
60
+ 2025-04-26,19:20:38 | INFO | coca_caption_loss_weight: 2.0
61
+ 2025-04-26,19:20:38 | INFO | coca_contrastive_loss_weight: 1.0
62
+ 2025-04-26,19:20:38 | INFO | copy_codebase: False
63
+ 2025-04-26,19:20:38 | INFO | csv_caption_key: title
64
+ 2025-04-26,19:20:38 | INFO | csv_img_key: filepath
65
+ 2025-04-26,19:20:38 | INFO | csv_separator:
66
+ 2025-04-26,19:20:38 | INFO | dataset_resampled: False
67
+ 2025-04-26,19:20:38 | INFO | dataset_type: synthetic
68
+ 2025-04-26,19:20:38 | INFO | ddp_static_graph: False
69
+ 2025-04-26,19:20:38 | INFO | debug: False
70
+ 2025-04-26,19:20:38 | INFO | delete_previous_checkpoint: False
71
+ 2025-04-26,19:20:38 | INFO | device: cuda:0
72
+ 2025-04-26,19:20:38 | INFO | dist_backend: None
73
+ 2025-04-26,19:20:38 | INFO | dist_url: None
74
+ 2025-04-26,19:20:38 | INFO | distill: False
75
+ 2025-04-26,19:20:38 | INFO | distill_model: None
76
+ 2025-04-26,19:20:38 | INFO | distill_pretrained: None
77
+ 2025-04-26,19:20:38 | INFO | distributed: True
78
+ 2025-04-26,19:20:38 | INFO | epochs: 10
79
+ 2025-04-26,19:20:38 | INFO | epochs_cooldown: None
80
+ 2025-04-26,19:20:38 | INFO | eps: 1e-08
81
+ 2025-04-26,19:20:38 | INFO | force_custom_text: False
82
+ 2025-04-26,19:20:38 | INFO | force_image_size: None
83
+ 2025-04-26,19:20:38 | INFO | force_patch_dropout: None
84
+ 2025-04-26,19:20:38 | INFO | force_quick_gelu: False
85
+ 2025-04-26,19:20:38 | INFO | gather_with_grad: True
86
+ 2025-04-26,19:20:38 | INFO | grad_checkpointing: True
87
+ 2025-04-26,19:20:38 | INFO | grad_clip_norm: None
88
+ 2025-04-26,19:20:38 | INFO | horovod: False
89
+ 2025-04-26,19:20:38 | INFO | image_interpolation: None
90
+ 2025-04-26,19:20:38 | INFO | image_mean: None
91
+ 2025-04-26,19:20:38 | INFO | image_resize_mode: None
92
+ 2025-04-26,19:20:38 | INFO | image_std: None
93
+ 2025-04-26,19:20:38 | INFO | imagenet_v2: None
94
+ 2025-04-26,19:20:38 | INFO | imagenet_val: None
95
+ 2025-04-26,19:20:38 | INFO | keep_func_name: keep_image_farest
96
+ 2025-04-26,19:20:38 | INFO | local_loss: False
97
+ 2025-04-26,19:20:38 | INFO | local_rank: 0
98
+ 2025-04-26,19:20:38 | INFO | lock_image: False
99
+ 2025-04-26,19:20:38 | INFO | lock_image_freeze_bn_stats: False
100
+ 2025-04-26,19:20:38 | INFO | lock_image_unlocked_groups: 0
101
+ 2025-04-26,19:20:38 | INFO | lock_text: True
102
+ 2025-04-26,19:20:38 | INFO | lock_text_freeze_layer_norm: False
103
+ 2025-04-26,19:20:38 | INFO | lock_text_unlocked_layers: 0
104
+ 2025-04-26,19:20:38 | INFO | log_every_n_steps: 100
105
+ 2025-04-26,19:20:38 | INFO | log_level: 20
106
+ 2025-04-26,19:20:38 | INFO | log_local: False
107
+ 2025-04-26,19:20:38 | INFO | log_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/out.log
108
+ 2025-04-26,19:20:38 | INFO | logs: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text
109
+ 2025-04-26,19:20:38 | INFO | loss_dist_impl: None
110
+ 2025-04-26,19:20:38 | INFO | lr: 4e-05
111
+ 2025-04-26,19:20:38 | INFO | lr_cooldown_end: 0.0
112
+ 2025-04-26,19:20:38 | INFO | lr_cooldown_power: 1.0
113
+ 2025-04-26,19:20:38 | INFO | lr_scheduler: cosine
114
+ 2025-04-26,19:20:38 | INFO | map_func_name: use_all
115
+ 2025-04-26,19:20:38 | INFO | model: ViT-B-16
116
+ 2025-04-26,19:20:38 | INFO | momentum: None
117
+ 2025-04-26,19:20:38 | INFO | name: keep_image_farest
118
+ 2025-04-26,19:20:38 | INFO | no_set_device_rank: False
119
+ 2025-04-26,19:20:38 | INFO | opt: adamw
120
+ 2025-04-26,19:20:38 | INFO | precision: amp
121
+ 2025-04-26,19:20:38 | INFO | pretrained: datacomp_xl_s13b_b90k
122
+ 2025-04-26,19:20:38 | INFO | pretrained_image: False
123
+ 2025-04-26,19:20:38 | INFO | rank: 0
124
+ 2025-04-26,19:20:38 | INFO | remote_sync: None
125
+ 2025-04-26,19:20:38 | INFO | remote_sync_frequency: 300
126
+ 2025-04-26,19:20:38 | INFO | remote_sync_protocol: s3
127
+ 2025-04-26,19:20:38 | INFO | report_to: tensorboard,wandb
128
+ 2025-04-26,19:20:38 | INFO | resume: None
129
+ 2025-04-26,19:20:38 | INFO | save_frequency: 10
130
+ 2025-04-26,19:20:38 | INFO | save_most_recent: False
131
+ 2025-04-26,19:20:38 | INFO | seed: 0
132
+ 2025-04-26,19:20:38 | INFO | siglip: False
133
+ 2025-04-26,19:20:38 | INFO | skip_scheduler: False
134
+ 2025-04-26,19:20:38 | INFO | tensorboard: True
135
+ 2025-04-26,19:20:38 | INFO | tensorboard_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/tensorboard
136
+ 2025-04-26,19:20:38 | INFO | torchcompile: False
137
+ 2025-04-26,19:20:38 | INFO | torchscript: False
138
+ 2025-04-26,19:20:38 | INFO | trace: False
139
+ 2025-04-26,19:20:38 | INFO | train_data: /mnt/personal/zhudongy/recap-datacomp-3m-wds/{00376..01507}.tar
140
+ 2025-04-26,19:20:38 | INFO | train_data_upsampling_factors: None
141
+ 2025-04-26,19:20:38 | INFO | train_num_samples: 9011874
142
+ 2025-04-26,19:20:38 | INFO | use_bn_sync: False
143
+ 2025-04-26,19:20:38 | INFO | use_bnb_linear: None
144
+ 2025-04-26,19:20:38 | INFO | val_data: None
145
+ 2025-04-26,19:20:38 | INFO | val_frequency: 1
146
+ 2025-04-26,19:20:38 | INFO | val_num_samples: None
147
+ 2025-04-26,19:20:38 | INFO | wandb: True
148
+ 2025-04-26,19:20:38 | INFO | wandb_notes:
149
+ 2025-04-26,19:20:38 | INFO | wandb_project_name: open-clip
150
+ 2025-04-26,19:20:38 | INFO | warmup: 110
151
+ 2025-04-26,19:20:38 | INFO | wd: 0.5
152
+ 2025-04-26,19:20:38 | INFO | workers: 16
153
+ 2025-04-26,19:20:38 | INFO | world_size: 2
154
+ 2025-04-26,19:20:38 | INFO | zeroshot_frequency: 2
155
+ 2025-04-26,19:20:39 | INFO | Created AdamW (adamw) optimizer: lr: 4e-05, betas: (0.9, 0.98), eps: 1e-08, weight_decay: 0.5, amsgrad: False, foreach: None, maximize: False, capturable: False, differentiable: False, fused: None
156
+ 2025-04-26,19:20:57 | INFO | Start epoch 0
157
+ 2025-04-26,19:21:51 | INFO | Train Epoch: 0 [ 8192/917504 (1%)] Data (t): 44.342 Batch (t): 53.918, 151.934/s, 75.9669/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 28.554 (28.554) Imm_text: 28.554 (28.554) Isd_image: 3.2214 (3.2214) Isd_text: 3.2214 (3.2214) Contrastive_loss: 1.4326 (1.4326) Loss: 1.4326 (1.4326)
158
+ 2025-04-26,19:36:52 | INFO | Train Epoch: 0 [827392/917504 (90%)] Data (t): 0.998 Batch (t): 9.004, 917.588/s, 458.794/s/gpu LR: 0.000037 Logit Scale: 99.933 Imm_image: 28.962 (28.758) Imm_text: 28.962 (28.758) Isd_image: 5.8420 (4.5317) Isd_text: 5.8420 (4.5317) Contrastive_loss: 0.73414 (1.0834) Loss: 0.73414 (1.0834)
159
+ 2025-04-26,19:38:30 | INFO | Train Epoch: 0 [917504/917504 (100%)] Data (t): 0.953 Batch (t): 8.939, 913.115/s, 456.558/s/gpu LR: 0.000040 Logit Scale: 99.924 Imm_image: 28.947 (28.821) Imm_text: 28.947 (28.821) Isd_image: 5.6367 (4.9000) Isd_text: 5.6367 (4.9000) Contrastive_loss: 0.69624 (0.95433) Loss: 0.69624 (0.95433)
160
+ 2025-04-26,19:38:30 | INFO | Start epoch 1
161
+ 2025-04-26,19:39:13 | INFO | Train Epoch: 1 [ 8192/917504 (1%)] Data (t): 33.564 Batch (t): 43.142, 189.886/s, 94.9431/s/gpu LR: 0.000040 Logit Scale: 99.923 Imm_image: 29.097 (29.097) Imm_text: 29.097 (29.097) Isd_image: 5.5602 (5.5602) Isd_text: 5.5602 (5.5602) Contrastive_loss: 0.60997 (0.60997) Loss: 0.60997 (0.60997)
162
+ 2025-04-26,19:54:15 | INFO | Train Epoch: 1 [827392/917504 (90%)] Data (t): 1.005 Batch (t): 9.013, 917.390/s, 458.695/s/gpu LR: 0.000039 Logit Scale: 99.881 Imm_image: 28.927 (29.012) Imm_text: 28.927 (29.012) Isd_image: 4.1130 (4.8366) Isd_text: 4.1130 (4.8366) Contrastive_loss: 0.62246 (0.61621) Loss: 0.62246 (0.61621)
163
+ 2025-04-26,19:55:53 | INFO | Train Epoch: 1 [917504/917504 (100%)] Data (t): 0.953 Batch (t): 8.938, 915.751/s, 457.875/s/gpu LR: 0.000039 Logit Scale: 99.881 Imm_image: 29.309 (29.111) Imm_text: 29.309 (29.111) Isd_image: 3.7556 (4.4762) Isd_text: 3.7556 (4.4762) Contrastive_loss: 0.49672 (0.57638) Loss: 0.49672 (0.57638)
164
+ 2025-04-26,19:55:53 | INFO | Start epoch 2
165
+ 2025-04-26,19:56:39 | INFO | Train Epoch: 2 [ 8192/917504 (1%)] Data (t): 35.471 Batch (t): 45.796, 178.882/s, 89.4409/s/gpu LR: 0.000039 Logit Scale: 99.882 Imm_image: 29.290 (29.290) Imm_text: 29.290 (29.290) Isd_image: 3.6671 (3.6671) Isd_text: 3.6671 (3.6671) Contrastive_loss: 0.49962 (0.49962) Loss: 0.49962 (0.49962)
166
+ 2025-04-26,20:11:40 | INFO | Train Epoch: 2 [827392/917504 (90%)] Data (t): 0.999 Batch (t): 9.008, 913.482/s, 456.741/s/gpu LR: 0.000036 Logit Scale: 100.000 Imm_image: 29.430 (29.360) Imm_text: 29.430 (29.360) Isd_image: 2.6187 (3.1429) Isd_text: 2.6187 (3.1429) Contrastive_loss: 0.45664 (0.47813) Loss: 0.45664 (0.47813)
167
+ 2025-04-26,20:13:18 | INFO | Train Epoch: 2 [917504/917504 (100%)] Data (t): 0.958 Batch (t): 8.943, 914.170/s, 457.085/s/gpu LR: 0.000035 Logit Scale: 100.000 Imm_image: 29.466 (29.396) Imm_text: 29.466 (29.396) Isd_image: 2.6329 (2.9729) Isd_text: 2.6329 (2.9729) Contrastive_loss: 0.41372 (0.45666) Loss: 0.41372 (0.45666)
168
+ 2025-04-26,20:13:18 | INFO | Start epoch 3
169
+ 2025-04-26,20:14:03 | INFO | Train Epoch: 3 [ 8192/917504 (1%)] Data (t): 36.546 Batch (t): 44.617, 183.607/s, 91.8035/s/gpu LR: 0.000035 Logit Scale: 100.000 Imm_image: 29.504 (29.504) Imm_text: 29.504 (29.504) Isd_image: 2.5284 (2.5284) Isd_text: 2.5284 (2.5284) Contrastive_loss: 0.39021 (0.39021) Loss: 0.39021 (0.39021)
170
+ 2025-04-26,20:29:06 | INFO | Train Epoch: 3 [827392/917504 (90%)] Data (t): 1.015 Batch (t): 9.027, 917.585/s, 458.792/s/gpu LR: 0.000031 Logit Scale: 100.000 Imm_image: 29.626 (29.565) Imm_text: 29.626 (29.565) Isd_image: 1.6747 (2.1016) Isd_text: 1.6747 (2.1016) Contrastive_loss: 0.41409 (0.40215) Loss: 0.41409 (0.40215)
171
+ 2025-04-26,20:30:44 | INFO | Train Epoch: 3 [917504/917504 (100%)] Data (t): 0.962 Batch (t): 8.949, 915.455/s, 457.728/s/gpu LR: 0.000030 Logit Scale: 100.000 Imm_image: 29.619 (29.583) Imm_text: 29.619 (29.583) Isd_image: 1.6334 (1.9455) Isd_text: 1.6334 (1.9455) Contrastive_loss: 0.36740 (0.39056) Loss: 0.36740 (0.39056)
172
+ 2025-04-26,20:30:44 | INFO | Start epoch 4
173
+ 2025-04-26,20:31:29 | INFO | Train Epoch: 4 [ 8192/917504 (1%)] Data (t): 35.128 Batch (t): 44.257, 185.102/s, 92.5508/s/gpu LR: 0.000030 Logit Scale: 100.000 Imm_image: 29.757 (29.757) Imm_text: 29.757 (29.757) Isd_image: 1.5362 (1.5362) Isd_text: 1.5362 (1.5362) Contrastive_loss: 0.33672 (0.33672) Loss: 0.33672 (0.33672)
174
+ 2025-04-26,20:46:29 | INFO | Train Epoch: 4 [827392/917504 (90%)] Data (t): 0.997 Batch (t): 9.004, 917.946/s, 458.973/s/gpu LR: 0.000024 Logit Scale: 100.000 Imm_image: 29.850 (29.804) Imm_text: 29.850 (29.804) Isd_image: 1.1166 (1.3264) Isd_text: 1.1166 (1.3264) Contrastive_loss: 0.33999 (0.33835) Loss: 0.33999 (0.33835)
175
+ 2025-04-26,20:48:07 | INFO | Train Epoch: 4 [917504/917504 (100%)] Data (t): 0.954 Batch (t): 8.939, 915.946/s, 457.973/s/gpu LR: 0.000023 Logit Scale: 100.000 Imm_image: 29.934 (29.847) Imm_text: 29.934 (29.847) Isd_image: 1.0718 (1.2415) Isd_text: 1.0718 (1.2415) Contrastive_loss: 0.28950 (0.32207) Loss: 0.28950 (0.32207)
176
+ 2025-04-26,20:48:08 | INFO | Start epoch 5
177
+ 2025-04-26,20:48:52 | INFO | Train Epoch: 5 [ 8192/917504 (1%)] Data (t): 36.121 Batch (t): 44.476, 184.189/s, 92.0946/s/gpu LR: 0.000023 Logit Scale: 100.000 Imm_image: 29.804 (29.804) Imm_text: 29.804 (29.804) Isd_image: 1.1836 (1.1836) Isd_text: 1.1836 (1.1836) Contrastive_loss: 0.33592 (0.33592) Loss: 0.33592 (0.33592)
178
+ 2025-04-26,21:03:54 | INFO | Train Epoch: 5 [827392/917504 (90%)] Data (t): 1.010 Batch (t): 9.022, 916.297/s, 458.148/s/gpu LR: 0.000017 Logit Scale: 100.000 Imm_image: 30.026 (29.915) Imm_text: 30.026 (29.915) Isd_image: 0.84884 (1.0162) Isd_text: 0.84884 (1.0162) Contrastive_loss: 0.26665 (0.30128) Loss: 0.26665 (0.30128)
179
+ 2025-04-26,21:05:33 | INFO | Train Epoch: 5 [917504/917504 (100%)] Data (t): 0.952 Batch (t): 8.936, 915.004/s, 457.502/s/gpu LR: 0.000017 Logit Scale: 100.000 Imm_image: 29.985 (29.938) Imm_text: 29.985 (29.938) Isd_image: 0.86452 (0.96566) Isd_text: 0.86452 (0.96566) Contrastive_loss: 0.26700 (0.28985) Loss: 0.26700 (0.28985)
180
+ 2025-04-26,21:05:33 | INFO | Start epoch 6
181
+ 2025-04-26,21:06:18 | INFO | Train Epoch: 6 [ 8192/917504 (1%)] Data (t): 36.805 Batch (t): 44.934, 182.311/s, 91.1555/s/gpu LR: 0.000016 Logit Scale: 100.000 Imm_image: 30.204 (30.204) Imm_text: 30.204 (30.204) Isd_image: 0.70070 (0.70070) Isd_text: 0.70070 (0.70070) Contrastive_loss: 0.23083 (0.23083) Loss: 0.23083 (0.23083)
182
+ 2025-04-26,21:21:18 | INFO | Train Epoch: 6 [827392/917504 (90%)] Data (t): 0.999 Batch (t): 9.007, 917.786/s, 458.893/s/gpu LR: 0.000011 Logit Scale: 100.000 Imm_image: 30.193 (30.199) Imm_text: 30.193 (30.199) Isd_image: 0.75573 (0.72822) Isd_text: 0.75573 (0.72822) Contrastive_loss: 0.24233 (0.23658) Loss: 0.24233 (0.23658)
183
+ 2025-04-26,21:22:57 | INFO | Train Epoch: 6 [917504/917504 (100%)] Data (t): 0.954 Batch (t): 8.940, 916.499/s, 458.250/s/gpu LR: 0.000010 Logit Scale: 100.000 Imm_image: 30.249 (30.216) Imm_text: 30.249 (30.216) Isd_image: 0.70645 (0.72096) Isd_text: 0.70645 (0.72096) Contrastive_loss: 0.21447 (0.22921) Loss: 0.21447 (0.22921)
184
+ 2025-04-26,21:22:57 | INFO | Start epoch 7
185
+ 2025-04-26,21:23:42 | INFO | Train Epoch: 7 [ 8192/917504 (1%)] Data (t): 36.516 Batch (t): 44.643, 183.500/s, 91.7499/s/gpu LR: 0.000010 Logit Scale: 100.000 Imm_image: 30.320 (30.320) Imm_text: 30.320 (30.320) Isd_image: 0.57832 (0.57832) Isd_text: 0.57832 (0.57832) Contrastive_loss: 0.22137 (0.22137) Loss: 0.22137 (0.22137)
186
+ 2025-04-26,21:38:42 | INFO | Train Epoch: 7 [827392/917504 (90%)] Data (t): 0.998 Batch (t): 9.006, 915.734/s, 457.867/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.290 (30.305) Imm_text: 30.290 (30.305) Isd_image: 0.58627 (0.58230) Isd_text: 0.58627 (0.58230) Contrastive_loss: 0.24158 (0.23147) Loss: 0.24158 (0.23147)
187
+ 2025-04-26,21:40:21 | INFO | Train Epoch: 7 [917504/917504 (100%)] Data (t): 0.959 Batch (t): 8.944, 914.296/s, 457.148/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.336 (30.315) Imm_text: 30.336 (30.315) Isd_image: 0.57076 (0.57845) Isd_text: 0.57076 (0.57845) Contrastive_loss: 0.20793 (0.22363) Loss: 0.20793 (0.22363)
188
+ 2025-04-26,21:40:21 | INFO | Start epoch 8
189
+ 2025-04-26,21:41:05 | INFO | Train Epoch: 8 [ 8192/917504 (1%)] Data (t): 34.989 Batch (t): 44.032, 186.047/s, 93.0235/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.384 (30.384) Imm_text: 30.384 (30.384) Isd_image: 0.50557 (0.50557) Isd_text: 0.50557 (0.50557) Contrastive_loss: 0.19039 (0.19039) Loss: 0.19039 (0.19039)
190
+ 2025-04-26,21:56:10 | INFO | Train Epoch: 8 [827392/917504 (90%)] Data (t): 1.028 Batch (t): 9.047, 913.226/s, 456.613/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.238 (30.311) Imm_text: 30.238 (30.311) Isd_image: 0.65661 (0.58109) Isd_text: 0.65661 (0.58109) Contrastive_loss: 0.24421 (0.21730) Loss: 0.24421 (0.21730)
191
+ 2025-04-26,21:57:48 | INFO | Train Epoch: 8 [917504/917504 (100%)] Data (t): 0.951 Batch (t): 8.934, 914.753/s, 457.376/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.413 (30.345) Imm_text: 30.413 (30.345) Isd_image: 0.54766 (0.56995) Isd_text: 0.54766 (0.56995) Contrastive_loss: 0.19143 (0.20867) Loss: 0.19143 (0.20867)
192
+ 2025-04-26,21:57:48 | INFO | Start epoch 9
193
+ 2025-04-26,21:58:33 | INFO | Train Epoch: 9 [ 8192/917504 (1%)] Data (t): 36.704 Batch (t): 44.784, 182.922/s, 91.4612/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.425 (30.425) Imm_text: 30.425 (30.425) Isd_image: 0.52132 (0.52132) Isd_text: 0.52132 (0.52132) Contrastive_loss: 0.20011 (0.20011) Loss: 0.20011 (0.20011)
194
+ 2025-04-26,22:13:32 | INFO | Train Epoch: 9 [827392/917504 (90%)] Data (t): 0.981 Batch (t): 8.986, 916.852/s, 458.426/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 30.419 (30.422) Imm_text: 30.419 (30.422) Isd_image: 0.57758 (0.54945) Isd_text: 0.57758 (0.54945) Contrastive_loss: 0.21143 (0.20577) Loss: 0.21143 (0.20577)
195
+ 2025-04-26,22:15:10 | INFO | Train Epoch: 9 [917504/917504 (100%)] Data (t): 0.936 Batch (t): 8.913, 916.256/s, 458.128/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 30.455 (30.433) Imm_text: 30.455 (30.433) Isd_image: 0.51568 (0.53820) Isd_text: 0.51568 (0.53820) Contrastive_loss: 0.18257 (0.19804) Loss: 0.18257 (0.19804)
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/params.txt ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 2
2
+ aug_cfg: {}
3
+ batch_size: 2048
4
+ beta1: 0.9
5
+ beta2: 0.98
6
+ cache_dir: None
7
+ caption_ratio: 0.1
8
+ checkpoint_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/checkpoints
9
+ coca_caption_loss_weight: 2.0
10
+ coca_contrastive_loss_weight: 1.0
11
+ copy_codebase: False
12
+ csv_caption_key: title
13
+ csv_img_key: filepath
14
+ csv_separator:
15
+ dataset_resampled: False
16
+ dataset_type: synthetic
17
+ ddp_static_graph: False
18
+ debug: False
19
+ delete_previous_checkpoint: False
20
+ device: cuda:0
21
+ dist_backend: None
22
+ dist_url: None
23
+ distill: False
24
+ distill_model: None
25
+ distill_pretrained: None
26
+ distributed: True
27
+ epochs: 10
28
+ epochs_cooldown: None
29
+ eps: 1e-08
30
+ force_custom_text: False
31
+ force_image_size: None
32
+ force_patch_dropout: None
33
+ force_quick_gelu: False
34
+ gather_with_grad: True
35
+ grad_checkpointing: True
36
+ grad_clip_norm: None
37
+ horovod: False
38
+ image_interpolation: None
39
+ image_mean: None
40
+ image_resize_mode: None
41
+ image_std: None
42
+ imagenet_v2: None
43
+ imagenet_val: None
44
+ keep_func_name: keep_image_farest
45
+ local_loss: False
46
+ local_rank: 0
47
+ lock_image: False
48
+ lock_image_freeze_bn_stats: False
49
+ lock_image_unlocked_groups: 0
50
+ lock_text: True
51
+ lock_text_freeze_layer_norm: False
52
+ lock_text_unlocked_layers: 0
53
+ log_every_n_steps: 100
54
+ log_level: 20
55
+ log_local: False
56
+ log_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/out.log
57
+ logs: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text
58
+ loss_dist_impl: None
59
+ lr: 4e-05
60
+ lr_cooldown_end: 0.0
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ map_func_name: use_all
64
+ model: ViT-B-16
65
+ momentum: None
66
+ name: keep_image_farest
67
+ no_set_device_rank: False
68
+ opt: adamw
69
+ precision: amp
70
+ pretrained: datacomp_xl_s13b_b90k
71
+ pretrained_image: False
72
+ rank: 0
73
+ remote_sync: None
74
+ remote_sync_frequency: 300
75
+ remote_sync_protocol: s3
76
+ report_to: tensorboard,wandb
77
+ resume: None
78
+ save_frequency: 10
79
+ save_most_recent: False
80
+ seed: 0
81
+ siglip: False
82
+ skip_scheduler: False
83
+ tensorboard: True
84
+ tensorboard_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_image_farest/tensorboard
85
+ torchcompile: False
86
+ torchscript: False
87
+ trace: False
88
+ train_data: /mnt/personal/zhudongy/recap-datacomp-3m-wds/{00376..01507}.tar
89
+ train_data_upsampling_factors: None
90
+ train_num_samples: 9011874
91
+ use_bn_sync: False
92
+ use_bnb_linear: None
93
+ val_data: None
94
+ val_frequency: 1
95
+ val_num_samples: None
96
+ wandb: True
97
+ wandb_notes:
98
+ wandb_project_name: open-clip
99
+ warmup: 110
100
+ wd: 0.5
101
+ workers: 16
102
+ world_size: 2
103
+ zeroshot_frequency: 2
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_caltech101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "caltech101", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8642825896762905, "acc5": 0.968503937007874, "mean_per_class_recall": 0.9268833112583469}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_cars_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "cars", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8307424449695312, "acc5": 0.9890560875512996, "mean_per_class_recall": 0.831079202139019}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "cifar100", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7507, "acc5": 0.9307, "mean_per_class_recall": 0.7508}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_cifar10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "cifar10", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.9424, "acc5": 0.9983, "mean_per_class_recall": 0.9426}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "country211", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.19488151658767772, "acc5": 0.42402843601895734, "mean_per_class_recall": 0.19497630331753554}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "dtd", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5574468085106383, "acc5": 0.8218085106382979, "mean_per_class_recall": 0.5574468085106383}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_eurosat_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "eurosat", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.4812222222222222, "acc5": 0.9082592592592592, "mean_per_class_recall": 0.49873}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "fgvc_aircraft", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.24302430243024303, "acc5": 0.5886588658865887, "mean_per_class_recall": 0.24406417112299464}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "flickr30k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.6453999876976013, "text_retrieval_recall@1": 0.8119999766349792, "image_retrieval_recall@5": 0.8736000061035156, "text_retrieval_recall@5": 0.9599999785423279, "image_retrieval_recall@10": 0.9232000112533569, "text_retrieval_recall@10": 0.9769999980926514}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "flowers", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7315010570824524, "acc5": 0.9033989266547406, "mean_per_class_recall": 0.7427204309687857}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "food101", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8706930693069307, "acc5": 0.981069306930693, "mean_per_class_recall": 0.8706138613861385}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "gtsrb", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5311955661124307, "acc5": 0.7740300870942202, "mean_per_class_recall": 0.5028962559086422}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "imagenet1k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.68738, "acc5": 0.90974, "mean_per_class_recall": 0.68722}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_mscoco_captions_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "mscoco_captions", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.38168731331825256, "text_retrieval_recall@1": 0.5389999747276306, "image_retrieval_recall@5": 0.6399040222167969, "text_retrieval_recall@5": 0.7764000296592712, "image_retrieval_recall@10": 0.7417033314704895, "text_retrieval_recall@10": 0.8560000061988831}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "pets", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8997001907876806, "acc5": 0.9964568002180431, "mean_per_class_recall": 0.8992468186086529}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_stl10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "stl10", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.970375, "acc5": 0.999875, "mean_per_class_recall": 0.9705}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "sun397", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6735016643066002, "acc5": 0.927404969012634, "mean_per_class_recall": 0.6633693099922054}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/benchmark_vtab_resisc45_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "vtab/resisc45", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6115873015873016, "acc5": 0.9157142857142857, "mean_per_class_recall": 0.6191834588628579}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/out.log ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-27,01:37:31 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2.
2
+ 2025-04-27,01:37:31 | INFO | Loaded ViT-B-16 model config.
3
+ 2025-04-27,01:37:33 | INFO | Loading pretrained ViT-B-16 weights (datacomp_xl_s13b_b90k).
4
+ 2025-04-27,01:37:33 | INFO | Model:
5
+ 2025-04-27,01:37:33 | INFO | CLIP(
6
+ (visual): VisionTransformer(
7
+ (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
8
+ (patch_dropout): Identity()
9
+ (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
10
+ (transformer): Transformer(
11
+ (resblocks): ModuleList(
12
+ (0-11): 12 x ResidualAttentionBlock(
13
+ (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
14
+ (attn): MultiheadAttention(
15
+ (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
16
+ )
17
+ (ls_1): Identity()
18
+ (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
19
+ (mlp): Sequential(
20
+ (c_fc): Linear(in_features=768, out_features=3072, bias=True)
21
+ (gelu): GELU(approximate='none')
22
+ (c_proj): Linear(in_features=3072, out_features=768, bias=True)
23
+ )
24
+ (ls_2): Identity()
25
+ )
26
+ )
27
+ )
28
+ (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
29
+ )
30
+ (transformer): Transformer(
31
+ (resblocks): ModuleList(
32
+ (0-11): 12 x ResidualAttentionBlock(
33
+ (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
34
+ (attn): MultiheadAttention(
35
+ (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
36
+ )
37
+ (ls_1): Identity()
38
+ (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
39
+ (mlp): Sequential(
40
+ (c_fc): Linear(in_features=512, out_features=2048, bias=True)
41
+ (gelu): GELU(approximate='none')
42
+ (c_proj): Linear(in_features=2048, out_features=512, bias=True)
43
+ )
44
+ (ls_2): Identity()
45
+ )
46
+ )
47
+ )
48
+ (token_embedding): Embedding(49408, 512)
49
+ (ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
50
+ )
51
+ 2025-04-27,01:37:33 | INFO | Params:
52
+ 2025-04-27,01:37:33 | INFO | accum_freq: 2
53
+ 2025-04-27,01:37:33 | INFO | aug_cfg: {}
54
+ 2025-04-27,01:37:33 | INFO | batch_size: 2048
55
+ 2025-04-27,01:37:33 | INFO | beta1: 0.9
56
+ 2025-04-27,01:37:33 | INFO | beta2: 0.98
57
+ 2025-04-27,01:37:33 | INFO | cache_dir: None
58
+ 2025-04-27,01:37:33 | INFO | caption_ratio: 0.1
59
+ 2025-04-27,01:37:33 | INFO | checkpoint_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints
60
+ 2025-04-27,01:37:33 | INFO | coca_caption_loss_weight: 2.0
61
+ 2025-04-27,01:37:33 | INFO | coca_contrastive_loss_weight: 1.0
62
+ 2025-04-27,01:37:33 | INFO | copy_codebase: False
63
+ 2025-04-27,01:37:33 | INFO | csv_caption_key: title
64
+ 2025-04-27,01:37:33 | INFO | csv_img_key: filepath
65
+ 2025-04-27,01:37:33 | INFO | csv_separator:
66
+ 2025-04-27,01:37:33 | INFO | dataset_resampled: False
67
+ 2025-04-27,01:37:33 | INFO | dataset_type: synthetic
68
+ 2025-04-27,01:37:33 | INFO | ddp_static_graph: False
69
+ 2025-04-27,01:37:33 | INFO | debug: False
70
+ 2025-04-27,01:37:33 | INFO | delete_previous_checkpoint: False
71
+ 2025-04-27,01:37:33 | INFO | device: cuda:0
72
+ 2025-04-27,01:37:33 | INFO | dist_backend: None
73
+ 2025-04-27,01:37:33 | INFO | dist_url: None
74
+ 2025-04-27,01:37:33 | INFO | distill: False
75
+ 2025-04-27,01:37:33 | INFO | distill_model: None
76
+ 2025-04-27,01:37:33 | INFO | distill_pretrained: None
77
+ 2025-04-27,01:37:33 | INFO | distributed: True
78
+ 2025-04-27,01:37:33 | INFO | epochs: 10
79
+ 2025-04-27,01:37:33 | INFO | epochs_cooldown: None
80
+ 2025-04-27,01:37:33 | INFO | eps: 1e-08
81
+ 2025-04-27,01:37:33 | INFO | force_custom_text: False
82
+ 2025-04-27,01:37:33 | INFO | force_image_size: None
83
+ 2025-04-27,01:37:33 | INFO | force_patch_dropout: None
84
+ 2025-04-27,01:37:33 | INFO | force_quick_gelu: False
85
+ 2025-04-27,01:37:33 | INFO | gather_with_grad: True
86
+ 2025-04-27,01:37:33 | INFO | grad_checkpointing: True
87
+ 2025-04-27,01:37:33 | INFO | grad_clip_norm: None
88
+ 2025-04-27,01:37:33 | INFO | horovod: False
89
+ 2025-04-27,01:37:33 | INFO | image_interpolation: None
90
+ 2025-04-27,01:37:33 | INFO | image_mean: None
91
+ 2025-04-27,01:37:33 | INFO | image_resize_mode: None
92
+ 2025-04-27,01:37:33 | INFO | image_std: None
93
+ 2025-04-27,01:37:33 | INFO | imagenet_v2: None
94
+ 2025-04-27,01:37:33 | INFO | imagenet_val: None
95
+ 2025-04-27,01:37:33 | INFO | keep_func_name: keep_random
96
+ 2025-04-27,01:37:33 | INFO | local_loss: False
97
+ 2025-04-27,01:37:33 | INFO | local_rank: 0
98
+ 2025-04-27,01:37:33 | INFO | lock_image: False
99
+ 2025-04-27,01:37:33 | INFO | lock_image_freeze_bn_stats: False
100
+ 2025-04-27,01:37:33 | INFO | lock_image_unlocked_groups: 0
101
+ 2025-04-27,01:37:33 | INFO | lock_text: True
102
+ 2025-04-27,01:37:33 | INFO | lock_text_freeze_layer_norm: False
103
+ 2025-04-27,01:37:33 | INFO | lock_text_unlocked_layers: 0
104
+ 2025-04-27,01:37:33 | INFO | log_every_n_steps: 100
105
+ 2025-04-27,01:37:33 | INFO | log_level: 20
106
+ 2025-04-27,01:37:33 | INFO | log_local: False
107
+ 2025-04-27,01:37:33 | INFO | log_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/out.log
108
+ 2025-04-27,01:37:33 | INFO | logs: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text
109
+ 2025-04-27,01:37:33 | INFO | loss_dist_impl: None
110
+ 2025-04-27,01:37:33 | INFO | lr: 4e-05
111
+ 2025-04-27,01:37:33 | INFO | lr_cooldown_end: 0.0
112
+ 2025-04-27,01:37:33 | INFO | lr_cooldown_power: 1.0
113
+ 2025-04-27,01:37:33 | INFO | lr_scheduler: cosine
114
+ 2025-04-27,01:37:33 | INFO | map_func_name: use_all
115
+ 2025-04-27,01:37:33 | INFO | model: ViT-B-16
116
+ 2025-04-27,01:37:33 | INFO | momentum: None
117
+ 2025-04-27,01:37:33 | INFO | name: keep_random
118
+ 2025-04-27,01:37:33 | INFO | no_set_device_rank: False
119
+ 2025-04-27,01:37:33 | INFO | opt: adamw
120
+ 2025-04-27,01:37:33 | INFO | precision: amp
121
+ 2025-04-27,01:37:33 | INFO | pretrained: datacomp_xl_s13b_b90k
122
+ 2025-04-27,01:37:33 | INFO | pretrained_image: False
123
+ 2025-04-27,01:37:33 | INFO | rank: 0
124
+ 2025-04-27,01:37:33 | INFO | remote_sync: None
125
+ 2025-04-27,01:37:33 | INFO | remote_sync_frequency: 300
126
+ 2025-04-27,01:37:33 | INFO | remote_sync_protocol: s3
127
+ 2025-04-27,01:37:33 | INFO | report_to: tensorboard,wandb
128
+ 2025-04-27,01:37:33 | INFO | resume: None
129
+ 2025-04-27,01:37:33 | INFO | save_frequency: 10
130
+ 2025-04-27,01:37:33 | INFO | save_most_recent: False
131
+ 2025-04-27,01:37:33 | INFO | seed: 0
132
+ 2025-04-27,01:37:33 | INFO | siglip: False
133
+ 2025-04-27,01:37:33 | INFO | skip_scheduler: False
134
+ 2025-04-27,01:37:33 | INFO | tensorboard: True
135
+ 2025-04-27,01:37:33 | INFO | tensorboard_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/tensorboard
136
+ 2025-04-27,01:37:33 | INFO | torchcompile: False
137
+ 2025-04-27,01:37:33 | INFO | torchscript: False
138
+ 2025-04-27,01:37:33 | INFO | trace: False
139
+ 2025-04-27,01:37:33 | INFO | train_data: /mnt/personal/zhudongy/recap-datacomp-3m-wds/{00376..01507}.tar
140
+ 2025-04-27,01:37:33 | INFO | train_data_upsampling_factors: None
141
+ 2025-04-27,01:37:33 | INFO | train_num_samples: 9011874
142
+ 2025-04-27,01:37:33 | INFO | use_bn_sync: False
143
+ 2025-04-27,01:37:33 | INFO | use_bnb_linear: None
144
+ 2025-04-27,01:37:33 | INFO | val_data: None
145
+ 2025-04-27,01:37:33 | INFO | val_frequency: 1
146
+ 2025-04-27,01:37:33 | INFO | val_num_samples: None
147
+ 2025-04-27,01:37:33 | INFO | wandb: True
148
+ 2025-04-27,01:37:33 | INFO | wandb_notes:
149
+ 2025-04-27,01:37:33 | INFO | wandb_project_name: open-clip
150
+ 2025-04-27,01:37:33 | INFO | warmup: 110
151
+ 2025-04-27,01:37:33 | INFO | wd: 0.5
152
+ 2025-04-27,01:37:33 | INFO | workers: 16
153
+ 2025-04-27,01:37:33 | INFO | world_size: 2
154
+ 2025-04-27,01:37:33 | INFO | zeroshot_frequency: 2
155
+ 2025-04-27,01:37:34 | INFO | Created AdamW (adamw) optimizer: lr: 4e-05, betas: (0.9, 0.98), eps: 1e-08, weight_decay: 0.5, amsgrad: False, foreach: None, maximize: False, capturable: False, differentiable: False, fused: None
156
+ 2025-04-27,01:37:47 | INFO | Start epoch 0
157
+ 2025-04-27,01:38:39 | INFO | Train Epoch: 0 [ 8192/917504 (1%)] Data (t): 42.123 Batch (t): 51.906, 157.823/s, 78.9117/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 28.618 (28.618) Imm_text: 28.618 (28.618) Isd_image: 3.3252 (3.3252) Isd_text: 3.3252 (3.3252) Contrastive_loss: 1.4192 (1.4192) Loss: 1.4192 (1.4192)
158
+ 2025-04-27,01:53:40 | INFO | Train Epoch: 0 [827392/917504 (90%)] Data (t): 0.999 Batch (t): 9.010, 915.892/s, 457.946/s/gpu LR: 0.000037 Logit Scale: 99.934 Imm_image: 28.968 (28.793) Imm_text: 28.968 (28.793) Isd_image: 5.7168 (4.5210) Isd_text: 5.7168 (4.5210) Contrastive_loss: 0.72981 (1.0745) Loss: 0.72981 (1.0745)
159
+ 2025-04-27,01:55:19 | INFO | Train Epoch: 0 [917504/917504 (100%)] Data (t): 0.952 Batch (t): 8.936, 914.273/s, 457.137/s/gpu LR: 0.000040 Logit Scale: 99.924 Imm_image: 28.905 (28.830) Imm_text: 28.905 (28.830) Isd_image: 5.5824 (4.8748) Isd_text: 5.5824 (4.8748) Contrastive_loss: 0.71474 (0.95458) Loss: 0.71474 (0.95458)
160
+ 2025-04-27,01:55:19 | INFO | Start epoch 1
161
+ 2025-04-27,01:56:03 | INFO | Train Epoch: 1 [ 8192/917504 (1%)] Data (t): 35.769 Batch (t): 44.168, 185.473/s, 92.7366/s/gpu LR: 0.000040 Logit Scale: 99.924 Imm_image: 29.056 (29.056) Imm_text: 29.056 (29.056) Isd_image: 5.6160 (5.6160) Isd_text: 5.6160 (5.6160) Contrastive_loss: 0.63035 (0.63035) Loss: 0.63035 (0.63035)
162
+ 2025-04-27,02:11:05 | INFO | Train Epoch: 1 [827392/917504 (90%)] Data (t): 1.006 Batch (t): 9.021, 916.703/s, 458.351/s/gpu LR: 0.000039 Logit Scale: 99.879 Imm_image: 28.940 (28.998) Imm_text: 28.940 (28.998) Isd_image: 4.3083 (4.9621) Isd_text: 4.3083 (4.9621) Contrastive_loss: 0.64448 (0.63742) Loss: 0.64448 (0.63742)
163
+ 2025-04-27,02:12:43 | INFO | Train Epoch: 1 [917504/917504 (100%)] Data (t): 0.950 Batch (t): 8.937, 918.129/s, 459.065/s/gpu LR: 0.000039 Logit Scale: 99.883 Imm_image: 29.164 (29.053) Imm_text: 29.164 (29.053) Isd_image: 3.7631 (4.5625) Isd_text: 3.7631 (4.5625) Contrastive_loss: 0.51947 (0.59810) Loss: 0.51947 (0.59810)
164
+ 2025-04-27,02:12:44 | INFO | Start epoch 2
165
+ 2025-04-27,02:13:29 | INFO | Train Epoch: 2 [ 8192/917504 (1%)] Data (t): 37.286 Batch (t): 45.551, 179.841/s, 89.9207/s/gpu LR: 0.000039 Logit Scale: 99.885 Imm_image: 29.259 (29.259) Imm_text: 29.259 (29.259) Isd_image: 3.6721 (3.6721) Isd_text: 3.6721 (3.6721) Contrastive_loss: 0.49092 (0.49092) Loss: 0.49092 (0.49092)
166
+ 2025-04-27,02:28:32 | INFO | Train Epoch: 2 [827392/917504 (90%)] Data (t): 1.009 Batch (t): 9.023, 917.789/s, 458.894/s/gpu LR: 0.000036 Logit Scale: 100.000 Imm_image: 29.366 (29.312) Imm_text: 29.366 (29.312) Isd_image: 2.6899 (3.1810) Isd_text: 2.6899 (3.1810) Contrastive_loss: 0.46473 (0.47783) Loss: 0.46473 (0.47783)
167
+ 2025-04-27,02:30:10 | INFO | Train Epoch: 2 [917504/917504 (100%)] Data (t): 0.952 Batch (t): 8.936, 915.433/s, 457.716/s/gpu LR: 0.000035 Logit Scale: 100.000 Imm_image: 29.520 (29.382) Imm_text: 29.520 (29.382) Isd_image: 2.5545 (2.9722) Isd_text: 2.5545 (2.9722) Contrastive_loss: 0.39856 (0.45141) Loss: 0.39856 (0.45141)
168
+ 2025-04-27,02:30:10 | INFO | Start epoch 3
169
+ 2025-04-27,02:30:55 | INFO | Train Epoch: 3 [ 8192/917504 (1%)] Data (t): 36.864 Batch (t): 44.915, 182.389/s, 91.1947/s/gpu LR: 0.000035 Logit Scale: 100.000 Imm_image: 29.511 (29.511) Imm_text: 29.511 (29.511) Isd_image: 2.5891 (2.5891) Isd_text: 2.5891 (2.5891) Contrastive_loss: 0.42152 (0.42152) Loss: 0.42152 (0.42152)
170
+ 2025-04-27,02:46:00 | INFO | Train Epoch: 3 [827392/917504 (90%)] Data (t): 1.025 Batch (t): 9.047, 917.102/s, 458.551/s/gpu LR: 0.000031 Logit Scale: 100.000 Imm_image: 29.689 (29.600) Imm_text: 29.689 (29.600) Isd_image: 1.7021 (2.1456) Isd_text: 1.7021 (2.1456) Contrastive_loss: 0.39319 (0.40736) Loss: 0.39319 (0.40736)
171
+ 2025-04-27,02:47:38 | INFO | Train Epoch: 3 [917504/917504 (100%)] Data (t): 0.951 Batch (t): 8.934, 916.799/s, 458.399/s/gpu LR: 0.000030 Logit Scale: 100.000 Imm_image: 29.685 (29.628) Imm_text: 29.685 (29.628) Isd_image: 1.5345 (1.9419) Isd_text: 1.5345 (1.9419) Contrastive_loss: 0.34130 (0.38534) Loss: 0.34130 (0.38534)
172
+ 2025-04-27,02:47:38 | INFO | Start epoch 4
173
+ 2025-04-27,02:48:23 | INFO | Train Epoch: 4 [ 8192/917504 (1%)] Data (t): 36.836 Batch (t): 44.907, 182.422/s, 91.2108/s/gpu LR: 0.000030 Logit Scale: 100.000 Imm_image: 29.707 (29.707) Imm_text: 29.707 (29.707) Isd_image: 1.5228 (1.5228) Isd_text: 1.5228 (1.5228) Contrastive_loss: 0.35253 (0.35253) Loss: 0.35253 (0.35253)
174
+ 2025-04-27,03:03:22 | INFO | Train Epoch: 4 [827392/917504 (90%)] Data (t): 0.984 Batch (t): 8.987, 918.381/s, 459.190/s/gpu LR: 0.000024 Logit Scale: 100.000 Imm_image: 29.872 (29.789) Imm_text: 29.872 (29.789) Isd_image: 1.0030 (1.2629) Isd_text: 1.0030 (1.2629) Contrastive_loss: 0.33160 (0.34206) Loss: 0.33160 (0.34206)
175
+ 2025-04-27,03:05:00 | INFO | Train Epoch: 4 [917504/917504 (100%)] Data (t): 0.940 Batch (t): 8.925, 915.463/s, 457.731/s/gpu LR: 0.000023 Logit Scale: 100.000 Imm_image: 29.722 (29.767) Imm_text: 29.722 (29.767) Isd_image: 1.1486 (1.2248) Isd_text: 1.1486 (1.2248) Contrastive_loss: 0.31939 (0.33451) Loss: 0.31939 (0.33451)
176
+ 2025-04-27,03:05:00 | INFO | Start epoch 5
177
+ 2025-04-27,03:05:45 | INFO | Train Epoch: 5 [ 8192/917504 (1%)] Data (t): 35.249 Batch (t): 44.356, 184.686/s, 92.3430/s/gpu LR: 0.000023 Logit Scale: 100.000 Imm_image: 29.820 (29.820) Imm_text: 29.820 (29.820) Isd_image: 1.0999 (1.0999) Isd_text: 1.0999 (1.0999) Contrastive_loss: 0.29852 (0.29852) Loss: 0.29852 (0.29852)
178
+ 2025-04-27,03:20:46 | INFO | Train Epoch: 5 [827392/917504 (90%)] Data (t): 1.002 Batch (t): 9.017, 918.400/s, 459.200/s/gpu LR: 0.000017 Logit Scale: 100.000 Imm_image: 29.969 (29.895) Imm_text: 29.969 (29.895) Isd_image: 0.86037 (0.98011) Isd_text: 0.86037 (0.98011) Contrastive_loss: 0.27611 (0.28732) Loss: 0.27611 (0.28732)
179
+ 2025-04-27,03:22:25 | INFO | Train Epoch: 5 [917504/917504 (100%)] Data (t): 0.949 Batch (t): 8.933, 914.883/s, 457.441/s/gpu LR: 0.000017 Logit Scale: 100.000 Imm_image: 29.996 (29.928) Imm_text: 29.996 (29.928) Isd_image: 0.85540 (0.93854) Isd_text: 0.85540 (0.93854) Contrastive_loss: 0.24764 (0.27409) Loss: 0.24764 (0.27409)
180
+ 2025-04-27,03:22:25 | INFO | Start epoch 6
181
+ 2025-04-27,03:23:10 | INFO | Train Epoch: 6 [ 8192/917504 (1%)] Data (t): 36.590 Batch (t): 44.694, 183.290/s, 91.6450/s/gpu LR: 0.000016 Logit Scale: 100.000 Imm_image: 30.118 (30.118) Imm_text: 30.118 (30.118) Isd_image: 0.77700 (0.77700) Isd_text: 0.77700 (0.77700) Contrastive_loss: 0.24394 (0.24394) Loss: 0.24394 (0.24394)
182
+ 2025-04-27,03:38:11 | INFO | Train Epoch: 6 [827392/917504 (90%)] Data (t): 1.004 Batch (t): 9.018, 917.586/s, 458.793/s/gpu LR: 0.000011 Logit Scale: 100.000 Imm_image: 30.208 (30.163) Imm_text: 30.208 (30.163) Isd_image: 0.66865 (0.72282) Isd_text: 0.66865 (0.72282) Contrastive_loss: 0.24086 (0.24240) Loss: 0.24086 (0.24240)
183
+ 2025-04-27,03:39:50 | INFO | Train Epoch: 6 [917504/917504 (100%)] Data (t): 0.950 Batch (t): 8.933, 917.061/s, 458.530/s/gpu LR: 0.000010 Logit Scale: 100.000 Imm_image: 30.353 (30.226) Imm_text: 30.353 (30.226) Isd_image: 0.60130 (0.68231) Isd_text: 0.60130 (0.68231) Contrastive_loss: 0.17543 (0.22008) Loss: 0.17543 (0.22008)
184
+ 2025-04-27,03:39:50 | INFO | Start epoch 7
185
+ 2025-04-27,03:40:34 | INFO | Train Epoch: 7 [ 8192/917504 (1%)] Data (t): 36.419 Batch (t): 44.470, 184.213/s, 92.1063/s/gpu LR: 0.000010 Logit Scale: 100.000 Imm_image: 30.290 (30.290) Imm_text: 30.290 (30.290) Isd_image: 0.66099 (0.66099) Isd_text: 0.66099 (0.66099) Contrastive_loss: 0.21813 (0.21813) Loss: 0.21813 (0.21813)
186
+ 2025-04-27,03:55:41 | INFO | Train Epoch: 7 [827392/917504 (90%)] Data (t): 1.037 Batch (t): 9.062, 921.569/s, 460.784/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.350 (30.320) Imm_text: 30.350 (30.320) Isd_image: 0.52269 (0.59184) Isd_text: 0.52269 (0.59184) Contrastive_loss: 0.21930 (0.21872) Loss: 0.21930 (0.21872)
187
+ 2025-04-27,03:57:19 | INFO | Train Epoch: 7 [917504/917504 (100%)] Data (t): 0.951 Batch (t): 8.931, 912.708/s, 456.354/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.451 (30.364) Imm_text: 30.451 (30.364) Isd_image: 0.50853 (0.56407) Isd_text: 0.50853 (0.56407) Contrastive_loss: 0.19753 (0.21166) Loss: 0.19753 (0.21166)
188
+ 2025-04-27,03:57:19 | INFO | Start epoch 8
189
+ 2025-04-27,03:58:04 | INFO | Train Epoch: 8 [ 8192/917504 (1%)] Data (t): 36.493 Batch (t): 44.842, 182.684/s, 91.3421/s/gpu LR: 0.000005 Logit Scale: 100.000 Imm_image: 30.394 (30.394) Imm_text: 30.394 (30.394) Isd_image: 0.56205 (0.56205) Isd_text: 0.56205 (0.56205) Contrastive_loss: 0.19877 (0.19877) Loss: 0.19877 (0.19877)
190
+ 2025-04-27,04:13:05 | INFO | Train Epoch: 8 [827392/917504 (90%)] Data (t): 1.001 Batch (t): 9.007, 919.521/s, 459.760/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.305 (30.350) Imm_text: 30.305 (30.350) Isd_image: 0.50703 (0.53454) Isd_text: 0.50703 (0.53454) Contrastive_loss: 0.23204 (0.21540) Loss: 0.23204 (0.21540)
191
+ 2025-04-27,04:14:43 | INFO | Train Epoch: 8 [917504/917504 (100%)] Data (t): 0.940 Batch (t): 8.918, 914.019/s, 457.010/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.433 (30.377) Imm_text: 30.433 (30.377) Isd_image: 0.56413 (0.54440) Isd_text: 0.56413 (0.54440) Contrastive_loss: 0.18968 (0.20683) Loss: 0.18968 (0.20683)
192
+ 2025-04-27,04:14:43 | INFO | Start epoch 9
193
+ 2025-04-27,04:15:26 | INFO | Train Epoch: 9 [ 8192/917504 (1%)] Data (t): 34.886 Batch (t): 43.055, 190.267/s, 95.1333/s/gpu LR: 0.000001 Logit Scale: 100.000 Imm_image: 30.423 (30.423) Imm_text: 30.423 (30.423) Isd_image: 0.50569 (0.50569) Isd_text: 0.50569 (0.50569) Contrastive_loss: 0.20082 (0.20082) Loss: 0.20082 (0.20082)
194
+ 2025-04-27,04:30:24 | INFO | Train Epoch: 9 [827392/917504 (90%)] Data (t): 0.980 Batch (t): 8.984, 919.065/s, 459.533/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 30.383 (30.403) Imm_text: 30.383 (30.403) Isd_image: 0.52638 (0.51604) Isd_text: 0.52638 (0.51604) Contrastive_loss: 0.22643 (0.21362) Loss: 0.22643 (0.21362)
195
+ 2025-04-27,04:32:02 | INFO | Train Epoch: 9 [917504/917504 (100%)] Data (t): 0.931 Batch (t): 8.910, 918.487/s, 459.244/s/gpu LR: 0.000000 Logit Scale: 100.000 Imm_image: 30.408 (30.404) Imm_text: 30.408 (30.404) Isd_image: 0.47812 (0.50340) Isd_text: 0.47812 (0.50340) Contrastive_loss: 0.19293 (0.20673) Loss: 0.19293 (0.20673)
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/params.txt ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 2
2
+ aug_cfg: {}
3
+ batch_size: 2048
4
+ beta1: 0.9
5
+ beta2: 0.98
6
+ cache_dir: None
7
+ caption_ratio: 0.1
8
+ checkpoint_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/checkpoints
9
+ coca_caption_loss_weight: 2.0
10
+ coca_contrastive_loss_weight: 1.0
11
+ copy_codebase: False
12
+ csv_caption_key: title
13
+ csv_img_key: filepath
14
+ csv_separator:
15
+ dataset_resampled: False
16
+ dataset_type: synthetic
17
+ ddp_static_graph: False
18
+ debug: False
19
+ delete_previous_checkpoint: False
20
+ device: cuda:0
21
+ dist_backend: None
22
+ dist_url: None
23
+ distill: False
24
+ distill_model: None
25
+ distill_pretrained: None
26
+ distributed: True
27
+ epochs: 10
28
+ epochs_cooldown: None
29
+ eps: 1e-08
30
+ force_custom_text: False
31
+ force_image_size: None
32
+ force_patch_dropout: None
33
+ force_quick_gelu: False
34
+ gather_with_grad: True
35
+ grad_checkpointing: True
36
+ grad_clip_norm: None
37
+ horovod: False
38
+ image_interpolation: None
39
+ image_mean: None
40
+ image_resize_mode: None
41
+ image_std: None
42
+ imagenet_v2: None
43
+ imagenet_val: None
44
+ keep_func_name: keep_random
45
+ local_loss: False
46
+ local_rank: 0
47
+ lock_image: False
48
+ lock_image_freeze_bn_stats: False
49
+ lock_image_unlocked_groups: 0
50
+ lock_text: True
51
+ lock_text_freeze_layer_norm: False
52
+ lock_text_unlocked_layers: 0
53
+ log_every_n_steps: 100
54
+ log_level: 20
55
+ log_local: False
56
+ log_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/out.log
57
+ logs: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text
58
+ loss_dist_impl: None
59
+ lr: 4e-05
60
+ lr_cooldown_end: 0.0
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ map_func_name: use_all
64
+ model: ViT-B-16
65
+ momentum: None
66
+ name: keep_random
67
+ no_set_device_rank: False
68
+ opt: adamw
69
+ precision: amp
70
+ pretrained: datacomp_xl_s13b_b90k
71
+ pretrained_image: False
72
+ rank: 0
73
+ remote_sync: None
74
+ remote_sync_frequency: 300
75
+ remote_sync_protocol: s3
76
+ report_to: tensorboard,wandb
77
+ resume: None
78
+ save_frequency: 10
79
+ save_most_recent: False
80
+ seed: 0
81
+ siglip: False
82
+ skip_scheduler: False
83
+ tensorboard: True
84
+ tensorboard_path: /mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_random/tensorboard
85
+ torchcompile: False
86
+ torchscript: False
87
+ trace: False
88
+ train_data: /mnt/personal/zhudongy/recap-datacomp-3m-wds/{00376..01507}.tar
89
+ train_data_upsampling_factors: None
90
+ train_num_samples: 9011874
91
+ use_bn_sync: False
92
+ use_bnb_linear: None
93
+ val_data: None
94
+ val_frequency: 1
95
+ val_num_samples: None
96
+ wandb: True
97
+ wandb_notes:
98
+ wandb_project_name: open-clip
99
+ warmup: 110
100
+ wd: 0.5
101
+ workers: 16
102
+ world_size: 2
103
+ zeroshot_frequency: 2
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_caltech101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "caltech101", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8618766404199475, "acc5": 0.9681758530183727, "mean_per_class_recall": 0.925570681349594}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_cars_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "cars", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8362144011938814, "acc5": 0.9920407909463997, "mean_per_class_recall": 0.8370035365963817}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_cifar100_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "cifar100", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7489, "acc5": 0.9325, "mean_per_class_recall": 0.7489}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_cifar10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "cifar10", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.9405, "acc5": 0.9984, "mean_per_class_recall": 0.9405000000000001}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_country211_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "country211", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.1942654028436019, "acc5": 0.42255924170616116, "mean_per_class_recall": 0.19436018957345974}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_dtd_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "dtd", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5537234042553192, "acc5": 0.8122340425531915, "mean_per_class_recall": 0.5537234042553191}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_eurosat_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "eurosat", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.48733333333333334, "acc5": 0.8834074074074074, "mean_per_class_recall": 0.48750666666666664}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_fgvc_aircraft_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "fgvc_aircraft", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.24152415241524153, "acc5": 0.5781578157815782, "mean_per_class_recall": 0.2407130124777184}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_flickr30k_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "flickr30k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.652999997138977, "text_retrieval_recall@1": 0.8100000023841858, "image_retrieval_recall@5": 0.8772000074386597, "text_retrieval_recall@5": 0.9539999961853027, "image_retrieval_recall@10": 0.921999990940094, "text_retrieval_recall@10": 0.9779999852180481}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_flowers_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "flowers", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.7280858676207513, "acc5": 0.9017726459586924, "mean_per_class_recall": 0.7367525532935172}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_food101_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "food101", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8707722772277228, "acc5": 0.9812277227722772, "mean_per_class_recall": 0.8707326732673267}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_gtsrb_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "gtsrb", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.5553444180522565, "acc5": 0.7724465558194774, "mean_per_class_recall": 0.5145985462408618}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_imagenet1k_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "imagenet1k", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6846, "acc5": 0.90768, "mean_per_class_recall": 0.6846}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_mscoco_captions_epoch_10.pt_ViT-B-16_en_zeroshot_retrieval.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "mscoco_captions", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.384246289730072, "text_retrieval_recall@1": 0.5473999977111816, "image_retrieval_recall@5": 0.6422231197357178, "text_retrieval_recall@5": 0.7784000039100647, "image_retrieval_recall@10": 0.7440223693847656, "text_retrieval_recall@10": 0.8586000204086304}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_pets_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "pets", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.8975197601526301, "acc5": 0.9964568002180431, "mean_per_class_recall": 0.8970007980600225}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_stl10_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "stl10", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.97225, "acc5": 0.9995, "mean_per_class_recall": 0.9721250000000001}, "language": "en"}
SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/benchmark_sun397_epoch_10.pt_ViT-B-16_en_zeroshot_classification.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset": "sun397", "model": "ViT-B-16", "pretrained": "/mnt/personal/zhudongy/rdc9m_results/SFR-Embedding-Code-2B_R#0.8#0.6#dinov2-large#0.0#0.2#rouge_0.2#top_8#inter_0.4/ViT-B-16-datacomp_xl_s13b_b90k-010-4e5-e10-recaption-finetune-lock-text/keep_text_closest_image_closest/checkpoints/epoch_10.pt", "task": "zeroshot_classification", "metrics": {"acc1": 0.6729039851407764, "acc5": 0.9277635765121283, "mean_per_class_recall": 0.6608765022275734}, "language": "en"}