project: name: imagenet-project data: raw_dir: ./data/raw/ captions_file: ./data/captioning/annotations/train.json dataset_version: cls_raw-20260525-v2 # dataset_version: raw-20260509-v1 split: train_ratio: 0.7 val_ratio: 0.15 test_ratio: 0.15 train: seed: 42 # repeated experiment # seed: 7 # seed: 21 epochs: 20 batch_size: 32 num_workers: 4 device: cuda optimizer: adam preprocess: image_size: 224 normalize: true loss: name: cross_entropy ignore_index: pad_token evaluate: batch_size: 32 metrics: - bleu - rouge_l - meteor logging: use_wandb: true project_name: imagenet-project log_interval: 10 outputs: base_dir: outputs demo: host: 0.0.0.0 port: 7860 share: false top_k: 5 show_gradcam: true class_names : [airplane, apple, aster, banana, bicycle, bracelet, bulldog, bus, butterfly, car, carrot, cucumber, cup-cake, daisy, dandelion, dumpling, earrings, elephant, glasses, golden-retriever, hamburger, horse, iris, lavender, lily, marigold, motorcycle, necklace, orange, orchid, pants, pasta, penguin, persian-cat, pizza, rose, salad, sandwich, sheep, siamese-cat, sneakers, squirrel, steak, strawberry, sunflower, sushi, tomato, t-shirt, tulip, waffle] cnn: backbone: resnet18 pretrained: true freeze: true output_dim: 512 dropout: 0.3 pooling: avg captioning: # encoder: resnet18 encoder: swin # encoder: vit decoder: transformer # decoder: lstm # decoder: gru version: final epochs: 25 learning_rate: 0.0001 batch_size: 32 optimizer: adamw max_caption_length: 30 train_num_caption: 2 debug: False lstm: embed_dim: 256 hidden_dim: 512 num_layers: 1 gru: embed_dim: 256 hidden_dim: 512 num_layers: 1 transformer: n_layers: 6 nhead: 8 d_model: 512 drop_p: 0.3 label_smoothing: 0 weight_decay: 0.001 data: dataset_version: cap_raw-20260524-v1 train_img: ./data/captioning/raw/train/ train_caption: ./data/captioning/annotations/train.json val_img: ./data/captioning/raw/val/ val_caption: ./data/captioning/annotations/val.json test_img: ./data/captioning/raw/test/ test_caption: ./data/captioning/annotations/test.json tokenizer: min_freq: 3 max_vocab_size: 10000 sp_vocab_size: 2000 use_subword: False sp_model_path: ./src/dataset/sub_tokenizer2000.model checkpoint: save_dir: ./outputs/captioning final_checkpoint: swin-transformer_final_best.pt resume: False heatmap: dec_atten_dir: /workspace/outputs/captioning/heatmap/ enc_dec_atten_dir: /workspace/outputs/captioning/heatmap/ layer: 6 # 몇번째 층 sample: [0, 410, 820, 1230, 1640] # caption & heatmap 몇번째 샘플(batch) scheduler: use_scheduler: False warmup_step: 500 lr_scale: 0.5 beam_search: use_beam_search: True beam_size: 3 classification: # model_name: resnet18 # model_name: efficientnet_b0 # model_name: convnext_tiny # model_name: mobilenet_v3_small # model_name: vit_b_16 model_name: swin_t # model_name: deit_tiny_patch16_224 final_checkpoint: ./outputs/classification/cls_swin-t_base_cls_raw-20260525-v2_lr-0005_bs-32_adamw_none_wdc-0.05_ls-0.0_best.pth epochs: 50 learning_rate: # baseline cnn: 0.001 transformer: 0.0005 # hyperparameter tuning # cnn: 0.0005 # transformer: 0.0001 # optimizer: adam # optimizer: sgd optimizer: adamw # default # weight_decay: 0.01 # tuning weight_decay: 0.05 scheduler: use: false # use: true # name: cosineannealinglr augmentation: # baseline use_aug: false type: none # mixup # use_aug: true # type: mixup # cutmix # use_aug: true # type: cutmix label_smoothing: 0.0 # label smoothing experiment # label_smoothing: 0.05 # label_smoothing: 0.1 metrics: train: - loss - accuracy validation: - loss - accuracy - macro_f1 final_test: - accuracy - macro_f1 - precision - recall - confusion_matrix checkpoint: save_dir: /workspace/outputs/classification latent_space: data_dir: /workspace/data/raw checkpoint: /workspace/outputs/classification/cls_swin-t_base_cls_raw-20260525-v2_lr-0005_bs-32_adamw_none_wdc-0.05_ls-0.0_best.pth output_dir: /workspace/outputs/latent_space output_umap_npy: cls_swin-t_best_umap_2d_test_nb10_md05 output_umap_png: cls_swin-t_best_umap_plt_test_nb10_md05 output_meta_csv: cls_swin-t_best_metadata_test_nb10_md05 split: test batch_size: 32 num_workers: 4 device: cuda seed: 42 save_meta: true use_wandb: true wandb_name: latent_space_umap umap: n_neighbors: 10 min_dist: 0.5 metric: cosine