File size: 5,059 Bytes
c1596ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
project:
  name: imagenet-project

data:
  raw_dir: ./data/raw/
  captions_file: ./data/captioning/annotations/train.json
  dataset_version: cls_raw-20260525-v2
  # dataset_version: raw-20260509-v1

split:
  train_ratio: 0.7
  val_ratio: 0.15
  test_ratio: 0.15

train:
  seed: 42
  # repeated experiment
  # seed: 7
  # seed: 21
  epochs: 20
  batch_size: 32
  num_workers: 4
  device: cuda
  optimizer: adam

preprocess:
  image_size: 224
  normalize: true

loss:
  name: cross_entropy
  ignore_index: pad_token

evaluate:
  batch_size: 32
  metrics:
    - bleu
    - rouge_l
    - meteor

logging:
  use_wandb: true
  project_name: imagenet-project
  log_interval: 10

outputs:
  base_dir: outputs

demo:
  host: 0.0.0.0
  port: 7860
  share: false
  top_k: 5
  show_gradcam: true
  class_names : [airplane, apple, aster, banana, bicycle, bracelet, bulldog, bus, butterfly, car, carrot, cucumber, cup-cake, daisy, dandelion, dumpling, earrings, elephant, glasses, golden-retriever, hamburger, horse, iris, lavender, lily, marigold, motorcycle, necklace, orange, orchid, pants, pasta, penguin, persian-cat, pizza, rose, salad, sandwich, sheep, siamese-cat, sneakers, squirrel, steak, strawberry, sunflower, sushi, tomato, t-shirt, tulip, waffle]

cnn:
  backbone: resnet18
  pretrained: true
  freeze: true
  output_dim: 512
  dropout: 0.3
  pooling: avg

captioning:

  # encoder: resnet18
  encoder: swin
  # encoder: vit
  decoder: transformer
  # decoder: lstm
  # decoder: gru
  version: final

  epochs: 25
  learning_rate: 0.0001
  batch_size: 32
  optimizer: adamw
  max_caption_length: 30
  train_num_caption: 2

  debug: False

  lstm:
    embed_dim: 256
    hidden_dim: 512
    num_layers: 1

  gru:
    embed_dim: 256
    hidden_dim: 512
    num_layers: 1

  transformer:
    n_layers: 6
    nhead: 8
    d_model: 512
    drop_p: 0.3
    label_smoothing: 0
    weight_decay: 0.001

  data:
    dataset_version: cap_raw-20260524-v1
    train_img: ./data/captioning/raw/train/
    train_caption: ./data/captioning/annotations/train.json
    val_img: ./data/captioning/raw/val/
    val_caption: ./data/captioning/annotations/val.json
    test_img: ./data/captioning/raw/test/
    test_caption: ./data/captioning/annotations/test.json
    
  tokenizer:
    min_freq: 3
    max_vocab_size: 10000
    sp_vocab_size: 2000
    use_subword: False
    sp_model_path: ./src/dataset/sub_tokenizer2000.model

  checkpoint:
    save_dir: ./outputs/captioning
    final_checkpoint: swin-transformer_final_best.pt
    resume: False

  heatmap:
    dec_atten_dir: /workspace/outputs/captioning/heatmap/
    enc_dec_atten_dir: /workspace/outputs/captioning/heatmap/
    layer: 6 # 몇번째 층
    sample: [0, 410, 820, 1230, 1640] # caption & heatmap 몇번째 샘플(batch)

  scheduler:
    use_scheduler: False
    warmup_step: 500
    lr_scale: 0.5

  beam_search:
    use_beam_search: True
    beam_size: 3
    


classification:

  # model_name: resnet18
  # model_name: efficientnet_b0
  # model_name: convnext_tiny
  # model_name: mobilenet_v3_small
  # model_name: vit_b_16
  model_name: swin_t
  # model_name: deit_tiny_patch16_224

  final_checkpoint: ./outputs/classification/cls_swin-t_base_cls_raw-20260525-v2_lr-0005_bs-32_adamw_none_wdc-0.05_ls-0.0_best.pth

  epochs: 50

  learning_rate:

    # baseline
    cnn: 0.001
    transformer: 0.0005

    # hyperparameter tuning
    # cnn: 0.0005
    # transformer: 0.0001

  # optimizer: adam
  # optimizer: sgd
  optimizer: adamw

  # default
  # weight_decay: 0.01

  # tuning
  weight_decay: 0.05

  scheduler:
    use: false

    # use: true
    # name: cosineannealinglr

  augmentation:

    # baseline
    use_aug: false
    type: none

    # mixup
    # use_aug: true
    # type: mixup

    # cutmix
    # use_aug: true
    # type: cutmix

  label_smoothing: 0.0

  # label smoothing experiment
  # label_smoothing: 0.05
  # label_smoothing: 0.1


  metrics:

    train:
      - loss
      - accuracy

    validation:
      - loss
      - accuracy
      - macro_f1

    final_test:
      - accuracy
      - macro_f1
      - precision
      - recall
      - confusion_matrix

  checkpoint:
    save_dir: /workspace/outputs/classification


latent_space:
  data_dir: /workspace/data/raw
  checkpoint: /workspace/outputs/classification/cls_swin-t_base_cls_raw-20260525-v2_lr-0005_bs-32_adamw_none_wdc-0.05_ls-0.0_best.pth
  output_dir: /workspace/outputs/latent_space
  output_umap_npy: cls_swin-t_best_umap_2d_test_nb10_md05
  output_umap_png: cls_swin-t_best_umap_plt_test_nb10_md05
  output_meta_csv: cls_swin-t_best_metadata_test_nb10_md05
  split: test
  batch_size: 32
  num_workers: 4
  device: cuda
  seed: 42
  save_meta: true
  use_wandb: true
  wandb_name: latent_space_umap

  umap:
    n_neighbors: 10
    min_dist: 0.5
    metric: cosine