Shawon16 commited on
Commit
5016ec0
·
verified ·
1 Parent(s): 949793b

Upload trained VideoMAE with metrics and all figures

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ confusion_matrix_test_50.png filter=lfs diff=lfs merge=lfs -text
37
+ confusion_matrix_train_50.png filter=lfs diff=lfs merge=lfs -text
38
+ confusion_matrix_valid_50.png filter=lfs diff=lfs merge=lfs -text
39
+ longtail_f1_vs_freq.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: cc-by-nc-4.0
4
+ base_model: MCG-NJU/videomae-base
5
+ tags:
6
+ - generated_from_trainer
7
+ metrics:
8
+ - accuracy
9
+ model-index:
10
+ - name: VideoMAE_WLASL_2000__200_epoch_p20_longtail
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # VideoMAE_WLASL_2000__200_epoch_p20_longtail
18
+
19
+ This model is a fine-tuned version of [MCG-NJU/videomae-base](https://huggingface.co/MCG-NJU/videomae-base) on an unknown dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 4.6155
22
+ - Accuracy: 0.0207
23
+ - Top 1 Accuracy: 0.0207
24
+ - Top 5 Accuracy: 0.0592
25
+ - Top 10 Accuracy: 0.1124
26
+ - Macro Precision: 0.0005
27
+ - Macro Recall: 0.0117
28
+ - Macro F1: 0.0010
29
+ - Pearson Corr: nan
30
+ - Spearman Corr: nan
31
+
32
+ ## Model description
33
+
34
+ More information needed
35
+
36
+ ## Intended uses & limitations
37
+
38
+ More information needed
39
+
40
+ ## Training and evaluation data
41
+
42
+ More information needed
43
+
44
+ ## Training procedure
45
+
46
+ ### Training hyperparameters
47
+
48
+ The following hyperparameters were used during training:
49
+ - learning_rate: 5e-05
50
+ - train_batch_size: 2
51
+ - eval_batch_size: 2
52
+ - seed: 42
53
+ - gradient_accumulation_steps: 4
54
+ - total_train_batch_size: 8
55
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
56
+ - lr_scheduler_type: linear
57
+ - lr_scheduler_warmup_ratio: 0.1
58
+ - training_steps: 180
59
+ - mixed_precision_training: Native AMP
60
+
61
+ ### Training results
62
+
63
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy | Top 1 Accuracy | Top 5 Accuracy | Top 10 Accuracy | Macro Precision | Macro Recall | Macro F1 | Pearson Corr | Spearman Corr |
64
+ |:-------------:|:-----:|:----:|:---------------:|:--------:|:--------------:|:--------------:|:---------------:|:---------------:|:------------:|:--------:|:------------:|:-------------:|
65
+ | 18.6831 | 1.0 | 180 | 4.6155 | 0.0207 | 0.0207 | 0.0592 | 0.1124 | 0.0005 | 0.0117 | 0.0010 | nan | nan |
66
+
67
+
68
+ ### Framework versions
69
+
70
+ - Transformers 4.46.1
71
+ - Pytorch 2.5.1+cu124
72
+ - Datasets 3.1.0
73
+ - Tokenizers 0.20.1
config.json ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "MCG-NJU/videomae-base",
3
+ "architectures": [
4
+ "VideoMAEForVideoClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "decoder_hidden_size": 384,
8
+ "decoder_intermediate_size": 1536,
9
+ "decoder_num_attention_heads": 6,
10
+ "decoder_num_hidden_layers": 4,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.0,
13
+ "hidden_size": 768,
14
+ "id2label": {
15
+ "0": "accident",
16
+ "1": "africa",
17
+ "2": "all",
18
+ "3": "apple",
19
+ "4": "basketball",
20
+ "5": "bed",
21
+ "6": "before",
22
+ "7": "bird",
23
+ "8": "birthday",
24
+ "9": "black",
25
+ "10": "blue",
26
+ "11": "book",
27
+ "12": "bowling",
28
+ "13": "brown",
29
+ "14": "but",
30
+ "15": "can",
31
+ "16": "candy",
32
+ "17": "chair",
33
+ "18": "change",
34
+ "19": "cheat",
35
+ "20": "city",
36
+ "21": "clothes",
37
+ "22": "color",
38
+ "23": "computer",
39
+ "24": "cook",
40
+ "25": "cool",
41
+ "26": "corn",
42
+ "27": "cousin",
43
+ "28": "cow",
44
+ "29": "dance",
45
+ "30": "dark",
46
+ "31": "deaf",
47
+ "32": "decide",
48
+ "33": "doctor",
49
+ "34": "dog",
50
+ "35": "drink",
51
+ "36": "eat",
52
+ "37": "enjoy",
53
+ "38": "family",
54
+ "39": "fine",
55
+ "40": "finish",
56
+ "41": "fish",
57
+ "42": "forget",
58
+ "43": "full",
59
+ "44": "give",
60
+ "45": "go",
61
+ "46": "graduate",
62
+ "47": "hat",
63
+ "48": "hearing",
64
+ "49": "help",
65
+ "50": "hot",
66
+ "51": "how",
67
+ "52": "jacket",
68
+ "53": "kiss",
69
+ "54": "language",
70
+ "55": "last",
71
+ "56": "later",
72
+ "57": "letter",
73
+ "58": "like",
74
+ "59": "man",
75
+ "60": "many",
76
+ "61": "medicine",
77
+ "62": "meet",
78
+ "63": "mother",
79
+ "64": "need",
80
+ "65": "no",
81
+ "66": "now",
82
+ "67": "orange",
83
+ "68": "paint",
84
+ "69": "paper",
85
+ "70": "pink",
86
+ "71": "pizza",
87
+ "72": "play",
88
+ "73": "pull",
89
+ "74": "purple",
90
+ "75": "right",
91
+ "76": "same",
92
+ "77": "school",
93
+ "78": "secretary",
94
+ "79": "shirt",
95
+ "80": "short",
96
+ "81": "son",
97
+ "82": "study",
98
+ "83": "table",
99
+ "84": "tall",
100
+ "85": "tell",
101
+ "86": "thanksgiving",
102
+ "87": "thin",
103
+ "88": "thursday",
104
+ "89": "time",
105
+ "90": "walk",
106
+ "91": "want",
107
+ "92": "what",
108
+ "93": "white",
109
+ "94": "who",
110
+ "95": "woman",
111
+ "96": "work",
112
+ "97": "wrong",
113
+ "98": "year",
114
+ "99": "yes"
115
+ },
116
+ "image_size": 224,
117
+ "initializer_range": 0.02,
118
+ "intermediate_size": 3072,
119
+ "label2id": {
120
+ "accident": 0,
121
+ "africa": 1,
122
+ "all": 2,
123
+ "apple": 3,
124
+ "basketball": 4,
125
+ "bed": 5,
126
+ "before": 6,
127
+ "bird": 7,
128
+ "birthday": 8,
129
+ "black": 9,
130
+ "blue": 10,
131
+ "book": 11,
132
+ "bowling": 12,
133
+ "brown": 13,
134
+ "but": 14,
135
+ "can": 15,
136
+ "candy": 16,
137
+ "chair": 17,
138
+ "change": 18,
139
+ "cheat": 19,
140
+ "city": 20,
141
+ "clothes": 21,
142
+ "color": 22,
143
+ "computer": 23,
144
+ "cook": 24,
145
+ "cool": 25,
146
+ "corn": 26,
147
+ "cousin": 27,
148
+ "cow": 28,
149
+ "dance": 29,
150
+ "dark": 30,
151
+ "deaf": 31,
152
+ "decide": 32,
153
+ "doctor": 33,
154
+ "dog": 34,
155
+ "drink": 35,
156
+ "eat": 36,
157
+ "enjoy": 37,
158
+ "family": 38,
159
+ "fine": 39,
160
+ "finish": 40,
161
+ "fish": 41,
162
+ "forget": 42,
163
+ "full": 43,
164
+ "give": 44,
165
+ "go": 45,
166
+ "graduate": 46,
167
+ "hat": 47,
168
+ "hearing": 48,
169
+ "help": 49,
170
+ "hot": 50,
171
+ "how": 51,
172
+ "jacket": 52,
173
+ "kiss": 53,
174
+ "language": 54,
175
+ "last": 55,
176
+ "later": 56,
177
+ "letter": 57,
178
+ "like": 58,
179
+ "man": 59,
180
+ "many": 60,
181
+ "medicine": 61,
182
+ "meet": 62,
183
+ "mother": 63,
184
+ "need": 64,
185
+ "no": 65,
186
+ "now": 66,
187
+ "orange": 67,
188
+ "paint": 68,
189
+ "paper": 69,
190
+ "pink": 70,
191
+ "pizza": 71,
192
+ "play": 72,
193
+ "pull": 73,
194
+ "purple": 74,
195
+ "right": 75,
196
+ "same": 76,
197
+ "school": 77,
198
+ "secretary": 78,
199
+ "shirt": 79,
200
+ "short": 80,
201
+ "son": 81,
202
+ "study": 82,
203
+ "table": 83,
204
+ "tall": 84,
205
+ "tell": 85,
206
+ "thanksgiving": 86,
207
+ "thin": 87,
208
+ "thursday": 88,
209
+ "time": 89,
210
+ "walk": 90,
211
+ "want": 91,
212
+ "what": 92,
213
+ "white": 93,
214
+ "who": 94,
215
+ "woman": 95,
216
+ "work": 96,
217
+ "wrong": 97,
218
+ "year": 98,
219
+ "yes": 99
220
+ },
221
+ "layer_norm_eps": 1e-12,
222
+ "model_type": "videomae",
223
+ "norm_pix_loss": true,
224
+ "num_attention_heads": 12,
225
+ "num_channels": 3,
226
+ "num_frames": 16,
227
+ "num_hidden_layers": 12,
228
+ "patch_size": 16,
229
+ "problem_type": "single_label_classification",
230
+ "qkv_bias": true,
231
+ "torch_dtype": "float32",
232
+ "transformers_version": "4.46.1",
233
+ "tubelet_size": 2,
234
+ "use_mean_pooling": false
235
+ }
confusion_matrix_test_50.png ADDED

Git LFS Details

  • SHA256: 2618f021a4ed470a1aaf463e4680797158d36abf308607d719f45073102a445f
  • Pointer size: 132 Bytes
  • Size of remote file: 1.45 MB
confusion_matrix_train_50.png ADDED

Git LFS Details

  • SHA256: 52757928f3bdc501844bacdf198ece067c1897a4925bafe741c522043d640e34
  • Pointer size: 132 Bytes
  • Size of remote file: 1.62 MB
confusion_matrix_valid_50.png ADDED

Git LFS Details

  • SHA256: 72451e4fa94f84670b518ef42df1fd97d737e7c160b6b00db6cc0ef3e2f00d9c
  • Pointer size: 132 Bytes
  • Size of remote file: 1.45 MB
longtail_f1_vs_freq.png ADDED

Git LFS Details

  • SHA256: 972be5f7534d538418ae2ef4d86c8c89fac92e31c5ea5a7837d1abb75fcd1f0e
  • Pointer size: 131 Bytes
  • Size of remote file: 317 kB
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb41e99ca1aa4af41540e5e25f3d54535bae1353169b90b63e8305f5fc2ccfae
3
+ size 345238832
per_class_report.csv ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,precision,recall,f1-score,support,train_freq,pearson_corr,spearman_corr
2
+ accident,0.0,0.0,0.0,3,1.0,,
3
+ africa,0.0,0.0,0.0,3,1.0,,
4
+ all,0.0,0.0,0.0,3,1.0,,
5
+ apple,0.0,0.0,0.0,2,1.0,,
6
+ basketball,0.0,0.0,0.0,2,1.0,,
7
+ bed,0.0,0.0,0.0,2,1.0,,
8
+ before,0.0,0.0,0.0,4,1.0,,
9
+ bird,0.0,0.0,0.0,2,1.0,,
10
+ birthday,0.0,0.0,0.0,3,1.0,,
11
+ black,0.0,0.0,0.0,3,1.0,,
12
+ blue,0.0,0.0,0.0,2,1.0,,
13
+ book,0.018691588785046728,1.0,0.03669724770642202,4,1.0,,
14
+ bowling,0.0,0.0,0.0,2,1.0,,
15
+ brown,0.0,0.0,0.0,2,1.0,,
16
+ but,0.0,0.0,0.0,3,1.0,,
17
+ can,0.0,0.0,0.0,2,1.0,,
18
+ candy,0.0,0.0,0.0,3,1.0,,
19
+ chair,0.0,0.0,0.0,3,1.0,,
20
+ change,0.0,0.0,0.0,2,1.0,,
21
+ cheat,0.0,0.0,0.0,2,1.0,,
22
+ city,0.0,0.0,0.0,2,1.0,,
23
+ clothes,0.0,0.0,0.0,3,1.0,,
24
+ color,0.0,0.0,0.0,2,1.0,,
25
+ computer,0.0,0.0,0.0,5,1.0,,
26
+ cook,0.0,0.0,0.0,2,1.0,,
27
+ cool,0.0,0.0,0.0,3,1.0,,
28
+ corn,0.0,0.0,0.0,3,1.0,,
29
+ cousin,0.0,0.0,0.0,3,1.0,,
30
+ cow,0.0,0.0,0.0,3,1.0,,
31
+ dance,0.0,0.0,0.0,2,1.0,,
32
+ dark,0.0,0.0,0.0,3,1.0,,
33
+ deaf,0.0,0.0,0.0,3,1.0,,
34
+ decide,0.0,0.0,0.0,2,1.0,,
35
+ doctor,0.0,0.0,0.0,3,1.0,,
36
+ dog,0.0,0.0,0.0,3,1.0,,
37
+ drink,0.0,0.0,0.0,4,1.0,,
38
+ eat,0.0,0.0,0.0,2,1.0,,
39
+ enjoy,0.0,0.0,0.0,2,1.0,,
40
+ family,0.0,0.0,0.0,2,1.0,,
41
+ fine,0.0,0.0,0.0,3,1.0,,
42
+ finish,0.0,0.0,0.0,3,1.0,,
43
+ fish,0.0,0.0,0.0,3,1.0,,
44
+ forget,0.0,0.0,0.0,2,1.0,,
45
+ full,0.0,0.0,0.0,2,1.0,,
46
+ give,0.0,0.0,0.0,2,1.0,,
47
+ go,0.0,0.0,0.0,3,1.0,,
48
+ graduate,0.0,0.0,0.0,2,1.0,,
49
+ hat,0.0,0.0,0.0,2,1.0,,
50
+ hearing,0.0,0.0,0.0,2,1.0,,
51
+ help,0.0,0.0,0.0,3,1.0,,
52
+ hot,0.0,0.0,0.0,3,1.0,,
53
+ how,0.0,0.0,0.0,3,1.0,,
54
+ jacket,0.0,0.0,0.0,2,1.0,,
55
+ kiss,0.0,0.0,0.0,3,1.0,,
56
+ language,0.0,0.0,0.0,3,1.0,,
57
+ last,0.0,0.0,0.0,3,1.0,,
58
+ later,0.0,0.0,0.0,2,1.0,,
59
+ letter,0.0,0.0,0.0,3,1.0,,
60
+ like,0.0,0.0,0.0,3,1.0,,
61
+ man,0.0,0.0,0.0,2,1.0,,
62
+ many,0.0,0.0,0.0,3,1.0,,
63
+ medicine,0.0,0.0,0.0,2,1.0,,
64
+ meet,0.0,0.0,0.0,2,1.0,,
65
+ mother,0.0,0.0,0.0,3,1.0,,
66
+ need,0.0,0.0,0.0,2,1.0,,
67
+ no,0.0,0.0,0.0,3,1.0,,
68
+ now,0.0,0.0,0.0,3,1.0,,
69
+ orange,0.0,0.0,0.0,3,1.0,,
70
+ paint,0.0,0.0,0.0,2,1.0,,
71
+ paper,0.0,0.0,0.0,2,1.0,,
72
+ pink,0.0,0.0,0.0,3,1.0,,
73
+ pizza,0.0,0.0,0.0,3,1.0,,
74
+ play,0.0,0.0,0.0,2,1.0,,
75
+ pull,0.0,0.0,0.0,2,1.0,,
76
+ purple,0.0,0.0,0.0,2,1.0,,
77
+ right,0.0,0.0,0.0,2,1.0,,
78
+ same,0.0,0.0,0.0,3,1.0,,
79
+ school,0.0,0.0,0.0,2,1.0,,
80
+ secretary,0.0,0.0,0.0,3,1.0,,
81
+ shirt,0.0,0.0,0.0,3,1.0,,
82
+ short,0.0,0.0,0.0,2,1.0,,
83
+ son,0.0,0.0,0.0,2,1.0,,
84
+ study,0.0,0.0,0.0,2,1.0,,
85
+ table,0.0,0.0,0.0,3,1.0,,
86
+ tall,0.0,0.0,0.0,3,1.0,,
87
+ tell,0.0,0.0,0.0,2,1.0,,
88
+ thanksgiving,0.0,0.0,0.0,3,1.0,,
89
+ thin,0.0,0.0,0.0,3,1.0,,
90
+ thursday,0.0,0.0,0.0,2,1.0,,
91
+ time,0.0,0.0,0.0,2,1.0,,
92
+ walk,0.0,0.0,0.0,3,1.0,,
93
+ want,0.0,0.0,0.0,2,1.0,,
94
+ what,0.0,0.0,0.0,3,1.0,,
95
+ white,0.0,0.0,0.0,2,1.0,,
96
+ who,0.0,0.0,0.0,3,1.0,,
97
+ woman,0.0,0.0,0.0,3,1.0,,
98
+ work,0.0,0.0,0.0,2,1.0,,
99
+ wrong,0.0,0.0,0.0,2,1.0,,
100
+ year,0.0,0.0,0.0,3,1.0,,
101
+ yes,0.0,0.0,0.0,3,1.0,,
102
+ macro avg,0.00018691588785046728,0.01,0.0003669724770642202,258,,,
103
+ weighted avg,0.00028979207418677096,0.015503875968992248,0.0005689495768437523,258,,,
preprocessor_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 224,
4
+ "width": 224
5
+ },
6
+ "do_center_crop": true,
7
+ "do_normalize": true,
8
+ "do_rescale": true,
9
+ "do_resize": true,
10
+ "image_mean": [
11
+ 0.485,
12
+ 0.456,
13
+ 0.406
14
+ ],
15
+ "image_processor_type": "VideoMAEImageProcessor",
16
+ "image_std": [
17
+ 0.229,
18
+ 0.224,
19
+ 0.225
20
+ ],
21
+ "resample": 2,
22
+ "rescale_factor": 0.00392156862745098,
23
+ "size": {
24
+ "shortest_edge": 224
25
+ }
26
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:622a4b2c480ab98f4218ea94d80223d25e7e66f749f9449f80783b9d4a8e973c
3
+ size 5368