Image-to-Text
Transformers
PyTorch
English
Geo-Localization
File size: 5,770 Bytes
085b09f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
{
    "ResumeFromPreviousRun": "/home/tobias.rothlin/data/TrainingSnapshots/Regression_2",
    "DatasetConfig": {
        "base_model": "openai/clip-vit-large-patch14-336",
        "augmentaion_pipeline": [
            {
                "name": "RandomRotation",
                "params": {
                    "degrees": 10
                }
            },
            {
                "name": "ColorJitter",
                "params": {
                    "brightness": 0.5,
                    "contrast": 0.2,
                    "saturation": 0.4,
                    "hue": 0.01
                }
            },
            {
                "name": "RandomPerspective",
                "params": {
                    "distortion_scale": 0.2,
                    "p": 0.5
                }
            }
        ],
        "normalize_labels": true,
        "use_cached_dataloader": false,
        "load_for_contrast_learning": false,
        "use_pre_calculated_embeddings": false,
        "load_pooling_output": false,
        "use_gaussian_smoothing": true,
        "workers": 4
    },
    "DataLoaderConfig": {
        "Train": {
            "batch_size": 64,
            "shuffle": true,
            "num_workers": 8,
            "pin_memory": true,
            "prefetch_factor": 20
        },
        "Test": {
            "batch_size": 64,
            "shuffle": true,
            "num_workers": 8,
            "pin_memory": true,
            "prefetch_factor": 20
        }
    },
    "ModelConfig": {
        "use_location_head": true,
        "use_similarity_head": false,
        "freeze_base_model": true,
        "LocationHeadClip": {
            "mean_locatation_head_output": false,
            "layers": [
                {
                    "d_model": 1024,
                    "nhead": 8
                },
                {
                    "d_model": 1024,
                    "nhead": 8
                }
            ],
            "linear_layer_mapping": {
                "in_features": 1024,
                "out_features": 1024
            }
        },
        "RegressionHead": {
            "layer_group": [
                [
                    {
                        "type": "Linear",
                        "in_features": 1024,
                        "out_features": 1024
                    },
                    {
                        "type": "Dropout",
                        "p": 0.3
                    },
                    {
                        "type": "LayerNorm",
                        "normalized_shape": 1024
                    },
                    {
                        "type": "ReLU"
                    }
                ],
                [
                    {
                        "type": "Linear",
                        "in_features": 1024,
                        "out_features": 512
                    },
                    {
                        "type": "Dropout",
                        "p": 0.2
                    },
                    {
                        "type": "LayerNorm",
                        "normalized_shape": 512
                    },
                    {
                        "type": "ReLU"
                    }
                ],
                [
                    {
                        "type": "Linear",
                        "in_features": 512,
                        "out_features": 256
                    },
                    {
                        "type": "Dropout",
                        "p": 0.1
                    },
                    {
                        "type": "LayerNorm",
                        "normalized_shape": 256
                    },
                    {
                        "type": "ReLU"
                    }
                ],
                [
                    {
                        "type": "Linear",
                        "in_features": 256,
                        "out_features": 64
                    },
                    {
                        "type": "Dropout",
                        "p": 0.1
                    },
                    {
                        "type": "LayerNorm",
                        "normalized_shape": 64
                    },
                    {
                        "type": "ReLU"
                    }
                ],
                [
                    {
                        "type": "Linear",
                        "in_features": 64,
                        "out_features": 32
                    },
                    {
                        "type": "Dropout",
                        "p": 0.1
                    },
                    {
                        "type": "LayerNorm",
                        "normalized_shape": 32
                    },
                    {
                        "type": "ReLU"
                    }
                ],
                [
                    {
                        "type": "Linear",
                        "in_features": 32,
                        "out_features": 2
                    },
                    {
                        "type": "Tanh"
                    }
                ]
            ]
        }
    },
    "TrainingConfig": {
        "Epochs": 6,
        "SaveEvery": 10000,
        "RunName": "Regression_Best_Long",
        "SnapshotPath": "/home/tobias.rothlin/data/TrainingSnapshots",
        "LogMLFlow": false,
        "MLFlowExperimentName": "ClipLocationDecoder",
        "GradientAccumulationSteps": 1,
        "ContrastLearningStrategy": null,
        "LearningRate": 5e-05,
        "Amsgrad": true,
        "WeightDecay": 0.0001,
        "Betas": [
            0.9,
            0.98
        ],
        "Gamma": 0.9
    }
}