robgreenberg3 commited on
Commit
c8bcdd4
·
verified ·
1 Parent(s): 907b1bf

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,34 @@
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
  *.ot filter=lfs diff=lfs merge=lfs -text
18
  *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
  *.bz2 filter=lfs diff=lfs merge=lfs -text
 
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
 
11
  *.model filter=lfs diff=lfs merge=lfs -text
12
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
13
  *.onnx filter=lfs diff=lfs merge=lfs -text
14
  *.ot filter=lfs diff=lfs merge=lfs -text
15
  *.parquet filter=lfs diff=lfs merge=lfs -text
16
  *.pb filter=lfs diff=lfs merge=lfs -text
 
 
17
  *.pt filter=lfs diff=lfs merge=lfs -text
18
  *.pth filter=lfs diff=lfs merge=lfs -text
19
  *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 
21
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
22
  *.tflite filter=lfs diff=lfs merge=lfs -text
23
  *.tgz filter=lfs diff=lfs merge=lfs -text
 
24
  *.xz filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
29
+ examples/India_900498_S2Hand.tif filter=lfs diff=lfs merge=lfs -text
30
+ examples/Spain_7370579_S2Hand.tif filter=lfs diff=lfs merge=lfs -text
31
+ examples/USA_430764_S2Hand.tif filter=lfs diff=lfs merge=lfs -text
32
+ examples/subsetted_512x512_HLS.S30.T10SEH.2018190.v1.4_merged.tif filter=lfs diff=lfs merge=lfs -text
33
+ examples/subsetted_512x512_HLS.S30.T10SFF.2018190.v1.4_merged.tif filter=lfs diff=lfs merge=lfs -text
34
+ examples/subsetted_512x512_HLS.S30.T10SGF.2020217.v1.4_merged.tif filter=lfs diff=lfs merge=lfs -text
1_Pooling/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 384,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false
7
+ }
Prithvi-EO-V2-300M-TL-Sen1Floods11.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3675e9c2b52547de8ff8a19f4881c28573e6d4d2f0805d866f4fc48c1e517d60
3
+ size 1276843350
Prithvi_EO_V2_300M_BurnScars.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c5f9334be9a75c9006387ab8f3dc05a55ea7fb5ef7956717316be57c62954d3
3
+ size 1297798380
README.md ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ license: apache-2.0
4
+ library_name: sentence-transformers
5
+ tags:
6
+ - sentence-transformers
7
+ - feature-extraction
8
+ - sentence-similarity
9
+ - transformers
10
+ datasets:
11
+ - s2orc
12
+ - flax-sentence-embeddings/stackexchange_xml
13
+ - ms_marco
14
+ - gooaq
15
+ - yahoo_answers_topics
16
+ - code_search_net
17
+ - search_qa
18
+ - eli5
19
+ - snli
20
+ - multi_nli
21
+ - wikihow
22
+ - natural_questions
23
+ - trivia_qa
24
+ - embedding-data/sentence-compression
25
+ - embedding-data/flickr30k-captions
26
+ - embedding-data/altlex
27
+ - embedding-data/simple-wiki
28
+ - embedding-data/QQP
29
+ - embedding-data/SPECTER
30
+ - embedding-data/PAQ_pairs
31
+ - embedding-data/WikiAnswers
32
+ pipeline_tag: sentence-similarity
33
+ ---
34
+
35
+
36
+ # all-MiniLM-L6-v2
37
+ This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
38
+
39
+ ## Usage (Sentence-Transformers)
40
+ Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
41
+
42
+ ```
43
+ pip install -U sentence-transformers
44
+ ```
45
+
46
+ Then you can use the model like this:
47
+ ```python
48
+ from sentence_transformers import SentenceTransformer
49
+ sentences = ["This is an example sentence", "Each sentence is converted"]
50
+
51
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
52
+ embeddings = model.encode(sentences)
53
+ print(embeddings)
54
+ ```
55
+
56
+ ## Usage (HuggingFace Transformers)
57
+ Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
58
+
59
+ ```python
60
+ from transformers import AutoTokenizer, AutoModel
61
+ import torch
62
+ import torch.nn.functional as F
63
+
64
+ #Mean Pooling - Take attention mask into account for correct averaging
65
+ def mean_pooling(model_output, attention_mask):
66
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
67
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
68
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
69
+
70
+
71
+ # Sentences we want sentence embeddings for
72
+ sentences = ['This is an example sentence', 'Each sentence is converted']
73
+
74
+ # Load model from HuggingFace Hub
75
+ tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
76
+ model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
77
+
78
+ # Tokenize sentences
79
+ encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
80
+
81
+ # Compute token embeddings
82
+ with torch.no_grad():
83
+ model_output = model(**encoded_input)
84
+
85
+ # Perform pooling
86
+ sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
87
+
88
+ # Normalize embeddings
89
+ sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
90
+
91
+ print("Sentence embeddings:")
92
+ print(sentence_embeddings)
93
+ ```
94
+
95
+ ------
96
+
97
+ ## Background
98
+
99
+ The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
100
+ contrastive learning objective. We used the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model and fine-tuned in on a
101
+ 1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
102
+
103
+ We developed this model during the
104
+ [Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
105
+ organized by Hugging Face. We developed this model as part of the project:
106
+ [Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
107
+
108
+ ## Intended uses
109
+
110
+ Our model is intended to be used as a sentence and short paragraph encoder. Given an input text, it outputs a vector which captures
111
+ the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
112
+
113
+ By default, input text longer than 256 word pieces is truncated.
114
+
115
+
116
+ ## Training procedure
117
+
118
+ ### Pre-training
119
+
120
+ We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
121
+
122
+ ### Fine-tuning
123
+
124
+ We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch.
125
+ We then apply the cross entropy loss by comparing with true pairs.
126
+
127
+ #### Hyper parameters
128
+
129
+ We trained our model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core).
130
+ We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with
131
+ a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`.
132
+
133
+ #### Training data
134
+
135
+ We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences.
136
+ We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
137
+
138
+
139
+ | Dataset | Paper | Number of training tuples |
140
+ |--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
141
+ | [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
142
+ | [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 |
143
+ | [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
144
+ | [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
145
+ | [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
146
+ | [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
147
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs | - | 25,316,456 |
148
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs | - | 21,396,559 |
149
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs | - | 21,396,559 |
150
+ | [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
151
+ | [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
152
+ | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
153
+ | [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 |
154
+ | [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
155
+ | [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
156
+ | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
157
+ | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
158
+ | [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
159
+ | [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
160
+ | [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
161
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 |
162
+ | AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 |
163
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 |
164
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 |
165
+ | [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
166
+ | [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
167
+ | [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
168
+ | [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
169
+ | [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
170
+ | [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
171
+ | [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
172
+ | [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
173
+ | **Total** | | **1,170,060,424** |
burn_scars_config.yaml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # lightning.pytorch==2.4.0
2
+ seed_everything: 2
3
+ trainer:
4
+ logger: true
5
+ max_epochs: 100
6
+ log_every_n_steps: 1
7
+ callbacks:
8
+ - class_path: EarlyStopping
9
+ init_args:
10
+ monitor: val/loss
11
+ patience: 15
12
+ - class_path: LearningRateMonitor
13
+ init_args:
14
+ logging_interval: epoch
15
+ enable_progress_bar: false
16
+ precision: bf16-mixed
17
+
18
+ model:
19
+ class_path: terratorch.tasks.SemanticSegmentationTask
20
+ init_args:
21
+ model_factory: EncoderDecoderFactory
22
+ model_args:
23
+ backbone: prithvi_eo_v2_300
24
+ backbone_pretrained: true
25
+ backbone_bands: ["BLUE", "GREEN", "RED", "NIR_NARROW", "SWIR_1", "SWIR_2"]
26
+ necks:
27
+ - name: SelectIndices
28
+ indices: [5, 11, 17, 23]
29
+ - name: ReshapeTokensToImage
30
+ - name: LearnedInterpolateToPyramidal
31
+ decoder: UNetDecoder
32
+ decoder_channels: [512, 256, 128, 64]
33
+ num_classes: 2
34
+ loss: ce
35
+ ignore_index: -1
36
+ freeze_backbone: false
37
+ plot_on_val: false
38
+ class_names: [Not burned, Burn scar]
39
+
40
+ optimizer:
41
+ class_path: torch.optim.AdamW
42
+ init_args:
43
+ lr: 1.e-4
44
+ lr_scheduler:
45
+ class_path: ReduceLROnPlateau
46
+ init_args:
47
+ monitor: val/loss
48
+ factor: 0.5
49
+ patience: 4
50
+
51
+ data:
52
+ class_path: GenericNonGeoSegmentationDataModule
53
+ init_args:
54
+ batch_size: 8
55
+ num_workers: 8
56
+ dataset_bands: # Dataset bands
57
+ - BLUE
58
+ - GREEN
59
+ - RED
60
+ - NIR_NARROW
61
+ - SWIR_1
62
+ - SWIR_2
63
+ output_bands: # Model input bands
64
+ - BLUE
65
+ - GREEN
66
+ - RED
67
+ - NIR_NARROW
68
+ - SWIR_1
69
+ - SWIR_2
70
+ rgb_indices:
71
+ - 2
72
+ - 1
73
+ - 0
74
+ train_data_root: hls_burn_scars/data
75
+ val_data_root: hls_burn_scars/data
76
+ test_data_root: hls_burn_scars/data
77
+ train_split: hls_burn_scars/splits/train.txt
78
+ val_split: hls_burn_scars/splits/val.txt
79
+ test_split: hls_burn_scars/splits/test.txt
80
+ img_grep: "*_merged.tif"
81
+ label_grep: "*.mask.tif"
82
+ means:
83
+ - 0.033349706741586264
84
+ - 0.05701185520536176
85
+ - 0.05889748132001316
86
+ - 0.2323245113436119
87
+ - 0.1972854853760658
88
+ - 0.11944914225186566
89
+ stds:
90
+ - 0.02269135568823774
91
+ - 0.026807560223070237
92
+ - 0.04004109844362779
93
+ - 0.07791732423672691
94
+ - 0.08708738838140137
95
+ - 0.07241979477437814
96
+ num_classes: 2
97
+ train_transform:
98
+ - class_path: albumentations.D4
99
+ - class_path: ToTensorV2
100
+ test_transform:
101
+ - class_path: ToTensorV2
102
+
103
+ no_data_replace: 0
104
+ no_label_replace: -1
config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "nreimers/MiniLM-L6-H384-uncased",
3
+ "architectures": [
4
+ "BertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 384,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 1536,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 6,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "transformers_version": "4.8.2",
21
+ "type_vocab_size": 2,
22
+ "use_cache": true,
23
+ "vocab_size": 30522
24
+ }
config.yaml ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # lightning.pytorch==2.4.0
2
+ seed_everything: 0
3
+ trainer:
4
+ accelerator: auto
5
+ strategy: auto
6
+ devices: auto
7
+ num_nodes: 1
8
+ precision: 16-mixed
9
+ logger: true
10
+ callbacks:
11
+ - class_path: lightning.pytorch.callbacks.RichProgressBar
12
+ init_args:
13
+ refresh_rate: 1
14
+ leave: false
15
+ theme:
16
+ description: white
17
+ progress_bar: '#6206E0'
18
+ progress_bar_finished: '#6206E0'
19
+ progress_bar_pulse: '#6206E0'
20
+ batch_progress: white
21
+ time: grey54
22
+ processing_speed: grey70
23
+ metrics: white
24
+ metrics_text_delimiter: ' '
25
+ metrics_format: .3f
26
+ - class_path: lightning.pytorch.callbacks.LearningRateMonitor
27
+ init_args:
28
+ logging_interval: epoch
29
+ log_momentum: false
30
+ log_weight_decay: false
31
+ - class_path: lightning.pytorch.callbacks.EarlyStopping
32
+ init_args:
33
+ monitor: val/loss
34
+ min_delta: 0.0
35
+ patience: 20
36
+ verbose: false
37
+ mode: min
38
+ strict: true
39
+ check_finite: true
40
+ log_rank_zero_only: false
41
+ fast_dev_run: false
42
+ max_epochs: 50
43
+ max_steps: -1
44
+ overfit_batches: 0.0
45
+ check_val_every_n_epoch: 2
46
+ log_every_n_steps: 10
47
+ enable_checkpointing: true
48
+ accumulate_grad_batches: 1
49
+ inference_mode: true
50
+ use_distributed_sampler: true
51
+ detect_anomaly: false
52
+ barebones: false
53
+ sync_batchnorm: false
54
+ reload_dataloaders_every_n_epochs: 0
55
+ default_root_dir: /dccstor/geofm-finetuning/benchmark-geo-bench-paolo/
56
+ model:
57
+ class_path: terratorch.tasks.SemanticSegmentationTask
58
+ init_args:
59
+ model_args:
60
+ backbone_pretrained: true
61
+ backbone: prithvi_eo_v2_300_tl
62
+ decoder: UperNetDecoder
63
+ decoder_channels: 256
64
+ decoder_scale_modules: true
65
+ num_classes: 2
66
+ rescale: true
67
+ backbone_bands:
68
+ - BLUE
69
+ - GREEN
70
+ - RED
71
+ - NIR_NARROW
72
+ - SWIR_1
73
+ - SWIR_2
74
+ head_dropout: 0.1
75
+ necks:
76
+ - name: SelectIndices
77
+ indices:
78
+ - 5
79
+ - 11
80
+ - 17
81
+ - 23
82
+ - name: ReshapeTokensToImage
83
+ model_factory: EncoderDecoderFactory
84
+ loss: ce
85
+ ignore_index: -1
86
+ lr: 0.001
87
+ freeze_backbone: false
88
+ freeze_decoder: false
89
+ plot_on_val: 10
90
+ data:
91
+ class_path: terratorch.datamodules.Sen1Floods11NonGeoDataModule
92
+ init_args:
93
+ data_root: /dccstor/geofm-finetuning/datasets/sen1floods11
94
+ batch_size: 16
95
+ num_workers: 8
96
+ bands:
97
+ - BLUE
98
+ - GREEN
99
+ - RED
100
+ - NIR_NARROW
101
+ - SWIR_1
102
+ - SWIR_2
103
+ train_transform:
104
+ - class_path: albumentations.RandomCrop
105
+ init_args:
106
+ height: 224
107
+ width: 224
108
+ p: 1.0
109
+ - class_path: albumentations.HorizontalFlip
110
+ init_args:
111
+ p: 0.5
112
+ - class_path: albumentations.VerticalFlip
113
+ init_args:
114
+ p: 0.5
115
+ - class_path: albumentations.pytorch.ToTensorV2
116
+ init_args:
117
+ transpose_mask: false
118
+ p: 1.0
119
+ val_transform:
120
+ - class_path: albumentations.pytorch.ToTensorV2
121
+ init_args:
122
+ transpose_mask: false
123
+ p: 1.0
124
+ test_transform:
125
+ - class_path: albumentations.pytorch.ToTensorV2
126
+ init_args:
127
+ transpose_mask: false
128
+ p: 1.0
129
+ drop_last: true
130
+ constant_scale: 0.0001
131
+ no_data_replace: 0.0
132
+ no_label_replace: -1
133
+ use_metadata: false
134
+ out_dtype: int16
135
+ deploy_config_file: true
136
+ optimizer:
137
+ class_path: torch.optim.AdamW
138
+ init_args:
139
+ lr: 5.0e-05
140
+ betas:
141
+ - 0.9
142
+ - 0.999
143
+ eps: 1.0e-08
144
+ weight_decay: 0.05
145
+ amsgrad: false
146
+ maximize: false
147
+ capturable: false
148
+ differentiable: false
149
+ lr_scheduler:
150
+ class_path: torch.optim.lr_scheduler.CosineAnnealingLR
151
+ init_args:
152
+ T_max: 50
153
+ eta_min: 0
154
+ last_epoch: -1
config_sentence_transformers.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "2.0.0",
4
+ "transformers": "4.6.1",
5
+ "pytorch": "1.8.1"
6
+ }
7
+ }
data_config.json ADDED
@@ -0,0 +1,1452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "name": "stackexchange_title_body/skeptics.stackexchange.com.jsonl.gz",
4
+ "lines": 10009,
5
+ "weight": 1
6
+ },
7
+ {
8
+ "name": "stackexchange_TitleBody_Answer/islam.stackexchange.com.jsonl.gz",
9
+ "lines": 10052,
10
+ "weight": 1
11
+ },
12
+ {
13
+ "name": "stackexchange_Title_Answer/islam.stackexchange.com.jsonl.gz",
14
+ "lines": 10052,
15
+ "weight": 1
16
+ },
17
+ {
18
+ "name": "stackexchange_TitleBody_Answer/anime.stackexchange.com.jsonl.gz",
19
+ "lines": 10131,
20
+ "weight": 1
21
+ },
22
+ {
23
+ "name": "stackexchange_Title_Answer/anime.stackexchange.com.jsonl.gz",
24
+ "lines": 10131,
25
+ "weight": 1
26
+ },
27
+ {
28
+ "name": "stackexchange_title_body/writers.stackexchange.com.jsonl.gz",
29
+ "lines": 10157,
30
+ "weight": 1
31
+ },
32
+ {
33
+ "name": "stackexchange_title_body/astronomy.stackexchange.com.jsonl.gz",
34
+ "lines": 10462,
35
+ "weight": 1
36
+ },
37
+ {
38
+ "name": "stackexchange_title_body/vi.stackexchange.com.jsonl.gz",
39
+ "lines": 10551,
40
+ "weight": 1
41
+ },
42
+ {
43
+ "name": "stackexchange_TitleBody_Answer/french.stackexchange.com.jsonl.gz",
44
+ "lines": 10578,
45
+ "weight": 1
46
+ },
47
+ {
48
+ "name": "stackexchange_Title_Answer/french.stackexchange.com.jsonl.gz",
49
+ "lines": 10578,
50
+ "weight": 1
51
+ },
52
+ {
53
+ "name": "stackexchange_title_body/cstheory.stackexchange.com.jsonl.gz",
54
+ "lines": 10642,
55
+ "weight": 1
56
+ },
57
+ {
58
+ "name": "stackexchange_TitleBody_Answer/civicrm.stackexchange.com.jsonl.gz",
59
+ "lines": 10648,
60
+ "weight": 1
61
+ },
62
+ {
63
+ "name": "stackexchange_Title_Answer/civicrm.stackexchange.com.jsonl.gz",
64
+ "lines": 10648,
65
+ "weight": 1
66
+ },
67
+ {
68
+ "name": "stackexchange_TitleBody_Answer/expressionengine.stackexchange.com.jsonl.gz",
69
+ "lines": 10742,
70
+ "weight": 1
71
+ },
72
+ {
73
+ "name": "stackexchange_Title_Answer/expressionengine.stackexchange.com.jsonl.gz",
74
+ "lines": 10742,
75
+ "weight": 1
76
+ },
77
+ {
78
+ "name": "stackexchange_title_body/engineering.stackexchange.com.jsonl.gz",
79
+ "lines": 10753,
80
+ "weight": 1
81
+ },
82
+ {
83
+ "name": "stackexchange_TitleBody_Answer/history.stackexchange.com.jsonl.gz",
84
+ "lines": 10766,
85
+ "weight": 1
86
+ },
87
+ {
88
+ "name": "stackexchange_Title_Answer/history.stackexchange.com.jsonl.gz",
89
+ "lines": 10766,
90
+ "weight": 1
91
+ },
92
+ {
93
+ "name": "stackexchange_title_body/french.stackexchange.com.jsonl.gz",
94
+ "lines": 10794,
95
+ "weight": 1
96
+ },
97
+ {
98
+ "name": "stackexchange_TitleBody_Answer/politics.stackexchange.com.jsonl.gz",
99
+ "lines": 11047,
100
+ "weight": 1
101
+ },
102
+ {
103
+ "name": "stackexchange_Title_Answer/politics.stackexchange.com.jsonl.gz",
104
+ "lines": 11047,
105
+ "weight": 1
106
+ },
107
+ {
108
+ "name": "stackexchange_title_body/economics.stackexchange.com.jsonl.gz",
109
+ "lines": 11115,
110
+ "weight": 1
111
+ },
112
+ {
113
+ "name": "stackexchange_TitleBody_Answer/craftcms.stackexchange.com.jsonl.gz",
114
+ "lines": 11236,
115
+ "weight": 1
116
+ },
117
+ {
118
+ "name": "stackexchange_Title_Answer/craftcms.stackexchange.com.jsonl.gz",
119
+ "lines": 11236,
120
+ "weight": 1
121
+ },
122
+ {
123
+ "name": "stackexchange_title_body/anime.stackexchange.com.jsonl.gz",
124
+ "lines": 11444,
125
+ "weight": 1
126
+ },
127
+ {
128
+ "name": "stackexchange_TitleBody_Answer/christianity.stackexchange.com.jsonl.gz",
129
+ "lines": 11498,
130
+ "weight": 1
131
+ },
132
+ {
133
+ "name": "stackexchange_Title_Answer/christianity.stackexchange.com.jsonl.gz",
134
+ "lines": 11498,
135
+ "weight": 1
136
+ },
137
+ {
138
+ "name": "stackexchange_TitleBody_Answer/softwarerecs.stackexchange.com.jsonl.gz",
139
+ "lines": 11761,
140
+ "weight": 1
141
+ },
142
+ {
143
+ "name": "stackexchange_Title_Answer/softwarerecs.stackexchange.com.jsonl.gz",
144
+ "lines": 11761,
145
+ "weight": 1
146
+ },
147
+ {
148
+ "name": "stackexchange_TitleBody_Answer/boardgames.stackexchange.com.jsonl.gz",
149
+ "lines": 11805,
150
+ "weight": 1
151
+ },
152
+ {
153
+ "name": "stackexchange_Title_Answer/boardgames.stackexchange.com.jsonl.gz",
154
+ "lines": 11805,
155
+ "weight": 1
156
+ },
157
+ {
158
+ "name": "stackexchange_title_body/islam.stackexchange.com.jsonl.gz",
159
+ "lines": 11853,
160
+ "weight": 1
161
+ },
162
+ {
163
+ "name": "stackexchange_title_body/expressionengine.stackexchange.com.jsonl.gz",
164
+ "lines": 11866,
165
+ "weight": 1
166
+ },
167
+ {
168
+ "name": "stackexchange_title_body/politics.stackexchange.com.jsonl.gz",
169
+ "lines": 11894,
170
+ "weight": 1
171
+ },
172
+ {
173
+ "name": "stackexchange_title_body/history.stackexchange.com.jsonl.gz",
174
+ "lines": 12021,
175
+ "weight": 1
176
+ },
177
+ {
178
+ "name": "stackexchange_title_body/christianity.stackexchange.com.jsonl.gz",
179
+ "lines": 12108,
180
+ "weight": 1
181
+ },
182
+ {
183
+ "name": "stackexchange_title_body/boardgames.stackexchange.com.jsonl.gz",
184
+ "lines": 12149,
185
+ "weight": 1
186
+ },
187
+ {
188
+ "name": "flickr30k_captions.jsonl.gz",
189
+ "lines": 317695,
190
+ "weight": 1
191
+ },
192
+ {
193
+ "name": "coco_captions.jsonl.gz",
194
+ "lines": 828395,
195
+ "weight": 1
196
+ },
197
+ {
198
+ "name": "codesearchnet.jsonl.gz",
199
+ "lines": 1151414,
200
+ "weight": 1
201
+ },
202
+ {
203
+ "name": "stackexchange_title_body/civicrm.stackexchange.com.jsonl.gz",
204
+ "lines": 12543,
205
+ "weight": 2
206
+ },
207
+ {
208
+ "name": "stackexchange_title_body/craftcms.stackexchange.com.jsonl.gz",
209
+ "lines": 12574,
210
+ "weight": 2
211
+ },
212
+ {
213
+ "name": "stackexchange_TitleBody_Answer/networkengineering.stackexchange.com.jsonl.gz",
214
+ "lines": 12590,
215
+ "weight": 2
216
+ },
217
+ {
218
+ "name": "stackexchange_Title_Answer/networkengineering.stackexchange.com.jsonl.gz",
219
+ "lines": 12590,
220
+ "weight": 2
221
+ },
222
+ {
223
+ "name": "stackexchange_TitleBody_Answer/space.stackexchange.com.jsonl.gz",
224
+ "lines": 12893,
225
+ "weight": 2
226
+ },
227
+ {
228
+ "name": "stackexchange_Title_Answer/space.stackexchange.com.jsonl.gz",
229
+ "lines": 12893,
230
+ "weight": 2
231
+ },
232
+ {
233
+ "name": "stackexchange_TitleBody_Answer/quant.stackexchange.com.jsonl.gz",
234
+ "lines": 12933,
235
+ "weight": 2
236
+ },
237
+ {
238
+ "name": "stackexchange_Title_Answer/quant.stackexchange.com.jsonl.gz",
239
+ "lines": 12933,
240
+ "weight": 2
241
+ },
242
+ {
243
+ "name": "stackexchange_TitleBody_Answer/philosophy.stackexchange.com.jsonl.gz",
244
+ "lines": 13114,
245
+ "weight": 2
246
+ },
247
+ {
248
+ "name": "stackexchange_Title_Answer/philosophy.stackexchange.com.jsonl.gz",
249
+ "lines": 13114,
250
+ "weight": 2
251
+ },
252
+ {
253
+ "name": "stackexchange_TitleBody_Answer/gardening.stackexchange.com.jsonl.gz",
254
+ "lines": 13246,
255
+ "weight": 2
256
+ },
257
+ {
258
+ "name": "stackexchange_Title_Answer/gardening.stackexchange.com.jsonl.gz",
259
+ "lines": 13246,
260
+ "weight": 2
261
+ },
262
+ {
263
+ "name": "stackexchange_title_body/hinduism.stackexchange.com.jsonl.gz",
264
+ "lines": 13450,
265
+ "weight": 2
266
+ },
267
+ {
268
+ "name": "stackexchange_title_body/networkengineering.stackexchange.com.jsonl.gz",
269
+ "lines": 13454,
270
+ "weight": 2
271
+ },
272
+ {
273
+ "name": "stackexchange_TitleBody_Answer/german.stackexchange.com.jsonl.gz",
274
+ "lines": 13733,
275
+ "weight": 2
276
+ },
277
+ {
278
+ "name": "stackexchange_Title_Answer/german.stackexchange.com.jsonl.gz",
279
+ "lines": 13733,
280
+ "weight": 2
281
+ },
282
+ {
283
+ "name": "stackexchange_title_body/german.stackexchange.com.jsonl.gz",
284
+ "lines": 13950,
285
+ "weight": 2
286
+ },
287
+ {
288
+ "name": "stackexchange_title_body/philosophy.stackexchange.com.jsonl.gz",
289
+ "lines": 14829,
290
+ "weight": 2
291
+ },
292
+ {
293
+ "name": "stackexchange_title_body/gardening.stackexchange.com.jsonl.gz",
294
+ "lines": 15136,
295
+ "weight": 2
296
+ },
297
+ {
298
+ "name": "stackexchange_title_body/space.stackexchange.com.jsonl.gz",
299
+ "lines": 15142,
300
+ "weight": 2
301
+ },
302
+ {
303
+ "name": "stackexchange_TitleBody_Answer/bicycles.stackexchange.com.jsonl.gz",
304
+ "lines": 15708,
305
+ "weight": 2
306
+ },
307
+ {
308
+ "name": "stackexchange_Title_Answer/bicycles.stackexchange.com.jsonl.gz",
309
+ "lines": 15708,
310
+ "weight": 2
311
+ },
312
+ {
313
+ "name": "stackexchange_TitleBody_Answer/law.stackexchange.com.jsonl.gz",
314
+ "lines": 16133,
315
+ "weight": 2
316
+ },
317
+ {
318
+ "name": "stackexchange_Title_Answer/law.stackexchange.com.jsonl.gz",
319
+ "lines": 16133,
320
+ "weight": 2
321
+ },
322
+ {
323
+ "name": "stackexchange_TitleBody_Answer/arduino.stackexchange.com.jsonl.gz",
324
+ "lines": 16281,
325
+ "weight": 2
326
+ },
327
+ {
328
+ "name": "stackexchange_Title_Answer/arduino.stackexchange.com.jsonl.gz",
329
+ "lines": 16281,
330
+ "weight": 2
331
+ },
332
+ {
333
+ "name": "stackexchange_title_body/bicycles.stackexchange.com.jsonl.gz",
334
+ "lines": 16353,
335
+ "weight": 2
336
+ },
337
+ {
338
+ "name": "stackexchange_TitleBody_Answer/emacs.stackexchange.com.jsonl.gz",
339
+ "lines": 16830,
340
+ "weight": 2
341
+ },
342
+ {
343
+ "name": "stackexchange_Title_Answer/emacs.stackexchange.com.jsonl.gz",
344
+ "lines": 16830,
345
+ "weight": 2
346
+ },
347
+ {
348
+ "name": "stackexchange_title_body/quant.stackexchange.com.jsonl.gz",
349
+ "lines": 17261,
350
+ "weight": 2
351
+ },
352
+ {
353
+ "name": "stackexchange_TitleBody_Answer/dsp.stackexchange.com.jsonl.gz",
354
+ "lines": 17430,
355
+ "weight": 2
356
+ },
357
+ {
358
+ "name": "stackexchange_Title_Answer/dsp.stackexchange.com.jsonl.gz",
359
+ "lines": 17430,
360
+ "weight": 2
361
+ },
362
+ {
363
+ "name": "stackexchange_TitleBody_Answer/puzzling.stackexchange.com.jsonl.gz",
364
+ "lines": 17448,
365
+ "weight": 2
366
+ },
367
+ {
368
+ "name": "stackexchange_Title_Answer/puzzling.stackexchange.com.jsonl.gz",
369
+ "lines": 17448,
370
+ "weight": 2
371
+ },
372
+ {
373
+ "name": "stackexchange_title_body/puzzling.stackexchange.com.jsonl.gz",
374
+ "lines": 17851,
375
+ "weight": 2
376
+ },
377
+ {
378
+ "name": "stackexchange_title_body/law.stackexchange.com.jsonl.gz",
379
+ "lines": 17941,
380
+ "weight": 2
381
+ },
382
+ {
383
+ "name": "stackexchange_TitleBody_Answer/movies.stackexchange.com.jsonl.gz",
384
+ "lines": 18243,
385
+ "weight": 2
386
+ },
387
+ {
388
+ "name": "stackexchange_Title_Answer/movies.stackexchange.com.jsonl.gz",
389
+ "lines": 18243,
390
+ "weight": 2
391
+ },
392
+ {
393
+ "name": "stackexchange_TitleBody_Answer/mechanics.stackexchange.com.jsonl.gz",
394
+ "lines": 18613,
395
+ "weight": 2
396
+ },
397
+ {
398
+ "name": "stackexchange_Title_Answer/mechanics.stackexchange.com.jsonl.gz",
399
+ "lines": 18613,
400
+ "weight": 2
401
+ },
402
+ {
403
+ "name": "stackexchange_TitleBody_Answer/aviation.stackexchange.com.jsonl.gz",
404
+ "lines": 18755,
405
+ "weight": 2
406
+ },
407
+ {
408
+ "name": "stackexchange_Title_Answer/aviation.stackexchange.com.jsonl.gz",
409
+ "lines": 18755,
410
+ "weight": 2
411
+ },
412
+ {
413
+ "name": "stackexchange_TitleBody_Answer/biology.stackexchange.com.jsonl.gz",
414
+ "lines": 19277,
415
+ "weight": 2
416
+ },
417
+ {
418
+ "name": "stackexchange_Title_Answer/biology.stackexchange.com.jsonl.gz",
419
+ "lines": 19277,
420
+ "weight": 2
421
+ },
422
+ {
423
+ "name": "stackexchange_TitleBody_Answer/crypto.stackexchange.com.jsonl.gz",
424
+ "lines": 19404,
425
+ "weight": 2
426
+ },
427
+ {
428
+ "name": "stackexchange_Title_Answer/crypto.stackexchange.com.jsonl.gz",
429
+ "lines": 19404,
430
+ "weight": 2
431
+ },
432
+ {
433
+ "name": "stackexchange_title_body/arduino.stackexchange.com.jsonl.gz",
434
+ "lines": 19553,
435
+ "weight": 2
436
+ },
437
+ {
438
+ "name": "stackexchange_TitleBody_Answer/music.stackexchange.com.jsonl.gz",
439
+ "lines": 19936,
440
+ "weight": 2
441
+ },
442
+ {
443
+ "name": "stackexchange_Title_Answer/music.stackexchange.com.jsonl.gz",
444
+ "lines": 19936,
445
+ "weight": 2
446
+ },
447
+ {
448
+ "name": "stackexchange_title_body/aviation.stackexchange.com.jsonl.gz",
449
+ "lines": 20139,
450
+ "weight": 2
451
+ },
452
+ {
453
+ "name": "stackexchange_title_body/softwarerecs.stackexchange.com.jsonl.gz",
454
+ "lines": 20142,
455
+ "weight": 2
456
+ },
457
+ {
458
+ "name": "stackexchange_title_body/movies.stackexchange.com.jsonl.gz",
459
+ "lines": 20181,
460
+ "weight": 2
461
+ },
462
+ {
463
+ "name": "stackexchange_TitleBody_Answer/datascience.stackexchange.com.jsonl.gz",
464
+ "lines": 20503,
465
+ "weight": 2
466
+ },
467
+ {
468
+ "name": "stackexchange_Title_Answer/datascience.stackexchange.com.jsonl.gz",
469
+ "lines": 20503,
470
+ "weight": 2
471
+ },
472
+ {
473
+ "name": "stackexchange_title_body/music.stackexchange.com.jsonl.gz",
474
+ "lines": 20636,
475
+ "weight": 2
476
+ },
477
+ {
478
+ "name": "stackexchange_TitleBody_Answer/japanese.stackexchange.com.jsonl.gz",
479
+ "lines": 20948,
480
+ "weight": 2
481
+ },
482
+ {
483
+ "name": "stackexchange_Title_Answer/japanese.stackexchange.com.jsonl.gz",
484
+ "lines": 20948,
485
+ "weight": 2
486
+ },
487
+ {
488
+ "name": "stackexchange_title_body/emacs.stackexchange.com.jsonl.gz",
489
+ "lines": 21055,
490
+ "weight": 2
491
+ },
492
+ {
493
+ "name": "stackexchange_title_body/dsp.stackexchange.com.jsonl.gz",
494
+ "lines": 21252,
495
+ "weight": 2
496
+ },
497
+ {
498
+ "name": "stackexchange_title_body/japanese.stackexchange.com.jsonl.gz",
499
+ "lines": 22056,
500
+ "weight": 2
501
+ },
502
+ {
503
+ "name": "stackexchange_TitleBody_Answer/bitcoin.stackexchange.com.jsonl.gz",
504
+ "lines": 22474,
505
+ "weight": 2
506
+ },
507
+ {
508
+ "name": "stackexchange_Title_Answer/bitcoin.stackexchange.com.jsonl.gz",
509
+ "lines": 22474,
510
+ "weight": 2
511
+ },
512
+ {
513
+ "name": "stackexchange_TitleBody_Answer/cooking.stackexchange.com.jsonl.gz",
514
+ "lines": 22641,
515
+ "weight": 2
516
+ },
517
+ {
518
+ "name": "stackexchange_Title_Answer/cooking.stackexchange.com.jsonl.gz",
519
+ "lines": 22641,
520
+ "weight": 2
521
+ },
522
+ {
523
+ "name": "stackexchange_title_body/mechanics.stackexchange.com.jsonl.gz",
524
+ "lines": 22868,
525
+ "weight": 2
526
+ },
527
+ {
528
+ "name": "stackexchange_TitleBody_Answer/photo.stackexchange.com.jsonl.gz",
529
+ "lines": 23204,
530
+ "weight": 2
531
+ },
532
+ {
533
+ "name": "stackexchange_Title_Answer/photo.stackexchange.com.jsonl.gz",
534
+ "lines": 23204,
535
+ "weight": 2
536
+ },
537
+ {
538
+ "name": "stackexchange_title_body/crypto.stackexchange.com.jsonl.gz",
539
+ "lines": 23231,
540
+ "weight": 2
541
+ },
542
+ {
543
+ "name": "stackexchange_title_body/cooking.stackexchange.com.jsonl.gz",
544
+ "lines": 23705,
545
+ "weight": 2
546
+ },
547
+ {
548
+ "name": "stackexchange_title_body/photo.stackexchange.com.jsonl.gz",
549
+ "lines": 23753,
550
+ "weight": 2
551
+ },
552
+ {
553
+ "name": "stackexchange_TitleBody_Answer/workplace.stackexchange.com.jsonl.gz",
554
+ "lines": 24012,
555
+ "weight": 2
556
+ },
557
+ {
558
+ "name": "stackexchange_Title_Answer/workplace.stackexchange.com.jsonl.gz",
559
+ "lines": 24012,
560
+ "weight": 2
561
+ },
562
+ {
563
+ "name": "stackexchange_TitleBody_Answer/meta.stackoverflow.com.jsonl.gz",
564
+ "lines": 24044,
565
+ "weight": 2
566
+ },
567
+ {
568
+ "name": "stackexchange_Title_Answer/meta.stackoverflow.com.jsonl.gz",
569
+ "lines": 24044,
570
+ "weight": 2
571
+ },
572
+ {
573
+ "name": "stackexchange_TitleBody_Answer/raspberrypi.stackexchange.com.jsonl.gz",
574
+ "lines": 24143,
575
+ "weight": 2
576
+ },
577
+ {
578
+ "name": "stackexchange_Title_Answer/raspberrypi.stackexchange.com.jsonl.gz",
579
+ "lines": 24143,
580
+ "weight": 2
581
+ },
582
+ {
583
+ "name": "stackexchange_title_body/workplace.stackexchange.com.jsonl.gz",
584
+ "lines": 24189,
585
+ "weight": 2
586
+ },
587
+ {
588
+ "name": "stackexchange_title_body/biology.stackexchange.com.jsonl.gz",
589
+ "lines": 24447,
590
+ "weight": 3
591
+ },
592
+ {
593
+ "name": "stackexchange_TitleBody_Answer/webapps.stackexchange.com.jsonl.gz",
594
+ "lines": 24867,
595
+ "weight": 3
596
+ },
597
+ {
598
+ "name": "stackexchange_Title_Answer/webapps.stackexchange.com.jsonl.gz",
599
+ "lines": 24867,
600
+ "weight": 3
601
+ },
602
+ {
603
+ "name": "stackexchange_title_body/bitcoin.stackexchange.com.jsonl.gz",
604
+ "lines": 25374,
605
+ "weight": 3
606
+ },
607
+ {
608
+ "name": "stackexchange_TitleBody_Answer/judaism.stackexchange.com.jsonl.gz",
609
+ "lines": 26085,
610
+ "weight": 3
611
+ },
612
+ {
613
+ "name": "stackexchange_Title_Answer/judaism.stackexchange.com.jsonl.gz",
614
+ "lines": 26085,
615
+ "weight": 3
616
+ },
617
+ {
618
+ "name": "stackexchange_TitleBody_Answer/ethereum.stackexchange.com.jsonl.gz",
619
+ "lines": 26124,
620
+ "weight": 3
621
+ },
622
+ {
623
+ "name": "stackexchange_Title_Answer/ethereum.stackexchange.com.jsonl.gz",
624
+ "lines": 26124,
625
+ "weight": 3
626
+ },
627
+ {
628
+ "name": "stackexchange_TitleBody_Answer/worldbuilding.stackexchange.com.jsonl.gz",
629
+ "lines": 26210,
630
+ "weight": 3
631
+ },
632
+ {
633
+ "name": "stackexchange_Title_Answer/worldbuilding.stackexchange.com.jsonl.gz",
634
+ "lines": 26210,
635
+ "weight": 3
636
+ },
637
+ {
638
+ "name": "stackexchange_title_body/worldbuilding.stackexchange.com.jsonl.gz",
639
+ "lines": 26763,
640
+ "weight": 3
641
+ },
642
+ {
643
+ "name": "stackexchange_TitleBody_Answer/chemistry.stackexchange.com.jsonl.gz",
644
+ "lines": 27061,
645
+ "weight": 3
646
+ },
647
+ {
648
+ "name": "stackexchange_Title_Answer/chemistry.stackexchange.com.jsonl.gz",
649
+ "lines": 27061,
650
+ "weight": 3
651
+ },
652
+ {
653
+ "name": "stackexchange_title_body/datascience.stackexchange.com.jsonl.gz",
654
+ "lines": 27397,
655
+ "weight": 3
656
+ },
657
+ {
658
+ "name": "stackexchange_TitleBody_Answer/graphicdesign.stackexchange.com.jsonl.gz",
659
+ "lines": 28083,
660
+ "weight": 3
661
+ },
662
+ {
663
+ "name": "stackexchange_Title_Answer/graphicdesign.stackexchange.com.jsonl.gz",
664
+ "lines": 28083,
665
+ "weight": 3
666
+ },
667
+ {
668
+ "name": "stackexchange_TitleBody_Answer/ux.stackexchange.com.jsonl.gz",
669
+ "lines": 28901,
670
+ "weight": 3
671
+ },
672
+ {
673
+ "name": "stackexchange_Title_Answer/ux.stackexchange.com.jsonl.gz",
674
+ "lines": 28901,
675
+ "weight": 3
676
+ },
677
+ {
678
+ "name": "stackexchange_title_body/ux.stackexchange.com.jsonl.gz",
679
+ "lines": 29403,
680
+ "weight": 3
681
+ },
682
+ {
683
+ "name": "stackexchange_TitleBody_Answer/money.stackexchange.com.jsonl.gz",
684
+ "lines": 29404,
685
+ "weight": 3
686
+ },
687
+ {
688
+ "name": "stackexchange_Title_Answer/money.stackexchange.com.jsonl.gz",
689
+ "lines": 29404,
690
+ "weight": 3
691
+ },
692
+ {
693
+ "name": "stackexchange_title_body/webapps.stackexchange.com.jsonl.gz",
694
+ "lines": 29697,
695
+ "weight": 3
696
+ },
697
+ {
698
+ "name": "stackexchange_TitleBody_Answer/cs.stackexchange.com.jsonl.gz",
699
+ "lines": 30010,
700
+ "weight": 3
701
+ },
702
+ {
703
+ "name": "stackexchange_Title_Answer/cs.stackexchange.com.jsonl.gz",
704
+ "lines": 30010,
705
+ "weight": 3
706
+ },
707
+ {
708
+ "name": "stackexchange_title_body/graphicdesign.stackexchange.com.jsonl.gz",
709
+ "lines": 30233,
710
+ "weight": 3
711
+ },
712
+ {
713
+ "name": "stackexchange_TitleBody_Answer/webmasters.stackexchange.com.jsonl.gz",
714
+ "lines": 30370,
715
+ "weight": 3
716
+ },
717
+ {
718
+ "name": "stackexchange_Title_Answer/webmasters.stackexchange.com.jsonl.gz",
719
+ "lines": 30370,
720
+ "weight": 3
721
+ },
722
+ {
723
+ "name": "stackexchange_title_body/raspberrypi.stackexchange.com.jsonl.gz",
724
+ "lines": 30625,
725
+ "weight": 3
726
+ },
727
+ {
728
+ "name": "stackexchange_title_body/money.stackexchange.com.jsonl.gz",
729
+ "lines": 32021,
730
+ "weight": 3
731
+ },
732
+ {
733
+ "name": "stackexchange_title_body/judaism.stackexchange.com.jsonl.gz",
734
+ "lines": 32028,
735
+ "weight": 3
736
+ },
737
+ {
738
+ "name": "stackexchange_TitleBody_Answer/academia.stackexchange.com.jsonl.gz",
739
+ "lines": 32137,
740
+ "weight": 3
741
+ },
742
+ {
743
+ "name": "stackexchange_Title_Answer/academia.stackexchange.com.jsonl.gz",
744
+ "lines": 32137,
745
+ "weight": 3
746
+ },
747
+ {
748
+ "name": "stackexchange_title_body/ethereum.stackexchange.com.jsonl.gz",
749
+ "lines": 32760,
750
+ "weight": 3
751
+ },
752
+ {
753
+ "name": "stackexchange_title_body/academia.stackexchange.com.jsonl.gz",
754
+ "lines": 34331,
755
+ "weight": 3
756
+ },
757
+ {
758
+ "name": "stackexchange_title_body/chemistry.stackexchange.com.jsonl.gz",
759
+ "lines": 34506,
760
+ "weight": 3
761
+ },
762
+ {
763
+ "name": "stackexchange_title_body/webmasters.stackexchange.com.jsonl.gz",
764
+ "lines": 34559,
765
+ "weight": 3
766
+ },
767
+ {
768
+ "name": "stackexchange_title_body/meta.stackoverflow.com.jsonl.gz",
769
+ "lines": 36456,
770
+ "weight": 3
771
+ },
772
+ {
773
+ "name": "stackexchange_TitleBody_Answer/travel.stackexchange.com.jsonl.gz",
774
+ "lines": 36533,
775
+ "weight": 4
776
+ },
777
+ {
778
+ "name": "stackexchange_Title_Answer/travel.stackexchange.com.jsonl.gz",
779
+ "lines": 36533,
780
+ "weight": 4
781
+ },
782
+ {
783
+ "name": "stackexchange_TitleBody_Answer/android.stackexchange.com.jsonl.gz",
784
+ "lines": 38077,
785
+ "weight": 4
786
+ },
787
+ {
788
+ "name": "stackexchange_Title_Answer/android.stackexchange.com.jsonl.gz",
789
+ "lines": 38077,
790
+ "weight": 4
791
+ },
792
+ {
793
+ "name": "stackexchange_title_body/cs.stackexchange.com.jsonl.gz",
794
+ "lines": 38314,
795
+ "weight": 4
796
+ },
797
+ {
798
+ "name": "stackexchange_TitleBody_Answer/gamedev.stackexchange.com.jsonl.gz",
799
+ "lines": 40154,
800
+ "weight": 4
801
+ },
802
+ {
803
+ "name": "stackexchange_Title_Answer/gamedev.stackexchange.com.jsonl.gz",
804
+ "lines": 40154,
805
+ "weight": 4
806
+ },
807
+ {
808
+ "name": "stackexchange_TitleBody_Answer/rpg.stackexchange.com.jsonl.gz",
809
+ "lines": 40435,
810
+ "weight": 4
811
+ },
812
+ {
813
+ "name": "stackexchange_Title_Answer/rpg.stackexchange.com.jsonl.gz",
814
+ "lines": 40435,
815
+ "weight": 4
816
+ },
817
+ {
818
+ "name": "stackexchange_title_body/travel.stackexchange.com.jsonl.gz",
819
+ "lines": 41227,
820
+ "weight": 4
821
+ },
822
+ {
823
+ "name": "stackexchange_TitleBody_Answer/codereview.stackexchange.com.jsonl.gz",
824
+ "lines": 41748,
825
+ "weight": 4
826
+ },
827
+ {
828
+ "name": "stackexchange_Title_Answer/codereview.stackexchange.com.jsonl.gz",
829
+ "lines": 41748,
830
+ "weight": 4
831
+ },
832
+ {
833
+ "name": "stackexchange_title_body/rpg.stackexchange.com.jsonl.gz",
834
+ "lines": 42303,
835
+ "weight": 4
836
+ },
837
+ {
838
+ "name": "stackexchange_title_body/codereview.stackexchange.com.jsonl.gz",
839
+ "lines": 45765,
840
+ "weight": 4
841
+ },
842
+ {
843
+ "name": "stackexchange_title_body/gamedev.stackexchange.com.jsonl.gz",
844
+ "lines": 46485,
845
+ "weight": 4
846
+ },
847
+ {
848
+ "name": "stackexchange_TitleBody_Answer/softwareengineering.stackexchange.com.jsonl.gz",
849
+ "lines": 51326,
850
+ "weight": 5
851
+ },
852
+ {
853
+ "name": "stackexchange_Title_Answer/softwareengineering.stackexchange.com.jsonl.gz",
854
+ "lines": 51326,
855
+ "weight": 5
856
+ },
857
+ {
858
+ "name": "stackexchange_TitleBody_Answer/security.stackexchange.com.jsonl.gz",
859
+ "lines": 51355,
860
+ "weight": 5
861
+ },
862
+ {
863
+ "name": "stackexchange_Title_Answer/security.stackexchange.com.jsonl.gz",
864
+ "lines": 51355,
865
+ "weight": 5
866
+ },
867
+ {
868
+ "name": "stackexchange_title_body/android.stackexchange.com.jsonl.gz",
869
+ "lines": 51608,
870
+ "weight": 5
871
+ },
872
+ {
873
+ "name": "stackexchange_TitleBody_Answer/diy.stackexchange.com.jsonl.gz",
874
+ "lines": 52896,
875
+ "weight": 5
876
+ },
877
+ {
878
+ "name": "stackexchange_Title_Answer/diy.stackexchange.com.jsonl.gz",
879
+ "lines": 52896,
880
+ "weight": 5
881
+ },
882
+ {
883
+ "name": "stackexchange_title_body/softwareengineering.stackexchange.com.jsonl.gz",
884
+ "lines": 53942,
885
+ "weight": 5
886
+ },
887
+ {
888
+ "name": "stackexchange_TitleBody_Answer/blender.stackexchange.com.jsonl.gz",
889
+ "lines": 54153,
890
+ "weight": 5
891
+ },
892
+ {
893
+ "name": "stackexchange_Title_Answer/blender.stackexchange.com.jsonl.gz",
894
+ "lines": 54153,
895
+ "weight": 5
896
+ },
897
+ {
898
+ "name": "stackexchange_TitleBody_Answer/scifi.stackexchange.com.jsonl.gz",
899
+ "lines": 54805,
900
+ "weight": 5
901
+ },
902
+ {
903
+ "name": "stackexchange_Title_Answer/scifi.stackexchange.com.jsonl.gz",
904
+ "lines": 54805,
905
+ "weight": 5
906
+ },
907
+ {
908
+ "name": "stackexchange_title_body/security.stackexchange.com.jsonl.gz",
909
+ "lines": 58000,
910
+ "weight": 5
911
+ },
912
+ {
913
+ "name": "stackexchange_TitleBody_Answer/mathematica.stackexchange.com.jsonl.gz",
914
+ "lines": 59895,
915
+ "weight": 5
916
+ },
917
+ {
918
+ "name": "stackexchange_Title_Answer/mathematica.stackexchange.com.jsonl.gz",
919
+ "lines": 59895,
920
+ "weight": 5
921
+ },
922
+ {
923
+ "name": "stackexchange_title_body/diy.stackexchange.com.jsonl.gz",
924
+ "lines": 60083,
925
+ "weight": 5
926
+ },
927
+ {
928
+ "name": "stackexchange_TitleBody_Answer/meta.stackexchange.com.jsonl.gz",
929
+ "lines": 60744,
930
+ "weight": 5
931
+ },
932
+ {
933
+ "name": "stackexchange_Title_Answer/meta.stackexchange.com.jsonl.gz",
934
+ "lines": 60744,
935
+ "weight": 5
936
+ },
937
+ {
938
+ "name": "stackexchange_title_body/scifi.stackexchange.com.jsonl.gz",
939
+ "lines": 61528,
940
+ "weight": 6
941
+ },
942
+ {
943
+ "name": "stackexchange_TitleBody_Answer/drupal.stackexchange.com.jsonl.gz",
944
+ "lines": 67817,
945
+ "weight": 6
946
+ },
947
+ {
948
+ "name": "stackexchange_Title_Answer/drupal.stackexchange.com.jsonl.gz",
949
+ "lines": 67817,
950
+ "weight": 6
951
+ },
952
+ {
953
+ "name": "stackexchange_TitleBody_Answer/dba.stackexchange.com.jsonl.gz",
954
+ "lines": 71449,
955
+ "weight": 6
956
+ },
957
+ {
958
+ "name": "stackexchange_Title_Answer/dba.stackexchange.com.jsonl.gz",
959
+ "lines": 71449,
960
+ "weight": 6
961
+ },
962
+ {
963
+ "name": "stackexchange_title_body/mathematica.stackexchange.com.jsonl.gz",
964
+ "lines": 73131,
965
+ "weight": 7
966
+ },
967
+ {
968
+ "name": "stackexchange_TitleBody_Answer/ell.stackexchange.com.jsonl.gz",
969
+ "lines": 77892,
970
+ "weight": 7
971
+ },
972
+ {
973
+ "name": "stackexchange_Title_Answer/ell.stackexchange.com.jsonl.gz",
974
+ "lines": 77892,
975
+ "weight": 7
976
+ },
977
+ {
978
+ "name": "stackexchange_TitleBody_Answer/magento.stackexchange.com.jsonl.gz",
979
+ "lines": 79241,
980
+ "weight": 7
981
+ },
982
+ {
983
+ "name": "stackexchange_Title_Answer/magento.stackexchange.com.jsonl.gz",
984
+ "lines": 79241,
985
+ "weight": 7
986
+ },
987
+ {
988
+ "name": "stackexchange_title_body/drupal.stackexchange.com.jsonl.gz",
989
+ "lines": 79717,
990
+ "weight": 7
991
+ },
992
+ {
993
+ "name": "stackexchange_TitleBody_Answer/sharepoint.stackexchange.com.jsonl.gz",
994
+ "lines": 80420,
995
+ "weight": 7
996
+ },
997
+ {
998
+ "name": "stackexchange_Title_Answer/sharepoint.stackexchange.com.jsonl.gz",
999
+ "lines": 80420,
1000
+ "weight": 7
1001
+ },
1002
+ {
1003
+ "name": "stackexchange_title_body/blender.stackexchange.com.jsonl.gz",
1004
+ "lines": 80766,
1005
+ "weight": 7
1006
+ },
1007
+ {
1008
+ "name": "stackexchange_title_body/dba.stackexchange.com.jsonl.gz",
1009
+ "lines": 81871,
1010
+ "weight": 7
1011
+ },
1012
+ {
1013
+ "name": "stackexchange_TitleBody_Answer/gaming.stackexchange.com.jsonl.gz",
1014
+ "lines": 82887,
1015
+ "weight": 7
1016
+ },
1017
+ {
1018
+ "name": "stackexchange_Title_Answer/gaming.stackexchange.com.jsonl.gz",
1019
+ "lines": 82887,
1020
+ "weight": 7
1021
+ },
1022
+ {
1023
+ "name": "stackexchange_title_body/ell.stackexchange.com.jsonl.gz",
1024
+ "lines": 83271,
1025
+ "weight": 7
1026
+ },
1027
+ {
1028
+ "name": "stackexchange_title_body/meta.stackexchange.com.jsonl.gz",
1029
+ "lines": 83510,
1030
+ "weight": 7
1031
+ },
1032
+ {
1033
+ "name": "stackexchange_TitleBody_Answer/wordpress.stackexchange.com.jsonl.gz",
1034
+ "lines": 83621,
1035
+ "weight": 7
1036
+ },
1037
+ {
1038
+ "name": "stackexchange_Title_Answer/wordpress.stackexchange.com.jsonl.gz",
1039
+ "lines": 83621,
1040
+ "weight": 7
1041
+ },
1042
+ {
1043
+ "name": "stackexchange_TitleBody_Answer/mathoverflow.net.jsonl.gz",
1044
+ "lines": 85289,
1045
+ "weight": 8
1046
+ },
1047
+ {
1048
+ "name": "stackexchange_Title_Answer/mathoverflow.net.jsonl.gz",
1049
+ "lines": 85289,
1050
+ "weight": 8
1051
+ },
1052
+ {
1053
+ "name": "stackexchange_TitleBody_Answer/salesforce.stackexchange.com.jsonl.gz",
1054
+ "lines": 87272,
1055
+ "weight": 8
1056
+ },
1057
+ {
1058
+ "name": "stackexchange_Title_Answer/salesforce.stackexchange.com.jsonl.gz",
1059
+ "lines": 87272,
1060
+ "weight": 8
1061
+ },
1062
+ {
1063
+ "name": "stackexchange_title_body/gaming.stackexchange.com.jsonl.gz",
1064
+ "lines": 88912,
1065
+ "weight": 8
1066
+ },
1067
+ {
1068
+ "name": "stackexchange_TitleBody_Answer/apple.stackexchange.com.jsonl.gz",
1069
+ "lines": 92487,
1070
+ "weight": 8
1071
+ },
1072
+ {
1073
+ "name": "stackexchange_Title_Answer/apple.stackexchange.com.jsonl.gz",
1074
+ "lines": 92487,
1075
+ "weight": 8
1076
+ },
1077
+ {
1078
+ "name": "stackexchange_title_body/sharepoint.stackexchange.com.jsonl.gz",
1079
+ "lines": 94011,
1080
+ "weight": 8
1081
+ },
1082
+ {
1083
+ "name": "stackexchange_title_body/magento.stackexchange.com.jsonl.gz",
1084
+ "lines": 99991,
1085
+ "weight": 9
1086
+ },
1087
+ {
1088
+ "name": "stackexchange_TitleBody_Answer/gis.stackexchange.com.jsonl.gz",
1089
+ "lines": 100254,
1090
+ "weight": 9
1091
+ },
1092
+ {
1093
+ "name": "stackexchange_Title_Answer/gis.stackexchange.com.jsonl.gz",
1094
+ "lines": 100254,
1095
+ "weight": 9
1096
+ },
1097
+ {
1098
+ "name": "stackexchange_title_body/wordpress.stackexchange.com.jsonl.gz",
1099
+ "lines": 100474,
1100
+ "weight": 9
1101
+ },
1102
+ {
1103
+ "name": "stackexchange_TitleBody_Answer/english.stackexchange.com.jsonl.gz",
1104
+ "lines": 100640,
1105
+ "weight": 9
1106
+ },
1107
+ {
1108
+ "name": "stackexchange_Title_Answer/english.stackexchange.com.jsonl.gz",
1109
+ "lines": 100640,
1110
+ "weight": 9
1111
+ },
1112
+ {
1113
+ "name": "stackexchange_title_body/salesforce.stackexchange.com.jsonl.gz",
1114
+ "lines": 105260,
1115
+ "weight": 9
1116
+ },
1117
+ {
1118
+ "name": "stackexchange_title_body/english.stackexchange.com.jsonl.gz",
1119
+ "lines": 109522,
1120
+ "weight": 10
1121
+ },
1122
+ {
1123
+ "name": "stackexchange_title_body/apple.stackexchange.com.jsonl.gz",
1124
+ "lines": 110622,
1125
+ "weight": 10
1126
+ },
1127
+ {
1128
+ "name": "stackexchange_TitleBody_Answer/stats.stackexchange.com.jsonl.gz",
1129
+ "lines": 115679,
1130
+ "weight": 10
1131
+ },
1132
+ {
1133
+ "name": "stackexchange_Title_Answer/stats.stackexchange.com.jsonl.gz",
1134
+ "lines": 115679,
1135
+ "weight": 10
1136
+ },
1137
+ {
1138
+ "name": "stackexchange_title_body/mathoverflow.net.jsonl.gz",
1139
+ "lines": 120851,
1140
+ "weight": 10
1141
+ },
1142
+ {
1143
+ "name": "stackexchange_TitleBody_Answer/electronics.stackexchange.com.jsonl.gz",
1144
+ "lines": 129494,
1145
+ "weight": 11
1146
+ },
1147
+ {
1148
+ "name": "stackexchange_Title_Answer/electronics.stackexchange.com.jsonl.gz",
1149
+ "lines": 129494,
1150
+ "weight": 11
1151
+ },
1152
+ {
1153
+ "name": "stackexchange_title_body/gis.stackexchange.com.jsonl.gz",
1154
+ "lines": 131000,
1155
+ "weight": 11
1156
+ },
1157
+ {
1158
+ "name": "stackexchange_TitleBody_Answer/physics.stackexchange.com.jsonl.gz",
1159
+ "lines": 141230,
1160
+ "weight": 12
1161
+ },
1162
+ {
1163
+ "name": "stackexchange_Title_Answer/physics.stackexchange.com.jsonl.gz",
1164
+ "lines": 141230,
1165
+ "weight": 12
1166
+ },
1167
+ {
1168
+ "name": "stackexchange_title_body/electronics.stackexchange.com.jsonl.gz",
1169
+ "lines": 143582,
1170
+ "weight": 12
1171
+ },
1172
+ {
1173
+ "name": "stackexchange_TitleBody_Answer/unix.stackexchange.com.jsonl.gz",
1174
+ "lines": 155414,
1175
+ "weight": 13
1176
+ },
1177
+ {
1178
+ "name": "stackexchange_Title_Answer/unix.stackexchange.com.jsonl.gz",
1179
+ "lines": 155414,
1180
+ "weight": 13
1181
+ },
1182
+ {
1183
+ "name": "stackexchange_TitleBody_Answer/tex.stackexchange.com.jsonl.gz",
1184
+ "lines": 171628,
1185
+ "weight": 15
1186
+ },
1187
+ {
1188
+ "name": "stackexchange_Title_Answer/tex.stackexchange.com.jsonl.gz",
1189
+ "lines": 171628,
1190
+ "weight": 15
1191
+ },
1192
+ {
1193
+ "name": "stackexchange_title_body/physics.stackexchange.com.jsonl.gz",
1194
+ "lines": 173307,
1195
+ "weight": 15
1196
+ },
1197
+ {
1198
+ "name": "stackexchange_title_body/stats.stackexchange.com.jsonl.gz",
1199
+ "lines": 173466,
1200
+ "weight": 15
1201
+ },
1202
+ {
1203
+ "name": "stackexchange_title_body/unix.stackexchange.com.jsonl.gz",
1204
+ "lines": 185997,
1205
+ "weight": 16
1206
+ },
1207
+ {
1208
+ "name": "stackexchange_title_body/tex.stackexchange.com.jsonl.gz",
1209
+ "lines": 202954,
1210
+ "weight": 17
1211
+ },
1212
+ {
1213
+ "name": "TriviaQA_pairs.jsonl.gz",
1214
+ "lines": 73346,
1215
+ "weight": 19
1216
+ },
1217
+ {
1218
+ "name": "stackexchange_TitleBody_Answer/serverfault.com.jsonl.gz",
1219
+ "lines": 238507,
1220
+ "weight": 20
1221
+ },
1222
+ {
1223
+ "name": "stackexchange_Title_Answer/serverfault.com.jsonl.gz",
1224
+ "lines": 238507,
1225
+ "weight": 20
1226
+ },
1227
+ {
1228
+ "name": "stackexchange_duplicate_questions_title-body_title-body.jsonl.gz",
1229
+ "lines": 250460,
1230
+ "weight": 21
1231
+ },
1232
+ {
1233
+ "name": "stackexchange_duplicate_questions_body_body.jsonl.gz",
1234
+ "lines": 250519,
1235
+ "weight": 21
1236
+ },
1237
+ {
1238
+ "name": "squad_pairs.jsonl.gz",
1239
+ "lines": 87599,
1240
+ "weight": 22
1241
+ },
1242
+ {
1243
+ "name": "stackexchange_TitleBody_Answer/askubuntu.com.jsonl.gz",
1244
+ "lines": 267135,
1245
+ "weight": 22
1246
+ },
1247
+ {
1248
+ "name": "stackexchange_Title_Answer/askubuntu.com.jsonl.gz",
1249
+ "lines": 267135,
1250
+ "weight": 22
1251
+ },
1252
+ {
1253
+ "name": "stackexchange_title_body/serverfault.com.jsonl.gz",
1254
+ "lines": 270904,
1255
+ "weight": 23
1256
+ },
1257
+ {
1258
+ "name": "NQ-train_pairs.jsonl.gz",
1259
+ "lines": 100231,
1260
+ "weight": 25
1261
+ },
1262
+ {
1263
+ "name": "SimpleWiki.jsonl.gz",
1264
+ "lines": 102225,
1265
+ "weight": 26
1266
+ },
1267
+ {
1268
+ "name": "quora_duplicates_triplets.jsonl.gz",
1269
+ "lines": 103663,
1270
+ "weight": 26
1271
+ },
1272
+ {
1273
+ "name": "stackexchange_duplicate_questions_title_title.jsonl.gz",
1274
+ "lines": 304525,
1275
+ "weight": 26
1276
+ },
1277
+ {
1278
+ "name": "altlex.jsonl.gz",
1279
+ "lines": 112696,
1280
+ "weight": 28
1281
+ },
1282
+ {
1283
+ "name": "stackexchange_title_body/askubuntu.com.jsonl.gz",
1284
+ "lines": 347925,
1285
+ "weight": 29
1286
+ },
1287
+ {
1288
+ "name": "stackexchange_TitleBody_Answer/superuser.com.jsonl.gz",
1289
+ "lines": 352610,
1290
+ "weight": 30
1291
+ },
1292
+ {
1293
+ "name": "stackexchange_Title_Answer/superuser.com.jsonl.gz",
1294
+ "lines": 352610,
1295
+ "weight": 30
1296
+ },
1297
+ {
1298
+ "name": "wikihow.jsonl.gz",
1299
+ "lines": 128542,
1300
+ "weight": 32
1301
+ },
1302
+ {
1303
+ "name": "stackexchange_title_body/superuser.com.jsonl.gz",
1304
+ "lines": 435463,
1305
+ "weight": 36
1306
+ },
1307
+ {
1308
+ "name": "stackexchange_title_body/small_stackexchanges.jsonl.gz",
1309
+ "lines": 448146,
1310
+ "weight": 37
1311
+ },
1312
+ {
1313
+ "name": "stackexchange_TitleBody_Answer/small_stackexchanges.jsonl.gz",
1314
+ "lines": 460256,
1315
+ "weight": 38
1316
+ },
1317
+ {
1318
+ "name": "stackexchange_Title_Answer/small_stackexchanges.jsonl.gz",
1319
+ "lines": 460256,
1320
+ "weight": 38
1321
+ },
1322
+ {
1323
+ "name": "sentence-compression.jsonl.gz",
1324
+ "lines": 180000,
1325
+ "weight": 45
1326
+ },
1327
+ {
1328
+ "name": "AllNLI.jsonl.gz",
1329
+ "lines": 277230,
1330
+ "weight": 69
1331
+ },
1332
+ {
1333
+ "name": "eli5_question_answer.jsonl.gz",
1334
+ "lines": 325475,
1335
+ "weight": 81
1336
+ },
1337
+ {
1338
+ "name": "reddit/reddit_2015.jsonl.gz",
1339
+ "lines": 135108166,
1340
+ "weight": 82
1341
+ },
1342
+ {
1343
+ "name": "reddit/reddit_2016.jsonl.gz",
1344
+ "lines": 159164386,
1345
+ "weight": 82
1346
+ },
1347
+ {
1348
+ "name": "reddit/reddit_2017.jsonl.gz",
1349
+ "lines": 191485219,
1350
+ "weight": 82
1351
+ },
1352
+ {
1353
+ "name": "reddit/reddit_2018.jsonl.gz",
1354
+ "lines": 240726659,
1355
+ "weight": 82
1356
+ },
1357
+ {
1358
+ "name": "stackexchange_TitleBody_Answer/math.stackexchange.com.jsonl.gz",
1359
+ "lines": 1100953,
1360
+ "weight": 83
1361
+ },
1362
+ {
1363
+ "name": "stackexchange_Title_Answer/math.stackexchange.com.jsonl.gz",
1364
+ "lines": 1100953,
1365
+ "weight": 83
1366
+ },
1367
+ {
1368
+ "name": "stackexchange_title_body/math.stackexchange.com.jsonl.gz",
1369
+ "lines": 1338443,
1370
+ "weight": 83
1371
+ },
1372
+ {
1373
+ "name": "stackexchange_TitleBody_Answer/stackoverflow.com-Posts.jsonl.gz",
1374
+ "lines": 15768211,
1375
+ "weight": 83
1376
+ },
1377
+ {
1378
+ "name": "stackexchange_Title_Answer/stackoverflow.com-Posts.jsonl.gz",
1379
+ "lines": 15768211,
1380
+ "weight": 83
1381
+ },
1382
+ {
1383
+ "name": "stackexchange_title_body/stackoverflow.com-Posts.jsonl.gz",
1384
+ "lines": 18562443,
1385
+ "weight": 83
1386
+ },
1387
+ {
1388
+ "name": "specter_train_triples.jsonl.gz",
1389
+ "lines": 684100,
1390
+ "weight": 84
1391
+ },
1392
+ {
1393
+ "name": "S2ORC_title_abstract.jsonl.gz",
1394
+ "lines": 41769185,
1395
+ "weight": 123
1396
+ },
1397
+ {
1398
+ "name": "S2ORC_citation_pairs.jsonl.gz",
1399
+ "lines": 52603982,
1400
+ "weight": 123
1401
+ },
1402
+ {
1403
+ "name": "PAQ_pairs.jsonl.gz",
1404
+ "lines": 64371441,
1405
+ "weight": 123
1406
+ },
1407
+ {
1408
+ "name": "WikiAnswers_pairs.jsonl.gz",
1409
+ "lines": 77427422,
1410
+ "weight": 123
1411
+ },
1412
+ {
1413
+ "name": "S2ORC_citation_pairs_abstract.jsonl.gz",
1414
+ "lines": 116288806,
1415
+ "weight": 123
1416
+ },
1417
+ {
1418
+ "name": "searchQA_question_top5_snippets_merged.jsonl.gz",
1419
+ "lines": 582261,
1420
+ "weight": 144
1421
+ },
1422
+ {
1423
+ "name": "yahoo_answers_title_question.jsonl.gz",
1424
+ "lines": 659896,
1425
+ "weight": 163
1426
+ },
1427
+ {
1428
+ "name": "yahoo_answers_question_answer.jsonl.gz",
1429
+ "lines": 681164,
1430
+ "weight": 169
1431
+ },
1432
+ {
1433
+ "name": "yahoo_answers_title_answer.jsonl.gz",
1434
+ "lines": 1198260,
1435
+ "weight": 247
1436
+ },
1437
+ {
1438
+ "name": "amazon-qa-train-pairs.jsonl.gz",
1439
+ "lines": 2448839,
1440
+ "weight": 247
1441
+ },
1442
+ {
1443
+ "name": "gooaq_pairs.jsonl.gz",
1444
+ "lines": 3012496,
1445
+ "weight": 247
1446
+ },
1447
+ {
1448
+ "name": "msmarco-query_passage_negative.jsonl.gz",
1449
+ "lines": 9144553,
1450
+ "weight": 247
1451
+ }
1452
+ ]
examples/India_900498_S2Hand.tif ADDED

Git LFS Details

  • SHA256: ee898621b387a731503a01960397599f209b45c268d4e088689928c704dbe968
  • Pointer size: 132 Bytes
  • Size of remote file: 2.15 MB
examples/Spain_7370579_S2Hand.tif ADDED

Git LFS Details

  • SHA256: 16e997e6a7159fa11160faf00591da763eb37ac82faf806f7fe733991944a048
  • Pointer size: 132 Bytes
  • Size of remote file: 2.32 MB
examples/USA_430764_S2Hand.tif ADDED

Git LFS Details

  • SHA256: 385418e105dc1068a7d78585e4395bcdc134e7b0d092ba942867ef74393d8d12
  • Pointer size: 132 Bytes
  • Size of remote file: 2.2 MB
examples/subsetted_512x512_HLS.S30.T10SEH.2018190.v1.4_merged.tif ADDED

Git LFS Details

  • SHA256: 13bc592a5e569d837bd8bb3524bb0d2f28418830bcc7b0750e74033078f8b17e
  • Pointer size: 132 Bytes
  • Size of remote file: 6.3 MB
examples/subsetted_512x512_HLS.S30.T10SFF.2018190.v1.4_merged.tif ADDED

Git LFS Details

  • SHA256: b491445bcca5d23a534765ac9f8b24f4cb0c9a75a7254c969456d65a982207a5
  • Pointer size: 132 Bytes
  • Size of remote file: 6.3 MB
examples/subsetted_512x512_HLS.S30.T10SGF.2020217.v1.4_merged.tif ADDED

Git LFS Details

  • SHA256: ecaa478fdb21ed437ea03436da87ed5efbd3d980e4da23cbee05171212c40378
  • Pointer size: 132 Bytes
  • Size of remote file: 6.3 MB
inference.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import argparse
3
+ import os
4
+ from typing import List, Union
5
+ import re
6
+ import datetime
7
+ import numpy as np
8
+ import rasterio
9
+ import torch
10
+ import yaml
11
+ from einops import rearrange
12
+ from terratorch.cli_tools import LightningInferenceModel
13
+
14
+ NO_DATA = -9999
15
+ NO_DATA_FLOAT = 0.0001
16
+ OFFSET = 0
17
+ PERCENTILE = 99
18
+
19
+
20
+ def process_channel_group(orig_img, channels):
21
+ """
22
+ Args:
23
+ orig_img: torch.Tensor representing original image (reference) with shape = (bands, H, W).
24
+ channels: list of indices representing RGB channels.
25
+
26
+ Returns:
27
+ torch.Tensor with shape (num_channels, height, width) for original image
28
+ """
29
+
30
+ orig_img = orig_img[channels, ...]
31
+ valid_mask = torch.ones_like(orig_img, dtype=torch.bool)
32
+ valid_mask[orig_img == NO_DATA_FLOAT] = False
33
+
34
+
35
+ # Rescale (enhancing contrast)
36
+ max_value = max(3000, np.percentile(orig_img[valid_mask], PERCENTILE))
37
+ min_value = OFFSET
38
+
39
+ orig_img = torch.clamp((orig_img - min_value) / (max_value - min_value), 0, 1)
40
+
41
+ # No data as zeros
42
+ orig_img[~valid_mask] = 0
43
+
44
+ return orig_img
45
+
46
+
47
+ def read_geotiff(file_path: str):
48
+ """Read all bands from *file_path* and return image + meta info.
49
+
50
+ Args:
51
+ file_path: path to image file.
52
+
53
+ Returns:
54
+ np.ndarray with shape (bands, height, width)
55
+ meta info dict
56
+ """
57
+
58
+ with rasterio.open(file_path) as src:
59
+ img = src.read()
60
+ meta = src.meta
61
+ try:
62
+ coords = src.lnglat()
63
+ except:
64
+ # Cannot read coords
65
+ coords = None
66
+
67
+ return img, meta, coords
68
+
69
+
70
+ def save_geotiff(image, output_path: str, meta: dict):
71
+ """Save multi-band image in Geotiff file.
72
+
73
+ Args:
74
+ image: np.ndarray with shape (bands, height, width)
75
+ output_path: path where to save the image
76
+ meta: dict with meta info.
77
+ """
78
+
79
+ with rasterio.open(output_path, "w", **meta) as dest:
80
+ for i in range(image.shape[0]):
81
+ dest.write(image[i, :, :], i + 1)
82
+
83
+ return
84
+
85
+
86
+ def _convert_np_uint8(float_image: torch.Tensor):
87
+ image = float_image.numpy() * 255.0
88
+ image = image.astype(dtype=np.uint8)
89
+
90
+ return image
91
+
92
+
93
+ def load_example(
94
+ file_paths: List[str],
95
+ mean: List[float] = None,
96
+ std: List[float] = None,
97
+ indices: Union[list[int], None] = None,
98
+ ):
99
+ """Build an input example by loading images in *file_paths*.
100
+
101
+ Args:
102
+ file_paths: list of file paths .
103
+ mean: list containing mean values for each band in the images in *file_paths*.
104
+ std: list containing std values for each band in the images in *file_paths*.
105
+
106
+ Returns:
107
+ np.array containing created example
108
+ list of meta info for each image in *file_paths*
109
+ """
110
+
111
+ imgs = []
112
+ metas = []
113
+ temporal_coords = []
114
+ location_coords = []
115
+
116
+ for file in file_paths:
117
+ img, meta, coords = read_geotiff(file)
118
+
119
+ # Rescaling (don't normalize on nodata)
120
+ img = np.moveaxis(img, 0, -1) # channels last for rescaling
121
+ if indices is not None:
122
+ img = img[..., indices]
123
+ if mean is not None and std is not None:
124
+ img = np.where(img == NO_DATA, NO_DATA_FLOAT, (img - mean) / std)
125
+
126
+ imgs.append(img)
127
+ metas.append(meta)
128
+ if coords is not None:
129
+ location_coords.append(coords)
130
+
131
+ try:
132
+ match = re.search(r'(\d{7,8}T\d{6})', file)
133
+ if match:
134
+ year = int(match.group(1)[:4])
135
+ julian_day = match.group(1).split('T')[0][4:]
136
+ if len(julian_day) == 3:
137
+ julian_day = int(julian_day)
138
+ else:
139
+ julian_day = datetime.datetime.strptime(julian_day, '%m%d').timetuple().tm_yday
140
+ temporal_coords.append([year, julian_day])
141
+ except Exception as e:
142
+ print(f'Could not extract timestamp for {file} ({e})')
143
+
144
+ imgs = np.stack(imgs, axis=0) # num_frames, H, W, C
145
+ imgs = np.moveaxis(imgs, -1, 0).astype("float32") # C, num_frames, H, W
146
+ imgs = np.expand_dims(imgs, axis=0) # add batch di
147
+
148
+ return imgs, temporal_coords, location_coords, metas
149
+
150
+
151
+ def run_model(input_data, model, datamodule, img_size):
152
+ # Reflect pad if not divisible by img_size
153
+ original_h, original_w = input_data.shape[-2:]
154
+ pad_h = (img_size - (original_h % img_size)) % img_size
155
+ pad_w = (img_size - (original_w % img_size)) % img_size
156
+ input_data = np.pad(
157
+ input_data, ((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)), mode="reflect"
158
+ )
159
+
160
+ # Build sliding window
161
+
162
+ batch_size = 1
163
+ batch = torch.tensor(input_data, device="cpu")
164
+ windows = batch.unfold(3, img_size, img_size).unfold(4, img_size, img_size)
165
+ h1, w1 = windows.shape[3:5]
166
+ windows = rearrange(
167
+ windows, "b c t h1 w1 h w -> (b h1 w1) c t h w", h=img_size, w=img_size
168
+ )
169
+
170
+ # Split into batches if number of windows > batch_size
171
+ num_batches = windows.shape[0] // batch_size if windows.shape[0] > batch_size else 1
172
+ windows = torch.tensor_split(windows, num_batches, dim=0)
173
+
174
+ # Run model
175
+ pred_imgs = []
176
+ for x in windows:
177
+ # Apply standardization
178
+ x = datamodule.test_transform(image=x.squeeze().numpy().transpose(1,2,0))
179
+ x['image'] = x['image'].unsqueeze(0)
180
+ x = datamodule.aug(x)['image']
181
+
182
+ with torch.no_grad():
183
+ x = x.to(model.device)
184
+ pred = model(x)
185
+ pred = pred.output.detach().cpu()
186
+
187
+ y_hat = pred.argmax(dim=1)
188
+
189
+ y_hat = torch.nn.functional.interpolate(y_hat.unsqueeze(1).float(), size=img_size, mode="nearest")
190
+
191
+ pred_imgs.append(y_hat)
192
+
193
+ pred_imgs = torch.concat(pred_imgs, dim=0)
194
+
195
+ # Build images from patches
196
+ pred_imgs = rearrange(
197
+ pred_imgs,
198
+ "(b h1 w1) c h w -> b c (h1 h) (w1 w)",
199
+ h=img_size,
200
+ w=img_size,
201
+ b=1,
202
+ c=1,
203
+ h1=h1,
204
+ w1=w1,
205
+ )
206
+
207
+ # Cut padded area back to original size
208
+ pred_imgs = pred_imgs[..., :original_h, :original_w]
209
+
210
+ # Squeeze (batch size 1)
211
+ pred_imgs = pred_imgs[0]
212
+
213
+ return pred_imgs
214
+
215
+
216
+ def main(
217
+ data_file: str,
218
+ config: str,
219
+ checkpoint: str,
220
+ output_dir: str,
221
+ rgb_outputs: bool,
222
+ input_indices: list[int] = None,
223
+ ):
224
+ os.makedirs(output_dir, exist_ok=True)
225
+
226
+ with open(config, "r") as f:
227
+ config_dict = yaml.safe_load(f)
228
+
229
+ # Load model ---------------------------------------------------------------------------------
230
+
231
+ lightning_model = LightningInferenceModel.from_config(config, checkpoint)
232
+ img_size = 512 # Size of BurnScars
233
+
234
+ # Loading data ---------------------------------------------------------------------------------
235
+
236
+ input_data, temporal_coords, location_coords, meta_data = load_example(
237
+ file_paths=[data_file], indices=input_indices,
238
+ )
239
+
240
+ meta_data = meta_data[0] # only one image
241
+
242
+ if input_data.mean() > 1:
243
+ input_data = input_data / 10000 # Convert to range 0-1
244
+
245
+ # Running model --------------------------------------------------------------------------------
246
+
247
+ lightning_model.model.eval()
248
+
249
+ channels = config_dict['data']['init_args']['rgb_indices']
250
+
251
+ pred = run_model(input_data, lightning_model.model, lightning_model.datamodule, img_size)
252
+
253
+ # Save pred
254
+ meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0)
255
+ pred_file = os.path.join(output_dir, f"pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff")
256
+ save_geotiff(_convert_np_uint8(pred), pred_file, meta_data)
257
+
258
+ # Save image + pred
259
+ meta_data.update(count=3, dtype="uint8", compress="lzw", nodata=0)
260
+
261
+ if input_data.mean() < 1:
262
+ input_data = input_data * 10000 # Scale to 0-10000
263
+
264
+ rgb_orig = process_channel_group(
265
+ orig_img=torch.Tensor(input_data[0, :, 0, ...]),
266
+ channels=channels,
267
+ )
268
+
269
+ pred[pred == 0.] = np.nan
270
+ img_pred = rgb_orig * 0.7 + pred * 0.3
271
+ img_pred[img_pred.isnan()] = rgb_orig[img_pred.isnan()]
272
+
273
+ img_pred_file = os.path.join(output_dir, f"rgb_pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff")
274
+ save_geotiff(
275
+ image=_convert_np_uint8(img_pred),
276
+ output_path=img_pred_file,
277
+ meta=meta_data,
278
+ )
279
+
280
+ # Save image rgb
281
+ if rgb_outputs:
282
+ rgb_file = os.path.join(output_dir, f"original_rgb_{os.path.splitext(os.path.basename(data_file))[0]}.tiff")
283
+ save_geotiff(
284
+ image=_convert_np_uint8(rgb_orig),
285
+ output_path=rgb_file,
286
+ meta=meta_data,
287
+ )
288
+
289
+ print("Done!")
290
+
291
+
292
+ if __name__ == "__main__":
293
+ parser = argparse.ArgumentParser("run inference", add_help=False)
294
+
295
+ parser.add_argument(
296
+ "--data_file",
297
+ type=str,
298
+ default="examples/subsetted_512x512_HLS.S30.T10SEH.2018190.v1.4_merged.tif",
299
+ help="Path to the file.",
300
+ )
301
+ parser.add_argument(
302
+ "--config",
303
+ "-c",
304
+ type=str,
305
+ default="burn_scars_config.yaml",
306
+ help="Path to yaml file containing model parameters.",
307
+ )
308
+ parser.add_argument(
309
+ "--checkpoint",
310
+ type=str,
311
+ default="Prithvi_EO_V2_300M_BurnScars.pt",
312
+ help="Path to a checkpoint file to load from.",
313
+ )
314
+ parser.add_argument(
315
+ "--output_dir",
316
+ type=str,
317
+ default="output",
318
+ help="Path to the directory where to save outputs.",
319
+ )
320
+ parser.add_argument(
321
+ "--input_indices",
322
+ default=[0,1,2,3,4,5],
323
+ type=int,
324
+ nargs="+",
325
+ help="0-based indices of the six Prithvi channels to be selected from the input. By default selects [0,1,2,3,4,5] for filtered HLS data.",
326
+ )
327
+ parser.add_argument(
328
+ "--rgb_outputs",
329
+ action="store_true",
330
+ help="If present, output files will only contain RGB channels. "
331
+ "Otherwise, all bands will be saved.",
332
+ )
333
+ args = parser.parse_args()
334
+
335
+ main(**vars(args))
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db
3
+ size 90868376
modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.models.Normalize"
19
+ }
20
+ ]
onnx/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fd5d72fe4589f189f8ebc006442dbb529bb7ce38f8082112682524616046452
3
+ size 90405214
onnx/model_O1.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1391c6fc20b5530250bc15cbe1f47578ffeca55ab0551d335cc668b6299a88ec
3
+ size 90360328
onnx/model_O2.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1de3905029190b398c7d300b530e320cf4b5e7d3dfb9af1429ebd73fd9a16faf
3
+ size 90326566
onnx/model_O3.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a44f671e364dddbac31f203f07b91be6b0a35e51936e5ebfab65b6d9538b83ff
3
+ size 90326497
onnx/model_O4.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1667d7f3ba669048b13a96ee3a44456d5e42c8f44588ae8b603430e16160c485
3
+ size 45212349
onnx/model_qint8_arm64.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4278337fd0ff3c68bfb6291042cad8ab363e1d9fbc43dcb499fe91c871902474
3
+ size 23026053
onnx/model_qint8_avx512.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4278337fd0ff3c68bfb6291042cad8ab363e1d9fbc43dcb499fe91c871902474
3
+ size 23026053
onnx/model_qint8_avx512_vnni.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4278337fd0ff3c68bfb6291042cad8ab363e1d9fbc43dcb499fe91c871902474
3
+ size 23026053
onnx/model_quint8_avx2.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b941bf19f1f1283680f449fa6a7336bb5600bdcd5f84d10ddc5cd72218a0fd21
3
+ size 23046789
openvino/openvino_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b86cab4722e2aefab310cf96d4d5a9eb3b187f7d9670a082afc55c7fa0d392a
3
+ size 90265744
openvino/openvino_model.xml ADDED
The diff for this file is too large to render. See raw diff
 
openvino/openvino_model_qint8_quantized.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c92ea4af3c6bc7b4a0f3b3d61b147c850f4dbdd7c9e7beee0c0c70dc12da289b
3
+ size 22933664
openvino/openvino_model_qint8_quantized.xml ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3a85f238711653950f6a79ece63eb0ea93d76f6a6284be04019c53733baf256
3
+ size 90888945
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ torchvision
3
+ timm
4
+ einops
5
+ rasterio
6
+ terratorch==0.99.8
rust_model.ot ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d98d96d278348988f2744e6445b8bc16d921c3f6e17c667362f3cb353007aea
3
+ size 90887379
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 256,
3
+ "do_lower_case": false
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
splits/test.txt ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ T10SDH.2020248.v1
2
+ T10SEH.2018190.v1
3
+ T10SEH.2018245.v1
4
+ T10SEH.2018280.v1
5
+ T10SEH.2019305.v1
6
+ T10SEH.2020190.v1
7
+ T10SEH.2020285.v1
8
+ T10SEJ.2019185.v1
9
+ T10TFQ.2018183.v1
10
+ T10TFQ.2018245.v1
11
+ T10TFT.2018213.v1
12
+ T10TGN.2019305.v1
13
+ T10TGN.2020310.v1
14
+ T10TGR.2020275.v1
15
+ T10TGS.2018245.v1
16
+ T10TGS.2018285.v1
17
+ T10TGS.2019195.v1
18
+ T10TGS.2020215.v1
19
+ T10TGT.2018188.v1
20
+ T10TGT.2018213.v1
21
+ T10TGT.2018285.v1
22
+ T10TGT.2019213.v1
23
+ T10TGT.2020218.v1
24
+ T10UGU.2018213.v1
25
+ T10UGU.2020215.v1
26
+ T10UGU.2020280.v1
27
+ T10UGU.2021249.v1
28
+ T11SLB.2018197.v1
29
+ T11SQC.2020196.v1
30
+ T11TLE.2018247.v1
31
+ T11TLH.2019215.v1
32
+ T11TMG.2019217.v1
33
+ T11TNJ.2019217.v1
34
+ T11TPH.2018244.v1
35
+ T11TPH.2020174.v1
36
+ T11TPH.2021263.v1
37
+ T11TPL.2021223.v1
38
+ T11TQH.2018219.v1
39
+ T11TQH.2019244.v1
40
+ T11UQP.2018249.v1
41
+ T12RWV.2019075.v1
42
+ T12RWV.2019225.v1
43
+ T12SUC.2019223.v1
44
+ T12SUC.2020153.v1
45
+ T12SUC.2020248.v1
46
+ T12SUC.2020318.v1
47
+ T12SUJ.2019298.v1
48
+ T12SVC.2018215.v1
49
+ T12SVC.2019245.v1
50
+ T12SVC.2019280.v1
51
+ T12SVC.2020155.v1
52
+ T12SVC.2020190.v1
53
+ T12SVC.2020250.v1
54
+ T12SVC.2020285.v1
55
+ T12SVD.2019183.v1
56
+ T12SVD.2020218.v1
57
+ T12SVE.2019183.v1
58
+ T12SVE.2019228.v1
59
+ T12SWA.2018225.v1
60
+ T12SXA.2018157.v1
61
+ T12SXA.2020187.v1
62
+ T12SYG.2020220.v1
63
+ T12TUK.2020286.v1
64
+ T12TVP.2018221.v1
65
+ T12TXK.2018215.v1
66
+ T12TXT.2018293.v1
67
+ T12TXT.2020248.v1
68
+ T13REP.2018141.v1
69
+ T13REQ.2018156.v1
70
+ T13SBS.2020217.v1
71
+ T13SDV.2020269.v1
72
+ T13SEA.2018144.v1
73
+ T13SFC.2020184.v1
74
+ T13TCG.2020277.v1
75
+ T13TCH.2020280.v1
76
+ T13TCM.2020250.v1
77
+ T13TCN.2020278.v1
78
+ T13TDH.2018292.v1
79
+ T13TDL.2019150.v1
80
+ T13TDL.2020245.v1
81
+ T13TDL.2020280.v1
82
+ T13TDM.2020250.v1
83
+ T14SMC.2018213.v1
84
+ T14SME.2018138.v1
85
+ T14SMF.2018098.v1
86
+ T14SPB.2018035.v1
87
+ T14SPF.2019345.v1
88
+ T14SQE.2018075.v1
89
+ T14SQE.2020065.v1
90
+ T14SQF.2018125.v1
91
+ T15RVQ.2018094.v1
92
+ T15RVQ.2019099.v1
93
+ T15STV.2018102.v1
94
+ T15SXB.2018154.v1
95
+ T15SXB.2019134.v1
96
+ T15SXB.2020089.v1
97
+ T15SXB.2020099.v1
98
+ T15SXB.2021093.v1
99
+ T16SBA.2019206.v1
100
+ T16SBD.2020096.v1
101
+ T16SCF.2019111.v1
102
+ T16SGG.2021094.v1
103
+ T16TFS.2019153.v1
104
+ T17RML.2018064.v1
105
+ T17RML.2021103.v1
106
+ T17RNL.2019111.v1
107
+ T17SKT.2018107.v1
108
+ T17SKT.2018132.v1
109
+ T17SKV.2019100.v1
110
+ T17SKV.2021094.v1
111
+ T17SLV.2019112.v1
112
+ T17SMS.2019094.v1
113
+ T17SMS.2021103.v1
114
+ T17SMU.2018132.v1
115
+ T17SNS.2021128.v1
116
+ T17SPS.2018074.v1
117
+ T17SPS.2019009.v1
118
+ T17SPS.2019094.v1
119
+ T17SPS.2020039.v1
120
+ T18SUD.2020061.v1
splits/train.txt ADDED
@@ -0,0 +1,524 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ T10SEJ.2018185.v1
2
+ T10SFE.2020267.v1
3
+ T10SFE.2021166.v1
4
+ T10SFF.2018155.v1
5
+ T10SFF.2018190.v1
6
+ T10SFF.2020215.v1
7
+ T10SFF.2020250.v1
8
+ T10SFF.2021189.v1
9
+ T10SFG.2020215.v1
10
+ T10SFH.2018185.v1
11
+ T10SFH.2020185.v1
12
+ T10SFH.2020245.v1
13
+ T10SGD.2018257.v1
14
+ T10SGD.2021306.v1
15
+ T10SGE.2018247.v1
16
+ T10SGE.2019187.v1
17
+ T10SGE.2020162.v1
18
+ T10SGE.2020187.v1
19
+ T10SGE.2020217.v1
20
+ T10SGE.2020247.v1
21
+ T10SGF.2020217.v1
22
+ T10SGG.2018187.v1
23
+ T10SGG.2019307.v1
24
+ T10SGG.2020247.v1
25
+ T10TDN.2019213.v1
26
+ T10TEK.2018183.v1
27
+ T10TEK.2018340.v1
28
+ T10TEK.2020275.v1
29
+ T10TEM.2018213.v1
30
+ T10TEN.2019168.v1
31
+ T10TFK.2020220.v1
32
+ T10TFL.2018215.v1
33
+ T10TFL.2018245.v1
34
+ T10TFL.2020215.v1
35
+ T10TFN.2018175.v1
36
+ T10TFN.2018245.v1
37
+ T10TFN.2020235.v1
38
+ T10TFP.2018285.v1
39
+ T10TFP.2019278.v1
40
+ T10TFP.2020248.v1
41
+ T10TFQ.2018173.v1
42
+ T10TFQ.2019245.v1
43
+ T10TFQ.2019305.v1
44
+ T10TFR.2018188.v1
45
+ T10TFR.2018213.v1
46
+ T10TFR.2020173.v1
47
+ T10TFS.2018193.v1
48
+ T10TFS.2018213.v1
49
+ T10TFS.2019213.v1
50
+ T10TGK.2019245.v1
51
+ T10TGK.2019280.v1
52
+ T10TGK.2020285.v1
53
+ T10TGL.2018215.v1
54
+ T10TGL.2019245.v1
55
+ T10TGL.2020265.v1
56
+ T10TGP.2018215.v1
57
+ T10TGQ.2018245.v1
58
+ T10TGQ.2020275.v1
59
+ T10TGR.2018215.v1
60
+ T10TGR.2018245.v1
61
+ T10TGR.2019215.v1
62
+ T10TGS.2018190.v1
63
+ T10TGS.2018215.v1
64
+ T10TGS.2019215.v1
65
+ T10TGS.2020245.v1
66
+ T10UGU.2018245.v1
67
+ T10UGV.2020218.v1
68
+ T11SKB.2018222.v1
69
+ T11SKB.2020222.v1
70
+ T11SKU.2019002.v1
71
+ T11SKV.2019152.v1
72
+ T11SKV.2019187.v1
73
+ T11SLB.2019237.v1
74
+ T11SLC.2020247.v1
75
+ T11SLT.2018349.v1
76
+ T11SLT.2019309.v1
77
+ T11SLT.2021163.v1
78
+ T11SLU.2018184.v1
79
+ T11SLU.2018274.v1
80
+ T11SLU.2020229.v1
81
+ T11SLU.2020249.v1
82
+ T11SLV.2018249.v1
83
+ T11SLV.2020222.v1
84
+ T11SLV.2020247.v1
85
+ T11SLV.2021186.v1
86
+ T11SLV.2021216.v1
87
+ T11SLV.2021251.v1
88
+ T11SLV.2021331.v1
89
+ T11SMS.2018259.v1
90
+ T11SMS.2020029.v1
91
+ T11SMT.2018154.v1
92
+ T11SMT.2018249.v1
93
+ T11SMT.2019294.v1
94
+ T11SMT.2019309.v1
95
+ T11SMT.2020194.v1
96
+ T11SMT.2020249.v1
97
+ T11SMT.2020289.v1
98
+ T11SMT.2020309.v1
99
+ T11SMT.2021248.v1
100
+ T11SMU.2020299.v1
101
+ T11SMV.2020234.v1
102
+ T11SMV.2020249.v1
103
+ T11SNS.2019246.v1
104
+ T11SNS.2020276.v1
105
+ T11SNS.2021155.v1
106
+ T11SNT.2018216.v1
107
+ T11SNT.2020221.v1
108
+ T11SNT.2020281.v1
109
+ T11SPA.2020196.v1
110
+ T11SPB.2019281.v1
111
+ T11SPB.2020196.v1
112
+ T11SPB.2020241.v1
113
+ T11SPV.2020186.v1
114
+ T11SQA.2019226.v1
115
+ T11SQB.2020241.v1
116
+ T11SQC.2020241.v1
117
+ T11SQD.2020196.v1
118
+ T11TKE.2018215.v1
119
+ T11TKF.2020265.v1
120
+ T11TKG.2020265.v1
121
+ T11TLE.2018182.v1
122
+ T11TLE.2019257.v1
123
+ T11TLG.2018247.v1
124
+ T11TLH.2018152.v1
125
+ T11TLH.2018217.v1
126
+ T11TLH.2020247.v1
127
+ T11TLJ.2019155.v1
128
+ T11TLJ.2019327.v1
129
+ T11TLM.2018245.v1
130
+ T11TLM.2019245.v1
131
+ T11TLM.2019305.v1
132
+ T11TLM.2020275.v1
133
+ T11TLN.2020280.v1
134
+ T11TMF.2019312.v1
135
+ T11TMF.2020217.v1
136
+ T11TMG.2018222.v1
137
+ T11TMH.2018182.v1
138
+ T11TMH.2019227.v1
139
+ T11TMH.2020247.v1
140
+ T11TMJ.2018217.v1
141
+ T11TMJ.2020247.v1
142
+ T11TMK.2018217.v1
143
+ T11TMK.2018292.v1
144
+ T11TMK.2020247.v1
145
+ T11TMM.2018285.v1
146
+ T11TMM.2020245.v1
147
+ T11TMM.2021224.v1
148
+ T11TMN.2018245.v1
149
+ T11TNE.2019224.v1
150
+ T11TNF.2018199.v1
151
+ T11TNF.2018219.v1
152
+ T11TNF.2018244.v1
153
+ T11TNF.2018289.v1
154
+ T11TNF.2019224.v1
155
+ T11TNF.2019314.v1
156
+ T11TNH.2018217.v1
157
+ T11TNH.2020217.v1
158
+ T11TNJ.2018217.v1
159
+ T11TNJ.2019244.v1
160
+ T11TNK.2018222.v1
161
+ T11TPE.2018244.v1
162
+ T11TPE.2019269.v1
163
+ T11TPE.2020219.v1
164
+ T11TPF.2018219.v1
165
+ T11TPF.2018289.v1
166
+ T11TPF.2021183.v1
167
+ T11TPH.2018219.v1
168
+ T11TPH.2019244.v1
169
+ T11TPH.2020184.v1
170
+ T11TPH.2020219.v1
171
+ T11TPH.2020249.v1
172
+ T11TPH.2021238.v1
173
+ T11TPJ.2020249.v1
174
+ T11TPL.2019244.v1
175
+ T11TPM.2021223.v1
176
+ T11TPN.2019214.v1
177
+ T11TPN.2020217.v1
178
+ T11TQG.2018221.v1
179
+ T11TQG.2018291.v1
180
+ T11TQG.2020306.v1
181
+ T11TQH.2019221.v1
182
+ T11TQH.2020216.v1
183
+ T11TQH.2020251.v1
184
+ T11TQJ.2018219.v1
185
+ T11TQK.2020249.v1
186
+ T11ULP.2018245.v1
187
+ T11ULP.2020215.v1
188
+ T12RVV.2018215.v1
189
+ T12RVV.2020320.v1
190
+ T12RXV.2018182.v1
191
+ T12RXV.2019062.v1
192
+ T12RYV.2018152.v1
193
+ T12STC.2020218.v1
194
+ T12STE.2020246.v1
195
+ T12STF.2018231.v1
196
+ T12STF.2018291.v1
197
+ T12STF.2021190.v1
198
+ T12STF.2021215.v1
199
+ T12STG.2018186.v1
200
+ T12STH.2020246.v1
201
+ T12SUC.2019158.v1
202
+ T12SUD.2018168.v1
203
+ T12SUD.2018218.v1
204
+ T12SUD.2019183.v1
205
+ T12SUD.2020218.v1
206
+ T12SUE.2020218.v1
207
+ T12SUF.2018253.v1
208
+ T12SUH.2018228.v1
209
+ T12SUH.2019298.v1
210
+ T12SUH.2020268.v1
211
+ T12SVA.2020310.v1
212
+ T12SVB.2020155.v1
213
+ T12SVB.2020185.v1
214
+ T12SVB.2020310.v1
215
+ T12SVC.2019190.v1
216
+ T12SVF.2018253.v1
217
+ T12SWA.2019260.v1
218
+ T12SWA.2020230.v1
219
+ T12SWB.2019155.v1
220
+ T12SWB.2020155.v1
221
+ T12SWB.2020250.v1
222
+ T12SWC.2019225.v1
223
+ T12SWC.2020190.v1
224
+ T12SWC.2020250.v1
225
+ T12SXB.2018217.v1
226
+ T12SXG.2019235.v1
227
+ T12SXJ.2020200.v1
228
+ T12SYG.2018225.v1
229
+ T12TTM.2018219.v1
230
+ T12TTM.2019244.v1
231
+ T12TTM.2020306.v1
232
+ T12TUK.2018231.v1
233
+ T12TUM.2019261.v1
234
+ T12TUM.2020191.v1
235
+ T12TUN.2018186.v1
236
+ T12TUN.2018216.v1
237
+ T12TUN.2018246.v1
238
+ T12TUN.2019276.v1
239
+ T12TUN.2020216.v1
240
+ T12TUN.2021150.v1
241
+ T12TUN.2021205.v1
242
+ T12TUN.2021215.v1
243
+ T12TVK.2020188.v1
244
+ T12TVK.2020228.v1
245
+ T12TVK.2020308.v1
246
+ T12TVM.2018246.v1
247
+ T12TVN.2018246.v1
248
+ T12TVN.2018291.v1
249
+ T12TVR.2018246.v1
250
+ T12TVS.2019216.v1
251
+ T12TVS.2020191.v1
252
+ T12TVT.2018196.v1
253
+ T12TWK.2020268.v1
254
+ T12TWT.2020276.v1
255
+ T12TXL.2018220.v1
256
+ T12TXL.2020250.v1
257
+ T12TXQ.2018248.v1
258
+ T12TXR.2020283.v1
259
+ T12TYK.2018220.v1
260
+ T12TYK.2020215.v1
261
+ T12TYL.2018290.v1
262
+ T12TYP.2018220.v1
263
+ T12TYP.2018245.v1
264
+ T12TYP.2020215.v1
265
+ T12TYQ.2018185.v1
266
+ T12TYT.2018153.v1
267
+ T12TYT.2018213.v1
268
+ T12TYT.2019153.v1
269
+ T12TYT.2019248.v1
270
+ T12UXU.2021250.v1
271
+ T13RGP.2020118.v1
272
+ T13SBT.2019202.v1
273
+ T13SBT.2019237.v1
274
+ T13SBV.2019247.v1
275
+ T13SCA.2019227.v1
276
+ T13SCR.2018214.v1
277
+ T13SDA.2018199.v1
278
+ T13SDS.2018214.v1
279
+ T13SDT.2018134.v1
280
+ T13SDT.2019184.v1
281
+ T13SDT.2020249.v1
282
+ T13SEA.2020214.v1
283
+ T13SEB.2018189.v1
284
+ T13SEB.2020249.v1
285
+ T13SEC.2018164.v1
286
+ T13SER.2018156.v1
287
+ T13SEV.2020154.v1
288
+ T13SFA.2018134.v1
289
+ T13SFA.2018159.v1
290
+ T13SFA.2018184.v1
291
+ T13SFA.2019244.v1
292
+ T13SFA.2020154.v1
293
+ T13SFB.2019109.v1
294
+ T13SFB.2019219.v1
295
+ T13SFB.2020164.v1
296
+ T13SFB.2020189.v1
297
+ T13SFC.2019154.v1
298
+ T13SFC.2020124.v1
299
+ T13SFR.2020216.v1
300
+ T13SFS.2018136.v1
301
+ T13SFS.2018196.v1
302
+ T13SFT.2018136.v1
303
+ T13SFT.2019096.v1
304
+ T13SFT.2019171.v1
305
+ T13SGB.2018136.v1
306
+ T13SGB.2019076.v1
307
+ T13TCG.2020247.v1
308
+ T13TCJ.2020200.v1
309
+ T13TCJ.2020230.v1
310
+ T13TCK.2020215.v1
311
+ T13TCK.2020245.v1
312
+ T13TCL.2020245.v1
313
+ T13TCL.2020280.v1
314
+ T13TCM.2020215.v1
315
+ T13TDF.2018222.v1
316
+ T13TDK.2020217.v1
317
+ T13TDK.2020280.v1
318
+ T13TDL.2020187.v1
319
+ T13TDL.2020307.v1
320
+ T13TEE.2020214.v1
321
+ T13TEE.2020249.v1
322
+ T13TEF.2019109.v1
323
+ T13TEG.2019157.v1
324
+ T13TEL.2020307.v1
325
+ T13TEN.2018245.v1
326
+ T13TFJ.2018319.v1
327
+ T13TFN.2018157.v1
328
+ T13TFN.2019152.v1
329
+ T13TGE.2018136.v1
330
+ T13TGH.2020304.v1
331
+ T13UCP.2020248.v1
332
+ T13UDP.2018290.v1
333
+ T13UDP.2020255.v1
334
+ T13UEP.2018135.v1
335
+ T14RKU.2020158.v1
336
+ T14RKV.2018143.v1
337
+ T14RKV.2018193.v1
338
+ T14RKV.2018213.v1
339
+ T14RKV.2020268.v1
340
+ T14RKV.2020278.v1
341
+ T14RLT.2020273.v1
342
+ T14RLU.2019293.v1
343
+ T14RLU.2020278.v1
344
+ T14RLV.2018043.v1
345
+ T14RLV.2019223.v1
346
+ T14RLV.2019348.v1
347
+ T14RMV.2020220.v1
348
+ T14RNQ.2021051.v1
349
+ T14RNU.2018215.v1
350
+ T14RNV.2018215.v1
351
+ T14RNV.2020220.v1
352
+ T14RPQ.2018107.v1
353
+ T14RPR.2020187.v1
354
+ T14RPS.2019257.v1
355
+ T14RQS.2019322.v1
356
+ T14RQS.2020032.v1
357
+ T14SKF.2019246.v1
358
+ T14SLA.2020223.v1
359
+ T14SLC.2018038.v1
360
+ T14SLD.2020218.v1
361
+ T14SLE.2018138.v1
362
+ T14SLE.2019098.v1
363
+ T14SLE.2019248.v1
364
+ T14SLE.2020323.v1
365
+ T14SLF.2020108.v1
366
+ T14SMA.2020185.v1
367
+ T14SMB.2019228.v1
368
+ T14SMB.2020268.v1
369
+ T14SMD.2018138.v1
370
+ T14SNA.2018215.v1
371
+ T14SNB.2018215.v1
372
+ T14SNB.2019280.v1
373
+ T14SNG.2020183.v1
374
+ T14SPD.2018100.v1
375
+ T14SQE.2018100.v1
376
+ T14SQF.2018075.v1
377
+ T14SQF.2018100.v1
378
+ T14SQG.2018125.v1
379
+ T14SQJ.2020340.v1
380
+ T14TKM.2019144.v1
381
+ T14TQT.2021112.v1
382
+ T14UPU.2021112.v1
383
+ T14UQU.2019163.v1
384
+ T15RTN.2020032.v1
385
+ T15RTQ.2021101.v1
386
+ T15RUQ.2018094.v1
387
+ T15RUQ.2018129.v1
388
+ T15RUQ.2019099.v1
389
+ T15RUQ.2021088.v1
390
+ T15RVP.2021063.v1
391
+ T15RVQ.2021063.v1
392
+ T15RWQ.2021063.v1
393
+ T15RWQ.2021098.v1
394
+ T15RXQ.2020106.v1
395
+ T15RXQ.2021095.v1
396
+ T15RYQ.2021095.v1
397
+ T15STA.2018125.v1
398
+ T15STC.2018125.v1
399
+ T15STD.2018125.v1
400
+ T15STU.2018237.v1
401
+ T15SUB.2018127.v1
402
+ T15SUC.2018127.v1
403
+ T15SUU.2018062.v1
404
+ T15SUU.2020107.v1
405
+ T15SUV.2018102.v1
406
+ T15SUV.2020107.v1
407
+ T15SVA.2018102.v1
408
+ T15SVU.2018114.v1
409
+ T15SVU.2019144.v1
410
+ T15SVU.2021063.v1
411
+ T15SVU.2021098.v1
412
+ T15SWA.2021093.v1
413
+ T15SWS.2021128.v1
414
+ T15SWV.2018099.v1
415
+ T15SWV.2019099.v1
416
+ T15SWV.2021093.v1
417
+ T15SXA.2018154.v1
418
+ T15SXA.2021093.v1
419
+ T15SXB.2019099.v1
420
+ T15SXC.2019099.v1
421
+ T15SYR.2021110.v1
422
+ T15TVN.2018135.v1
423
+ T15TVN.2019140.v1
424
+ T15TXM.2018187.v1
425
+ T15TXM.2019157.v1
426
+ T16RBU.2018133.v1
427
+ T16RBU.2019133.v1
428
+ T16RBU.2020123.v1
429
+ T16RBV.2018158.v1
430
+ T16RBV.2020033.v1
431
+ T16RBV.2021127.v1
432
+ T16RCU.2019133.v1
433
+ T16RCU.2020033.v1
434
+ T16RCU.2020118.v1
435
+ T16RCU.2021077.v1
436
+ T16RCV.2020118.v1
437
+ T16REV.2018095.v1
438
+ T16REV.2018130.v1
439
+ T16REV.2019080.v1
440
+ T16REV.2019100.v1
441
+ T16REV.2019135.v1
442
+ T16REV.2019165.v1
443
+ T16REV.2020060.v1
444
+ T16REV.2020100.v1
445
+ T16REV.2021129.v1
446
+ T16REV.2021314.v1
447
+ T16RFT.2019250.v1
448
+ T16RFU.2019105.v1
449
+ T16RFU.2019250.v1
450
+ T16RFU.2019305.v1
451
+ T16RFU.2021054.v1
452
+ T16RFU.2021069.v1
453
+ T16RFU.2021109.v1
454
+ T16RGU.2018107.v1
455
+ T16RGU.2019112.v1
456
+ T16RGU.2019127.v1
457
+ T16RGU.2019267.v1
458
+ T16RGU.2019277.v1
459
+ T16RGU.2020107.v1
460
+ T16RGU.2021066.v1
461
+ T16RGU.2021096.v1
462
+ T16RGU.2021206.v1
463
+ T16SCA.2018133.v1
464
+ T16SCG.2019111.v1
465
+ T16SDB.2018063.v1
466
+ T16SDB.2018098.v1
467
+ T16SDB.2018133.v1
468
+ T16SDB.2021062.v1
469
+ T16SDC.2018098.v1
470
+ T16SDC.2020118.v1
471
+ T16SDC.2021127.v1
472
+ T16SDD.2020093.v1
473
+ T16SEB.2018090.v1
474
+ T16SEB.2018095.v1
475
+ T16SEB.2018155.v1
476
+ T16SEB.2019100.v1
477
+ T16SEB.2019135.v1
478
+ T16SEB.2021069.v1
479
+ T16SEC.2020105.v1
480
+ T16SEC.2021094.v1
481
+ T16SFB.2019100.v1
482
+ T16SFB.2021094.v1
483
+ T16SFC.2018155.v1
484
+ T16SFC.2019100.v1
485
+ T16SFC.2019165.v1
486
+ T16SFC.2020045.v1
487
+ T16SFC.2020105.v1
488
+ T16SFC.2021094.v1
489
+ T16SFD.2019100.v1
490
+ T16SFF.2021094.v1
491
+ T16SGD.2018155.v1
492
+ T16SGD.2019100.v1
493
+ T16SGE.2019100.v1
494
+ T16TDS.2019159.v1
495
+ T16TGQ.2018133.v1
496
+ T16TGQ.2019158.v1
497
+ T16TGQ.2021167.v1
498
+ T17RKN.2018087.v1
499
+ T17RKN.2019112.v1
500
+ T17RMH.2021103.v1
501
+ T17RMJ.2018064.v1
502
+ T17RMJ.2021313.v1
503
+ T17RMN.2020034.v1
504
+ T17RMN.2020094.v1
505
+ T17RMN.2021063.v1
506
+ T17RNK.2019081.v1
507
+ T17RNK.2020126.v1
508
+ T17SKS.2021126.v1
509
+ T17SKU.2018107.v1
510
+ T17SKU.2021094.v1
511
+ T17SLB.2019287.v1
512
+ T17SLT.2019112.v1
513
+ T17SLT.2021096.v1
514
+ T17SMA.2019112.v1
515
+ T17SMA.2021096.v1
516
+ T17SMV.2021096.v1
517
+ T17SNA.2021348.v1
518
+ T17SNB.2019162.v1
519
+ T17SNB.2020102.v1
520
+ T17SNU.2018074.v1
521
+ T17SNU.2018129.v1
522
+ T17SQU.2018121.v1
523
+ T18SVJ.2020131.v1
524
+ T18TXQ.2018266.v1
splits/val.txt ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ T10SEJ.2018220.v1
2
+ T10SFG.2020185.v1
3
+ T10TEK.2019275.v1
4
+ T10TEK.2019350.v1
5
+ T10TFM.2018110.v1
6
+ T10TFM.2018155.v1
7
+ T10TFM.2019215.v1
8
+ T10TFM.2019280.v1
9
+ T10TFM.2020215.v1
10
+ T10TGM.2020215.v1
11
+ T10TGR.2019245.v1
12
+ T11SKD.2018192.v1
13
+ T11SKD.2020197.v1
14
+ T11SKD.2020217.v1
15
+ T11SLU.2021188.v1
16
+ T11SNB.2018224.v1
17
+ T11SNB.2020234.v1
18
+ T11SNB.2021153.v1
19
+ T11SNB.2021268.v1
20
+ T11SPD.2020184.v1
21
+ T11SPV.2020236.v1
22
+ T11SPV.2020246.v1
23
+ T11SPV.2021215.v1
24
+ T11SQA.2020286.v1
25
+ T11SQS.2019078.v1
26
+ T11TLL.2018190.v1
27
+ T11TLL.2018215.v1
28
+ T11TLL.2018245.v1
29
+ T11TLL.2020275.v1
30
+ T11TME.2019222.v1
31
+ T11TMF.2018222.v1
32
+ T11TMF.2019227.v1
33
+ T11TMF.2019257.v1
34
+ T11TNE.2018199.v1
35
+ T11TNG.2018219.v1
36
+ T11TNG.2018289.v1
37
+ T11TPG.2018219.v1
38
+ T11TPG.2019214.v1
39
+ T11TPG.2020249.v1
40
+ T11TPK.2021268.v1
41
+ T11TQG.2020186.v1
42
+ T11TQG.2020216.v1
43
+ T11TQL.2021223.v1
44
+ T11ULP.2019245.v1
45
+ T11ULP.2020280.v1
46
+ T11ULP.2021249.v1
47
+ T12RXV.2018217.v1
48
+ T12STD.2018168.v1
49
+ T12STD.2020248.v1
50
+ T12STF.2020156.v1
51
+ T12STG.2020241.v1
52
+ T12STG.2020291.v1
53
+ T12STJ.2020241.v1
54
+ T12SUE.2019183.v1
55
+ T12SUF.2019183.v1
56
+ T12SUF.2020183.v1
57
+ T12SUF.2020223.v1
58
+ T12SUH.2020188.v1
59
+ T12SUJ.2019308.v1
60
+ T12SUJ.2020186.v1
61
+ T12SVA.2020230.v1
62
+ T12SVD.2018168.v1
63
+ T12SVD.2018310.v1
64
+ T12SWD.2018220.v1
65
+ T12SWJ.2018238.v1
66
+ T12SXA.2020247.v1
67
+ T12TUM.2018196.v1
68
+ T12TUM.2018246.v1
69
+ T12TUM.2019231.v1
70
+ T12TUP.2019226.v1
71
+ T12TUP.2020241.v1
72
+ T12TVR.2020256.v1
73
+ T12TVR.2020281.v1
74
+ T12TVS.2020256.v1
75
+ T12TWQ.2020283.v1
76
+ T12TWR.2020283.v1
77
+ T12TXM.2018220.v1
78
+ T12TXS.2020278.v1
79
+ T12TXT.2018248.v1
80
+ T12TYS.2020248.v1
81
+ T12TYS.2020278.v1
82
+ T13REP.2019241.v1
83
+ T13SDU.2018219.v1
84
+ T13SDU.2018254.v1
85
+ T13SFR.2020251.v1
86
+ T13TBE.2018220.v1
87
+ T13TBF.2018190.v1
88
+ T13TCL.2019150.v1
89
+ T13TDE.2020247.v1
90
+ T13TFG.2020274.v1
91
+ T13TFH.2020204.v1
92
+ T13TFJ.2020264.v1
93
+ T13TGJ.2020309.v1
94
+ T14RLU.2020193.v1
95
+ T14RMV.2020280.v1
96
+ T14SKA.2019278.v1
97
+ T14SKA.2020223.v1
98
+ T14SKB.2019268.v1
99
+ T14SKD.2018156.v1
100
+ T14SKE.2018156.v1
101
+ T14SKE.2019111.v1
102
+ T14SKE.2020216.v1
103
+ T14SKE.2020281.v1
104
+ T14SLA.2020193.v1
105
+ T14SLC.2019258.v1
106
+ T14SLD.2018138.v1
107
+ T14SLD.2018163.v1
108
+ T14SLD.2018218.v1
109
+ T14SLE.2018098.v1
110
+ T14SMB.2018073.v1
111
+ T14SMB.2018138.v1
112
+ T14SMC.2019258.v1
113
+ T14SND.2018125.v1
114
+ T14SND.2018215.v1
115
+ T14SNF.2018095.v1
116
+ T14SNF.2020183.v1
117
+ T14SNG.2020118.v1
118
+ T14SPH.2018095.v1
119
+ T14SPH.2018125.v1
120
+ T14SPJ.2018125.v1
121
+ T14SPJ.2019345.v1
122
+ T14SQG.2020085.v1
123
+ T14SQH.2018125.v1
124
+ T14UPV.2018136.v1
125
+ T14UQU.2021112.v1
126
+ T15RTM.2020059.v1
127
+ T15RVQ.2020059.v1
128
+ T15RWQ.2019134.v1
129
+ T15SVA.2019092.v1
130
+ T15SVU.2020109.v1
131
+ T15SWA.2019099.v1
132
+ T15SWA.2020109.v1
133
+ T15SWR.2018129.v1
134
+ T15SWR.2021128.v1
135
+ T15TYJ.2018124.v1
136
+ T15TYL.2019159.v1
137
+ T16RCV.2021167.v1
138
+ T16SBA.2018106.v1
139
+ T16SBB.2019111.v1
140
+ T16SBB.2021145.v1
141
+ T16SEH.2021102.v1
142
+ T16SFD.2018100.v1
143
+ T16SGA.2018107.v1
144
+ T16SGF.2018155.v1
145
+ T17RLP.2018082.v1
146
+ T17RLP.2018102.v1
147
+ T17RLP.2018127.v1
148
+ T17RLP.2019112.v1
149
+ T17RLP.2021111.v1
150
+ T17RNM.2019349.v1
151
+ T17RNM.2021103.v1
152
+ T17SKU.2021166.v1
153
+ T17SLS.2020097.v1
154
+ T17SLT.2018107.v1
155
+ T17SLT.2020097.v1
156
+ T17SMR.2021128.v1
157
+ T17SNC.2019112.v1
158
+ T17SQD.2020094.v1
159
+ T18TWK.2018121.v1
160
+ T18TWK.2019093.v1
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24c06a7429b843d46e40c6b167122053921bf94dce2e5550ea5c07fabc597646
3
+ size 91005696
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "name_or_path": "nreimers/MiniLM-L6-H384-uncased", "do_basic_tokenize": true, "never_split": null, "tokenizer_class": "BertTokenizer", "model_max_length": 512}
train_script.py ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Train script for a single file
3
+
4
+ Need to set the TPU address first:
5
+ export XRT_TPU_CONFIG="localservice;0;localhost:51011"
6
+ """
7
+
8
+ import torch.multiprocessing as mp
9
+ import threading
10
+ import time
11
+ import random
12
+ import sys
13
+ import argparse
14
+ import gzip
15
+ import json
16
+ import logging
17
+ import tqdm
18
+ import torch
19
+ from torch import nn
20
+ from torch.utils.data import DataLoader
21
+ import torch
22
+ import torch_xla
23
+ import torch_xla.core
24
+ import torch_xla.core.functions
25
+ import torch_xla.core.xla_model as xm
26
+ import torch_xla.distributed.xla_multiprocessing as xmp
27
+ import torch_xla.distributed.parallel_loader as pl
28
+ import os
29
+ from shutil import copyfile
30
+
31
+
32
+ from transformers import (
33
+ AdamW,
34
+ AutoModel,
35
+ AutoTokenizer,
36
+ get_linear_schedule_with_warmup,
37
+ set_seed,
38
+ )
39
+
40
+ class AutoModelForSentenceEmbedding(nn.Module):
41
+ def __init__(self, model_name, tokenizer, normalize=True):
42
+ super(AutoModelForSentenceEmbedding, self).__init__()
43
+
44
+ self.model = AutoModel.from_pretrained(model_name)
45
+ self.normalize = normalize
46
+ self.tokenizer = tokenizer
47
+
48
+ def forward(self, **kwargs):
49
+ model_output = self.model(**kwargs)
50
+ embeddings = self.mean_pooling(model_output, kwargs['attention_mask'])
51
+ if self.normalize:
52
+ embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
53
+
54
+ return embeddings
55
+
56
+ def mean_pooling(self, model_output, attention_mask):
57
+ token_embeddings = model_output[0] # First element of model_output contains all token embeddings
58
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
59
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
60
+
61
+ def save_pretrained(self, output_path):
62
+ if xm.is_master_ordinal():
63
+ self.tokenizer.save_pretrained(output_path)
64
+ self.model.config.save_pretrained(output_path)
65
+
66
+ xm.save(self.model.state_dict(), os.path.join(output_path, "pytorch_model.bin"))
67
+
68
+
69
+
70
+
71
+ def train_function(index, args, queue):
72
+ tokenizer = AutoTokenizer.from_pretrained(args.model)
73
+ model = AutoModelForSentenceEmbedding(args.model, tokenizer)
74
+
75
+
76
+ ### Train Loop
77
+ device = xm.xla_device()
78
+ model = model.to(device)
79
+
80
+ # Instantiate optimizer
81
+ optimizer = AdamW(params=model.parameters(), lr=2e-5, correct_bias=True)
82
+
83
+ lr_scheduler = get_linear_schedule_with_warmup(
84
+ optimizer=optimizer,
85
+ num_warmup_steps=500,
86
+ num_training_steps=args.steps,
87
+ )
88
+
89
+ # Now we train the model
90
+ cross_entropy_loss = nn.CrossEntropyLoss()
91
+ max_grad_norm = 1
92
+
93
+ model.train()
94
+
95
+ for global_step in tqdm.trange(args.steps, disable=not xm.is_master_ordinal()):
96
+ #### Get the batch data
97
+ batch = queue.get()
98
+ #print(index, "batch {}x{}".format(len(batch), ",".join([str(len(b)) for b in batch])))
99
+
100
+
101
+ if len(batch[0]) == 2: #(anchor, positive)
102
+ text1 = tokenizer([b[0] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
103
+ text2 = tokenizer([b[1] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
104
+
105
+ ### Compute embeddings
106
+ embeddings_a = model(**text1.to(device))
107
+ embeddings_b = model(**text2.to(device))
108
+
109
+ ### Gather all embedings
110
+ embeddings_a = torch_xla.core.functions.all_gather(embeddings_a)
111
+ embeddings_b = torch_xla.core.functions.all_gather(embeddings_b)
112
+
113
+ ### Compute similarity scores 512 x 512
114
+ scores = torch.mm(embeddings_a, embeddings_b.transpose(0, 1)) * args.scale
115
+
116
+ ### Compute cross-entropy loss
117
+ labels = torch.tensor(range(len(scores)), dtype=torch.long, device=embeddings_a.device) # Example a[i] should match with b[i]
118
+
119
+ ## Symmetric loss as in CLIP
120
+ loss = (cross_entropy_loss(scores, labels) + cross_entropy_loss(scores.transpose(0, 1), labels)) / 2
121
+
122
+ else: #(anchor, positive, negative)
123
+ text1 = tokenizer([b[0] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
124
+ text2 = tokenizer([b[1] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
125
+ text3 = tokenizer([b[2] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
126
+
127
+ embeddings_a = model(**text1.to(device))
128
+ embeddings_b1 = model(**text2.to(device))
129
+ embeddings_b2 = model(**text3.to(device))
130
+
131
+ embeddings_a = torch_xla.core.functions.all_gather(embeddings_a)
132
+ embeddings_b1 = torch_xla.core.functions.all_gather(embeddings_b1)
133
+ embeddings_b2 = torch_xla.core.functions.all_gather(embeddings_b2)
134
+
135
+ embeddings_b = torch.cat([embeddings_b1, embeddings_b2])
136
+
137
+ ### Compute similarity scores 512 x 1024
138
+ scores = torch.mm(embeddings_a, embeddings_b.transpose(0, 1)) * args.scale
139
+
140
+ ### Compute cross-entropy loss
141
+ labels = torch.tensor(range(len(scores)), dtype=torch.long, device=embeddings_a.device) # Example a[i] should match with b[i]
142
+
143
+ ## One-way loss
144
+ loss = cross_entropy_loss(scores, labels)
145
+
146
+
147
+ # Backward pass
148
+ optimizer.zero_grad()
149
+ loss.backward()
150
+ torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
151
+
152
+ xm.optimizer_step(optimizer, barrier=True)
153
+ lr_scheduler.step()
154
+
155
+
156
+ #Save model
157
+ if (global_step+1) % args.save_steps == 0:
158
+ output_path = os.path.join(args.output, str(global_step+1))
159
+ xm.master_print("save model: "+output_path)
160
+ model.save_pretrained(output_path)
161
+
162
+
163
+ output_path = os.path.join(args.output, "final")
164
+ xm.master_print("save model final: "+ output_path)
165
+ model.save_pretrained(output_path)
166
+
167
+
168
+ def produce_data(args, queue, filepaths, dataset_indices):
169
+ global_batch_size = args.batch_size*args.nprocs #Global batch size
170
+ size_per_dataset = int(global_batch_size / args.datasets_per_batch) #How many datasets per batch
171
+ num_same_dataset = int(size_per_dataset / args.batch_size)
172
+ print("producer", "global_batch_size", global_batch_size)
173
+ print("producer", "size_per_dataset", size_per_dataset)
174
+ print("producer", "num_same_dataset", num_same_dataset)
175
+
176
+ datasets = []
177
+ for filepath in filepaths:
178
+ if "reddit_" in filepath: #Special dataset class for Reddit files
179
+ data_obj = RedditDataset(filepath)
180
+ else:
181
+ data_obj = Dataset(filepath)
182
+ datasets.append(iter(data_obj))
183
+
184
+ # Store if dataset is in a 2 col or 3 col format
185
+ num_cols = {idx: len(next(dataset)) for idx, dataset in enumerate(datasets)}
186
+
187
+ while True:
188
+ texts_in_batch = set()
189
+ batch_format = None #2 vs 3 col format for this batch
190
+
191
+ #Add data from several sub datasets
192
+ for _ in range(args.datasets_per_batch):
193
+ valid_dataset = False #Check that datasets have the same 2/3 col format
194
+ while not valid_dataset:
195
+ data_idx = random.choice(dataset_indices)
196
+ if batch_format is None:
197
+ batch_format = num_cols[data_idx]
198
+ valid_dataset = True
199
+ else: #Check that this dataset has the same format
200
+ valid_dataset = (batch_format == num_cols[data_idx])
201
+
202
+ #Get data from this dataset
203
+ dataset = datasets[data_idx]
204
+ for _ in range(num_same_dataset):
205
+ for _ in range(args.nprocs):
206
+ batch_device = [] #A batch for one device
207
+ while len(batch_device) < args.batch_size:
208
+ sample = next(dataset)
209
+ in_batch = False
210
+ for text in sample:
211
+ if text in texts_in_batch:
212
+ in_batch = True
213
+ break
214
+
215
+ if not in_batch:
216
+ for text in sample:
217
+ texts_in_batch.add(text)
218
+ batch_device.append(sample)
219
+
220
+ queue.put(batch_device)
221
+
222
+
223
+ class RedditDataset:
224
+ """
225
+ A class that handles the reddit data files
226
+ """
227
+ def __init__(self, filepath):
228
+ self.filepath = filepath
229
+
230
+ def __iter__(self):
231
+ while True:
232
+ with gzip.open(self.filepath, "rt") as fIn:
233
+ for line in fIn:
234
+ data = json.loads(line)
235
+
236
+ if "response" in data and "context" in data:
237
+ yield [data["response"], data["context"]]
238
+
239
+ class Dataset:
240
+ """
241
+ A class that handles one dataset
242
+ """
243
+ def __init__(self, filepath):
244
+ self.filepath = filepath
245
+
246
+ def __iter__(self):
247
+ max_dataset_size = 10*1000*1000 #Cache small datasets in memory
248
+ dataset = []
249
+ data_format = None
250
+
251
+ while dataset is None or len(dataset) == 0:
252
+ with gzip.open(self.filepath, "rt") as fIn:
253
+ for line in fIn:
254
+ data = json.loads(line)
255
+ if isinstance(data, dict):
256
+ data = data['texts']
257
+
258
+ if data_format is None:
259
+ data_format = len(data)
260
+
261
+ #Ensure that all entries are of the same 2/3 col format
262
+ assert len(data) == data_format
263
+
264
+ if dataset is not None:
265
+ dataset.append(data)
266
+ if len(dataset) >= max_dataset_size:
267
+ dataset = None
268
+
269
+ yield data
270
+
271
+ # Data loaded. Now stream to the queue
272
+ # Shuffle for each epoch
273
+ while True:
274
+ random.shuffle(dataset)
275
+ for data in dataset:
276
+ yield data
277
+
278
+
279
+
280
+ if __name__ == "__main__":
281
+ parser = argparse.ArgumentParser()
282
+ parser.add_argument('--model', default='nreimers/MiniLM-L6-H384-uncased')
283
+ parser.add_argument('--steps', type=int, default=2000)
284
+ parser.add_argument('--save_steps', type=int, default=10000)
285
+ parser.add_argument('--batch_size', type=int, default=64)
286
+ parser.add_argument('--max_length', type=int, default=128)
287
+ parser.add_argument('--nprocs', type=int, default=8)
288
+ parser.add_argument('--datasets_per_batch', type=int, default=2, help="Number of datasets per batch")
289
+ parser.add_argument('--scale', type=float, default=20, help="Use 20 for cossim, and 1 when you work with unnormalized embeddings with dot product")
290
+ parser.add_argument('--data_folder', default="/data", help="Folder with your dataset files")
291
+ parser.add_argument('data_config', help="A data_config.json file")
292
+ parser.add_argument('output')
293
+ args = parser.parse_args()
294
+
295
+ # Ensure global batch size is divisble by data_sample_size
296
+ assert (args.batch_size*args.nprocs) % args.datasets_per_batch == 0
297
+
298
+ logging.info("Output: "+args.output)
299
+ if os.path.exists(args.output):
300
+ print("Output folder already exists.")
301
+ input("Continue?")
302
+
303
+ # Write train script to output path
304
+ os.makedirs(args.output, exist_ok=True)
305
+
306
+ data_config_path = os.path.join(args.output, 'data_config.json')
307
+ copyfile(args.data_config, data_config_path)
308
+
309
+ train_script_path = os.path.join(args.output, 'train_script.py')
310
+ copyfile(__file__, train_script_path)
311
+ with open(train_script_path, 'a') as fOut:
312
+ fOut.write("\n\n# Script was called via:\n#python " + " ".join(sys.argv))
313
+
314
+
315
+
316
+ #Load data config
317
+ with open(args.data_config) as fIn:
318
+ data_config = json.load(fIn)
319
+
320
+ queue = mp.Queue(maxsize=100*args.nprocs)
321
+
322
+ filepaths = []
323
+ dataset_indices = []
324
+ for idx, data in enumerate(data_config):
325
+ filepaths.append(os.path.join(os.path.expanduser(args.data_folder), data['name']))
326
+ dataset_indices.extend([idx]*data['weight'])
327
+
328
+ # Start producer
329
+ p = mp.Process(target=produce_data, args=(args, queue, filepaths, dataset_indices))
330
+ p.start()
331
+
332
+ # Run training
333
+ print("Start processes:", args.nprocs)
334
+ xmp.spawn(train_function, args=(args, queue), nprocs=args.nprocs, start_method='fork')
335
+ print("Training done")
336
+ print("It might be that not all processes exit automatically. In that case you must manually kill this process.")
337
+ print("With 'pkill python' you can kill all remaining python processes")
338
+ p.kill()
339
+ exit()
340
+
341
+
342
+
343
+ # Script was called via:
344
+ #python train_many_data_files_v2.py --steps 1000000 --batch_size 128 --model nreimers/MiniLM-L6-H384-uncased train_data_configs/all_datasets_v4.json output/all_datasets_v4_MiniLM-L6-H384-uncased-batch128
vocab.txt ADDED
The diff for this file is too large to render. See raw diff