Adding `safetensors` variant of this model

#1
README.md CHANGED
@@ -80,7 +80,7 @@ from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
80
  import torch
81
 
82
  # Load the pretrained model and tokenizer
83
- model_name = "P0L3/sciclimatebert"
84
  tokenizer = AutoTokenizer.from_pretrained(model_name)
85
  model = AutoModelForMaskedLM.from_pretrained(model_name)
86
 
@@ -121,20 +121,11 @@ The increase in greenhouse gas ... affected the carbon balance of the Earth. —
121
  If you use this model, please cite:
122
 
123
  ```bibtex
124
- @Article{Poleksić2025,
125
- author={Poleksi{\'{c}}, Andrija
126
- and Martin{\v{c}}i{\'{c}}-Ip{\v{s}}i{\'{c}}, Sanda},
127
- title={Pretraining and evaluation of BERT models for climate research},
128
- journal={Discover Applied Sciences},
129
  year={2025},
130
- month={Oct},
131
- day={24},
132
- volume={7},
133
- number={11},
134
- pages={1278},
135
- issn={3004-9261},
136
- doi={10.1007/s42452-025-07740-5},
137
- url={https://doi.org/10.1007/s42452-025-07740-5}
138
  }
139
-
140
-
 
80
  import torch
81
 
82
  # Load the pretrained model and tokenizer
83
+ model_name = "P0L3/clirebert_clirevocab_uncased"
84
  tokenizer = AutoTokenizer.from_pretrained(model_name)
85
  model = AutoModelForMaskedLM.from_pretrained(model_name)
86
 
 
121
  If you use this model, please cite:
122
 
123
  ```bibtex
124
+ @article{poleksic_etal_2025,
125
+ title={Climate Research Domain BERTs: Pretraining, Adaptation, and Evaluation},
126
+ author={Poleksić, Andrija and
127
+ Martinčić-Ipšić, Sanda},
128
+ journal={PREPRINT (Version 1)},
129
  year={2025},
130
+ doi={https://doi.org/10.21203/rs.3.rs-6644722/v1}
 
 
 
 
 
 
 
131
  }
 
 
added_tokens.json DELETED
@@ -1,237 +0,0 @@
1
- {
2
- "+/-": 50403,
3
- "2021": 50328,
4
- "2030": 50417,
5
- "2050": 50487,
6
- "CH4": 50354,
7
- "CO2": 50265,
8
- "Committee": 50410,
9
- "GHG": 50397,
10
- "N2O": 50382,
11
- "achieve": 50437,
12
- "across": 50277,
13
- "activities": 50317,
14
- "adaptation": 50377,
15
- "addition": 50303,
16
- "additional": 50444,
17
- "affect": 50438,
18
- "agreement": 50464,
19
- "agricultural": 50395,
20
- "already": 50435,
21
- "analyses": 50485,
22
- "annual": 50287,
23
- "applied": 50360,
24
- "approach": 50290,
25
- "areas": 50275,
26
- "assess": 50430,
27
- "assessment": 50345,
28
- "atmosphere": 50341,
29
- "atmospheric": 50310,
30
- "basis": 50372,
31
- "become": 50394,
32
- "benefits": 50402,
33
- "biomass": 50314,
34
- "capture": 50379,
35
- "caused": 50389,
36
- "challenges": 50404,
37
- "characteristics": 50420,
38
- "climatic": 50319,
39
- "coastal": 50393,
40
- "combined": 50451,
41
- "communities": 50318,
42
- "companies": 50312,
43
- "compared": 50282,
44
- "composition": 50418,
45
- "concentration": 50357,
46
- "concentrations": 50366,
47
- "conditions": 50272,
48
- "conducted": 50439,
49
- "conservation": 50458,
50
- "considered": 50339,
51
- "consistent": 50494,
52
- "construction": 50445,
53
- "consumption": 50307,
54
- "contribute": 50498,
55
- "contribution": 50468,
56
- "costs": 50326,
57
- "countries": 50283,
58
- "crisis": 50499,
59
- "customers": 50358,
60
- "decades": 50495,
61
- "decision": 50452,
62
- "decrease": 50367,
63
- "decreased": 50396,
64
- "degrees": 50276,
65
- "delta": 50433,
66
- "determine": 50480,
67
- "determined": 50455,
68
- "developing": 50412,
69
- "differences": 50375,
70
- "dioxide": 50346,
71
- "distribution": 50296,
72
- "diversity": 50431,
73
- "drought": 50336,
74
- "dynamics": 50390,
75
- "ecological": 50401,
76
- "economy": 50338,
77
- "ecosystem": 50333,
78
- "ecosystems": 50384,
79
- "efforts": 50392,
80
- "electricity": 50315,
81
- "emission": 50279,
82
- "emissions": 50266,
83
- "employees": 50419,
84
- "ensure": 50449,
85
- "environmental": 50269,
86
- "estimate": 50459,
87
- "estimated": 50362,
88
- "estimates": 50408,
89
- "factors": 50292,
90
- "findings": 50492,
91
- "flood": 50363,
92
- "flux": 50476,
93
- "fluxes": 50491,
94
- "following": 50369,
95
- "forests": 50406,
96
- "fossil": 50348,
97
- "fuels": 50424,
98
- "further": 50301,
99
- "gases": 50471,
100
- "greater": 50368,
101
- "greenhouse": 50289,
102
- "however": 50454,
103
- "hydrogen": 50376,
104
- "identify": 50460,
105
- "impacts": 50281,
106
- "importance": 50414,
107
- "included": 50434,
108
- "increase": 50271,
109
- "increased": 50274,
110
- "increases": 50361,
111
- "indicate": 50388,
112
- "industry": 50306,
113
- "influence": 50329,
114
- "infrastructure": 50425,
115
- "integrated": 50483,
116
- "investigated": 50385,
117
- "investment": 50321,
118
- "investments": 50466,
119
- "least": 50429,
120
- "losses": 50462,
121
- "mainly": 50399,
122
- "materials": 50398,
123
- "means": 50486,
124
- "measured": 50364,
125
- "measurements": 50422,
126
- "methane": 50359,
127
- "methods": 50378,
128
- "mitigation": 50457,
129
- "moisture": 50493,
130
- "monitoring": 50465,
131
- "nitrogen": 50405,
132
- "northern": 50446,
133
- "observations": 50461,
134
- "observed": 50293,
135
- "obtained": 50391,
136
- "ocean": 50347,
137
- "operating": 50440,
138
- "operations": 50374,
139
- "opportunities": 50469,
140
- "overall": 50426,
141
- "pandemic": 50490,
142
- "parameters": 50383,
143
- "particular": 50413,
144
- "patterns": 50349,
145
- "performed": 50497,
146
- "periods": 50432,
147
- "planning": 50473,
148
- "plans": 50423,
149
- "plants": 50313,
150
- "policies": 50355,
151
- "pollution": 50467,
152
- "populations": 50441,
153
- "possible": 50332,
154
- "potential": 50273,
155
- "practices": 50453,
156
- "precipitation": 50280,
157
- "presented": 50428,
158
- "previous": 50482,
159
- "processes": 50291,
160
- "productivity": 50463,
161
- "proposed": 50370,
162
- "provide": 50285,
163
- "provides": 50373,
164
- "rainfall": 50323,
165
- "ratio": 50416,
166
- "recovery": 50450,
167
- "reduce": 50288,
168
- "reduced": 50327,
169
- "reducing": 50381,
170
- "reduction": 50286,
171
- "regional": 50308,
172
- "regions": 50302,
173
- "relationship": 50400,
174
- "relatively": 50484,
175
- "renewable": 50294,
176
- "requirements": 50477,
177
- "respectively": 50316,
178
- "responses": 50427,
179
- "resulting": 50456,
180
- "risks": 50309,
181
- "scenario": 50421,
182
- "scenarios": 50334,
183
- "seasonal": 50411,
184
- "sediment": 50475,
185
- "several": 50342,
186
- "shares": 50474,
187
- "showed": 50304,
188
- "significantly": 50299,
189
- "simulations": 50470,
190
- "snow": 50496,
191
- "soil": 50270,
192
- "soils": 50448,
193
- "solar": 50320,
194
- "solutions": 50351,
195
- "sources": 50331,
196
- "southern": 50481,
197
- "spatial": 50322,
198
- "statements": 50472,
199
- "strategies": 50387,
200
- "strategy": 50353,
201
- "structure": 50337,
202
- "studied": 50443,
203
- "studies": 50297,
204
- "summer": 50335,
205
- "supply": 50311,
206
- "sustainability": 50325,
207
- "sustainable": 50284,
208
- "systems": 50278,
209
- "targets": 50436,
210
- "technologies": 50343,
211
- "temperature": 50268,
212
- "temperatures": 50295,
213
- "temporal": 50479,
214
- "thermal": 50365,
215
- "towards": 50409,
216
- "transition": 50344,
217
- "transport": 50352,
218
- "trees": 50478,
219
- "trend": 50407,
220
- "trends": 50371,
221
- "tropical": 50415,
222
- "uncertainty": 50489,
223
- "understanding": 50356,
224
- "variability": 50298,
225
- "variables": 50447,
226
- "variation": 50380,
227
- "variations": 50442,
228
- "various": 50340,
229
- "vegetation": 50330,
230
- "waste": 50350,
231
- "yield": 50386,
232
- "–": 50300,
233
- "’": 50267,
234
- "“": 50305,
235
- "”": 50488,
236
- "•": 50324
237
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef98d3f3c4011c6aa3eb946b4f91ab19564a19ea4371ea89481aae5630b10df4
3
- size 329416384
 
 
 
 
pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a9326d44605591895ce7c568209ad6d574c84798ca5b9b6d8234ec784f50eae
3
- size 329440370
 
 
 
 
special_tokens_map.json DELETED
@@ -1,15 +0,0 @@
1
- {
2
- "bos_token": "<s>",
3
- "cls_token": "<s>",
4
- "eos_token": "</s>",
5
- "mask_token": {
6
- "content": "<mask>",
7
- "lstrip": true,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false
11
- },
12
- "pad_token": "<pad>",
13
- "sep_token": "</s>",
14
- "unk_token": "<unk>"
15
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json DELETED
@@ -1,65 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "bos_token": {
4
- "__type": "AddedToken",
5
- "content": "<s>",
6
- "lstrip": false,
7
- "normalized": true,
8
- "rstrip": false,
9
- "single_word": false
10
- },
11
- "cls_token": {
12
- "__type": "AddedToken",
13
- "content": "<s>",
14
- "lstrip": false,
15
- "normalized": true,
16
- "rstrip": false,
17
- "single_word": false
18
- },
19
- "eos_token": {
20
- "__type": "AddedToken",
21
- "content": "</s>",
22
- "lstrip": false,
23
- "normalized": true,
24
- "rstrip": false,
25
- "single_word": false
26
- },
27
- "errors": "replace",
28
- "mask_token": {
29
- "__type": "AddedToken",
30
- "content": "<mask>",
31
- "lstrip": true,
32
- "normalized": true,
33
- "rstrip": false,
34
- "single_word": false
35
- },
36
- "model_max_length": 512,
37
- "name_or_path": "language_model/model/ClimateBERT_21072022_acc_grad_roberta",
38
- "pad_token": {
39
- "__type": "AddedToken",
40
- "content": "<pad>",
41
- "lstrip": false,
42
- "normalized": true,
43
- "rstrip": false,
44
- "single_word": false
45
- },
46
- "sep_token": {
47
- "__type": "AddedToken",
48
- "content": "</s>",
49
- "lstrip": false,
50
- "normalized": true,
51
- "rstrip": false,
52
- "single_word": false
53
- },
54
- "special_tokens_map_file": "pre_model/21072022_roberta/special_tokens_map.json",
55
- "tokenizer_class": "RobertaTokenizer",
56
- "trim_offsets": true,
57
- "unk_token": {
58
- "__type": "AddedToken",
59
- "content": "<unk>",
60
- "lstrip": false,
61
- "normalized": true,
62
- "rstrip": false,
63
- "single_word": false
64
- }
65
- }