atharv-savarkar commited on
Commit
fc2701b
·
verified ·
1 Parent(s): f61e6f8

Upload folder using huggingface_hub

Browse files
label_to_id.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "assamese": 0,
3
+ "bengali": 1,
4
+ "bodo": 2,
5
+ "dogri": 3,
6
+ "english": 4,
7
+ "gujarati": 5,
8
+ "hindi": 6,
9
+ "kannada": 7,
10
+ "kashmiri": 8,
11
+ "kokani": 9,
12
+ "maithili": 10,
13
+ "malayalam": 11,
14
+ "manipuri": 12,
15
+ "marathi": 13,
16
+ "nepali": 14,
17
+ "oriya": 15,
18
+ "punjabi": 16,
19
+ "sanskrit": 17,
20
+ "santali": 18,
21
+ "sindhi": 19,
22
+ "tamil": 20,
23
+ "telugu": 21,
24
+ "urdu": 22
25
+ }
lc_infer.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import argparse
4
+ import logging
5
+ from tqdm import tqdm
6
+ from typing import List, Dict
7
+
8
+ import torch
9
+ import torch.distributed as dist
10
+ from torch.utils.data import DataLoader
11
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
12
+
13
+
14
+ # ===========================
15
+ # PATH RESOLUTION (NO HARDCODE)
16
+ # ===========================
17
+
18
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
19
+ MODEL_PATH = os.path.join(SCRIPT_DIR, "model")
20
+ LABEL_MAP_PATH = os.path.join(SCRIPT_DIR, "label_to_id.json")
21
+
22
+
23
+ # ===========================
24
+ # Logging
25
+ # ===========================
26
+
27
+ def setup_logging(output_dir):
28
+ os.makedirs(output_dir, exist_ok=True)
29
+ log_path = os.path.join(output_dir, "language_classifier.log")
30
+
31
+ logging.basicConfig(
32
+ level=logging.INFO,
33
+ format="%(asctime)s | %(levelname)s | %(message)s",
34
+ handlers=[
35
+ logging.FileHandler(log_path),
36
+ logging.StreamHandler()
37
+ ],
38
+ )
39
+
40
+ logging.info(f"Logging to: {log_path}")
41
+
42
+
43
+ # ===========================
44
+ # DDP SETUP
45
+ # ===========================
46
+
47
+ def setup_distributed():
48
+ if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
49
+ dist.init_process_group(backend="nccl")
50
+ rank = int(os.environ["RANK"])
51
+ world_size = int(os.environ["WORLD_SIZE"])
52
+ local_rank = int(os.environ.get("LOCAL_RANK", 0))
53
+ torch.cuda.set_device(local_rank)
54
+ return True, rank, world_size, local_rank
55
+ return False, 0, 1, 0
56
+
57
+
58
+ def is_main_process():
59
+ return (
60
+ not dist.is_available()
61
+ or not dist.is_initialized()
62
+ or dist.get_rank() == 0
63
+ )
64
+
65
+
66
+ # ===========================
67
+ # Input Discovery
68
+ # ===========================
69
+
70
+ def find_all_jsonl_files(path: str) -> List[str]:
71
+ if os.path.isfile(path):
72
+ if not path.endswith(".jsonl"):
73
+ raise ValueError(f"Input file must be .jsonl: {path}")
74
+ return [path]
75
+
76
+ if not os.path.isdir(path):
77
+ raise ValueError(f"Input path does not exist: {path}")
78
+
79
+ files = []
80
+ for root, _, filenames in os.walk(path):
81
+ for fn in filenames:
82
+ if fn.endswith(".jsonl"):
83
+ files.append(os.path.join(root, fn))
84
+
85
+ if not files:
86
+ raise RuntimeError(f"No .jsonl files found inside: {path}")
87
+
88
+ return sorted(files)
89
+
90
+
91
+ # ===========================
92
+ # Dataset (Streaming, DDP-safe)
93
+ # ===========================
94
+
95
+ class JsonlIterableDataset(torch.utils.data.IterableDataset):
96
+ def __init__(self, input_path: str, text_key: str, rank: int, world_size: int):
97
+ self.files = find_all_jsonl_files(input_path)
98
+ self.text_key = text_key
99
+ self.rank = rank
100
+ self.world_size = world_size
101
+
102
+ def __iter__(self):
103
+ worker_info = torch.utils.data.get_worker_info()
104
+ worker_id = worker_info.id if worker_info else 0
105
+ num_workers = worker_info.num_workers if worker_info else 1
106
+
107
+ global_worker_id = self.rank * num_workers + worker_id
108
+ global_num_workers = self.world_size * num_workers
109
+
110
+ json_loads = json.loads
111
+ text_key = self.text_key
112
+
113
+ for path in self.files:
114
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
115
+ i = 0
116
+ for line in f:
117
+ if i == global_worker_id:
118
+ try:
119
+ obj = json_loads(line)
120
+ except json.JSONDecodeError:
121
+ pass
122
+ else:
123
+ text = obj.get(text_key)
124
+ if isinstance(text, str) and text.strip():
125
+ obj["__lc_text"] = text
126
+ yield obj
127
+
128
+ i += 1
129
+ if i == global_num_workers:
130
+ i = 0
131
+
132
+
133
+ # ===========================
134
+ # Collator
135
+ # ===========================
136
+
137
+ class Collator:
138
+ def __init__(self, tokenizer, max_length=512):
139
+ self.tokenizer = tokenizer
140
+ self.max_length = max_length
141
+
142
+ def __call__(self, batch):
143
+ if not batch:
144
+ return None
145
+
146
+ texts = [x["__lc_text"] for x in batch]
147
+
148
+ enc = self.tokenizer(
149
+ texts,
150
+ padding=True,
151
+ truncation=True,
152
+ max_length=self.max_length,
153
+ return_tensors="pt",
154
+ )
155
+
156
+ return {"enc": enc, "raw": batch}
157
+
158
+
159
+ # ===========================
160
+ # Main
161
+ # ===========================
162
+
163
+ def main():
164
+ parser = argparse.ArgumentParser("Language Classifier Inference")
165
+
166
+ parser.add_argument("--input_path", required=True)
167
+ parser.add_argument("--output_path", required=True)
168
+ parser.add_argument("--text_key", required=True)
169
+
170
+ parser.add_argument("--batch_size", type=int, default=2048)
171
+ parser.add_argument("--max_length", type=int, default=512)
172
+ parser.add_argument("--num_workers", type=int, default=8)
173
+
174
+ args = parser.parse_args()
175
+
176
+ setup_logging(args.output_path)
177
+
178
+ # --------------------
179
+ # DDP
180
+ # --------------------
181
+ distributed, rank, world_size, local_rank = setup_distributed()
182
+ device = f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu"
183
+
184
+ logging.info(f"Distributed={distributed} | World size={world_size}")
185
+
186
+ # --------------------
187
+ # Load label map
188
+ # --------------------
189
+ if not os.path.isfile(LABEL_MAP_PATH):
190
+ raise RuntimeError(f"Missing label map: {LABEL_MAP_PATH}")
191
+
192
+ with open(LABEL_MAP_PATH, "r", encoding="utf-8") as f:
193
+ label_map = json.load(f)
194
+
195
+ id_to_label = {v: k for k, v in label_map.items()}
196
+
197
+ # --------------------
198
+ # Load model
199
+ # --------------------
200
+ if not os.path.isdir(MODEL_PATH):
201
+ raise RuntimeError(f"Model directory not found: {MODEL_PATH}")
202
+
203
+ logging.info(f"Loading model from {MODEL_PATH}")
204
+
205
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
206
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
207
+ model.to(device)
208
+ model.eval()
209
+
210
+ # --------------------
211
+ # Dataset & Loader
212
+ # --------------------
213
+ dataset = JsonlIterableDataset(
214
+ args.input_path,
215
+ args.text_key,
216
+ rank=rank,
217
+ world_size=world_size,
218
+ )
219
+
220
+ dataloader = DataLoader(
221
+ dataset,
222
+ batch_size=args.batch_size,
223
+ num_workers=args.num_workers,
224
+ collate_fn=Collator(tokenizer, args.max_length),
225
+ pin_memory=True,
226
+ persistent_workers=True,
227
+ prefetch_factor=4,
228
+ )
229
+
230
+ # --------------------
231
+ # Accumulators
232
+ # --------------------
233
+ outputs: Dict[int, List[dict]] = {k: [] for k in id_to_label.keys()}
234
+
235
+ # --------------------
236
+ # Inference
237
+ # --------------------
238
+ iterator = tqdm(dataloader, desc="Classifying") if is_main_process() else dataloader
239
+
240
+ with torch.no_grad():
241
+ for batch in iterator:
242
+ if batch is None:
243
+ continue
244
+
245
+ try:
246
+ enc = {k: v.to(device) for k, v in batch["enc"].items()}
247
+ raw = batch["raw"]
248
+
249
+ logits = model(**enc).logits
250
+ preds = torch.argmax(logits, dim=-1).cpu().tolist()
251
+
252
+ for obj, pred in zip(raw, preds):
253
+ obj = dict(obj)
254
+ obj.pop("__lc_text", None)
255
+ obj["predicted_id"] = pred
256
+ obj["predicted_language"] = id_to_label[pred]
257
+ outputs[pred].append(obj)
258
+
259
+ except Exception as e:
260
+ logging.exception(f"Batch failed: {e}")
261
+
262
+ # --------------------
263
+ # Write outputs
264
+ # --------------------
265
+ os.makedirs(args.output_path, exist_ok=True)
266
+
267
+ for cls_id, cls_name in id_to_label.items():
268
+ out_path = os.path.join(
269
+ args.output_path,
270
+ f"{cls_name}.rank{rank}.jsonl"
271
+ )
272
+
273
+ logging.info(f"Writing {len(outputs[cls_id])} samples to {out_path}")
274
+
275
+ with open(out_path, "w", encoding="utf-8") as f:
276
+ for obj in outputs[cls_id]:
277
+ f.write(json.dumps(obj, ensure_ascii=False) + "\n")
278
+
279
+ if distributed:
280
+ dist.barrier()
281
+ dist.destroy_process_group()
282
+
283
+ logging.info("Language classification completed successfully.")
284
+
285
+
286
+ if __name__ == "__main__":
287
+ main()
model/config.json ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "dtype": "float32",
8
+ "embedding_size": 768,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "LABEL_0",
14
+ "1": "LABEL_1",
15
+ "2": "LABEL_2",
16
+ "3": "LABEL_3",
17
+ "4": "LABEL_4",
18
+ "5": "LABEL_5",
19
+ "6": "LABEL_6",
20
+ "7": "LABEL_7",
21
+ "8": "LABEL_8",
22
+ "9": "LABEL_9",
23
+ "10": "LABEL_10",
24
+ "11": "LABEL_11",
25
+ "12": "LABEL_12",
26
+ "13": "LABEL_13",
27
+ "14": "LABEL_14",
28
+ "15": "LABEL_15",
29
+ "16": "LABEL_16",
30
+ "17": "LABEL_17",
31
+ "18": "LABEL_18",
32
+ "19": "LABEL_19",
33
+ "20": "LABEL_20",
34
+ "21": "LABEL_21",
35
+ "22": "LABEL_22",
36
+ "23": "LABEL_23"
37
+ },
38
+ "initializer_range": 0.02,
39
+ "intermediate_size": 3072,
40
+ "label2id": {
41
+ "LABEL_0": 0,
42
+ "LABEL_1": 1,
43
+ "LABEL_10": 10,
44
+ "LABEL_11": 11,
45
+ "LABEL_12": 12,
46
+ "LABEL_13": 13,
47
+ "LABEL_14": 14,
48
+ "LABEL_15": 15,
49
+ "LABEL_16": 16,
50
+ "LABEL_17": 17,
51
+ "LABEL_18": 18,
52
+ "LABEL_19": 19,
53
+ "LABEL_2": 2,
54
+ "LABEL_20": 20,
55
+ "LABEL_21": 21,
56
+ "LABEL_22": 22,
57
+ "LABEL_23": 23,
58
+ "LABEL_3": 3,
59
+ "LABEL_4": 4,
60
+ "LABEL_5": 5,
61
+ "LABEL_6": 6,
62
+ "LABEL_7": 7,
63
+ "LABEL_8": 8,
64
+ "LABEL_9": 9
65
+ },
66
+ "layer_norm_eps": 1e-12,
67
+ "max_position_embeddings": 512,
68
+ "model_type": "bert",
69
+ "num_attention_heads": 12,
70
+ "num_hidden_layers": 12,
71
+ "pad_token_id": 3,
72
+ "position_embedding_type": "absolute",
73
+ "problem_type": "single_label_classification",
74
+ "transformers_version": "4.56.2",
75
+ "type_vocab_size": 2,
76
+ "use_cache": true,
77
+ "vocab_size": 250000
78
+ }
model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37cf7198a098b1fa6825ba86a1bb80c950ffc48d83e030abc72d1f939fe75180
3
+ size 1112262872
model/special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
model/tokenizer_config.json ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[UNK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "<as>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "6": {
52
+ "content": "<bd>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "7": {
60
+ "content": "<bn>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "8": {
68
+ "content": "<dg>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "9": {
76
+ "content": "<en>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "10": {
84
+ "content": "<gom>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "11": {
92
+ "content": "<gu>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "12": {
100
+ "content": "<hi>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "13": {
108
+ "content": "<kha>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "14": {
116
+ "content": "<kn>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "15": {
124
+ "content": "<ks>",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "16": {
132
+ "content": "<mai>",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "17": {
140
+ "content": "<ml>",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "18": {
148
+ "content": "<mni>",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "19": {
156
+ "content": "<mr>",
157
+ "lstrip": false,
158
+ "normalized": false,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": true
162
+ },
163
+ "20": {
164
+ "content": "<ne>",
165
+ "lstrip": false,
166
+ "normalized": false,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": true
170
+ },
171
+ "21": {
172
+ "content": "<or>",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": true
178
+ },
179
+ "22": {
180
+ "content": "<pa>",
181
+ "lstrip": false,
182
+ "normalized": false,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": true
186
+ },
187
+ "23": {
188
+ "content": "<sa>",
189
+ "lstrip": false,
190
+ "normalized": false,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": true
194
+ },
195
+ "24": {
196
+ "content": "<sd>",
197
+ "lstrip": false,
198
+ "normalized": false,
199
+ "rstrip": false,
200
+ "single_word": false,
201
+ "special": true
202
+ },
203
+ "25": {
204
+ "content": "<sat>",
205
+ "lstrip": false,
206
+ "normalized": false,
207
+ "rstrip": false,
208
+ "single_word": false,
209
+ "special": true
210
+ },
211
+ "26": {
212
+ "content": "<ta>",
213
+ "lstrip": false,
214
+ "normalized": false,
215
+ "rstrip": false,
216
+ "single_word": false,
217
+ "special": true
218
+ },
219
+ "27": {
220
+ "content": "<te>",
221
+ "lstrip": false,
222
+ "normalized": false,
223
+ "rstrip": false,
224
+ "single_word": false,
225
+ "special": true
226
+ },
227
+ "28": {
228
+ "content": "<ur>",
229
+ "lstrip": false,
230
+ "normalized": false,
231
+ "rstrip": false,
232
+ "single_word": false,
233
+ "special": true
234
+ }
235
+ },
236
+ "clean_up_tokenization_spaces": false,
237
+ "cls_token": "[CLS]",
238
+ "extra_special_tokens": {},
239
+ "mask_token": "[MASK]",
240
+ "model_max_length": 1000000000000000019884624838656,
241
+ "pad_token": "[PAD]",
242
+ "sep_token": "[SEP]",
243
+ "tokenizer_class": "PreTrainedTokenizerFast",
244
+ "unk_token": "[UNK]"
245
+ }
model/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
model/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60fb1107c5f308b237d9eccfa3a3329e204ba8a411f4504ef68e9fb6c6614542
3
+ size 5841