Ahmet Yildirim commited on
Commit
d6d6f4f
·
1 Parent(s): ce2411a

- Update lemmatisering

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.json_large filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -20,6 +20,7 @@ This specific version of the tagger is based on Norbert3-base.
20
  The aim of this model is to make Humit-Tagger available as a HuggingFace model including all functionality that the [original code](https://github.com/humit-oslo/humit-tagger) supports.
21
  In addition to the morphological tagging, this model supports Nynorsk/Bokmåk language identification provided by this [repository](https://github.com/humit-oslo/humit-sprakidentifikator).
22
 
 
23
 
24
  This model adds four classification layers on top of the base model.
25
  These layers do language identification, morphologic classification, lemmatization classification, and sentence boundary detection.
@@ -75,6 +76,7 @@ These functions receive similar parameters.
75
  | lang\_per\_sentence| yes | no | True / False | False | identify the language per sentence and output the tags according to the language identified for that sentence. If this is not set, and lang is "au" then the whole input (or a file if input\_directory is used) is used to identify the language. |
76
  | write\_output\_to | yes | yes | a file path, a file handle, or "list" | sys.stdout | to specify where to write the output. If a file path is provided, the output will be written to that file. The file is overwritten. If a file handle is provided, then the output is written there. If "list" is given as parameters, then the function returns a python "list". |
77
  | output\_tsv | yes | yes | True/False | False | to specify the output format. The default is the json format. If multiple sentences exist, each line is a single valid json but not the whole output. This option cannot be used along with write\_output\_to="list" |
 
78
  | lang\_per\_item | no | yes | True/False | False | consider each item in the list given as separate input for language identification. |
79
  | fast\_mode | no | yes | True/False | False | identify languages of the files in the input directory in fast mode. This mode uses only the beginning of the files in identification. This method is much more faster for many files but is not as accurate as if this paramer is set to False. |
80
 
 
20
  The aim of this model is to make Humit-Tagger available as a HuggingFace model including all functionality that the [original code](https://github.com/humit-oslo/humit-tagger) supports.
21
  In addition to the morphological tagging, this model supports Nynorsk/Bokmåk language identification provided by this [repository](https://github.com/humit-oslo/humit-sprakidentifikator).
22
 
23
+ **If you do not need lemmatisation, we recommend giving the lemmatisation=False flag as parameter. This will save some cpu time and make the tagging faster.**
24
 
25
  This model adds four classification layers on top of the base model.
26
  These layers do language identification, morphologic classification, lemmatization classification, and sentence boundary detection.
 
76
  | lang\_per\_sentence| yes | no | True / False | False | identify the language per sentence and output the tags according to the language identified for that sentence. If this is not set, and lang is "au" then the whole input (or a file if input\_directory is used) is used to identify the language. |
77
  | write\_output\_to | yes | yes | a file path, a file handle, or "list" | sys.stdout | to specify where to write the output. If a file path is provided, the output will be written to that file. The file is overwritten. If a file handle is provided, then the output is written there. If "list" is given as parameters, then the function returns a python "list". |
78
  | output\_tsv | yes | yes | True/False | False | to specify the output format. The default is the json format. If multiple sentences exist, each line is a single valid json but not the whole output. This option cannot be used along with write\_output\_to="list" |
79
+ | lemmatisation | yes | no | True / False | True | to specify whether lemmatisation will be applied. Disabling lemmatisation by giving this parameter as False makes the tagger faster. |
80
  | lang\_per\_item | no | yes | True/False | False | consider each item in the list given as separate input for language identification. |
81
  | fast\_mode | no | yes | True/False | False | identify languages of the files in the input directory in fast mode. This mode uses only the beginning of the files in identification. This method is much more faster for many files but is not as accurate as if this paramer is set to False. |
82
 
fullform_list.json_large ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a32e9d7c36ed2ba9ec7f080e118760c444277fb6f213172246d24711b0493433
3
+ size 240703613
modeling_humit_tagger.py CHANGED
@@ -32,7 +32,7 @@ class HumitTaggerModel(torch.nn.Module):
32
  kwargs["this_model_config"]=json.load(js)
33
 
34
 
35
- # Download this model's config:
36
  lemma_rules_path = hf_hub_download(repo_id=repo_name, filename=kwargs["config"].lemma_rules_py_file)
37
 
38
  # load lemma rules class
@@ -46,6 +46,7 @@ class HumitTaggerModel(torch.nn.Module):
46
  base_config_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_config_file"])
47
  base_model_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_model_file"])
48
  base_model_config_json_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_config_json_file"])
 
49
 
50
  # Copy base model's configuration python file into our working directory
51
  config_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)) , os.path.basename(base_config_file))
@@ -81,12 +82,13 @@ class HumitTaggerModel(torch.nn.Module):
81
 
82
  kwargs["model_weights_path"] = model_weights_path
83
  kwargs["repo_name"] = repo_name
 
84
  return HumitTaggerModel(**kwargs)
85
 
86
  def __init__(self, **kwargs ):
87
  super(HumitTaggerModel, self).__init__()
88
  json_cfg = kwargs["base_model_json_cfg"]
89
- self.config=kwargs["this_model_config"]
90
  self.LemmaHandling = sys.modules["lemma_rules"].LemmaHandling
91
  self.LemmaHandling.load_lemma_rules_from_obj(self.config["lemma_rules"])
92
  cfg=sys.modules["base_config"].NorbertConfig(**json_cfg)
@@ -117,6 +119,32 @@ class HumitTaggerModel(torch.nn.Module):
117
  self.REPLACE_PATTERN = '|'.join(sorted(re.escape(k) for k in self.REPLACE_DICT))
118
  self.MAX_LENGTH = self.bert.config.max_position_embeddings
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  def forward(self, input_ids=None, attention_mask=None ):
121
  outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True )
122
  sequence_output = self.dropout(outputs.last_hidden_state)
@@ -171,19 +199,24 @@ class HumitTaggerModel(torch.nn.Module):
171
  }
172
  batched_sentences.append(to_append)
173
 
174
- torch.cuda.empty_cache()
 
175
 
176
  return batched_sentences
177
 
178
  def _split_sentences(self, inp):
179
 
 
 
 
180
  # Here we get the whole text tokenized.
181
  encodings = self.tokenizer(inp,add_special_tokens=False, return_tensors="pt").to(self.device)
182
 
183
  # Save a copy of the tokenization
184
  original_encodings=copy.deepcopy(encodings)
185
  original_encodings=original_encodings.to("cpu")
186
- torch.cuda.empty_cache()
 
187
 
188
  # Pad to the complete size (model max_size -1 (-1 to add CLS))
189
  old_size=encodings["input_ids"][0].size()[0]
@@ -225,13 +258,15 @@ class HumitTaggerModel(torch.nn.Module):
225
  # First get them back to CPU to open space on GPU
226
  input_ids_batched=[i.to("cpu") for i in input_ids_batched]
227
  attention_mask_batched=[i.to("cpu") for i in attention_mask_batched]
228
- torch.cuda.empty_cache()
 
229
 
230
  for input_ids, attention_masks in zip(input_ids_batched, attention_mask_batched):
231
  current_batch={"input_ids":input_ids.to(self.device).long(), "attention_mask":attention_masks.to(self.device).long()}
232
  outputs = self(**current_batch)
233
  del current_batch
234
- torch.cuda.empty_cache()
 
235
 
236
  label_data=outputs["logits1"].argmax(-1)
237
  labels_output.extend(label_data)
@@ -240,7 +275,8 @@ class HumitTaggerModel(torch.nn.Module):
240
  labels_output=torch.stack(labels_output ,dim=0)
241
  labels_output=labels_output[:, range(1,self.MAX_LENGTH)]
242
  labels_output=torch.reshape(labels_output,(1,row_count * self.MAX_LENGTH_WITHOUT_CLS))
243
- torch.cuda.empty_cache()
 
244
 
245
  # Now the data is split into sentences
246
  # So, now create sentence data as list so that this could be used
@@ -265,7 +301,9 @@ class HumitTaggerModel(torch.nn.Module):
265
  del old_size
266
  del inp
267
  del outputs
268
- torch.cuda.empty_cache()
 
 
269
 
270
  return sentence_list
271
 
@@ -279,6 +317,85 @@ class HumitTaggerModel(torch.nn.Module):
279
  sentences.extend(self._split_sentences(i.strip()))
280
  return sentences
281
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  def tag_sentence_list(self, lst, **tag_config):
283
 
284
  # If the sentences are not tokenized, tokenize while batching:
@@ -296,62 +413,268 @@ class HumitTaggerModel(torch.nn.Module):
296
  else:
297
  tokenized_batches = self._batchify(lst)
298
 
299
- # If language will be identified per sentence
300
- if tag_config["lang_per_sentence"]:
301
- id_to_lang = self.config["id_to_lang"]
302
- # If the output will be to a python list
303
- if tag_config["write_output_to"]==None:
304
- all_tagged_sentences = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  for batch in tokenized_batches:
306
  all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
307
  batch_tags = torch.argmax(all_out["logits2"], dim=-1)
308
- batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
 
309
  batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
 
310
  batch["input_ids"].to("cpu")
311
  batch["attention_mask"].to("cpu")
 
 
 
312
 
313
- for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
314
- batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
 
 
 
 
 
 
 
315
  this_sentence=[]
316
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
317
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
318
  break
319
- if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
320
  if len(this_sentence)>0:
321
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
322
  else:
323
  this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
324
  else:
325
  this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
326
- all_tagged_sentences.append({"lang":id_to_lang[lang], "sent": [ {"w":i["w"], "t":self.tags[lang][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]})
327
 
328
- return all_tagged_sentences
329
-
330
- # If the output is in TSV format to a pipe (stdout or a file handle)
331
- elif tag_config["output_tsv"]:
332
- for batch in tokenized_batches:
333
- all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
334
- batch_tags = torch.argmax(all_out["logits2"], dim=-1)
335
- batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
336
- batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
337
- batch["input_ids"].to("cpu")
338
- batch["attention_mask"].to("cpu")
339
 
340
- for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
341
- batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
 
342
  this_sentence=[]
343
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
344
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
345
  break
346
- if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
347
  if len(this_sentence)>0:
348
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
349
  else:
350
- this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
351
  else:
352
  this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
353
- this_sentence=[ {"w":i["w"], "t":self.tags_str[lang][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]
354
- tag_config["write_output_to"].write(id_to_lang[lang])
 
 
355
  for lin in this_sentence:
356
  tag_config["write_output_to"].write("\t")
357
  tag_config["write_output_to"].write(lin["w"])
@@ -362,49 +685,235 @@ class HumitTaggerModel(torch.nn.Module):
362
  tag_config["write_output_to"].write("\n")
363
  tag_config["write_output_to"].write("\n")
364
 
365
- # If output format will be json to a pipe (stdout or a file handle)
366
- else:
367
- for batch in tokenized_batches:
368
- all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
369
- batch_tags = torch.argmax(all_out["logits2"], dim=-1)
370
- batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
371
- batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
372
- batch["input_ids"].to("cpu")
373
- batch["attention_mask"].to("cpu")
374
-
375
- for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
376
- batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
377
  this_sentence=[]
378
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
379
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
380
  break
381
- if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
382
  if len(this_sentence)>0:
383
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
384
  else:
385
  this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
386
  else:
387
  this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
388
-
389
- json.dump({"lang":id_to_lang[lang], "sent":[ {"w":i["w"], "t":self.tags[lang][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]}, tag_config["write_output_to"])
 
390
  tag_config["write_output_to"].write("\n")
391
 
392
- # If the language is set as parameter
393
- elif tag_config["lang"] != -1:
394
- LANG = tag_config["lang"]
395
- LANG_STR = self.config["id_to_lang"][LANG]
396
- # If the output will be to a python list
397
- if tag_config["write_output_to"]==None:
398
- all_tagged_sentences = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
  for batch in tokenized_batches:
400
  all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
401
- batch_tags = torch.argmax(all_out["logits2"], dim=-1)
402
  batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
 
 
403
  batch["input_ids"].to("cpu")
404
  batch["attention_mask"].to("cpu")
405
-
406
- for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
407
- batch_lemmas.tolist()):
 
 
 
 
 
 
 
 
 
 
408
  this_sentence=[]
409
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
410
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
@@ -413,24 +922,15 @@ class HumitTaggerModel(torch.nn.Module):
413
  if len(this_sentence)>0:
414
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
415
  else:
416
- this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
417
  else:
418
- this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
419
- all_tagged_sentences.append({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]})
420
-
421
- return all_tagged_sentences
422
-
423
- # If the output is in TSV format to a pipe (stdout or a file handle)
424
- elif tag_config["output_tsv"]:
425
- for batch in tokenized_batches:
426
- all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
427
- batch_tags = torch.argmax(all_out["logits2"], dim=-1)
428
- batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
429
- batch["input_ids"].to("cpu")
430
- batch["attention_mask"].to("cpu")
431
 
432
- for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
433
- batch_lemmas.tolist()):
 
434
  this_sentence=[]
435
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
436
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
@@ -439,32 +939,22 @@ class HumitTaggerModel(torch.nn.Module):
439
  if len(this_sentence)>0:
440
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
441
  else:
442
- this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
443
  else:
444
- this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
445
- this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]
446
  tag_config["write_output_to"].write(LANG_STR)
447
  for lin in this_sentence:
448
  tag_config["write_output_to"].write("\t")
449
  tag_config["write_output_to"].write(lin["w"])
450
  tag_config["write_output_to"].write("\t")
451
- tag_config["write_output_to"].write(lin["l"])
452
- tag_config["write_output_to"].write("\t")
453
  tag_config["write_output_to"].write(lin["t"])
454
  tag_config["write_output_to"].write("\n")
455
  tag_config["write_output_to"].write("\n")
456
 
457
- # If output format will be json to a pipe (stdout or a file handle)
458
- else:
459
- for batch in tokenized_batches:
460
- all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
461
- batch_tags = torch.argmax(all_out["logits2"], dim=-1)
462
- batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
463
- batch["input_ids"].to("cpu")
464
- batch["attention_mask"].to("cpu")
465
-
466
- for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
467
- batch_lemmas.tolist()):
468
  this_sentence=[]
469
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
470
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
@@ -473,98 +963,13 @@ class HumitTaggerModel(torch.nn.Module):
473
  if len(this_sentence)>0:
474
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
475
  else:
476
- this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
477
- else:
478
- this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
479
-
480
- json.dump({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]}, tag_config["write_output_to"])
481
- tag_config["write_output_to"].write("\n")
482
-
483
- # If language will be identified according to the majority of all sentences:
484
- else:
485
- all_tags=[]
486
- all_lemmas=[]
487
- all_langs=[]
488
- all_input_ids=[]
489
- # Go over all batches and each sentence in each batch
490
- for batch in tokenized_batches:
491
- all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
492
- batch_tags = torch.argmax(all_out["logits2"], dim=-1)
493
- batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
494
- batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
495
- all_input_ids.extend(batch["input_ids"].tolist())
496
- batch["input_ids"].to("cpu")
497
- batch["attention_mask"].to("cpu")
498
- all_langs.extend(batch_langs[:, 0].tolist())
499
- all_tags.extend(batch_tags.tolist())
500
- all_lemmas.extend(batch_lemmas.tolist())
501
-
502
- # Identify the language
503
- tag_config["lang"] = 1 if sum(all_langs)/len(all_langs)>=0.5 else 0
504
- LANG = tag_config["lang"]
505
- LANG_STR = self.config["id_to_lang"][LANG]
506
-
507
- # If the output will be returned as python list:
508
- if tag_config["write_output_to"]==None:
509
- all_tagged_sentences = []
510
- for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
511
- this_sentence=[]
512
- for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
513
- if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
514
- break
515
- if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
516
- if len(this_sentence)>0:
517
- this_sentence[-1]["w"] += self.tokenizer.decode(inps)
518
  else:
519
- this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
520
- else:
521
- this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
522
- all_tagged_sentences.append({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence] })
523
- return all_tagged_sentences
524
-
525
- # If the output is in TSV format
526
- elif tag_config["output_tsv"]:
527
- for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
528
- this_sentence=[]
529
- for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
530
- if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
531
- break
532
- if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
533
- if len(this_sentence)>0:
534
- this_sentence[-1]["w"] += self.tokenizer.decode(inps)
535
- else:
536
- this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
537
- else:
538
- this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
539
- this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]
540
- tag_config["write_output_to"].write(LANG_STR)
541
- for lin in this_sentence:
542
- tag_config["write_output_to"].write("\t")
543
- tag_config["write_output_to"].write(lin["w"])
544
- tag_config["write_output_to"].write("\t")
545
- tag_config["write_output_to"].write(lin["l"])
546
- tag_config["write_output_to"].write("\t")
547
- tag_config["write_output_to"].write(lin["t"])
548
  tag_config["write_output_to"].write("\n")
549
- tag_config["write_output_to"].write("\n")
550
 
551
- # If output format will be json
552
- else:
553
- for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
554
- this_sentence=[]
555
- for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
556
- if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
557
- break
558
- if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
559
- if len(this_sentence)>0:
560
- this_sentence[-1]["w"] += self.tokenizer.decode(inps)
561
- else:
562
- this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
563
- else:
564
- this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
565
-
566
- json.dump({"lang":LANG_STR, "sent":[ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]}, tag_config["write_output_to"])
567
- tag_config["write_output_to"].write("\n")
568
 
569
  def _check_if_text_file_and_return_content(self, filepath):
570
  try:
@@ -575,7 +980,21 @@ class HumitTaggerModel(torch.nn.Module):
575
 
576
  @torch.no_grad()
577
  def tag(self, inp=None, **tag_config):
 
578
  self.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
579
  if "one_sentence_per_line" not in tag_config:
580
  tag_config["one_sentence_per_line"]=False
581
 
@@ -620,7 +1039,7 @@ class HumitTaggerModel(torch.nn.Module):
620
  os.makedirs(os.path.dirname(out_path), exist_ok=True)
621
  if tag_config["one_sentence_per_line"]:
622
  inp = [i for i in file_content.split("\n") if i!=""]
623
- inp = [i for i in inp if i!=""]
624
  with open(out_path, "w") as opened_file:
625
  tag_config["write_output_to"] = opened_file
626
  self.tag_sentence_list(inp, **tag_config)
@@ -631,8 +1050,8 @@ class HumitTaggerModel(torch.nn.Module):
631
  self.tag_sentence_list(inp, **tag_config)
632
  else:
633
  print (f"Could not properly open and read {input_path}.")
634
-
635
- write_to.close()
636
  return
637
 
638
  else:
@@ -650,7 +1069,7 @@ class HumitTaggerModel(torch.nn.Module):
650
  # Tag one sentence per line in a string
651
  if tag_config["one_sentence_per_line"]:
652
  inp = [i for i in inp.split("\n") if i!=""]
653
- inp = [self._preprocess_text(i) for i in inp if i!=""]
654
  return self.tag_sentence_list(inp, **tag_config)
655
 
656
  # identify sentences
@@ -660,7 +1079,7 @@ class HumitTaggerModel(torch.nn.Module):
660
  # Tag one sentence per list item
661
  elif type(inp) == list:
662
  inp=[i.strip() for i in inp]
663
- inp=[self._preprocess_text(i) for i in inp if i!=""]
664
  return self.tag_sentence_list(inp, **tag_config)
665
 
666
  def identify_language_sentence_list(self, lst, **tag_config):
@@ -703,9 +1122,12 @@ class HumitTaggerModel(torch.nn.Module):
703
 
704
  @torch.no_grad()
705
  def identify_language(self, inp=None, **tag_config):
 
706
  self.eval()
 
707
  if "one_sentence_per_line" not in tag_config:
708
  tag_config["one_sentence_per_line"]=False
 
709
  if "lang" in tag_config:
710
  del tag_config["lang"]
711
 
@@ -715,7 +1137,7 @@ class HumitTaggerModel(torch.nn.Module):
715
  if "lang_per_sentence" not in tag_config:
716
  tag_config["lang_per_sentence"] = False
717
 
718
- elif tag_config["lang_per_sentence"]:
719
  tag_config["lang_per_sentence"] = True
720
 
721
  if "input_directory" in tag_config and "output_directory" in tag_config and "write_output_to" in tag_config and tag_config["write_output_to"]!=None:
@@ -771,7 +1193,7 @@ class HumitTaggerModel(torch.nn.Module):
771
  torch.cuda.empty_cache()
772
 
773
  if tag_config["write_output_to"]==None:
774
- general_output.extend([{"f":i[0], "l":self.config["id_to_lang"][i[1]]} for i in zip(file_names, langs)])
775
  elif tag_config["output_tsv"]:
776
  for fil,lan in zip(file_names, langs):
777
  tag_config["write_output_to"].write(fil)
@@ -780,7 +1202,7 @@ class HumitTaggerModel(torch.nn.Module):
780
  tag_config["write_output_to"].write("\n")
781
  else:
782
  for fil,lan in zip(file_names, langs):
783
- json.dump({"f":fil, "l":self.config["id_to_lang"][lan]})
784
  file_names=[]
785
  contents=[]
786
  else:
@@ -801,7 +1223,7 @@ class HumitTaggerModel(torch.nn.Module):
801
  torch.cuda.empty_cache()
802
 
803
  if tag_config["write_output_to"]==None:
804
- general_output.extend([{"f":i[0], "l":self.config["id_to_lang"][i[1]]} for i in zip(file_names, langs)])
805
  elif tag_config["output_tsv"]:
806
  for fil,lan in zip(file_names, langs):
807
  tag_config["write_output_to"].write(fil)
@@ -810,7 +1232,7 @@ class HumitTaggerModel(torch.nn.Module):
810
  tag_config["write_output_to"].write("\n")
811
  else:
812
  for fil,lan in zip(file_names, langs):
813
- json.dump({"f":fil, "l":self.config["id_to_lang"][lan]})
814
 
815
  return general_output if len(general_output)>0 else None
816
 
@@ -852,17 +1274,17 @@ class HumitTaggerModel(torch.nn.Module):
852
  opened_file.write(lan)
853
  opened_file.write("\n")
854
  else:
855
- json.dump([{"s":sen, "l":lan} for sen,lan in zip(inp, out) ] , opened_file)
856
  else:
857
  if tag_config["output_tsv"]:
858
  opened_file.write(out[0])
859
  else:
860
- json.dump({"l":out[0]} , opened_file)
861
  else:
862
  if tag_config["lang_per_sentence"]:
863
- general_output.extend([{"s":sen, "l":lan} for sen,lan in zip(inp, out) ])
864
  else:
865
- general_output.append({"f":input_path, "l":out[0]})
866
 
867
  # If there is an opened pipe already
868
  else:
@@ -875,7 +1297,7 @@ class HumitTaggerModel(torch.nn.Module):
875
  tag_config["write_output_to"].write("\n")
876
  tag_config["write_output_to"].write("\n")
877
  else:
878
- json.dump([{"s":sen, "l":lan} for sen,lan in zip(inp, out) ] , tag_config["write_output_to"])
879
  tag_config["write_output_to"].write("\n")
880
  else:
881
  if tag_config["output_tsv"]:
@@ -884,7 +1306,7 @@ class HumitTaggerModel(torch.nn.Module):
884
  tag_config["write_output_to"].write(out[0])
885
  tag_config["write_output_to"].write("\n")
886
  else:
887
- json.dump({"f":input_path, "l":out[0]} , tag_config["write_output_to"])
888
  tag_config["write_output_to"].write("\n")
889
 
890
  else:
@@ -894,10 +1316,10 @@ class HumitTaggerModel(torch.nn.Module):
894
  tag_config["write_output_to"].write("err")
895
  tag_config["write_output_to"].write("\n")
896
  else:
897
- json.dump({"f":input_path, "l":"err"} , tag_config["write_output_to"])
898
  tag_config["write_output_to"].write("\n")
899
 
900
- if tag_config["write_output_to"] and tag_config["write_output_to"]!=sys.stdout and tag_config["write_output_to"]!=sys.stderr:
901
  tag_config["write_output_to"].close()
902
 
903
  return general_output if len(general_output)>0 else None
@@ -933,7 +1355,7 @@ class HumitTaggerModel(torch.nn.Module):
933
 
934
  # If return as list
935
  if tag_config["write_output_to"]==None:
936
- return [{"s":i[0], "l": i[1]} for i in zip(inp, out)]
937
 
938
  if tag_config["output_tsv"]:
939
  for sen,lan in zip(inp, out):
@@ -942,7 +1364,7 @@ class HumitTaggerModel(torch.nn.Module):
942
  tag_config["write_output_to"].write(out)
943
  tag_config["write_output_to"].write("\n")
944
  else:
945
- json.dump([{"s":sen, "l":lan} for sen,lan in zip(inp, out) ] , tag_config["write_output_to"])
946
 
947
  return
948
 
@@ -954,7 +1376,7 @@ class HumitTaggerModel(torch.nn.Module):
954
 
955
  # If return as list
956
  if tag_config["write_output_to"]==None:
957
- return [{"s":i[0], "l": i[1]} for i in zip(inp, out)]
958
 
959
  if tag_config["output_tsv"]:
960
  for sen,lan in zip(inp, out):
@@ -963,7 +1385,7 @@ class HumitTaggerModel(torch.nn.Module):
963
  tag_config["write_output_to"].write(lan)
964
  tag_config["write_output_to"].write("\n")
965
  else:
966
- json.dump([{"s":sen, "l":lan} for sen,lan in zip(inp, out) ] , tag_config["write_output_to"])
967
 
968
  return
969
 
 
32
  kwargs["this_model_config"]=json.load(js)
33
 
34
 
35
+ # Download this model's lemma rules pickle file:
36
  lemma_rules_path = hf_hub_download(repo_id=repo_name, filename=kwargs["config"].lemma_rules_py_file)
37
 
38
  # load lemma rules class
 
46
  base_config_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_config_file"])
47
  base_model_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_model_file"])
48
  base_model_config_json_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_config_json_file"])
49
+ fullformlist_file = hf_hub_download(repo_id=repo_name, filename=kwargs["this_model_config"]["fullformlist_file"])
50
 
51
  # Copy base model's configuration python file into our working directory
52
  config_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)) , os.path.basename(base_config_file))
 
82
 
83
  kwargs["model_weights_path"] = model_weights_path
84
  kwargs["repo_name"] = repo_name
85
+ kwargs["fullformlist_file"] = fullformlist_file
86
  return HumitTaggerModel(**kwargs)
87
 
88
  def __init__(self, **kwargs ):
89
  super(HumitTaggerModel, self).__init__()
90
  json_cfg = kwargs["base_model_json_cfg"]
91
+ self.config = kwargs["this_model_config"]
92
  self.LemmaHandling = sys.modules["lemma_rules"].LemmaHandling
93
  self.LemmaHandling.load_lemma_rules_from_obj(self.config["lemma_rules"])
94
  cfg=sys.modules["base_config"].NorbertConfig(**json_cfg)
 
119
  self.REPLACE_PATTERN = '|'.join(sorted(re.escape(k) for k in self.REPLACE_DICT))
120
  self.MAX_LENGTH = self.bert.config.max_position_embeddings
121
 
122
+ # Note the classes that represents gen and prop tags
123
+ self.gen_tag_classes = set()
124
+ self.prop_tag_classes = set()
125
+ self.t_2_tag_classes = set()
126
+
127
+ for i, lst in enumerate(self.config["tags"][0]):
128
+ if "gen" in lst:
129
+ self.gen_tag_classes.add(i)
130
+ if "prop" in lst:
131
+ self.prop_tag_classes.add(i)
132
+ if "2" in lst:
133
+ self.t_2_tag_classes.add(i)
134
+
135
+
136
+ # Load the fullform list
137
+ self.fullform_list=[{},{}]
138
+ try:
139
+ with open(kwargs["fullformlist_file"], 'r') as f:
140
+ self.fullform_list = json.load(f)
141
+ for k in range(2):
142
+ for i in self.fullform_list[k]:
143
+ for j in self.fullform_list[k][i][j]:
144
+ self.fullform_list[k][i][j]=set(self.fullform_list[k][i][j])
145
+ except:
146
+ pass
147
+
148
  def forward(self, input_ids=None, attention_mask=None ):
149
  outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True )
150
  sequence_output = self.dropout(outputs.last_hidden_state)
 
199
  }
200
  batched_sentences.append(to_append)
201
 
202
+ if torch.cuda.is_available():
203
+ torch.cuda.empty_cache()
204
 
205
  return batched_sentences
206
 
207
  def _split_sentences(self, inp):
208
 
209
+ # Remove double spaces
210
+ inp=" ".join(inp.split())
211
+
212
  # Here we get the whole text tokenized.
213
  encodings = self.tokenizer(inp,add_special_tokens=False, return_tensors="pt").to(self.device)
214
 
215
  # Save a copy of the tokenization
216
  original_encodings=copy.deepcopy(encodings)
217
  original_encodings=original_encodings.to("cpu")
218
+ if torch.cuda.is_available():
219
+ torch.cuda.empty_cache()
220
 
221
  # Pad to the complete size (model max_size -1 (-1 to add CLS))
222
  old_size=encodings["input_ids"][0].size()[0]
 
258
  # First get them back to CPU to open space on GPU
259
  input_ids_batched=[i.to("cpu") for i in input_ids_batched]
260
  attention_mask_batched=[i.to("cpu") for i in attention_mask_batched]
261
+ if torch.cuda.is_available():
262
+ torch.cuda.empty_cache()
263
 
264
  for input_ids, attention_masks in zip(input_ids_batched, attention_mask_batched):
265
  current_batch={"input_ids":input_ids.to(self.device).long(), "attention_mask":attention_masks.to(self.device).long()}
266
  outputs = self(**current_batch)
267
  del current_batch
268
+ if torch.cuda.is_available():
269
+ torch.cuda.empty_cache()
270
 
271
  label_data=outputs["logits1"].argmax(-1)
272
  labels_output.extend(label_data)
 
275
  labels_output=torch.stack(labels_output ,dim=0)
276
  labels_output=labels_output[:, range(1,self.MAX_LENGTH)]
277
  labels_output=torch.reshape(labels_output,(1,row_count * self.MAX_LENGTH_WITHOUT_CLS))
278
+ if torch.cuda.is_available():
279
+ torch.cuda.empty_cache()
280
 
281
  # Now the data is split into sentences
282
  # So, now create sentence data as list so that this could be used
 
301
  del old_size
302
  del inp
303
  del outputs
304
+
305
+ if torch.cuda.is_available():
306
+ torch.cuda.empty_cache()
307
 
308
  return sentence_list
309
 
 
317
  sentences.extend(self._split_sentences(i.strip()))
318
  return sentences
319
 
320
+ def _lemmatize(self, tag, LANG):
321
+
322
+ # Here, a "tag" is a list of words in one sentence, their tags and an ordering of lemma classes according the lemmatization model for each word.
323
+ # We go over all words, and apply our algorithm for lemmatization
324
+ # 1. If the "pron" tag is found in the tags
325
+ # then, we check if the "gen" tag also exists
326
+ # if there is the "gen" tag in tags and if there is "s" at the end of the word, we remove that s
327
+ # and return the rest of the word as lemma
328
+ # 2. OR, we continue with "høflig" processing
329
+ # if the word is "De" and if it has the tag "høflig" then we set the lemma as "De", otherwise "de"
330
+ # 3. OR, we continue with checking the word and its word class (subst, verb, adj, etc.) towards the fullform lists.
331
+ # if the word and its word class exists in the fullformlist (of the language bokmål or nynorsk according the the language parameter)
332
+ # then we set the lemma from the fullform list.
333
+ # if there are multiple lemmas in the fullform list, then we check each lemma suggested by the model
334
+ # we pick the lemma amon the lemmas suggested by the fullformlist that comes the first among the lemmas suggested by model
335
+ # 4. OR, we set the first lemma suggested by the model
336
+ # 5. OR, just in case, one way or another if we cannot set a lemma, we set the word as the lemma
337
+
338
+ # Go over all words in the sentence
339
+ for i in range(len(tag)):
340
+
341
+ # If there is prop in tags
342
+ if tag[i]["t"] in self.prop_tag_classes:
343
+
344
+ # set the lemma as the word
345
+ tag[i]["l"]=tag[i]["w"]
346
+
347
+ # if there is gen in tags then remove the last Ss
348
+ if tag[i]["t"] in self.gen_tag_classes:
349
+ if tag[i]["l"].endswith("'s") or tag[i]["l"].endswith("'S"):
350
+ tag[i]["l"]=tag[i]["l"][:-2]
351
+ elif tag[i]["l"].endswith("s") or tag[i]["l"].endswith("S") or tag[i]["l"].endswith("'"):
352
+ tag[i]["l"]=tag[i]["l"][:-1]
353
+ continue
354
+
355
+ # if høflig
356
+ if tag[i]["w"]=="De":
357
+ if tag[i]["t"] in self.t_2_tag_classes:
358
+ tag[i]["l"]="De"
359
+ continue
360
+ else:
361
+ tag[i]["l"]="de"
362
+ continue
363
+
364
+ # for the rest of the cases of the word, lowercase the word and check against the fullform list
365
+ word=tag[i]["w"].lower()
366
+ word_class = self.tags[0][tag[i]["t"]][0]
367
+
368
+ # get the lemma from the fullform list
369
+ fullform_list_lemma = self.fullform_list[LANG].get(word, {}).get(word_class)
370
+
371
+ # if there is not a lemma in the fullformlist
372
+ # use the first lemma from the model
373
+ if fullform_list_lemma==None:
374
+ tag[i]["l"] = self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(tag[i]["w"], tag[i]["l"][0] )
375
+
376
+ # if there is only one fullformlist-lemma:
377
+ elif len(fullform_list_lemma) == 1:
378
+ tag[i]["l"] = next(iter(fullform_list_lemma))
379
+
380
+ # if there are multiple lemmas in the fullformlist
381
+ # here we disambugate among these lemmas using the alternatives from the model
382
+ elif len(fullform_list_lemma) > 1:
383
+ tag[i]["l"] = next((selected_lemma for x in tag[i]["l"] if (selected_lemma := self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(tag[i]["w"], x )) in fullform_list_lemma), self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(tag[i]["w"], tag[i]["l"][0] ) )
384
+
385
+ # This branch will probably not be called but kept just in case
386
+ # If none of the cases above, use the first lemma suggested by the model
387
+ else:
388
+ tag[i]["l"] = self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(tag[i]["w"], tag[i]["l"][0] )
389
+
390
+ # This if will probable not be true either but kept just in case
391
+ # If a lemma could not be assigned after all these operations
392
+ # then asign the word itself
393
+ # Check by if the lemma field is still a list or if the field-type is string the legth is 0
394
+ if type(tag[i]["l"]) == list or len(tag[i]["l"]) == 0:
395
+ tag[i]["l"] = tag[i]["w"]
396
+
397
+ return tag
398
+
399
  def tag_sentence_list(self, lst, **tag_config):
400
 
401
  # If the sentences are not tokenized, tokenize while batching:
 
413
  else:
414
  tokenized_batches = self._batchify(lst)
415
 
416
+ # If lemmatization will be applied
417
+ if tag_config["lemmatize"]:
418
+
419
+ # If language will be identified per sentence
420
+ if tag_config["lang_per_sentence"]:
421
+ id_to_lang = self.config["id_to_lang"]
422
+ # If the output will be to a python list
423
+ if tag_config["write_output_to"]==None:
424
+ all_tagged_sentences = []
425
+ for batch in tokenized_batches:
426
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
427
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
428
+ batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
429
+ #batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
430
+ batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
431
+ batch["input_ids"].to("cpu")
432
+ batch["attention_mask"].to("cpu")
433
+
434
+ for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
435
+ batch_lemma_indices.indices.tolist(), batch_langs[:, 0].tolist()):
436
+ this_sentence=[]
437
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
438
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
439
+ break
440
+ if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
441
+ if len(this_sentence)>0:
442
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
443
+ else:
444
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
445
+ else:
446
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
447
+ this_sentence = self._lemmatize(this_sentence, lang)
448
+ all_tagged_sentences.append({"lang":id_to_lang[lang], "sent": [ {"w":i["w"], "t":self.tags[lang][i["t"]], "l":i["l"]} for i in this_sentence]})
449
+
450
+ return all_tagged_sentences
451
+
452
+ # If the output is in TSV format to a pipe (stdout or a file handle)
453
+ elif tag_config["output_tsv"]:
454
+ for batch in tokenized_batches:
455
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
456
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
457
+ batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
458
+ #batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
459
+ batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
460
+ batch["input_ids"].to("cpu")
461
+ batch["attention_mask"].to("cpu")
462
+
463
+ for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
464
+ batch_lemma_indices.indices.tolist(), batch_langs[:, 0].tolist()):
465
+ this_sentence=[]
466
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
467
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
468
+ break
469
+ if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
470
+ if len(this_sentence)>0:
471
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
472
+ else:
473
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
474
+ else:
475
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
476
+ this_sentence = self._lemmatize(this_sentence, lang)
477
+ this_sentence=[ {"w":i["w"], "t":self.tags_str[lang][i["t"]], "l":i["l"]} for i in this_sentence]
478
+ tag_config["write_output_to"].write(id_to_lang[lang])
479
+ for lin in this_sentence:
480
+ tag_config["write_output_to"].write("\t")
481
+ tag_config["write_output_to"].write(lin["w"])
482
+ tag_config["write_output_to"].write("\t")
483
+ tag_config["write_output_to"].write(lin["l"])
484
+ tag_config["write_output_to"].write("\t")
485
+ tag_config["write_output_to"].write(lin["t"])
486
+ tag_config["write_output_to"].write("\n")
487
+ tag_config["write_output_to"].write("\n")
488
+
489
+ # If output format will be json to a pipe (stdout or a file handle)
490
+ else:
491
+ for batch in tokenized_batches:
492
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
493
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
494
+ batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
495
+ #batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
496
+ batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
497
+ batch["input_ids"].to("cpu")
498
+ batch["attention_mask"].to("cpu")
499
+
500
+ for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
501
+ batch_lemma_indices.indices.tolist(), batch_langs[:, 0].tolist()):
502
+ this_sentence=[]
503
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
504
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
505
+ break
506
+ if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
507
+ if len(this_sentence)>0:
508
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
509
+ else:
510
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
511
+ else:
512
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
513
+ this_sentence = self._lemmatize(this_sentence, lang)
514
+ json.dump({"lang":id_to_lang[lang], "sent":[ {"w":i["w"], "t":self.tags[lang][i["t"]], "l":i["l"]} for i in this_sentence]}, tag_config["write_output_to"])
515
+ tag_config["write_output_to"].write("\n")
516
+
517
+ # If the language is set as parameter
518
+ elif tag_config["lang"] != -1:
519
+ LANG = tag_config["lang"]
520
+ LANG_STR = self.config["id_to_lang"][LANG]
521
+ # If the output will be to a python list
522
+ if tag_config["write_output_to"]==None:
523
+ all_tagged_sentences = []
524
+ for batch in tokenized_batches:
525
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
526
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
527
+ batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
528
+ #batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
529
+ batch["input_ids"].to("cpu")
530
+ batch["attention_mask"].to("cpu")
531
+ for input_ids, tags, lemma_indices in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
532
+ batch_lemma_indices.indices.tolist()): #batch_lemmas.tolist(),
533
+ this_sentence=[]
534
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemma_indices[1:]):
535
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
536
+ break
537
+ if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
538
+ if len(this_sentence)>0:
539
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
540
+ else:
541
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
542
+ else:
543
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
544
+
545
+ this_sentence = self._lemmatize(this_sentence, LANG)
546
+ all_tagged_sentences.append({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":i["l"]} for i in this_sentence]})
547
+
548
+ return all_tagged_sentences
549
+
550
+ # If the output is in TSV format to a pipe (stdout or a file handle)
551
+ elif tag_config["output_tsv"]:
552
+ for batch in tokenized_batches:
553
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
554
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
555
+ batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
556
+ #batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
557
+ batch["input_ids"].to("cpu")
558
+ batch["attention_mask"].to("cpu")
559
+
560
+ for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
561
+ batch_lemma_indices.indices.tolist()):
562
+ this_sentence=[]
563
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
564
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
565
+ break
566
+ if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
567
+ if len(this_sentence)>0:
568
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
569
+ else:
570
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
571
+ else:
572
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
573
+
574
+ this_sentence = self._lemmatize(this_sentence, LANG)
575
+ this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]], "l":i["l"]} for i in this_sentence]
576
+ tag_config["write_output_to"].write(LANG_STR)
577
+ for lin in this_sentence:
578
+ tag_config["write_output_to"].write("\t")
579
+ tag_config["write_output_to"].write(lin["w"])
580
+ tag_config["write_output_to"].write("\t")
581
+ tag_config["write_output_to"].write(lin["l"])
582
+ tag_config["write_output_to"].write("\t")
583
+ tag_config["write_output_to"].write(lin["t"])
584
+ tag_config["write_output_to"].write("\n")
585
+ tag_config["write_output_to"].write("\n")
586
+
587
+ # If output format will be json to a pipe (stdout or a file handle)
588
+ else:
589
+ for batch in tokenized_batches:
590
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
591
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
592
+ batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
593
+ #batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
594
+ batch["input_ids"].to("cpu")
595
+ batch["attention_mask"].to("cpu")
596
+
597
+ for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
598
+ batch_lemma_indices.indices.tolist()):
599
+ this_sentence=[]
600
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
601
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
602
+ break
603
+ if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
604
+ if len(this_sentence)>0:
605
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
606
+ else:
607
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
608
+ else:
609
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
610
+
611
+ this_sentence = self._lemmatize(this_sentence, LANG)
612
+ json.dump({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":i["l"]} for i in this_sentence]}, tag_config["write_output_to"])
613
+ tag_config["write_output_to"].write("\n")
614
+
615
+ # If language will be identified according to the majority of all sentences:
616
+ else:
617
+ all_tags=[]
618
+ all_lemmas=[]
619
+ all_langs=[]
620
+ all_input_ids=[]
621
+ # Go over all batches and each sentence in each batch
622
  for batch in tokenized_batches:
623
  all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
624
  batch_tags = torch.argmax(all_out["logits2"], dim=-1)
625
+ batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
626
+ #batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
627
  batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
628
+ all_input_ids.extend(batch["input_ids"].tolist())
629
  batch["input_ids"].to("cpu")
630
  batch["attention_mask"].to("cpu")
631
+ all_langs.extend(batch_langs[:, 0].tolist())
632
+ all_tags.extend(batch_tags.tolist())
633
+ all_lemmas.extend(batch_lemma_indices.indices.tolist())
634
 
635
+ # Identify the language
636
+ tag_config["lang"] = 1 if sum(all_langs)/len(all_langs)>=0.5 else 0
637
+ LANG = tag_config["lang"]
638
+ LANG_STR = self.config["id_to_lang"][LANG]
639
+
640
+ # If the output will be returned as python list:
641
+ if tag_config["write_output_to"]==None:
642
+ all_tagged_sentences = []
643
+ for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
644
  this_sentence=[]
645
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
646
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
647
  break
648
+ if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
649
  if len(this_sentence)>0:
650
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
651
  else:
652
  this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
653
  else:
654
  this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
 
655
 
656
+ this_sentence = self._lemmatize(this_sentence, LANG)
657
+ all_tagged_sentences.append({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":i["l"]} for i in this_sentence] })
658
+ return all_tagged_sentences
 
 
 
 
 
 
 
 
659
 
660
+ # If the output is in TSV format
661
+ elif tag_config["output_tsv"]:
662
+ for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
663
  this_sentence=[]
664
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
665
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
666
  break
667
+ if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
668
  if len(this_sentence)>0:
669
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
670
  else:
671
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
672
  else:
673
  this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
674
+
675
+ this_sentence = self._lemmatize(this_sentence, LANG)
676
+ this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]], "l":i["l"]} for i in this_sentence]
677
+ tag_config["write_output_to"].write(LANG_STR)
678
  for lin in this_sentence:
679
  tag_config["write_output_to"].write("\t")
680
  tag_config["write_output_to"].write(lin["w"])
 
685
  tag_config["write_output_to"].write("\n")
686
  tag_config["write_output_to"].write("\n")
687
 
688
+ # If output format will be json
689
+ else:
690
+ for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
 
 
 
 
 
 
 
 
 
691
  this_sentence=[]
692
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
693
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
694
  break
695
+ if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
696
  if len(this_sentence)>0:
697
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
698
  else:
699
  this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
700
  else:
701
  this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
702
+
703
+ this_sentence = self._lemmatize(this_sentence, LANG)
704
+ json.dump({"lang":LANG_STR, "sent":[ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":i["l"]} for i in this_sentence]}, tag_config["write_output_to"])
705
  tag_config["write_output_to"].write("\n")
706
 
707
+ # If lemmatization will not be applied:
708
+ else:
709
+ # If language will be identified per sentence
710
+ if tag_config["lang_per_sentence"]:
711
+ id_to_lang = self.config["id_to_lang"]
712
+ # If the output will be to a python list
713
+ if tag_config["write_output_to"]==None:
714
+ all_tagged_sentences = []
715
+ for batch in tokenized_batches:
716
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
717
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
718
+ batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
719
+ batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
720
+ batch["input_ids"].to("cpu")
721
+ batch["attention_mask"].to("cpu")
722
+
723
+ for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
724
+ batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
725
+ this_sentence=[]
726
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
727
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
728
+ break
729
+ if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
730
+ if len(this_sentence)>0:
731
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
732
+ else:
733
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
734
+ else:
735
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
736
+ all_tagged_sentences.append({"lang":id_to_lang[lang], "sent": [ {"w":i["w"], "t":self.tags[lang][i["t"]]} for i in this_sentence]})
737
+
738
+ return all_tagged_sentences
739
+
740
+ # If the output is in TSV format to a pipe (stdout or a file handle)
741
+ elif tag_config["output_tsv"]:
742
+ for batch in tokenized_batches:
743
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
744
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
745
+ batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
746
+ batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
747
+ batch["input_ids"].to("cpu")
748
+ batch["attention_mask"].to("cpu")
749
+
750
+ for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
751
+ batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
752
+ this_sentence=[]
753
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
754
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
755
+ break
756
+ if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
757
+ if len(this_sentence)>0:
758
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
759
+ else:
760
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
761
+ else:
762
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
763
+ this_sentence=[ {"w":i["w"], "t":self.tags_str[lang][i["t"]] } for i in this_sentence]
764
+ tag_config["write_output_to"].write(id_to_lang[lang])
765
+ for lin in this_sentence:
766
+ tag_config["write_output_to"].write("\t")
767
+ tag_config["write_output_to"].write(lin["w"])
768
+ tag_config["write_output_to"].write("\t")
769
+ tag_config["write_output_to"].write(lin["t"])
770
+ tag_config["write_output_to"].write("\n")
771
+ tag_config["write_output_to"].write("\n")
772
+
773
+ # If output format will be json to a pipe (stdout or a file handle)
774
+ else:
775
+ for batch in tokenized_batches:
776
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
777
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
778
+ batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
779
+ batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
780
+ batch["input_ids"].to("cpu")
781
+ batch["attention_mask"].to("cpu")
782
+
783
+ for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
784
+ batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
785
+ this_sentence=[]
786
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
787
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
788
+ break
789
+ if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
790
+ if len(this_sentence)>0:
791
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
792
+ else:
793
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
794
+ else:
795
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
796
+
797
+ json.dump({"lang":id_to_lang[lang], "sent":[ {"w":i["w"], "t":self.tags[lang][i["t"]]} for i in this_sentence]}, tag_config["write_output_to"])
798
+ tag_config["write_output_to"].write("\n")
799
+
800
+ # If the language is set as parameter
801
+ elif tag_config["lang"] != -1:
802
+ LANG = tag_config["lang"]
803
+ LANG_STR = self.config["id_to_lang"][LANG]
804
+ # If the output will be to a python list
805
+ if tag_config["write_output_to"]==None:
806
+ all_tagged_sentences = []
807
+ for batch in tokenized_batches:
808
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
809
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
810
+ batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
811
+ batch["input_ids"].to("cpu")
812
+ batch["attention_mask"].to("cpu")
813
+
814
+ for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
815
+ batch_lemmas.tolist()):
816
+ this_sentence=[]
817
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
818
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
819
+ break
820
+ if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
821
+ if len(this_sentence)>0:
822
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
823
+ else:
824
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
825
+ else:
826
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
827
+ all_tagged_sentences.append({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]]} for i in this_sentence]})
828
+
829
+ return all_tagged_sentences
830
+
831
+ # If the output is in TSV format to a pipe (stdout or a file handle)
832
+ elif tag_config["output_tsv"]:
833
+ for batch in tokenized_batches:
834
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
835
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
836
+ batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
837
+ batch["input_ids"].to("cpu")
838
+ batch["attention_mask"].to("cpu")
839
+
840
+ for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
841
+ batch_lemmas.tolist()):
842
+ this_sentence=[]
843
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
844
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
845
+ break
846
+ if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
847
+ if len(this_sentence)>0:
848
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
849
+ else:
850
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
851
+ else:
852
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
853
+ this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]]} for i in this_sentence]
854
+ tag_config["write_output_to"].write(LANG_STR)
855
+ for lin in this_sentence:
856
+ tag_config["write_output_to"].write("\t")
857
+ tag_config["write_output_to"].write(lin["w"])
858
+ tag_config["write_output_to"].write("\t")
859
+ tag_config["write_output_to"].write(lin["t"])
860
+ tag_config["write_output_to"].write("\n")
861
+ tag_config["write_output_to"].write("\n")
862
+
863
+ # If output format will be json to a pipe (stdout or a file handle)
864
+ else:
865
+ for batch in tokenized_batches:
866
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
867
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
868
+ batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
869
+ batch["input_ids"].to("cpu")
870
+ batch["attention_mask"].to("cpu")
871
+
872
+ for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
873
+ batch_lemmas.tolist()):
874
+ this_sentence=[]
875
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
876
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
877
+ break
878
+ if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
879
+ if len(this_sentence)>0:
880
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
881
+ else:
882
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
883
+ else:
884
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
885
+
886
+ json.dump({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]]} for i in this_sentence]}, tag_config["write_output_to"])
887
+ tag_config["write_output_to"].write("\n")
888
+
889
+ # If language will be identified according to the majority of all sentences:
890
+ else:
891
+ all_tags=[]
892
+ all_lemmas=[]
893
+ all_langs=[]
894
+ all_input_ids=[]
895
+ # Go over all batches and each sentence in each batch
896
  for batch in tokenized_batches:
897
  all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
898
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
899
  batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
900
+ batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
901
+ all_input_ids.extend(batch["input_ids"].tolist())
902
  batch["input_ids"].to("cpu")
903
  batch["attention_mask"].to("cpu")
904
+ all_langs.extend(batch_langs[:, 0].tolist())
905
+ all_tags.extend(batch_tags.tolist())
906
+ all_lemmas.extend(batch_lemmas.tolist())
907
+
908
+ # Identify the language
909
+ tag_config["lang"] = 1 if sum(all_langs)/len(all_langs)>=0.5 else 0
910
+ LANG = tag_config["lang"]
911
+ LANG_STR = self.config["id_to_lang"][LANG]
912
+
913
+ # If the output will be returned as python list:
914
+ if tag_config["write_output_to"]==None:
915
+ all_tagged_sentences = []
916
+ for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
917
  this_sentence=[]
918
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
919
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
 
922
  if len(this_sentence)>0:
923
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
924
  else:
925
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
926
  else:
927
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
928
+ all_tagged_sentences.append({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]]} for i in this_sentence] })
929
+ return all_tagged_sentences
 
 
 
 
 
 
 
 
 
 
930
 
931
+ # If the output is in TSV format
932
+ elif tag_config["output_tsv"]:
933
+ for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
934
  this_sentence=[]
935
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
936
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
 
939
  if len(this_sentence)>0:
940
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
941
  else:
942
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
943
  else:
944
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
945
+ this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]]} for i in this_sentence]
946
  tag_config["write_output_to"].write(LANG_STR)
947
  for lin in this_sentence:
948
  tag_config["write_output_to"].write("\t")
949
  tag_config["write_output_to"].write(lin["w"])
950
  tag_config["write_output_to"].write("\t")
 
 
951
  tag_config["write_output_to"].write(lin["t"])
952
  tag_config["write_output_to"].write("\n")
953
  tag_config["write_output_to"].write("\n")
954
 
955
+ # If output format will be json
956
+ else:
957
+ for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
 
 
 
 
 
 
 
 
958
  this_sentence=[]
959
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
960
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
 
963
  if len(this_sentence)>0:
964
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
965
  else:
966
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
967
  else:
968
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
969
+
970
+ json.dump({"lang":LANG_STR, "sent":[ {"w":i["w"], "t":self.tags[LANG][i["t"]]} for i in this_sentence]}, tag_config["write_output_to"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
971
  tag_config["write_output_to"].write("\n")
 
972
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
973
 
974
  def _check_if_text_file_and_return_content(self, filepath):
975
  try:
 
980
 
981
  @torch.no_grad()
982
  def tag(self, inp=None, **tag_config):
983
+
984
  self.eval()
985
+
986
+ if "lemmatise" in tag_config and tag_config["lemmatise"]==False:
987
+ tag_config["lemmatize"] = False
988
+ if "lemmatise" in tag_config:
989
+ del tag_config["lemmatise"]
990
+ else:
991
+ tag_config["lemmatize"] = True
992
+ if "lemmatise" in tag_config:
993
+ del tag_config["lemmatise"]
994
+
995
+ if "lemmatize" in tag_config and tag_config["lemmatize"]==False:
996
+ tag_config["lemmatize"] = False
997
+
998
  if "one_sentence_per_line" not in tag_config:
999
  tag_config["one_sentence_per_line"]=False
1000
 
 
1039
  os.makedirs(os.path.dirname(out_path), exist_ok=True)
1040
  if tag_config["one_sentence_per_line"]:
1041
  inp = [i for i in file_content.split("\n") if i!=""]
1042
+ inp = [" ".join(i.split()) for i in inp if i!=""]
1043
  with open(out_path, "w") as opened_file:
1044
  tag_config["write_output_to"] = opened_file
1045
  self.tag_sentence_list(inp, **tag_config)
 
1050
  self.tag_sentence_list(inp, **tag_config)
1051
  else:
1052
  print (f"Could not properly open and read {input_path}.")
1053
+ if write_to is not sys.stdout and write_to is not sys.stderr:
1054
+ write_to.close()
1055
  return
1056
 
1057
  else:
 
1069
  # Tag one sentence per line in a string
1070
  if tag_config["one_sentence_per_line"]:
1071
  inp = [i for i in inp.split("\n") if i!=""]
1072
+ inp = [" ".join(self._preprocess_text(i).split()) for i in inp if i!=""]
1073
  return self.tag_sentence_list(inp, **tag_config)
1074
 
1075
  # identify sentences
 
1079
  # Tag one sentence per list item
1080
  elif type(inp) == list:
1081
  inp=[i.strip() for i in inp]
1082
+ inp=[" ".join(self._preprocess_text(i).split()) for i in inp if i!=""]
1083
  return self.tag_sentence_list(inp, **tag_config)
1084
 
1085
  def identify_language_sentence_list(self, lst, **tag_config):
 
1122
 
1123
  @torch.no_grad()
1124
  def identify_language(self, inp=None, **tag_config):
1125
+
1126
  self.eval()
1127
+
1128
  if "one_sentence_per_line" not in tag_config:
1129
  tag_config["one_sentence_per_line"]=False
1130
+
1131
  if "lang" in tag_config:
1132
  del tag_config["lang"]
1133
 
 
1137
  if "lang_per_sentence" not in tag_config:
1138
  tag_config["lang_per_sentence"] = False
1139
 
1140
+ elif type(tag_config["lang_per_sentence"])==bool and tag_config["lang_per_sentence"]:
1141
  tag_config["lang_per_sentence"] = True
1142
 
1143
  if "input_directory" in tag_config and "output_directory" in tag_config and "write_output_to" in tag_config and tag_config["write_output_to"]!=None:
 
1193
  torch.cuda.empty_cache()
1194
 
1195
  if tag_config["write_output_to"]==None:
1196
+ general_output.extend([{"f":i[0], "lang":self.config["id_to_lang"][i[1]]} for i in zip(file_names, langs)])
1197
  elif tag_config["output_tsv"]:
1198
  for fil,lan in zip(file_names, langs):
1199
  tag_config["write_output_to"].write(fil)
 
1202
  tag_config["write_output_to"].write("\n")
1203
  else:
1204
  for fil,lan in zip(file_names, langs):
1205
+ json.dump({"f":fil, "lang":self.config["id_to_lang"][lan]})
1206
  file_names=[]
1207
  contents=[]
1208
  else:
 
1223
  torch.cuda.empty_cache()
1224
 
1225
  if tag_config["write_output_to"]==None:
1226
+ general_output.extend([{"f":i[0], "lang":self.config["id_to_lang"][i[1]]} for i in zip(file_names, langs)])
1227
  elif tag_config["output_tsv"]:
1228
  for fil,lan in zip(file_names, langs):
1229
  tag_config["write_output_to"].write(fil)
 
1232
  tag_config["write_output_to"].write("\n")
1233
  else:
1234
  for fil,lan in zip(file_names, langs):
1235
+ json.dump({"f":fil, "lang":self.config["id_to_lang"][lan]})
1236
 
1237
  return general_output if len(general_output)>0 else None
1238
 
 
1274
  opened_file.write(lan)
1275
  opened_file.write("\n")
1276
  else:
1277
+ json.dump([{"s":sen, "lang":lan} for sen,lan in zip(inp, out) ] , opened_file)
1278
  else:
1279
  if tag_config["output_tsv"]:
1280
  opened_file.write(out[0])
1281
  else:
1282
+ json.dump({"lang":out[0]} , opened_file)
1283
  else:
1284
  if tag_config["lang_per_sentence"]:
1285
+ general_output.extend([{"s":sen, "lang":lan} for sen,lan in zip(inp, out) ])
1286
  else:
1287
+ general_output.append({"f":input_path, "lang":out[0]})
1288
 
1289
  # If there is an opened pipe already
1290
  else:
 
1297
  tag_config["write_output_to"].write("\n")
1298
  tag_config["write_output_to"].write("\n")
1299
  else:
1300
+ json.dump([{"s":sen, "lang":lan} for sen,lan in zip(inp, out) ] , tag_config["write_output_to"])
1301
  tag_config["write_output_to"].write("\n")
1302
  else:
1303
  if tag_config["output_tsv"]:
 
1306
  tag_config["write_output_to"].write(out[0])
1307
  tag_config["write_output_to"].write("\n")
1308
  else:
1309
+ json.dump({"f":input_path, "lang":out[0]} , tag_config["write_output_to"])
1310
  tag_config["write_output_to"].write("\n")
1311
 
1312
  else:
 
1316
  tag_config["write_output_to"].write("err")
1317
  tag_config["write_output_to"].write("\n")
1318
  else:
1319
+ json.dump({"f":input_path, "lang":"err"} , tag_config["write_output_to"])
1320
  tag_config["write_output_to"].write("\n")
1321
 
1322
+ if tag_config["write_output_to"] and tag_config["write_output_to"] is not sys.stdout and tag_config["write_output_to"] is not sys.stderr:
1323
  tag_config["write_output_to"].close()
1324
 
1325
  return general_output if len(general_output)>0 else None
 
1355
 
1356
  # If return as list
1357
  if tag_config["write_output_to"]==None:
1358
+ return [{"s":i[0], "lang": i[1]} for i in zip(inp, out)]
1359
 
1360
  if tag_config["output_tsv"]:
1361
  for sen,lan in zip(inp, out):
 
1364
  tag_config["write_output_to"].write(out)
1365
  tag_config["write_output_to"].write("\n")
1366
  else:
1367
+ json.dump([{"s":sen, "lang":lan} for sen,lan in zip(inp, out) ] , tag_config["write_output_to"])
1368
 
1369
  return
1370
 
 
1376
 
1377
  # If return as list
1378
  if tag_config["write_output_to"]==None:
1379
+ return [{"s":i[0], "lang": i[1]} for i in zip(inp, out)]
1380
 
1381
  if tag_config["output_tsv"]:
1382
  for sen,lan in zip(inp, out):
 
1385
  tag_config["write_output_to"].write(lan)
1386
  tag_config["write_output_to"].write("\n")
1387
  else:
1388
+ json.dump([{"s":sen, "lang":lan} for sen,lan in zip(inp, out) ] , tag_config["write_output_to"])
1389
 
1390
  return
1391
 
tagger_config.json CHANGED
The diff for this file is too large to render. See raw diff