Ahmet Yildirim commited on
Commit
10375c1
·
1 Parent(s): 6db1114

- Update lemmatisering

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.json_large filter=lfs diff=lfs merge=lfs -text
fullform_list.json_large ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a32e9d7c36ed2ba9ec7f080e118760c444277fb6f213172246d24711b0493433
3
+ size 240703613
modeling_humit_tagger.py CHANGED
@@ -32,7 +32,7 @@ class HumitTaggerModel(torch.nn.Module):
32
  kwargs["this_model_config"]=json.load(js)
33
 
34
 
35
- # Download this model's config:
36
  lemma_rules_path = hf_hub_download(repo_id=repo_name, filename=kwargs["config"].lemma_rules_py_file)
37
 
38
  # load lemma rules class
@@ -46,6 +46,7 @@ class HumitTaggerModel(torch.nn.Module):
46
  base_config_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_config_file"])
47
  base_model_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_model_file"])
48
  base_model_config_json_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_config_json_file"])
 
49
 
50
  # Copy base model's configuration python file into our working directory
51
  config_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)) , os.path.basename(base_config_file))
@@ -81,12 +82,13 @@ class HumitTaggerModel(torch.nn.Module):
81
 
82
  kwargs["model_weights_path"] = model_weights_path
83
  kwargs["repo_name"] = repo_name
 
84
  return HumitTaggerModel(**kwargs)
85
 
86
  def __init__(self, **kwargs ):
87
  super(HumitTaggerModel, self).__init__()
88
  json_cfg = kwargs["base_model_json_cfg"]
89
- self.config=kwargs["this_model_config"]
90
  self.LemmaHandling = sys.modules["lemma_rules"].LemmaHandling
91
  self.LemmaHandling.load_lemma_rules_from_obj(self.config["lemma_rules"])
92
  cfg=sys.modules["base_config"].NorbertConfig(**json_cfg)
@@ -117,6 +119,32 @@ class HumitTaggerModel(torch.nn.Module):
117
  self.REPLACE_PATTERN = '|'.join(sorted(re.escape(k) for k in self.REPLACE_DICT))
118
  self.MAX_LENGTH = self.bert.config.max_position_embeddings
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  def forward(self, input_ids=None, attention_mask=None ):
121
  outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True )
122
  sequence_output = self.dropout(outputs.last_hidden_state)
@@ -171,19 +199,24 @@ class HumitTaggerModel(torch.nn.Module):
171
  }
172
  batched_sentences.append(to_append)
173
 
174
- torch.cuda.empty_cache()
 
175
 
176
  return batched_sentences
177
 
178
  def _split_sentences(self, inp):
179
 
 
 
 
180
  # Here we get the whole text tokenized.
181
  encodings = self.tokenizer(inp,add_special_tokens=False, return_tensors="pt").to(self.device)
182
 
183
  # Save a copy of the tokenization
184
  original_encodings=copy.deepcopy(encodings)
185
  original_encodings=original_encodings.to("cpu")
186
- torch.cuda.empty_cache()
 
187
 
188
  # Pad to the complete size (model max_size -1 (-1 to add CLS))
189
  old_size=encodings["input_ids"][0].size()[0]
@@ -225,13 +258,15 @@ class HumitTaggerModel(torch.nn.Module):
225
  # First get them back to CPU to open space on GPU
226
  input_ids_batched=[i.to("cpu") for i in input_ids_batched]
227
  attention_mask_batched=[i.to("cpu") for i in attention_mask_batched]
228
- torch.cuda.empty_cache()
 
229
 
230
  for input_ids, attention_masks in zip(input_ids_batched, attention_mask_batched):
231
  current_batch={"input_ids":input_ids.to(self.device).long(), "attention_mask":attention_masks.to(self.device).long()}
232
  outputs = self(**current_batch)
233
  del current_batch
234
- torch.cuda.empty_cache()
 
235
 
236
  label_data=outputs["logits1"].argmax(-1)
237
  labels_output.extend(label_data)
@@ -240,7 +275,8 @@ class HumitTaggerModel(torch.nn.Module):
240
  labels_output=torch.stack(labels_output ,dim=0)
241
  labels_output=labels_output[:, range(1,self.MAX_LENGTH)]
242
  labels_output=torch.reshape(labels_output,(1,row_count * self.MAX_LENGTH_WITHOUT_CLS))
243
- torch.cuda.empty_cache()
 
244
 
245
  # Now the data is split into sentences
246
  # So, now create sentence data as list so that this could be used
@@ -265,7 +301,9 @@ class HumitTaggerModel(torch.nn.Module):
265
  del old_size
266
  del inp
267
  del outputs
268
- torch.cuda.empty_cache()
 
 
269
 
270
  return sentence_list
271
 
@@ -279,6 +317,85 @@ class HumitTaggerModel(torch.nn.Module):
279
  sentences.extend(self._split_sentences(i.strip()))
280
  return sentences
281
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  def tag_sentence_list(self, lst, **tag_config):
283
 
284
  # If the sentences are not tokenized, tokenize while batching:
@@ -296,62 +413,268 @@ class HumitTaggerModel(torch.nn.Module):
296
  else:
297
  tokenized_batches = self._batchify(lst)
298
 
299
- # If language will be identified per sentence
300
- if tag_config["lang_per_sentence"]:
301
- id_to_lang = self.config["id_to_lang"]
302
- # If the output will be to a python list
303
- if tag_config["write_output_to"]==None:
304
- all_tagged_sentences = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  for batch in tokenized_batches:
306
  all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
307
  batch_tags = torch.argmax(all_out["logits2"], dim=-1)
308
- batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
 
309
  batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
 
310
  batch["input_ids"].to("cpu")
311
  batch["attention_mask"].to("cpu")
 
 
 
312
 
313
- for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
314
- batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
 
 
 
 
 
 
 
315
  this_sentence=[]
316
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
317
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
318
  break
319
- if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
320
  if len(this_sentence)>0:
321
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
322
  else:
323
  this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
324
  else:
325
  this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
326
- all_tagged_sentences.append({"lang":id_to_lang[lang], "sent": [ {"w":i["w"], "t":self.tags[lang][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]})
327
 
328
- return all_tagged_sentences
329
-
330
- # If the output is in TSV format to a pipe (stdout or a file handle)
331
- elif tag_config["output_tsv"]:
332
- for batch in tokenized_batches:
333
- all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
334
- batch_tags = torch.argmax(all_out["logits2"], dim=-1)
335
- batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
336
- batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
337
- batch["input_ids"].to("cpu")
338
- batch["attention_mask"].to("cpu")
339
 
340
- for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
341
- batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
 
342
  this_sentence=[]
343
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
344
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
345
  break
346
- if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
347
  if len(this_sentence)>0:
348
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
349
  else:
350
- this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
351
  else:
352
  this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
353
- this_sentence=[ {"w":i["w"], "t":self.tags_str[lang][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]
354
- tag_config["write_output_to"].write(id_to_lang[lang])
 
 
355
  for lin in this_sentence:
356
  tag_config["write_output_to"].write("\t")
357
  tag_config["write_output_to"].write(lin["w"])
@@ -362,49 +685,235 @@ class HumitTaggerModel(torch.nn.Module):
362
  tag_config["write_output_to"].write("\n")
363
  tag_config["write_output_to"].write("\n")
364
 
365
- # If output format will be json to a pipe (stdout or a file handle)
366
- else:
367
- for batch in tokenized_batches:
368
- all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
369
- batch_tags = torch.argmax(all_out["logits2"], dim=-1)
370
- batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
371
- batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
372
- batch["input_ids"].to("cpu")
373
- batch["attention_mask"].to("cpu")
374
-
375
- for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
376
- batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
377
  this_sentence=[]
378
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
379
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
380
  break
381
- if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
382
  if len(this_sentence)>0:
383
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
384
  else:
385
  this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
386
  else:
387
  this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
388
-
389
- json.dump({"lang":id_to_lang[lang], "sent":[ {"w":i["w"], "t":self.tags[lang][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]}, tag_config["write_output_to"])
 
390
  tag_config["write_output_to"].write("\n")
391
 
392
- # If the language is set as parameter
393
- elif tag_config["lang"] != -1:
394
- LANG = tag_config["lang"]
395
- LANG_STR = self.config["id_to_lang"][LANG]
396
- # If the output will be to a python list
397
- if tag_config["write_output_to"]==None:
398
- all_tagged_sentences = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
  for batch in tokenized_batches:
400
  all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
401
- batch_tags = torch.argmax(all_out["logits2"], dim=-1)
402
  batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
 
 
403
  batch["input_ids"].to("cpu")
404
  batch["attention_mask"].to("cpu")
405
-
406
- for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
407
- batch_lemmas.tolist()):
 
 
 
 
 
 
 
 
 
 
408
  this_sentence=[]
409
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
410
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
@@ -413,24 +922,15 @@ class HumitTaggerModel(torch.nn.Module):
413
  if len(this_sentence)>0:
414
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
415
  else:
416
- this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
417
  else:
418
- this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
419
- all_tagged_sentences.append({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]})
420
-
421
- return all_tagged_sentences
422
-
423
- # If the output is in TSV format to a pipe (stdout or a file handle)
424
- elif tag_config["output_tsv"]:
425
- for batch in tokenized_batches:
426
- all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
427
- batch_tags = torch.argmax(all_out["logits2"], dim=-1)
428
- batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
429
- batch["input_ids"].to("cpu")
430
- batch["attention_mask"].to("cpu")
431
 
432
- for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
433
- batch_lemmas.tolist()):
 
434
  this_sentence=[]
435
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
436
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
@@ -439,32 +939,22 @@ class HumitTaggerModel(torch.nn.Module):
439
  if len(this_sentence)>0:
440
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
441
  else:
442
- this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
443
  else:
444
- this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
445
- this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]
446
  tag_config["write_output_to"].write(LANG_STR)
447
  for lin in this_sentence:
448
  tag_config["write_output_to"].write("\t")
449
  tag_config["write_output_to"].write(lin["w"])
450
  tag_config["write_output_to"].write("\t")
451
- tag_config["write_output_to"].write(lin["l"])
452
- tag_config["write_output_to"].write("\t")
453
  tag_config["write_output_to"].write(lin["t"])
454
  tag_config["write_output_to"].write("\n")
455
  tag_config["write_output_to"].write("\n")
456
 
457
- # If output format will be json to a pipe (stdout or a file handle)
458
- else:
459
- for batch in tokenized_batches:
460
- all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
461
- batch_tags = torch.argmax(all_out["logits2"], dim=-1)
462
- batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
463
- batch["input_ids"].to("cpu")
464
- batch["attention_mask"].to("cpu")
465
-
466
- for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
467
- batch_lemmas.tolist()):
468
  this_sentence=[]
469
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
470
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
@@ -473,98 +963,13 @@ class HumitTaggerModel(torch.nn.Module):
473
  if len(this_sentence)>0:
474
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
475
  else:
476
- this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
477
- else:
478
- this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
479
-
480
- json.dump({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]}, tag_config["write_output_to"])
481
- tag_config["write_output_to"].write("\n")
482
-
483
- # If language will be identified according to the majority of all sentences:
484
- else:
485
- all_tags=[]
486
- all_lemmas=[]
487
- all_langs=[]
488
- all_input_ids=[]
489
- # Go over all batches and each sentence in each batch
490
- for batch in tokenized_batches:
491
- all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
492
- batch_tags = torch.argmax(all_out["logits2"], dim=-1)
493
- batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
494
- batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
495
- all_input_ids.extend(batch["input_ids"].tolist())
496
- batch["input_ids"].to("cpu")
497
- batch["attention_mask"].to("cpu")
498
- all_langs.extend(batch_langs[:, 0].tolist())
499
- all_tags.extend(batch_tags.tolist())
500
- all_lemmas.extend(batch_lemmas.tolist())
501
-
502
- # Identify the language
503
- tag_config["lang"] = 1 if sum(all_langs)/len(all_langs)>=0.5 else 0
504
- LANG = tag_config["lang"]
505
- LANG_STR = self.config["id_to_lang"][LANG]
506
-
507
- # If the output will be returned as python list:
508
- if tag_config["write_output_to"]==None:
509
- all_tagged_sentences = []
510
- for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
511
- this_sentence=[]
512
- for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
513
- if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
514
- break
515
- if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
516
- if len(this_sentence)>0:
517
- this_sentence[-1]["w"] += self.tokenizer.decode(inps)
518
  else:
519
- this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
520
- else:
521
- this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
522
- all_tagged_sentences.append({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence] })
523
- return all_tagged_sentences
524
-
525
- # If the output is in TSV format
526
- elif tag_config["output_tsv"]:
527
- for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
528
- this_sentence=[]
529
- for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
530
- if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
531
- break
532
- if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
533
- if len(this_sentence)>0:
534
- this_sentence[-1]["w"] += self.tokenizer.decode(inps)
535
- else:
536
- this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
537
- else:
538
- this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
539
- this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]
540
- tag_config["write_output_to"].write(LANG_STR)
541
- for lin in this_sentence:
542
- tag_config["write_output_to"].write("\t")
543
- tag_config["write_output_to"].write(lin["w"])
544
- tag_config["write_output_to"].write("\t")
545
- tag_config["write_output_to"].write(lin["l"])
546
- tag_config["write_output_to"].write("\t")
547
- tag_config["write_output_to"].write(lin["t"])
548
  tag_config["write_output_to"].write("\n")
549
- tag_config["write_output_to"].write("\n")
550
 
551
- # If output format will be json
552
- else:
553
- for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
554
- this_sentence=[]
555
- for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
556
- if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
557
- break
558
- if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
559
- if len(this_sentence)>0:
560
- this_sentence[-1]["w"] += self.tokenizer.decode(inps)
561
- else:
562
- this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
563
- else:
564
- this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
565
-
566
- json.dump({"lang":LANG_STR, "sent":[ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]}, tag_config["write_output_to"])
567
- tag_config["write_output_to"].write("\n")
568
 
569
  def _check_if_text_file_and_return_content(self, filepath):
570
  try:
@@ -575,7 +980,21 @@ class HumitTaggerModel(torch.nn.Module):
575
 
576
  @torch.no_grad()
577
  def tag(self, inp=None, **tag_config):
 
578
  self.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
579
  if "one_sentence_per_line" not in tag_config:
580
  tag_config["one_sentence_per_line"]=False
581
 
@@ -620,7 +1039,7 @@ class HumitTaggerModel(torch.nn.Module):
620
  os.makedirs(os.path.dirname(out_path), exist_ok=True)
621
  if tag_config["one_sentence_per_line"]:
622
  inp = [i for i in file_content.split("\n") if i!=""]
623
- inp = [i for i in inp if i!=""]
624
  with open(out_path, "w") as opened_file:
625
  tag_config["write_output_to"] = opened_file
626
  self.tag_sentence_list(inp, **tag_config)
@@ -631,8 +1050,8 @@ class HumitTaggerModel(torch.nn.Module):
631
  self.tag_sentence_list(inp, **tag_config)
632
  else:
633
  print (f"Could not properly open and read {input_path}.")
634
-
635
- write_to.close()
636
  return
637
 
638
  else:
@@ -650,7 +1069,7 @@ class HumitTaggerModel(torch.nn.Module):
650
  # Tag one sentence per line in a string
651
  if tag_config["one_sentence_per_line"]:
652
  inp = [i for i in inp.split("\n") if i!=""]
653
- inp = [self._preprocess_text(i) for i in inp if i!=""]
654
  return self.tag_sentence_list(inp, **tag_config)
655
 
656
  # identify sentences
@@ -660,7 +1079,7 @@ class HumitTaggerModel(torch.nn.Module):
660
  # Tag one sentence per list item
661
  elif type(inp) == list:
662
  inp=[i.strip() for i in inp]
663
- inp=[self._preprocess_text(i) for i in inp if i!=""]
664
  return self.tag_sentence_list(inp, **tag_config)
665
 
666
  def identify_language_sentence_list(self, lst, **tag_config):
@@ -703,9 +1122,12 @@ class HumitTaggerModel(torch.nn.Module):
703
 
704
  @torch.no_grad()
705
  def identify_language(self, inp=None, **tag_config):
 
706
  self.eval()
 
707
  if "one_sentence_per_line" not in tag_config:
708
  tag_config["one_sentence_per_line"]=False
 
709
  if "lang" in tag_config:
710
  del tag_config["lang"]
711
 
@@ -715,7 +1137,7 @@ class HumitTaggerModel(torch.nn.Module):
715
  if "lang_per_sentence" not in tag_config:
716
  tag_config["lang_per_sentence"] = False
717
 
718
- elif tag_config["lang_per_sentence"]:
719
  tag_config["lang_per_sentence"] = True
720
 
721
  if "input_directory" in tag_config and "output_directory" in tag_config and "write_output_to" in tag_config and tag_config["write_output_to"]!=None:
@@ -771,7 +1193,7 @@ class HumitTaggerModel(torch.nn.Module):
771
  torch.cuda.empty_cache()
772
 
773
  if tag_config["write_output_to"]==None:
774
- general_output.extend([{"f":i[0], "l":self.config["id_to_lang"][i[1]]} for i in zip(file_names, langs)])
775
  elif tag_config["output_tsv"]:
776
  for fil,lan in zip(file_names, langs):
777
  tag_config["write_output_to"].write(fil)
@@ -780,7 +1202,7 @@ class HumitTaggerModel(torch.nn.Module):
780
  tag_config["write_output_to"].write("\n")
781
  else:
782
  for fil,lan in zip(file_names, langs):
783
- json.dump({"f":fil, "l":self.config["id_to_lang"][lan]})
784
  file_names=[]
785
  contents=[]
786
  else:
@@ -801,7 +1223,7 @@ class HumitTaggerModel(torch.nn.Module):
801
  torch.cuda.empty_cache()
802
 
803
  if tag_config["write_output_to"]==None:
804
- general_output.extend([{"f":i[0], "l":self.config["id_to_lang"][i[1]]} for i in zip(file_names, langs)])
805
  elif tag_config["output_tsv"]:
806
  for fil,lan in zip(file_names, langs):
807
  tag_config["write_output_to"].write(fil)
@@ -810,7 +1232,7 @@ class HumitTaggerModel(torch.nn.Module):
810
  tag_config["write_output_to"].write("\n")
811
  else:
812
  for fil,lan in zip(file_names, langs):
813
- json.dump({"f":fil, "l":self.config["id_to_lang"][lan]})
814
 
815
  return general_output if len(general_output)>0 else None
816
 
@@ -852,17 +1274,17 @@ class HumitTaggerModel(torch.nn.Module):
852
  opened_file.write(lan)
853
  opened_file.write("\n")
854
  else:
855
- json.dump([{"s":sen, "l":lan} for sen,lan in zip(inp, out) ] , opened_file)
856
  else:
857
  if tag_config["output_tsv"]:
858
  opened_file.write(out[0])
859
  else:
860
- json.dump({"l":out[0]} , opened_file)
861
  else:
862
  if tag_config["lang_per_sentence"]:
863
- general_output.extend([{"s":sen, "l":lan} for sen,lan in zip(inp, out) ])
864
  else:
865
- general_output.append({"f":input_path, "l":out[0]})
866
 
867
  # If there is an opened pipe already
868
  else:
@@ -875,7 +1297,7 @@ class HumitTaggerModel(torch.nn.Module):
875
  tag_config["write_output_to"].write("\n")
876
  tag_config["write_output_to"].write("\n")
877
  else:
878
- json.dump([{"s":sen, "l":lan} for sen,lan in zip(inp, out) ] , tag_config["write_output_to"])
879
  tag_config["write_output_to"].write("\n")
880
  else:
881
  if tag_config["output_tsv"]:
@@ -884,7 +1306,7 @@ class HumitTaggerModel(torch.nn.Module):
884
  tag_config["write_output_to"].write(out[0])
885
  tag_config["write_output_to"].write("\n")
886
  else:
887
- json.dump({"f":input_path, "l":out[0]} , tag_config["write_output_to"])
888
  tag_config["write_output_to"].write("\n")
889
 
890
  else:
@@ -894,10 +1316,10 @@ class HumitTaggerModel(torch.nn.Module):
894
  tag_config["write_output_to"].write("err")
895
  tag_config["write_output_to"].write("\n")
896
  else:
897
- json.dump({"f":input_path, "l":"err"} , tag_config["write_output_to"])
898
  tag_config["write_output_to"].write("\n")
899
 
900
- if tag_config["write_output_to"] and tag_config["write_output_to"]!=sys.stdout and tag_config["write_output_to"]!=sys.stderr:
901
  tag_config["write_output_to"].close()
902
 
903
  return general_output if len(general_output)>0 else None
@@ -933,7 +1355,7 @@ class HumitTaggerModel(torch.nn.Module):
933
 
934
  # If return as list
935
  if tag_config["write_output_to"]==None:
936
- return [{"s":i[0], "l": i[1]} for i in zip(inp, out)]
937
 
938
  if tag_config["output_tsv"]:
939
  for sen,lan in zip(inp, out):
@@ -942,7 +1364,7 @@ class HumitTaggerModel(torch.nn.Module):
942
  tag_config["write_output_to"].write(out)
943
  tag_config["write_output_to"].write("\n")
944
  else:
945
- json.dump([{"s":sen, "l":lan} for sen,lan in zip(inp, out) ] , tag_config["write_output_to"])
946
 
947
  return
948
 
@@ -954,7 +1376,7 @@ class HumitTaggerModel(torch.nn.Module):
954
 
955
  # If return as list
956
  if tag_config["write_output_to"]==None:
957
- return [{"s":i[0], "l": i[1]} for i in zip(inp, out)]
958
 
959
  if tag_config["output_tsv"]:
960
  for sen,lan in zip(inp, out):
@@ -963,7 +1385,7 @@ class HumitTaggerModel(torch.nn.Module):
963
  tag_config["write_output_to"].write(lan)
964
  tag_config["write_output_to"].write("\n")
965
  else:
966
- json.dump([{"s":sen, "l":lan} for sen,lan in zip(inp, out) ] , tag_config["write_output_to"])
967
 
968
  return
969
 
 
32
  kwargs["this_model_config"]=json.load(js)
33
 
34
 
35
+ # Download this model's lemma rules pickle file:
36
  lemma_rules_path = hf_hub_download(repo_id=repo_name, filename=kwargs["config"].lemma_rules_py_file)
37
 
38
  # load lemma rules class
 
46
  base_config_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_config_file"])
47
  base_model_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_model_file"])
48
  base_model_config_json_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_config_json_file"])
49
+ fullformlist_file = hf_hub_download(repo_id=repo_name, filename=kwargs["this_model_config"]["fullformlist_file"])
50
 
51
  # Copy base model's configuration python file into our working directory
52
  config_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)) , os.path.basename(base_config_file))
 
82
 
83
  kwargs["model_weights_path"] = model_weights_path
84
  kwargs["repo_name"] = repo_name
85
+ kwargs["fullformlist_file"] = fullformlist_file
86
  return HumitTaggerModel(**kwargs)
87
 
88
  def __init__(self, **kwargs ):
89
  super(HumitTaggerModel, self).__init__()
90
  json_cfg = kwargs["base_model_json_cfg"]
91
+ self.config = kwargs["this_model_config"]
92
  self.LemmaHandling = sys.modules["lemma_rules"].LemmaHandling
93
  self.LemmaHandling.load_lemma_rules_from_obj(self.config["lemma_rules"])
94
  cfg=sys.modules["base_config"].NorbertConfig(**json_cfg)
 
119
  self.REPLACE_PATTERN = '|'.join(sorted(re.escape(k) for k in self.REPLACE_DICT))
120
  self.MAX_LENGTH = self.bert.config.max_position_embeddings
121
 
122
+ # Note the classes that represents gen and prop tags
123
+ self.gen_tag_classes = set()
124
+ self.prop_tag_classes = set()
125
+ self.t_2_tag_classes = set()
126
+
127
+ for i, lst in enumerate(self.config["tags"][0]):
128
+ if "gen" in lst:
129
+ self.gen_tag_classes.add(i)
130
+ if "prop" in lst:
131
+ self.prop_tag_classes.add(i)
132
+ if "2" in lst:
133
+ self.t_2_tag_classes.add(i)
134
+
135
+
136
+ # Load the fullform list
137
+ self.fullform_list=[{},{}]
138
+ try:
139
+ with open(kwargs["fullformlist_file"], 'r') as f:
140
+ self.fullform_list = json.load(f)
141
+ for k in range(2):
142
+ for i in self.fullform_list[k]:
143
+ for j in self.fullform_list[k][i][j]:
144
+ self.fullform_list[k][i][j]=set(self.fullform_list[k][i][j])
145
+ except:
146
+ pass
147
+
148
  def forward(self, input_ids=None, attention_mask=None ):
149
  outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True )
150
  sequence_output = self.dropout(outputs.last_hidden_state)
 
199
  }
200
  batched_sentences.append(to_append)
201
 
202
+ if torch.cuda.is_available():
203
+ torch.cuda.empty_cache()
204
 
205
  return batched_sentences
206
 
207
  def _split_sentences(self, inp):
208
 
209
+ # Remove double spaces
210
+ inp=" ".join(inp.split())
211
+
212
  # Here we get the whole text tokenized.
213
  encodings = self.tokenizer(inp,add_special_tokens=False, return_tensors="pt").to(self.device)
214
 
215
  # Save a copy of the tokenization
216
  original_encodings=copy.deepcopy(encodings)
217
  original_encodings=original_encodings.to("cpu")
218
+ if torch.cuda.is_available():
219
+ torch.cuda.empty_cache()
220
 
221
  # Pad to the complete size (model max_size -1 (-1 to add CLS))
222
  old_size=encodings["input_ids"][0].size()[0]
 
258
  # First get them back to CPU to open space on GPU
259
  input_ids_batched=[i.to("cpu") for i in input_ids_batched]
260
  attention_mask_batched=[i.to("cpu") for i in attention_mask_batched]
261
+ if torch.cuda.is_available():
262
+ torch.cuda.empty_cache()
263
 
264
  for input_ids, attention_masks in zip(input_ids_batched, attention_mask_batched):
265
  current_batch={"input_ids":input_ids.to(self.device).long(), "attention_mask":attention_masks.to(self.device).long()}
266
  outputs = self(**current_batch)
267
  del current_batch
268
+ if torch.cuda.is_available():
269
+ torch.cuda.empty_cache()
270
 
271
  label_data=outputs["logits1"].argmax(-1)
272
  labels_output.extend(label_data)
 
275
  labels_output=torch.stack(labels_output ,dim=0)
276
  labels_output=labels_output[:, range(1,self.MAX_LENGTH)]
277
  labels_output=torch.reshape(labels_output,(1,row_count * self.MAX_LENGTH_WITHOUT_CLS))
278
+ if torch.cuda.is_available():
279
+ torch.cuda.empty_cache()
280
 
281
  # Now the data is split into sentences
282
  # So, now create sentence data as list so that this could be used
 
301
  del old_size
302
  del inp
303
  del outputs
304
+
305
+ if torch.cuda.is_available():
306
+ torch.cuda.empty_cache()
307
 
308
  return sentence_list
309
 
 
317
  sentences.extend(self._split_sentences(i.strip()))
318
  return sentences
319
 
320
+ def _lemmatize(self, tag, LANG):
321
+
322
+ # Here, a "tag" is a list of words in one sentence, their tags and an ordering of lemma classes according the lemmatization model for each word.
323
+ # We go over all words, and apply our algorithm for lemmatization
324
+ # 1. If the "pron" tag is found in the tags
325
+ # then, we check if the "gen" tag also exists
326
+ # if there is the "gen" tag in tags and if there is "s" at the end of the word, we remove that s
327
+ # and return the rest of the word as lemma
328
+ # 2. OR, we continue with "høflig" processing
329
+ # if the word is "De" and if it has the tag "høflig" then we set the lemma as "De", otherwise "de"
330
+ # 3. OR, we continue with checking the word and its word class (subst, verb, adj, etc.) towards the fullform lists.
331
+ # if the word and its word class exists in the fullformlist (of the language bokmål or nynorsk according the the language parameter)
332
+ # then we set the lemma from the fullform list.
333
+ # if there are multiple lemmas in the fullform list, then we check each lemma suggested by the model
334
+ # we pick the lemma amon the lemmas suggested by the fullformlist that comes the first among the lemmas suggested by model
335
+ # 4. OR, we set the first lemma suggested by the model
336
+ # 5. OR, just in case, one way or another if we cannot set a lemma, we set the word as the lemma
337
+
338
+ # Go over all words in the sentence
339
+ for i in range(len(tag)):
340
+
341
+ # If there is prop in tags
342
+ if tag[i]["t"] in self.prop_tag_classes:
343
+
344
+ # set the lemma as the word
345
+ tag[i]["l"]=tag[i]["w"]
346
+
347
+ # if there is gen in tags then remove the last Ss
348
+ if tag[i]["t"] in self.gen_tag_classes:
349
+ if tag[i]["l"].endswith("'s") or tag[i]["l"].endswith("'S"):
350
+ tag[i]["l"]=tag[i]["l"][:-2]
351
+ elif tag[i]["l"].endswith("s") or tag[i]["l"].endswith("S") or tag[i]["l"].endswith("'"):
352
+ tag[i]["l"]=tag[i]["l"][:-1]
353
+ continue
354
+
355
+ # if høflig
356
+ if tag[i]["w"]=="De":
357
+ if tag[i]["t"] in self.t_2_tag_classes:
358
+ tag[i]["l"]="De"
359
+ continue
360
+ else:
361
+ tag[i]["l"]="de"
362
+ continue
363
+
364
+ # for the rest of the cases of the word, lowercase the word and check against the fullform list
365
+ word=tag[i]["w"].lower()
366
+ word_class = self.tags[0][tag[i]["t"]][0]
367
+
368
+ # get the lemma from the fullform list
369
+ fullform_list_lemma = self.fullform_list[LANG].get(word, {}).get(word_class)
370
+
371
+ # if there is not a lemma in the fullformlist
372
+ # use the first lemma from the model
373
+ if fullform_list_lemma==None:
374
+ tag[i]["l"] = self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(tag[i]["w"], tag[i]["l"][0] )
375
+
376
+ # if there is only one fullformlist-lemma:
377
+ elif len(fullform_list_lemma) == 1:
378
+ tag[i]["l"] = next(iter(fullform_list_lemma))
379
+
380
+ # if there are multiple lemmas in the fullformlist
381
+ # here we disambugate among these lemmas using the alternatives from the model
382
+ elif len(fullform_list_lemma) > 1:
383
+ tag[i]["l"] = next((selected_lemma for x in tag[i]["l"] if (selected_lemma := self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(tag[i]["w"], x )) in fullform_list_lemma), self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(tag[i]["w"], tag[i]["l"][0] ) )
384
+
385
+ # This branch will probably not be called but kept just in case
386
+ # If none of the cases above, use the first lemma suggested by the model
387
+ else:
388
+ tag[i]["l"] = self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(tag[i]["w"], tag[i]["l"][0] )
389
+
390
+ # This if will probable not be true either but kept just in case
391
+ # If a lemma could not be assigned after all these operations
392
+ # then asign the word itself
393
+ # Check by if the lemma field is still a list or if the field-type is string the legth is 0
394
+ if type(tag[i]["l"]) == list or len(tag[i]["l"]) == 0:
395
+ tag[i]["l"] = tag[i]["w"]
396
+
397
+ return tag
398
+
399
  def tag_sentence_list(self, lst, **tag_config):
400
 
401
  # If the sentences are not tokenized, tokenize while batching:
 
413
  else:
414
  tokenized_batches = self._batchify(lst)
415
 
416
+ # If lemmatization will be applied
417
+ if tag_config["lemmatize"]:
418
+
419
+ # If language will be identified per sentence
420
+ if tag_config["lang_per_sentence"]:
421
+ id_to_lang = self.config["id_to_lang"]
422
+ # If the output will be to a python list
423
+ if tag_config["write_output_to"]==None:
424
+ all_tagged_sentences = []
425
+ for batch in tokenized_batches:
426
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
427
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
428
+ batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
429
+ #batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
430
+ batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
431
+ batch["input_ids"].to("cpu")
432
+ batch["attention_mask"].to("cpu")
433
+
434
+ for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
435
+ batch_lemma_indices.indices.tolist(), batch_langs[:, 0].tolist()):
436
+ this_sentence=[]
437
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
438
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
439
+ break
440
+ if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
441
+ if len(this_sentence)>0:
442
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
443
+ else:
444
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
445
+ else:
446
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
447
+ this_sentence = self._lemmatize(this_sentence, lang)
448
+ all_tagged_sentences.append({"lang":id_to_lang[lang], "sent": [ {"w":i["w"], "t":self.tags[lang][i["t"]], "l":i["l"]} for i in this_sentence]})
449
+
450
+ return all_tagged_sentences
451
+
452
+ # If the output is in TSV format to a pipe (stdout or a file handle)
453
+ elif tag_config["output_tsv"]:
454
+ for batch in tokenized_batches:
455
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
456
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
457
+ batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
458
+ #batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
459
+ batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
460
+ batch["input_ids"].to("cpu")
461
+ batch["attention_mask"].to("cpu")
462
+
463
+ for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
464
+ batch_lemma_indices.indices.tolist(), batch_langs[:, 0].tolist()):
465
+ this_sentence=[]
466
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
467
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
468
+ break
469
+ if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
470
+ if len(this_sentence)>0:
471
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
472
+ else:
473
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
474
+ else:
475
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
476
+ this_sentence = self._lemmatize(this_sentence, lang)
477
+ this_sentence=[ {"w":i["w"], "t":self.tags_str[lang][i["t"]], "l":i["l"]} for i in this_sentence]
478
+ tag_config["write_output_to"].write(id_to_lang[lang])
479
+ for lin in this_sentence:
480
+ tag_config["write_output_to"].write("\t")
481
+ tag_config["write_output_to"].write(lin["w"])
482
+ tag_config["write_output_to"].write("\t")
483
+ tag_config["write_output_to"].write(lin["l"])
484
+ tag_config["write_output_to"].write("\t")
485
+ tag_config["write_output_to"].write(lin["t"])
486
+ tag_config["write_output_to"].write("\n")
487
+ tag_config["write_output_to"].write("\n")
488
+
489
+ # If output format will be json to a pipe (stdout or a file handle)
490
+ else:
491
+ for batch in tokenized_batches:
492
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
493
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
494
+ batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
495
+ #batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
496
+ batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
497
+ batch["input_ids"].to("cpu")
498
+ batch["attention_mask"].to("cpu")
499
+
500
+ for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
501
+ batch_lemma_indices.indices.tolist(), batch_langs[:, 0].tolist()):
502
+ this_sentence=[]
503
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
504
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
505
+ break
506
+ if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
507
+ if len(this_sentence)>0:
508
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
509
+ else:
510
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
511
+ else:
512
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
513
+ this_sentence = self._lemmatize(this_sentence, lang)
514
+ json.dump({"lang":id_to_lang[lang], "sent":[ {"w":i["w"], "t":self.tags[lang][i["t"]], "l":i["l"]} for i in this_sentence]}, tag_config["write_output_to"])
515
+ tag_config["write_output_to"].write("\n")
516
+
517
+ # If the language is set as parameter
518
+ elif tag_config["lang"] != -1:
519
+ LANG = tag_config["lang"]
520
+ LANG_STR = self.config["id_to_lang"][LANG]
521
+ # If the output will be to a python list
522
+ if tag_config["write_output_to"]==None:
523
+ all_tagged_sentences = []
524
+ for batch in tokenized_batches:
525
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
526
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
527
+ batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
528
+ #batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
529
+ batch["input_ids"].to("cpu")
530
+ batch["attention_mask"].to("cpu")
531
+ for input_ids, tags, lemma_indices in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
532
+ batch_lemma_indices.indices.tolist()): #batch_lemmas.tolist(),
533
+ this_sentence=[]
534
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemma_indices[1:]):
535
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
536
+ break
537
+ if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
538
+ if len(this_sentence)>0:
539
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
540
+ else:
541
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
542
+ else:
543
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
544
+
545
+ this_sentence = self._lemmatize(this_sentence, LANG)
546
+ all_tagged_sentences.append({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":i["l"]} for i in this_sentence]})
547
+
548
+ return all_tagged_sentences
549
+
550
+ # If the output is in TSV format to a pipe (stdout or a file handle)
551
+ elif tag_config["output_tsv"]:
552
+ for batch in tokenized_batches:
553
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
554
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
555
+ batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
556
+ #batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
557
+ batch["input_ids"].to("cpu")
558
+ batch["attention_mask"].to("cpu")
559
+
560
+ for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
561
+ batch_lemma_indices.indices.tolist()):
562
+ this_sentence=[]
563
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
564
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
565
+ break
566
+ if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
567
+ if len(this_sentence)>0:
568
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
569
+ else:
570
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
571
+ else:
572
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
573
+
574
+ this_sentence = self._lemmatize(this_sentence, LANG)
575
+ this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]], "l":i["l"]} for i in this_sentence]
576
+ tag_config["write_output_to"].write(LANG_STR)
577
+ for lin in this_sentence:
578
+ tag_config["write_output_to"].write("\t")
579
+ tag_config["write_output_to"].write(lin["w"])
580
+ tag_config["write_output_to"].write("\t")
581
+ tag_config["write_output_to"].write(lin["l"])
582
+ tag_config["write_output_to"].write("\t")
583
+ tag_config["write_output_to"].write(lin["t"])
584
+ tag_config["write_output_to"].write("\n")
585
+ tag_config["write_output_to"].write("\n")
586
+
587
+ # If output format will be json to a pipe (stdout or a file handle)
588
+ else:
589
+ for batch in tokenized_batches:
590
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
591
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
592
+ batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
593
+ #batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
594
+ batch["input_ids"].to("cpu")
595
+ batch["attention_mask"].to("cpu")
596
+
597
+ for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
598
+ batch_lemma_indices.indices.tolist()):
599
+ this_sentence=[]
600
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
601
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
602
+ break
603
+ if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
604
+ if len(this_sentence)>0:
605
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
606
+ else:
607
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
608
+ else:
609
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
610
+
611
+ this_sentence = self._lemmatize(this_sentence, LANG)
612
+ json.dump({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":i["l"]} for i in this_sentence]}, tag_config["write_output_to"])
613
+ tag_config["write_output_to"].write("\n")
614
+
615
+ # If language will be identified according to the majority of all sentences:
616
+ else:
617
+ all_tags=[]
618
+ all_lemmas=[]
619
+ all_langs=[]
620
+ all_input_ids=[]
621
+ # Go over all batches and each sentence in each batch
622
  for batch in tokenized_batches:
623
  all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
624
  batch_tags = torch.argmax(all_out["logits2"], dim=-1)
625
+ batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
626
+ #batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
627
  batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
628
+ all_input_ids.extend(batch["input_ids"].tolist())
629
  batch["input_ids"].to("cpu")
630
  batch["attention_mask"].to("cpu")
631
+ all_langs.extend(batch_langs[:, 0].tolist())
632
+ all_tags.extend(batch_tags.tolist())
633
+ all_lemmas.extend(batch_lemma_indices.indices.tolist())
634
 
635
+ # Identify the language
636
+ tag_config["lang"] = 1 if sum(all_langs)/len(all_langs)>=0.5 else 0
637
+ LANG = tag_config["lang"]
638
+ LANG_STR = self.config["id_to_lang"][LANG]
639
+
640
+ # If the output will be returned as python list:
641
+ if tag_config["write_output_to"]==None:
642
+ all_tagged_sentences = []
643
+ for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
644
  this_sentence=[]
645
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
646
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
647
  break
648
+ if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
649
  if len(this_sentence)>0:
650
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
651
  else:
652
  this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
653
  else:
654
  this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
 
655
 
656
+ this_sentence = self._lemmatize(this_sentence, LANG)
657
+ all_tagged_sentences.append({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":i["l"]} for i in this_sentence] })
658
+ return all_tagged_sentences
 
 
 
 
 
 
 
 
659
 
660
+ # If the output is in TSV format
661
+ elif tag_config["output_tsv"]:
662
+ for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
663
  this_sentence=[]
664
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
665
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
666
  break
667
+ if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
668
  if len(this_sentence)>0:
669
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
670
  else:
671
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
672
  else:
673
  this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
674
+
675
+ this_sentence = self._lemmatize(this_sentence, LANG)
676
+ this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]], "l":i["l"]} for i in this_sentence]
677
+ tag_config["write_output_to"].write(LANG_STR)
678
  for lin in this_sentence:
679
  tag_config["write_output_to"].write("\t")
680
  tag_config["write_output_to"].write(lin["w"])
 
685
  tag_config["write_output_to"].write("\n")
686
  tag_config["write_output_to"].write("\n")
687
 
688
+ # If output format will be json
689
+ else:
690
+ for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
 
 
 
 
 
 
 
 
 
691
  this_sentence=[]
692
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
693
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
694
  break
695
+ if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
696
  if len(this_sentence)>0:
697
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
698
  else:
699
  this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
700
  else:
701
  this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
702
+
703
+ this_sentence = self._lemmatize(this_sentence, LANG)
704
+ json.dump({"lang":LANG_STR, "sent":[ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":i["l"]} for i in this_sentence]}, tag_config["write_output_to"])
705
  tag_config["write_output_to"].write("\n")
706
 
707
+ # If lemmatization will not be applied:
708
+ else:
709
+ # If language will be identified per sentence
710
+ if tag_config["lang_per_sentence"]:
711
+ id_to_lang = self.config["id_to_lang"]
712
+ # If the output will be to a python list
713
+ if tag_config["write_output_to"]==None:
714
+ all_tagged_sentences = []
715
+ for batch in tokenized_batches:
716
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
717
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
718
+ batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
719
+ batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
720
+ batch["input_ids"].to("cpu")
721
+ batch["attention_mask"].to("cpu")
722
+
723
+ for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
724
+ batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
725
+ this_sentence=[]
726
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
727
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
728
+ break
729
+ if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
730
+ if len(this_sentence)>0:
731
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
732
+ else:
733
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
734
+ else:
735
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
736
+ all_tagged_sentences.append({"lang":id_to_lang[lang], "sent": [ {"w":i["w"], "t":self.tags[lang][i["t"]]} for i in this_sentence]})
737
+
738
+ return all_tagged_sentences
739
+
740
+ # If the output is in TSV format to a pipe (stdout or a file handle)
741
+ elif tag_config["output_tsv"]:
742
+ for batch in tokenized_batches:
743
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
744
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
745
+ batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
746
+ batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
747
+ batch["input_ids"].to("cpu")
748
+ batch["attention_mask"].to("cpu")
749
+
750
+ for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
751
+ batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
752
+ this_sentence=[]
753
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
754
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
755
+ break
756
+ if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
757
+ if len(this_sentence)>0:
758
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
759
+ else:
760
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
761
+ else:
762
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
763
+ this_sentence=[ {"w":i["w"], "t":self.tags_str[lang][i["t"]] } for i in this_sentence]
764
+ tag_config["write_output_to"].write(id_to_lang[lang])
765
+ for lin in this_sentence:
766
+ tag_config["write_output_to"].write("\t")
767
+ tag_config["write_output_to"].write(lin["w"])
768
+ tag_config["write_output_to"].write("\t")
769
+ tag_config["write_output_to"].write(lin["t"])
770
+ tag_config["write_output_to"].write("\n")
771
+ tag_config["write_output_to"].write("\n")
772
+
773
+ # If output format will be json to a pipe (stdout or a file handle)
774
+ else:
775
+ for batch in tokenized_batches:
776
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
777
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
778
+ batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
779
+ batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
780
+ batch["input_ids"].to("cpu")
781
+ batch["attention_mask"].to("cpu")
782
+
783
+ for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
784
+ batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
785
+ this_sentence=[]
786
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
787
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
788
+ break
789
+ if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
790
+ if len(this_sentence)>0:
791
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
792
+ else:
793
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
794
+ else:
795
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
796
+
797
+ json.dump({"lang":id_to_lang[lang], "sent":[ {"w":i["w"], "t":self.tags[lang][i["t"]]} for i in this_sentence]}, tag_config["write_output_to"])
798
+ tag_config["write_output_to"].write("\n")
799
+
800
+ # If the language is set as parameter
801
+ elif tag_config["lang"] != -1:
802
+ LANG = tag_config["lang"]
803
+ LANG_STR = self.config["id_to_lang"][LANG]
804
+ # If the output will be to a python list
805
+ if tag_config["write_output_to"]==None:
806
+ all_tagged_sentences = []
807
+ for batch in tokenized_batches:
808
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
809
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
810
+ batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
811
+ batch["input_ids"].to("cpu")
812
+ batch["attention_mask"].to("cpu")
813
+
814
+ for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
815
+ batch_lemmas.tolist()):
816
+ this_sentence=[]
817
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
818
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
819
+ break
820
+ if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
821
+ if len(this_sentence)>0:
822
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
823
+ else:
824
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
825
+ else:
826
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
827
+ all_tagged_sentences.append({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]]} for i in this_sentence]})
828
+
829
+ return all_tagged_sentences
830
+
831
+ # If the output is in TSV format to a pipe (stdout or a file handle)
832
+ elif tag_config["output_tsv"]:
833
+ for batch in tokenized_batches:
834
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
835
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
836
+ batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
837
+ batch["input_ids"].to("cpu")
838
+ batch["attention_mask"].to("cpu")
839
+
840
+ for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
841
+ batch_lemmas.tolist()):
842
+ this_sentence=[]
843
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
844
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
845
+ break
846
+ if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
847
+ if len(this_sentence)>0:
848
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
849
+ else:
850
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
851
+ else:
852
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
853
+ this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]]} for i in this_sentence]
854
+ tag_config["write_output_to"].write(LANG_STR)
855
+ for lin in this_sentence:
856
+ tag_config["write_output_to"].write("\t")
857
+ tag_config["write_output_to"].write(lin["w"])
858
+ tag_config["write_output_to"].write("\t")
859
+ tag_config["write_output_to"].write(lin["t"])
860
+ tag_config["write_output_to"].write("\n")
861
+ tag_config["write_output_to"].write("\n")
862
+
863
+ # If output format will be json to a pipe (stdout or a file handle)
864
+ else:
865
+ for batch in tokenized_batches:
866
+ all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
867
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
868
+ batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
869
+ batch["input_ids"].to("cpu")
870
+ batch["attention_mask"].to("cpu")
871
+
872
+ for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
873
+ batch_lemmas.tolist()):
874
+ this_sentence=[]
875
+ for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
876
+ if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
877
+ break
878
+ if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
879
+ if len(this_sentence)>0:
880
+ this_sentence[-1]["w"] += self.tokenizer.decode(inps)
881
+ else:
882
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
883
+ else:
884
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
885
+
886
+ json.dump({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]]} for i in this_sentence]}, tag_config["write_output_to"])
887
+ tag_config["write_output_to"].write("\n")
888
+
889
+ # If language will be identified according to the majority of all sentences:
890
+ else:
891
+ all_tags=[]
892
+ all_lemmas=[]
893
+ all_langs=[]
894
+ all_input_ids=[]
895
+ # Go over all batches and each sentence in each batch
896
  for batch in tokenized_batches:
897
  all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
898
+ batch_tags = torch.argmax(all_out["logits2"], dim=-1)
899
  batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
900
+ batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
901
+ all_input_ids.extend(batch["input_ids"].tolist())
902
  batch["input_ids"].to("cpu")
903
  batch["attention_mask"].to("cpu")
904
+ all_langs.extend(batch_langs[:, 0].tolist())
905
+ all_tags.extend(batch_tags.tolist())
906
+ all_lemmas.extend(batch_lemmas.tolist())
907
+
908
+ # Identify the language
909
+ tag_config["lang"] = 1 if sum(all_langs)/len(all_langs)>=0.5 else 0
910
+ LANG = tag_config["lang"]
911
+ LANG_STR = self.config["id_to_lang"][LANG]
912
+
913
+ # If the output will be returned as python list:
914
+ if tag_config["write_output_to"]==None:
915
+ all_tagged_sentences = []
916
+ for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
917
  this_sentence=[]
918
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
919
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
 
922
  if len(this_sentence)>0:
923
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
924
  else:
925
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
926
  else:
927
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
928
+ all_tagged_sentences.append({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]]} for i in this_sentence] })
929
+ return all_tagged_sentences
 
 
 
 
 
 
 
 
 
 
930
 
931
+ # If the output is in TSV format
932
+ elif tag_config["output_tsv"]:
933
+ for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
934
  this_sentence=[]
935
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
936
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
 
939
  if len(this_sentence)>0:
940
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
941
  else:
942
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
943
  else:
944
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
945
+ this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]]} for i in this_sentence]
946
  tag_config["write_output_to"].write(LANG_STR)
947
  for lin in this_sentence:
948
  tag_config["write_output_to"].write("\t")
949
  tag_config["write_output_to"].write(lin["w"])
950
  tag_config["write_output_to"].write("\t")
 
 
951
  tag_config["write_output_to"].write(lin["t"])
952
  tag_config["write_output_to"].write("\n")
953
  tag_config["write_output_to"].write("\n")
954
 
955
+ # If output format will be json
956
+ else:
957
+ for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
 
 
 
 
 
 
 
 
958
  this_sentence=[]
959
  for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
960
  if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
 
963
  if len(this_sentence)>0:
964
  this_sentence[-1]["w"] += self.tokenizer.decode(inps)
965
  else:
966
+ this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
967
  else:
968
+ this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
969
+
970
+ json.dump({"lang":LANG_STR, "sent":[ {"w":i["w"], "t":self.tags[LANG][i["t"]]} for i in this_sentence]}, tag_config["write_output_to"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
971
  tag_config["write_output_to"].write("\n")
 
972
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
973
 
974
  def _check_if_text_file_and_return_content(self, filepath):
975
  try:
 
980
 
981
  @torch.no_grad()
982
  def tag(self, inp=None, **tag_config):
983
+
984
  self.eval()
985
+
986
+ if "lemmatise" in tag_config and tag_config["lemmatise"]==False:
987
+ tag_config["lemmatize"] = False
988
+ if "lemmatise" in tag_config:
989
+ del tag_config["lemmatise"]
990
+ else:
991
+ tag_config["lemmatize"] = True
992
+ if "lemmatise" in tag_config:
993
+ del tag_config["lemmatise"]
994
+
995
+ if "lemmatize" in tag_config and tag_config["lemmatize"]==False:
996
+ tag_config["lemmatize"] = False
997
+
998
  if "one_sentence_per_line" not in tag_config:
999
  tag_config["one_sentence_per_line"]=False
1000
 
 
1039
  os.makedirs(os.path.dirname(out_path), exist_ok=True)
1040
  if tag_config["one_sentence_per_line"]:
1041
  inp = [i for i in file_content.split("\n") if i!=""]
1042
+ inp = [" ".join(i.split()) for i in inp if i!=""]
1043
  with open(out_path, "w") as opened_file:
1044
  tag_config["write_output_to"] = opened_file
1045
  self.tag_sentence_list(inp, **tag_config)
 
1050
  self.tag_sentence_list(inp, **tag_config)
1051
  else:
1052
  print (f"Could not properly open and read {input_path}.")
1053
+ if write_to is not sys.stdout and write_to is not sys.stderr:
1054
+ write_to.close()
1055
  return
1056
 
1057
  else:
 
1069
  # Tag one sentence per line in a string
1070
  if tag_config["one_sentence_per_line"]:
1071
  inp = [i for i in inp.split("\n") if i!=""]
1072
+ inp = [" ".join(self._preprocess_text(i).split()) for i in inp if i!=""]
1073
  return self.tag_sentence_list(inp, **tag_config)
1074
 
1075
  # identify sentences
 
1079
  # Tag one sentence per list item
1080
  elif type(inp) == list:
1081
  inp=[i.strip() for i in inp]
1082
+ inp=[" ".join(self._preprocess_text(i).split()) for i in inp if i!=""]
1083
  return self.tag_sentence_list(inp, **tag_config)
1084
 
1085
  def identify_language_sentence_list(self, lst, **tag_config):
 
1122
 
1123
  @torch.no_grad()
1124
  def identify_language(self, inp=None, **tag_config):
1125
+
1126
  self.eval()
1127
+
1128
  if "one_sentence_per_line" not in tag_config:
1129
  tag_config["one_sentence_per_line"]=False
1130
+
1131
  if "lang" in tag_config:
1132
  del tag_config["lang"]
1133
 
 
1137
  if "lang_per_sentence" not in tag_config:
1138
  tag_config["lang_per_sentence"] = False
1139
 
1140
+ elif type(tag_config["lang_per_sentence"])==bool and tag_config["lang_per_sentence"]:
1141
  tag_config["lang_per_sentence"] = True
1142
 
1143
  if "input_directory" in tag_config and "output_directory" in tag_config and "write_output_to" in tag_config and tag_config["write_output_to"]!=None:
 
1193
  torch.cuda.empty_cache()
1194
 
1195
  if tag_config["write_output_to"]==None:
1196
+ general_output.extend([{"f":i[0], "lang":self.config["id_to_lang"][i[1]]} for i in zip(file_names, langs)])
1197
  elif tag_config["output_tsv"]:
1198
  for fil,lan in zip(file_names, langs):
1199
  tag_config["write_output_to"].write(fil)
 
1202
  tag_config["write_output_to"].write("\n")
1203
  else:
1204
  for fil,lan in zip(file_names, langs):
1205
+ json.dump({"f":fil, "lang":self.config["id_to_lang"][lan]})
1206
  file_names=[]
1207
  contents=[]
1208
  else:
 
1223
  torch.cuda.empty_cache()
1224
 
1225
  if tag_config["write_output_to"]==None:
1226
+ general_output.extend([{"f":i[0], "lang":self.config["id_to_lang"][i[1]]} for i in zip(file_names, langs)])
1227
  elif tag_config["output_tsv"]:
1228
  for fil,lan in zip(file_names, langs):
1229
  tag_config["write_output_to"].write(fil)
 
1232
  tag_config["write_output_to"].write("\n")
1233
  else:
1234
  for fil,lan in zip(file_names, langs):
1235
+ json.dump({"f":fil, "lang":self.config["id_to_lang"][lan]})
1236
 
1237
  return general_output if len(general_output)>0 else None
1238
 
 
1274
  opened_file.write(lan)
1275
  opened_file.write("\n")
1276
  else:
1277
+ json.dump([{"s":sen, "lang":lan} for sen,lan in zip(inp, out) ] , opened_file)
1278
  else:
1279
  if tag_config["output_tsv"]:
1280
  opened_file.write(out[0])
1281
  else:
1282
+ json.dump({"lang":out[0]} , opened_file)
1283
  else:
1284
  if tag_config["lang_per_sentence"]:
1285
+ general_output.extend([{"s":sen, "lang":lan} for sen,lan in zip(inp, out) ])
1286
  else:
1287
+ general_output.append({"f":input_path, "lang":out[0]})
1288
 
1289
  # If there is an opened pipe already
1290
  else:
 
1297
  tag_config["write_output_to"].write("\n")
1298
  tag_config["write_output_to"].write("\n")
1299
  else:
1300
+ json.dump([{"s":sen, "lang":lan} for sen,lan in zip(inp, out) ] , tag_config["write_output_to"])
1301
  tag_config["write_output_to"].write("\n")
1302
  else:
1303
  if tag_config["output_tsv"]:
 
1306
  tag_config["write_output_to"].write(out[0])
1307
  tag_config["write_output_to"].write("\n")
1308
  else:
1309
+ json.dump({"f":input_path, "lang":out[0]} , tag_config["write_output_to"])
1310
  tag_config["write_output_to"].write("\n")
1311
 
1312
  else:
 
1316
  tag_config["write_output_to"].write("err")
1317
  tag_config["write_output_to"].write("\n")
1318
  else:
1319
+ json.dump({"f":input_path, "lang":"err"} , tag_config["write_output_to"])
1320
  tag_config["write_output_to"].write("\n")
1321
 
1322
+ if tag_config["write_output_to"] and tag_config["write_output_to"] is not sys.stdout and tag_config["write_output_to"] is not sys.stderr:
1323
  tag_config["write_output_to"].close()
1324
 
1325
  return general_output if len(general_output)>0 else None
 
1355
 
1356
  # If return as list
1357
  if tag_config["write_output_to"]==None:
1358
+ return [{"s":i[0], "lang": i[1]} for i in zip(inp, out)]
1359
 
1360
  if tag_config["output_tsv"]:
1361
  for sen,lan in zip(inp, out):
 
1364
  tag_config["write_output_to"].write(out)
1365
  tag_config["write_output_to"].write("\n")
1366
  else:
1367
+ json.dump([{"s":sen, "lang":lan} for sen,lan in zip(inp, out) ] , tag_config["write_output_to"])
1368
 
1369
  return
1370
 
 
1376
 
1377
  # If return as list
1378
  if tag_config["write_output_to"]==None:
1379
+ return [{"s":i[0], "lang": i[1]} for i in zip(inp, out)]
1380
 
1381
  if tag_config["output_tsv"]:
1382
  for sen,lan in zip(inp, out):
 
1385
  tag_config["write_output_to"].write(lan)
1386
  tag_config["write_output_to"].write("\n")
1387
  else:
1388
+ json.dump([{"s":sen, "lang":lan} for sen,lan in zip(inp, out) ] , tag_config["write_output_to"])
1389
 
1390
  return
1391
 
tagger_config.json CHANGED
The diff for this file is too large to render. See raw diff