- Update lemmatisering

Files changed (4) hide show

.gitattributes +1 -0
fullform_list.json_large +3 -0
modeling_humit_tagger.py +627 -205
tagger_config.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.json_large filter=lfs diff=lfs merge=lfs -text

fullform_list.json_large ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a32e9d7c36ed2ba9ec7f080e118760c444277fb6f213172246d24711b0493433
+size 240703613

modeling_humit_tagger.py CHANGED Viewed

@@ -32,7 +32,7 @@ class HumitTaggerModel(torch.nn.Module):
             kwargs["this_model_config"]=json.load(js)
-        # Download this model's config:
         lemma_rules_path = hf_hub_download(repo_id=repo_name, filename=kwargs["config"].lemma_rules_py_file)
         # load lemma rules class
@@ -46,6 +46,7 @@ class HumitTaggerModel(torch.nn.Module):
         base_config_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_config_file"])
         base_model_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_model_file"])
         base_model_config_json_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_config_json_file"])
         # Copy base model's configuration python file into our working directory
         config_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)) , os.path.basename(base_config_file))
@@ -81,12 +82,13 @@ class HumitTaggerModel(torch.nn.Module):
         kwargs["model_weights_path"] = model_weights_path
         kwargs["repo_name"] = repo_name
         return HumitTaggerModel(**kwargs)
     def __init__(self, **kwargs ):
         super(HumitTaggerModel, self).__init__()
         json_cfg = kwargs["base_model_json_cfg"]
-        self.config=kwargs["this_model_config"]
         self.LemmaHandling = sys.modules["lemma_rules"].LemmaHandling
         self.LemmaHandling.load_lemma_rules_from_obj(self.config["lemma_rules"])
         cfg=sys.modules["base_config"].NorbertConfig(**json_cfg)
@@ -117,6 +119,32 @@ class HumitTaggerModel(torch.nn.Module):
         self.REPLACE_PATTERN = '|'.join(sorted(re.escape(k) for k in self.REPLACE_DICT))
         self.MAX_LENGTH = self.bert.config.max_position_embeddings
     def forward(self, input_ids=None, attention_mask=None ):
         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True )
         sequence_output = self.dropout(outputs.last_hidden_state)
@@ -171,19 +199,24 @@ class HumitTaggerModel(torch.nn.Module):
                             }
             batched_sentences.append(to_append)
-        torch.cuda.empty_cache()
         return batched_sentences
     def _split_sentences(self, inp):
         # Here we get the whole text tokenized.
         encodings = self.tokenizer(inp,add_special_tokens=False, return_tensors="pt").to(self.device)
         # Save a copy of the tokenization
         original_encodings=copy.deepcopy(encodings)
         original_encodings=original_encodings.to("cpu")
-        torch.cuda.empty_cache()
         # Pad to the complete size (model max_size -1 (-1 to add CLS))
         old_size=encodings["input_ids"][0].size()[0]
@@ -225,13 +258,15 @@ class HumitTaggerModel(torch.nn.Module):
         # First get them back to CPU to open space on GPU
         input_ids_batched=[i.to("cpu") for i in input_ids_batched]
         attention_mask_batched=[i.to("cpu") for i in attention_mask_batched]
-        torch.cuda.empty_cache()
         for input_ids, attention_masks in zip(input_ids_batched, attention_mask_batched):
             current_batch={"input_ids":input_ids.to(self.device).long(), "attention_mask":attention_masks.to(self.device).long()}
             outputs = self(**current_batch)
             del current_batch
-            torch.cuda.empty_cache()
             label_data=outputs["logits1"].argmax(-1)
             labels_output.extend(label_data)
@@ -240,7 +275,8 @@ class HumitTaggerModel(torch.nn.Module):
         labels_output=torch.stack(labels_output ,dim=0)
         labels_output=labels_output[:, range(1,self.MAX_LENGTH)]
         labels_output=torch.reshape(labels_output,(1,row_count * self.MAX_LENGTH_WITHOUT_CLS))
-        torch.cuda.empty_cache()
         # Now the data is split into sentences
         # So, now create sentence data as list so that this could be used
@@ -265,7 +301,9 @@ class HumitTaggerModel(torch.nn.Module):
         del old_size
         del inp
         del outputs
-        torch.cuda.empty_cache()
         return sentence_list
@@ -279,6 +317,85 @@ class HumitTaggerModel(torch.nn.Module):
             sentences.extend(self._split_sentences(i.strip()))
         return sentences
     def tag_sentence_list(self, lst, **tag_config):
         # If the sentences are not tokenized, tokenize while batching:
@@ -296,62 +413,268 @@ class HumitTaggerModel(torch.nn.Module):
         else:
             tokenized_batches = self._batchify(lst)
-        # If language will be identified per sentence
-        if tag_config["lang_per_sentence"]:
-            id_to_lang = self.config["id_to_lang"]
-            # If the output will be to a python list
-            if tag_config["write_output_to"]==None:
-                all_tagged_sentences = []
                 for batch in tokenized_batches:
                     all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
                     batch_tags = torch.argmax(all_out["logits2"], dim=-1)
-                    batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
                     batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
                     batch["input_ids"].to("cpu")
                     batch["attention_mask"].to("cpu")
-                    for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
-                                                             batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
                         this_sentence=[]
                         for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
                             if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
                                 break
-                            if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
                                 if len(this_sentence)>0:
                                     this_sentence[-1]["w"] += self.tokenizer.decode(inps)
                                 else:
                                     this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
                             else:
                                 this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
-                        all_tagged_sentences.append({"lang":id_to_lang[lang], "sent": [ {"w":i["w"], "t":self.tags[lang][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]})
-                return all_tagged_sentences
-            # If the output is in TSV format to a pipe (stdout or a file handle)
-            elif tag_config["output_tsv"]:
-                for batch in tokenized_batches:
-                    all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
-                    batch_tags = torch.argmax(all_out["logits2"], dim=-1)
-                    batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
-                    batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
-                    batch["input_ids"].to("cpu")
-                    batch["attention_mask"].to("cpu")
-                    for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
-                                                             batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
                         this_sentence=[]
                         for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
                             if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
                                 break
-                            if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
                                 if len(this_sentence)>0:
                                     this_sentence[-1]["w"] += self.tokenizer.decode(inps)
                                 else:
-                                    this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
                             else:
                                 this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
-                        this_sentence=[ {"w":i["w"], "t":self.tags_str[lang][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]
-                        tag_config["write_output_to"].write(id_to_lang[lang])
                         for lin in this_sentence:
                             tag_config["write_output_to"].write("\t")
                             tag_config["write_output_to"].write(lin["w"])
@@ -362,49 +685,235 @@ class HumitTaggerModel(torch.nn.Module):
                             tag_config["write_output_to"].write("\n")
                         tag_config["write_output_to"].write("\n")
-            # If output format will be json to a pipe (stdout or a file handle)
-            else:
-                for batch in tokenized_batches:
-                    all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
-                    batch_tags = torch.argmax(all_out["logits2"], dim=-1)
-                    batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
-                    batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
-                    batch["input_ids"].to("cpu")
-                    batch["attention_mask"].to("cpu")
-                    for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
-                                                             batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
                         this_sentence=[]
                         for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
                             if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
                                 break
-                            if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
                                 if len(this_sentence)>0:
                                     this_sentence[-1]["w"] += self.tokenizer.decode(inps)
                                 else:
                                     this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
                             else:
                                 this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
-                        json.dump({"lang":id_to_lang[lang], "sent":[ {"w":i["w"], "t":self.tags[lang][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]}, tag_config["write_output_to"])
                         tag_config["write_output_to"].write("\n")
-        # If the language is set as parameter
-        elif tag_config["lang"] != -1:
-            LANG = tag_config["lang"]
-            LANG_STR = self.config["id_to_lang"][LANG]
-            # If the output will be to a python list
-            if tag_config["write_output_to"]==None:
-                all_tagged_sentences = []
                 for batch in tokenized_batches:
                     all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
-                    batch_tags = torch.argmax(all_out["logits2"], dim=-1)
                     batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
                     batch["input_ids"].to("cpu")
                     batch["attention_mask"].to("cpu")
-                    for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
-                                                             batch_lemmas.tolist()):
                         this_sentence=[]
                         for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
                             if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
@@ -413,24 +922,15 @@ class HumitTaggerModel(torch.nn.Module):
                                 if len(this_sentence)>0:
                                     this_sentence[-1]["w"] += self.tokenizer.decode(inps)
                                 else:
-                                    this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
                             else:
-                                this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
-                        all_tagged_sentences.append({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]})
-                return all_tagged_sentences
-            # If the output is in TSV format to a pipe (stdout or a file handle)
-            elif tag_config["output_tsv"]:
-                for batch in tokenized_batches:
-                    all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
-                    batch_tags = torch.argmax(all_out["logits2"], dim=-1)
-                    batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
-                    batch["input_ids"].to("cpu")
-                    batch["attention_mask"].to("cpu")
-                    for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
-                                                             batch_lemmas.tolist()):
                         this_sentence=[]
                         for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
                             if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
@@ -439,32 +939,22 @@ class HumitTaggerModel(torch.nn.Module):
                                 if len(this_sentence)>0:
                                     this_sentence[-1]["w"] += self.tokenizer.decode(inps)
                                 else:
-                                    this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
                             else:
-                                this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
-                        this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]
                         tag_config["write_output_to"].write(LANG_STR)
                         for lin in this_sentence:
                             tag_config["write_output_to"].write("\t")
                             tag_config["write_output_to"].write(lin["w"])
                             tag_config["write_output_to"].write("\t")
-                            tag_config["write_output_to"].write(lin["l"])
-                            tag_config["write_output_to"].write("\t")
                             tag_config["write_output_to"].write(lin["t"])
                             tag_config["write_output_to"].write("\n")
                         tag_config["write_output_to"].write("\n")
-            # If output format will be json to a pipe (stdout or a file handle)
-            else:
-                for batch in tokenized_batches:
-                    all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
-                    batch_tags = torch.argmax(all_out["logits2"], dim=-1)
-                    batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
-                    batch["input_ids"].to("cpu")
-                    batch["attention_mask"].to("cpu")
-                    for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
-                                                             batch_lemmas.tolist()):
                         this_sentence=[]
                         for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
                             if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
@@ -473,98 +963,13 @@ class HumitTaggerModel(torch.nn.Module):
                                 if len(this_sentence)>0:
                                     this_sentence[-1]["w"] += self.tokenizer.decode(inps)
                                 else:
-                                    this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
-                            else:
-                                this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
-                        json.dump({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]}, tag_config["write_output_to"])
-                        tag_config["write_output_to"].write("\n")
-        # If language will be identified according to the majority of all sentences:
-        else:
-            all_tags=[]
-            all_lemmas=[]
-            all_langs=[]
-            all_input_ids=[]
-            # Go over all batches and each sentence in each batch
-            for batch in tokenized_batches:
-                all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
-                batch_tags = torch.argmax(all_out["logits2"], dim=-1)
-                batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
-                batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
-                all_input_ids.extend(batch["input_ids"].tolist())
-                batch["input_ids"].to("cpu")
-                batch["attention_mask"].to("cpu")
-                all_langs.extend(batch_langs[:, 0].tolist())
-                all_tags.extend(batch_tags.tolist())
-                all_lemmas.extend(batch_lemmas.tolist())
-            # Identify the language
-            tag_config["lang"] = 1 if sum(all_langs)/len(all_langs)>=0.5 else 0
-            LANG = tag_config["lang"]
-            LANG_STR = self.config["id_to_lang"][LANG]
-            # If the output will be returned as python list:
-            if tag_config["write_output_to"]==None:
-                all_tagged_sentences = []
-                for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
-                    this_sentence=[]
-                    for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
-                        if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
-                            break
-                        if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
-                            if len(this_sentence)>0:
-                                this_sentence[-1]["w"] += self.tokenizer.decode(inps)
                             else:
-                                this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
-                        else:
-                            this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
-                    all_tagged_sentences.append({"lang":LANG_STR, "sent":  [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence] })
-                return all_tagged_sentences
-            # If the output is in TSV format
-            elif tag_config["output_tsv"]:
-                for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
-                    this_sentence=[]
-                    for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
-                        if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
-                            break
-                        if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
-                            if len(this_sentence)>0:
-                                this_sentence[-1]["w"] += self.tokenizer.decode(inps)
-                            else:
-                                this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
-                        else:
-                            this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
-                    this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]
-                    tag_config["write_output_to"].write(LANG_STR)
-                    for lin in this_sentence:
-                        tag_config["write_output_to"].write("\t")
-                        tag_config["write_output_to"].write(lin["w"])
-                        tag_config["write_output_to"].write("\t")
-                        tag_config["write_output_to"].write(lin["l"])
-                        tag_config["write_output_to"].write("\t")
-                        tag_config["write_output_to"].write(lin["t"])
                         tag_config["write_output_to"].write("\n")
-                    tag_config["write_output_to"].write("\n")
-            # If output format will be json
-            else:
-                for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
-                    this_sentence=[]
-                    for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
-                        if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
-                            break
-                        if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
-                            if len(this_sentence)>0:
-                                this_sentence[-1]["w"] += self.tokenizer.decode(inps)
-                            else:
-                                this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
-                        else:
-                            this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
-                    json.dump({"lang":LANG_STR, "sent":[ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]}, tag_config["write_output_to"])
-                    tag_config["write_output_to"].write("\n")
     def _check_if_text_file_and_return_content(self, filepath):
         try:
@@ -575,7 +980,21 @@ class HumitTaggerModel(torch.nn.Module):
     @torch.no_grad()
     def tag(self, inp=None, **tag_config):
         self.eval()
         if "one_sentence_per_line" not in tag_config:
             tag_config["one_sentence_per_line"]=False
@@ -620,7 +1039,7 @@ class HumitTaggerModel(torch.nn.Module):
                         os.makedirs(os.path.dirname(out_path), exist_ok=True)
                         if tag_config["one_sentence_per_line"]:
                             inp = [i for i in file_content.split("\n") if i!=""]
-                            inp = [i for i in inp if i!=""]
                             with open(out_path, "w") as opened_file:
                                 tag_config["write_output_to"] = opened_file
                                 self.tag_sentence_list(inp, **tag_config)
@@ -631,8 +1050,8 @@ class HumitTaggerModel(torch.nn.Module):
                                 self.tag_sentence_list(inp, **tag_config)
                     else:
                         print (f"Could not properly open and read {input_path}.")
-            write_to.close()
             return
         else:
@@ -650,7 +1069,7 @@ class HumitTaggerModel(torch.nn.Module):
             # Tag one sentence per line in a string
             if tag_config["one_sentence_per_line"]:
                 inp = [i for i in inp.split("\n") if i!=""]
-                inp = [self._preprocess_text(i) for i in inp if i!=""]
                 return self.tag_sentence_list(inp, **tag_config)
             # identify sentences
@@ -660,7 +1079,7 @@ class HumitTaggerModel(torch.nn.Module):
         # Tag one sentence per list item
         elif type(inp) == list:
             inp=[i.strip() for i in inp]
-            inp=[self._preprocess_text(i) for i in inp if i!=""]
             return self.tag_sentence_list(inp, **tag_config)
     def identify_language_sentence_list(self, lst, **tag_config):
@@ -703,9 +1122,12 @@ class HumitTaggerModel(torch.nn.Module):
     @torch.no_grad()
     def identify_language(self, inp=None, **tag_config):
         self.eval()
         if "one_sentence_per_line" not in tag_config:
             tag_config["one_sentence_per_line"]=False
         if "lang" in tag_config:
             del tag_config["lang"]
@@ -715,7 +1137,7 @@ class HumitTaggerModel(torch.nn.Module):
         if "lang_per_sentence" not in tag_config:
             tag_config["lang_per_sentence"] = False
-        elif tag_config["lang_per_sentence"]:
             tag_config["lang_per_sentence"] = True
         if "input_directory" in tag_config and "output_directory" in tag_config and "write_output_to" in tag_config and tag_config["write_output_to"]!=None:
@@ -771,7 +1193,7 @@ class HumitTaggerModel(torch.nn.Module):
                         torch.cuda.empty_cache()
                         if tag_config["write_output_to"]==None:
-                            general_output.extend([{"f":i[0], "l":self.config["id_to_lang"][i[1]]} for i in zip(file_names, langs)])
                         elif tag_config["output_tsv"]:
                             for fil,lan in zip(file_names, langs):
                                 tag_config["write_output_to"].write(fil)
@@ -780,7 +1202,7 @@ class HumitTaggerModel(torch.nn.Module):
                                 tag_config["write_output_to"].write("\n")
                         else:
                             for fil,lan in zip(file_names, langs):
-                                json.dump({"f":fil, "l":self.config["id_to_lang"][lan]})
                         file_names=[]
                         contents=[]
                     else:
@@ -801,7 +1223,7 @@ class HumitTaggerModel(torch.nn.Module):
                 torch.cuda.empty_cache()
                 if tag_config["write_output_to"]==None:
-                    general_output.extend([{"f":i[0], "l":self.config["id_to_lang"][i[1]]} for i in zip(file_names, langs)])
                 elif tag_config["output_tsv"]:
                     for fil,lan in zip(file_names, langs):
                         tag_config["write_output_to"].write(fil)
@@ -810,7 +1232,7 @@ class HumitTaggerModel(torch.nn.Module):
                         tag_config["write_output_to"].write("\n")
                 else:
                     for fil,lan in zip(file_names, langs):
-                        json.dump({"f":fil, "l":self.config["id_to_lang"][lan]})
             return general_output if len(general_output)>0 else None
@@ -852,17 +1274,17 @@ class HumitTaggerModel(torch.nn.Module):
                                                 opened_file.write(lan)
                                                 opened_file.write("\n")
                                         else:
-                                            json.dump([{"s":sen, "l":lan} for sen,lan in zip(inp, out) ] , opened_file)
                                     else:
                                         if tag_config["output_tsv"]:
                                             opened_file.write(out[0])
                                         else:
-                                            json.dump({"l":out[0]} , opened_file)
                             else:
                                 if tag_config["lang_per_sentence"]:
-                                    general_output.extend([{"s":sen, "l":lan} for sen,lan in zip(inp, out) ])
                                 else:
-                                    general_output.append({"f":input_path, "l":out[0]})
                         # If there is an opened pipe already
                         else:
@@ -875,7 +1297,7 @@ class HumitTaggerModel(torch.nn.Module):
                                         tag_config["write_output_to"].write("\n")
                                     tag_config["write_output_to"].write("\n")
                                 else:
-                                    json.dump([{"s":sen, "l":lan} for sen,lan in zip(inp, out) ] , tag_config["write_output_to"])
                                     tag_config["write_output_to"].write("\n")
                             else:
                                 if tag_config["output_tsv"]:
@@ -884,7 +1306,7 @@ class HumitTaggerModel(torch.nn.Module):
                                     tag_config["write_output_to"].write(out[0])
                                     tag_config["write_output_to"].write("\n")
                                 else:
-                                    json.dump({"f":input_path, "l":out[0]} , tag_config["write_output_to"])
                                     tag_config["write_output_to"].write("\n")
                     else:
@@ -894,10 +1316,10 @@ class HumitTaggerModel(torch.nn.Module):
                             tag_config["write_output_to"].write("err")
                             tag_config["write_output_to"].write("\n")
                         else:
-                            json.dump({"f":input_path, "l":"err"} , tag_config["write_output_to"])
                             tag_config["write_output_to"].write("\n")
-            if tag_config["write_output_to"] and tag_config["write_output_to"]!=sys.stdout and tag_config["write_output_to"]!=sys.stderr:
                 tag_config["write_output_to"].close()
             return general_output if len(general_output)>0 else None
@@ -933,7 +1355,7 @@ class HumitTaggerModel(torch.nn.Module):
             # If return as list
             if tag_config["write_output_to"]==None:
-                return [{"s":i[0], "l": i[1]} for i in zip(inp, out)]
             if tag_config["output_tsv"]:
                 for sen,lan in zip(inp, out):
@@ -942,7 +1364,7 @@ class HumitTaggerModel(torch.nn.Module):
                     tag_config["write_output_to"].write(out)
                     tag_config["write_output_to"].write("\n")
             else:
-                json.dump([{"s":sen, "l":lan} for sen,lan in zip(inp, out) ] , tag_config["write_output_to"])
             return
@@ -954,7 +1376,7 @@ class HumitTaggerModel(torch.nn.Module):
             # If return as list
             if tag_config["write_output_to"]==None:
-                return [{"s":i[0], "l": i[1]} for i in zip(inp, out)]
             if tag_config["output_tsv"]:
                 for sen,lan in zip(inp, out):
@@ -963,7 +1385,7 @@ class HumitTaggerModel(torch.nn.Module):
                     tag_config["write_output_to"].write(lan)
                     tag_config["write_output_to"].write("\n")
             else:
-                json.dump([{"s":sen, "l":lan} for sen,lan in zip(inp, out) ] , tag_config["write_output_to"])
             return

             kwargs["this_model_config"]=json.load(js)
+        # Download this model's lemma rules pickle file:
         lemma_rules_path = hf_hub_download(repo_id=repo_name, filename=kwargs["config"].lemma_rules_py_file)
         # load lemma rules class
         base_config_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_config_file"])
         base_model_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_model_file"])
         base_model_config_json_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_config_json_file"])
+        fullformlist_file = hf_hub_download(repo_id=repo_name, filename=kwargs["this_model_config"]["fullformlist_file"])
         # Copy base model's configuration python file into our working directory
         config_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)) , os.path.basename(base_config_file))
         kwargs["model_weights_path"] = model_weights_path
         kwargs["repo_name"] = repo_name
+        kwargs["fullformlist_file"] = fullformlist_file
         return HumitTaggerModel(**kwargs)
     def __init__(self, **kwargs ):
         super(HumitTaggerModel, self).__init__()
         json_cfg = kwargs["base_model_json_cfg"]
+        self.config = kwargs["this_model_config"]
         self.LemmaHandling = sys.modules["lemma_rules"].LemmaHandling
         self.LemmaHandling.load_lemma_rules_from_obj(self.config["lemma_rules"])
         cfg=sys.modules["base_config"].NorbertConfig(**json_cfg)
         self.REPLACE_PATTERN = '|'.join(sorted(re.escape(k) for k in self.REPLACE_DICT))
         self.MAX_LENGTH = self.bert.config.max_position_embeddings
+        # Note the classes that represents gen and prop tags
+        self.gen_tag_classes = set()
+        self.prop_tag_classes = set()
+        self.t_2_tag_classes = set()
+        for i, lst in enumerate(self.config["tags"][0]):
+            if "gen" in lst:
+                self.gen_tag_classes.add(i)
+            if "prop" in lst:
+                self.prop_tag_classes.add(i)
+            if "2" in lst:
+                self.t_2_tag_classes.add(i)
+        # Load the fullform list
+        self.fullform_list=[{},{}]
+        try:
+            with open(kwargs["fullformlist_file"], 'r') as f:
+                self.fullform_list = json.load(f)
+                for k in range(2):
+                    for i in self.fullform_list[k]:
+                        for j in self.fullform_list[k][i][j]:
+                            self.fullform_list[k][i][j]=set(self.fullform_list[k][i][j])
+        except:
+            pass
     def forward(self, input_ids=None, attention_mask=None ):
         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True )
         sequence_output = self.dropout(outputs.last_hidden_state)
                             }
             batched_sentences.append(to_append)
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
         return batched_sentences
     def _split_sentences(self, inp):
+        # Remove double spaces
+        inp=" ".join(inp.split())
         # Here we get the whole text tokenized.
         encodings = self.tokenizer(inp,add_special_tokens=False, return_tensors="pt").to(self.device)
         # Save a copy of the tokenization
         original_encodings=copy.deepcopy(encodings)
         original_encodings=original_encodings.to("cpu")
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
         # Pad to the complete size (model max_size -1 (-1 to add CLS))
         old_size=encodings["input_ids"][0].size()[0]
         # First get them back to CPU to open space on GPU
         input_ids_batched=[i.to("cpu") for i in input_ids_batched]
         attention_mask_batched=[i.to("cpu") for i in attention_mask_batched]
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
         for input_ids, attention_masks in zip(input_ids_batched, attention_mask_batched):
             current_batch={"input_ids":input_ids.to(self.device).long(), "attention_mask":attention_masks.to(self.device).long()}
             outputs = self(**current_batch)
             del current_batch
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
             label_data=outputs["logits1"].argmax(-1)
             labels_output.extend(label_data)
         labels_output=torch.stack(labels_output ,dim=0)
         labels_output=labels_output[:, range(1,self.MAX_LENGTH)]
         labels_output=torch.reshape(labels_output,(1,row_count * self.MAX_LENGTH_WITHOUT_CLS))
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
         # Now the data is split into sentences
         # So, now create sentence data as list so that this could be used
         del old_size
         del inp
         del outputs
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
         return sentence_list
             sentences.extend(self._split_sentences(i.strip()))
         return sentences
+    def _lemmatize(self, tag, LANG):
+        # Here, a "tag" is a list of words in one sentence, their tags and an ordering of lemma classes according the lemmatization model for each word.
+        # We go over all words, and apply our algorithm for lemmatization
+        # 1. If the "pron" tag is found in the tags
+        #    then, we check if the "gen" tag also exists
+        #         if there is the "gen" tag in tags and if there is "s" at the end of the word, we remove that s
+        #             and return the rest of the word as lemma
+        # 2. OR, we continue with "høflig" processing
+        #    if the word is "De" and if it has the tag "høflig" then we set the lemma as "De", otherwise "de"
+        # 3. OR, we continue with checking the word and its word class (subst, verb, adj, etc.) towards the fullform lists.
+        #    if the word and its word class exists in the fullformlist (of the language bokmål or nynorsk according the the language parameter)
+        #       then we set the lemma from the fullform list.
+        #       if there are multiple lemmas in the fullform list, then we check each lemma suggested by the model
+        #          we pick the lemma amon the lemmas suggested by the fullformlist that comes the first among the lemmas suggested by model
+        # 4. OR, we set the first lemma suggested by the model
+        # 5. OR, just in case, one way or another if we cannot set a lemma, we set the word as the lemma
+        # Go over all words in the sentence
+        for i in range(len(tag)):
+            # If there is prop in tags
+            if tag[i]["t"] in self.prop_tag_classes:
+                # set the lemma as the word
+                tag[i]["l"]=tag[i]["w"]
+                # if there is gen in tags then remove the last Ss
+                if tag[i]["t"] in self.gen_tag_classes:
+                    if tag[i]["l"].endswith("'s") or tag[i]["l"].endswith("'S"):
+                        tag[i]["l"]=tag[i]["l"][:-2]
+                    elif tag[i]["l"].endswith("s") or tag[i]["l"].endswith("S") or tag[i]["l"].endswith("'"):
+                        tag[i]["l"]=tag[i]["l"][:-1]
+                continue
+            # if høflig
+            if tag[i]["w"]=="De":
+                if tag[i]["t"] in self.t_2_tag_classes:
+                    tag[i]["l"]="De"
+                    continue
+                else:
+                    tag[i]["l"]="de"
+                    continue
+            # for the rest of the cases of the word, lowercase the word and check against the fullform list
+            word=tag[i]["w"].lower()
+            word_class = self.tags[0][tag[i]["t"]][0]
+            # get the lemma from the fullform list
+            fullform_list_lemma = self.fullform_list[LANG].get(word, {}).get(word_class)
+            # if there is not a lemma in the fullformlist
+            # use the first lemma from the model
+            if fullform_list_lemma==None:
+                tag[i]["l"] = self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(tag[i]["w"], tag[i]["l"][0] )
+            # if there is only one fullformlist-lemma:
+            elif len(fullform_list_lemma) == 1:
+                tag[i]["l"] = next(iter(fullform_list_lemma))
+            # if there are multiple lemmas in the fullformlist
+            # here we disambugate among these lemmas using the alternatives from the model
+            elif len(fullform_list_lemma) > 1:
+                tag[i]["l"] = next((selected_lemma for x in tag[i]["l"] if (selected_lemma :=  self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(tag[i]["w"], x )) in fullform_list_lemma),  self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(tag[i]["w"], tag[i]["l"][0] ) )
+            # This branch will probably not be called but kept just in case
+            # If none of the cases above, use the first lemma suggested by the model
+            else:
+                tag[i]["l"] = self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(tag[i]["w"], tag[i]["l"][0] )
+            # This if will probable not be true either but kept just in case
+            # If a lemma could not be assigned after all these operations
+            # then asign the word itself
+            # Check by if the lemma field is still a list or if the field-type is string the legth is 0
+            if type(tag[i]["l"]) == list or len(tag[i]["l"]) == 0:
+                tag[i]["l"] = tag[i]["w"]
+        return tag
     def tag_sentence_list(self, lst, **tag_config):
         # If the sentences are not tokenized, tokenize while batching:
         else:
             tokenized_batches = self._batchify(lst)
+        # If lemmatization will be applied
+        if tag_config["lemmatize"]:
+            # If language will be identified per sentence
+            if tag_config["lang_per_sentence"]:
+                id_to_lang = self.config["id_to_lang"]
+                # If the output will be to a python list
+                if tag_config["write_output_to"]==None:
+                    all_tagged_sentences = []
+                    for batch in tokenized_batches:
+                        all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
+                        batch_tags = torch.argmax(all_out["logits2"], dim=-1)
+                        batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
+                        #batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
+                        batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
+                        batch["input_ids"].to("cpu")
+                        batch["attention_mask"].to("cpu")
+                        for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
+                                                                 batch_lemma_indices.indices.tolist(), batch_langs[:, 0].tolist()):
+                            this_sentence=[]
+                            for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
+                                if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
+                                    break
+                                if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
+                                    if len(this_sentence)>0:
+                                        this_sentence[-1]["w"] += self.tokenizer.decode(inps)
+                                    else:
+                                        this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
+                                else:
+                                    this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
+                            this_sentence = self._lemmatize(this_sentence, lang)
+                            all_tagged_sentences.append({"lang":id_to_lang[lang], "sent": [ {"w":i["w"], "t":self.tags[lang][i["t"]], "l":i["l"]} for i in this_sentence]})
+                    return all_tagged_sentences
+                # If the output is in TSV format to a pipe (stdout or a file handle)
+                elif tag_config["output_tsv"]:
+                    for batch in tokenized_batches:
+                        all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
+                        batch_tags = torch.argmax(all_out["logits2"], dim=-1)
+                        batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
+                        #batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
+                        batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
+                        batch["input_ids"].to("cpu")
+                        batch["attention_mask"].to("cpu")
+                        for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
+                                                                 batch_lemma_indices.indices.tolist(), batch_langs[:, 0].tolist()):
+                            this_sentence=[]
+                            for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
+                                if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
+                                    break
+                                if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
+                                    if len(this_sentence)>0:
+                                        this_sentence[-1]["w"] += self.tokenizer.decode(inps)
+                                    else:
+                                        this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
+                                else:
+                                    this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
+                            this_sentence = self._lemmatize(this_sentence, lang)
+                            this_sentence=[ {"w":i["w"], "t":self.tags_str[lang][i["t"]], "l":i["l"]} for i in this_sentence]
+                            tag_config["write_output_to"].write(id_to_lang[lang])
+                            for lin in this_sentence:
+                                tag_config["write_output_to"].write("\t")
+                                tag_config["write_output_to"].write(lin["w"])
+                                tag_config["write_output_to"].write("\t")
+                                tag_config["write_output_to"].write(lin["l"])
+                                tag_config["write_output_to"].write("\t")
+                                tag_config["write_output_to"].write(lin["t"])
+                                tag_config["write_output_to"].write("\n")
+                            tag_config["write_output_to"].write("\n")
+                # If output format will be json to a pipe (stdout or a file handle)
+                else:
+                    for batch in tokenized_batches:
+                        all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
+                        batch_tags = torch.argmax(all_out["logits2"], dim=-1)
+                        batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
+                        #batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
+                        batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
+                        batch["input_ids"].to("cpu")
+                        batch["attention_mask"].to("cpu")
+                        for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
+                                                                 batch_lemma_indices.indices.tolist(), batch_langs[:, 0].tolist()):
+                            this_sentence=[]
+                            for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
+                                if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
+                                    break
+                                if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
+                                    if len(this_sentence)>0:
+                                        this_sentence[-1]["w"] += self.tokenizer.decode(inps)
+                                    else:
+                                        this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
+                                else:
+                                    this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
+                            this_sentence = self._lemmatize(this_sentence, lang)
+                            json.dump({"lang":id_to_lang[lang], "sent":[ {"w":i["w"], "t":self.tags[lang][i["t"]], "l":i["l"]} for i in this_sentence]}, tag_config["write_output_to"])
+                            tag_config["write_output_to"].write("\n")
+            # If the language is set as parameter
+            elif tag_config["lang"] != -1:
+                LANG = tag_config["lang"]
+                LANG_STR = self.config["id_to_lang"][LANG]
+                # If the output will be to a python list
+                if tag_config["write_output_to"]==None:
+                    all_tagged_sentences = []
+                    for batch in tokenized_batches:
+                        all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
+                        batch_tags = torch.argmax(all_out["logits2"], dim=-1)
+                        batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
+                        #batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
+                        batch["input_ids"].to("cpu")
+                        batch["attention_mask"].to("cpu")
+                        for input_ids, tags, lemma_indices in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
+                                                                 batch_lemma_indices.indices.tolist()): #batch_lemmas.tolist(),
+                            this_sentence=[]
+                            for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemma_indices[1:]):
+                                if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
+                                    break
+                                if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
+                                    if len(this_sentence)>0:
+                                        this_sentence[-1]["w"] += self.tokenizer.decode(inps)
+                                    else:
+                                        this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
+                                else:
+                                    this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
+                            this_sentence = self._lemmatize(this_sentence, LANG)
+                            all_tagged_sentences.append({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":i["l"]} for i in this_sentence]})
+                    return all_tagged_sentences
+                # If the output is in TSV format to a pipe (stdout or a file handle)
+                elif tag_config["output_tsv"]:
+                    for batch in tokenized_batches:
+                        all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
+                        batch_tags = torch.argmax(all_out["logits2"], dim=-1)
+                        batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
+                        #batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
+                        batch["input_ids"].to("cpu")
+                        batch["attention_mask"].to("cpu")
+                        for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
+                                                                 batch_lemma_indices.indices.tolist()):
+                            this_sentence=[]
+                            for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
+                                if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
+                                    break
+                                if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
+                                    if len(this_sentence)>0:
+                                        this_sentence[-1]["w"] += self.tokenizer.decode(inps)
+                                    else:
+                                        this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
+                                else:
+                                    this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
+                            this_sentence = self._lemmatize(this_sentence, LANG)
+                            this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]], "l":i["l"]} for i in this_sentence]
+                            tag_config["write_output_to"].write(LANG_STR)
+                            for lin in this_sentence:
+                                tag_config["write_output_to"].write("\t")
+                                tag_config["write_output_to"].write(lin["w"])
+                                tag_config["write_output_to"].write("\t")
+                                tag_config["write_output_to"].write(lin["l"])
+                                tag_config["write_output_to"].write("\t")
+                                tag_config["write_output_to"].write(lin["t"])
+                                tag_config["write_output_to"].write("\n")
+                            tag_config["write_output_to"].write("\n")
+                # If output format will be json to a pipe (stdout or a file handle)
+                else:
+                    for batch in tokenized_batches:
+                        all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
+                        batch_tags = torch.argmax(all_out["logits2"], dim=-1)
+                        batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
+                        #batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
+                        batch["input_ids"].to("cpu")
+                        batch["attention_mask"].to("cpu")
+                        for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
+                                                                 batch_lemma_indices.indices.tolist()):
+                            this_sentence=[]
+                            for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
+                                if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
+                                    break
+                                if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
+                                    if len(this_sentence)>0:
+                                        this_sentence[-1]["w"] += self.tokenizer.decode(inps)
+                                    else:
+                                        this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
+                                else:
+                                    this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
+                            this_sentence = self._lemmatize(this_sentence, LANG)
+                            json.dump({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":i["l"]} for i in this_sentence]}, tag_config["write_output_to"])
+                            tag_config["write_output_to"].write("\n")
+            # If language will be identified according to the majority of all sentences:
+            else:
+                all_tags=[]
+                all_lemmas=[]
+                all_langs=[]
+                all_input_ids=[]
+                # Go over all batches and each sentence in each batch
                 for batch in tokenized_batches:
                     all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
                     batch_tags = torch.argmax(all_out["logits2"], dim=-1)
+                    batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
+                    #batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
                     batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
+                    all_input_ids.extend(batch["input_ids"].tolist())
                     batch["input_ids"].to("cpu")
                     batch["attention_mask"].to("cpu")
+                    all_langs.extend(batch_langs[:, 0].tolist())
+                    all_tags.extend(batch_tags.tolist())
+                    all_lemmas.extend(batch_lemma_indices.indices.tolist())
+                # Identify the language
+                tag_config["lang"] = 1 if sum(all_langs)/len(all_langs)>=0.5 else 0
+                LANG = tag_config["lang"]
+                LANG_STR = self.config["id_to_lang"][LANG]
+                # If the output will be returned as python list:
+                if tag_config["write_output_to"]==None:
+                    all_tagged_sentences = []
+                    for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
                         this_sentence=[]
                         for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
                             if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
                                 break
+                            if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
                                 if len(this_sentence)>0:
                                     this_sentence[-1]["w"] += self.tokenizer.decode(inps)
                                 else:
                                     this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
                             else:
                                 this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
+                        this_sentence = self._lemmatize(this_sentence, LANG)
+                        all_tagged_sentences.append({"lang":LANG_STR, "sent":  [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":i["l"]} for i in this_sentence] })
+                    return all_tagged_sentences
+                # If the output is in TSV format
+                elif tag_config["output_tsv"]:
+                    for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
                         this_sentence=[]
                         for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
                             if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
                                 break
+                            if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
                                 if len(this_sentence)>0:
                                     this_sentence[-1]["w"] += self.tokenizer.decode(inps)
                                 else:
+                                    this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
                             else:
                                 this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
+                        this_sentence = self._lemmatize(this_sentence, LANG)
+                        this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]], "l":i["l"]} for i in this_sentence]
+                        tag_config["write_output_to"].write(LANG_STR)
                         for lin in this_sentence:
                             tag_config["write_output_to"].write("\t")
                             tag_config["write_output_to"].write(lin["w"])
                             tag_config["write_output_to"].write("\n")
                         tag_config["write_output_to"].write("\n")
+                # If output format will be json
+                else:
+                    for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
                         this_sentence=[]
                         for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
                             if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
                                 break
+                            if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
                                 if len(this_sentence)>0:
                                     this_sentence[-1]["w"] += self.tokenizer.decode(inps)
                                 else:
                                     this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
                             else:
                                 this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
+                        this_sentence = self._lemmatize(this_sentence, LANG)
+                        json.dump({"lang":LANG_STR, "sent":[ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":i["l"]} for i in this_sentence]}, tag_config["write_output_to"])
                         tag_config["write_output_to"].write("\n")
+        # If lemmatization will not be applied:
+        else:
+            # If language will be identified per sentence
+            if tag_config["lang_per_sentence"]:
+                id_to_lang = self.config["id_to_lang"]
+                # If the output will be to a python list
+                if tag_config["write_output_to"]==None:
+                    all_tagged_sentences = []
+                    for batch in tokenized_batches:
+                        all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
+                        batch_tags = torch.argmax(all_out["logits2"], dim=-1)
+                        batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
+                        batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
+                        batch["input_ids"].to("cpu")
+                        batch["attention_mask"].to("cpu")
+                        for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
+                                                                 batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
+                            this_sentence=[]
+                            for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
+                                if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
+                                    break
+                                if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
+                                    if len(this_sentence)>0:
+                                        this_sentence[-1]["w"] += self.tokenizer.decode(inps)
+                                    else:
+                                        this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
+                                else:
+                                    this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
+                            all_tagged_sentences.append({"lang":id_to_lang[lang], "sent": [ {"w":i["w"], "t":self.tags[lang][i["t"]]} for i in this_sentence]})
+                    return all_tagged_sentences
+                # If the output is in TSV format to a pipe (stdout or a file handle)
+                elif tag_config["output_tsv"]:
+                    for batch in tokenized_batches:
+                        all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
+                        batch_tags = torch.argmax(all_out["logits2"], dim=-1)
+                        batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
+                        batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
+                        batch["input_ids"].to("cpu")
+                        batch["attention_mask"].to("cpu")
+                        for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
+                                                                 batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
+                            this_sentence=[]
+                            for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
+                                if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
+                                    break
+                                if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
+                                    if len(this_sentence)>0:
+                                        this_sentence[-1]["w"] += self.tokenizer.decode(inps)
+                                    else:
+                                        this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
+                                else:
+                                    this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
+                            this_sentence=[ {"w":i["w"], "t":self.tags_str[lang][i["t"]] } for i in this_sentence]
+                            tag_config["write_output_to"].write(id_to_lang[lang])
+                            for lin in this_sentence:
+                                tag_config["write_output_to"].write("\t")
+                                tag_config["write_output_to"].write(lin["w"])
+                                tag_config["write_output_to"].write("\t")
+                                tag_config["write_output_to"].write(lin["t"])
+                                tag_config["write_output_to"].write("\n")
+                            tag_config["write_output_to"].write("\n")
+                # If output format will be json to a pipe (stdout or a file handle)
+                else:
+                    for batch in tokenized_batches:
+                        all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
+                        batch_tags = torch.argmax(all_out["logits2"], dim=-1)
+                        batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
+                        batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
+                        batch["input_ids"].to("cpu")
+                        batch["attention_mask"].to("cpu")
+                        for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
+                                                                 batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
+                            this_sentence=[]
+                            for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
+                                if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
+                                    break
+                                if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
+                                    if len(this_sentence)>0:
+                                        this_sentence[-1]["w"] += self.tokenizer.decode(inps)
+                                    else:
+                                        this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
+                                else:
+                                    this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
+                            json.dump({"lang":id_to_lang[lang], "sent":[ {"w":i["w"], "t":self.tags[lang][i["t"]]} for i in this_sentence]}, tag_config["write_output_to"])
+                            tag_config["write_output_to"].write("\n")
+            # If the language is set as parameter
+            elif tag_config["lang"] != -1:
+                LANG = tag_config["lang"]
+                LANG_STR = self.config["id_to_lang"][LANG]
+                # If the output will be to a python list
+                if tag_config["write_output_to"]==None:
+                    all_tagged_sentences = []
+                    for batch in tokenized_batches:
+                        all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
+                        batch_tags = torch.argmax(all_out["logits2"], dim=-1)
+                        batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
+                        batch["input_ids"].to("cpu")
+                        batch["attention_mask"].to("cpu")
+                        for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
+                                                                 batch_lemmas.tolist()):
+                            this_sentence=[]
+                            for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
+                                if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
+                                    break
+                                if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
+                                    if len(this_sentence)>0:
+                                        this_sentence[-1]["w"] += self.tokenizer.decode(inps)
+                                    else:
+                                        this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
+                                else:
+                                    this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
+                            all_tagged_sentences.append({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]]} for i in this_sentence]})
+                    return all_tagged_sentences
+                # If the output is in TSV format to a pipe (stdout or a file handle)
+                elif tag_config["output_tsv"]:
+                    for batch in tokenized_batches:
+                        all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
+                        batch_tags = torch.argmax(all_out["logits2"], dim=-1)
+                        batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
+                        batch["input_ids"].to("cpu")
+                        batch["attention_mask"].to("cpu")
+                        for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
+                                                                 batch_lemmas.tolist()):
+                            this_sentence=[]
+                            for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
+                                if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
+                                    break
+                                if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
+                                    if len(this_sentence)>0:
+                                        this_sentence[-1]["w"] += self.tokenizer.decode(inps)
+                                    else:
+                                        this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
+                                else:
+                                    this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
+                            this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]]} for i in this_sentence]
+                            tag_config["write_output_to"].write(LANG_STR)
+                            for lin in this_sentence:
+                                tag_config["write_output_to"].write("\t")
+                                tag_config["write_output_to"].write(lin["w"])
+                                tag_config["write_output_to"].write("\t")
+                                tag_config["write_output_to"].write(lin["t"])
+                                tag_config["write_output_to"].write("\n")
+                            tag_config["write_output_to"].write("\n")
+                # If output format will be json to a pipe (stdout or a file handle)
+                else:
+                    for batch in tokenized_batches:
+                        all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
+                        batch_tags = torch.argmax(all_out["logits2"], dim=-1)
+                        batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
+                        batch["input_ids"].to("cpu")
+                        batch["attention_mask"].to("cpu")
+                        for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
+                                                                 batch_lemmas.tolist()):
+                            this_sentence=[]
+                            for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
+                                if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
+                                    break
+                                if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
+                                    if len(this_sentence)>0:
+                                        this_sentence[-1]["w"] += self.tokenizer.decode(inps)
+                                    else:
+                                        this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
+                                else:
+                                    this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
+                            json.dump({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]]} for i in this_sentence]}, tag_config["write_output_to"])
+                            tag_config["write_output_to"].write("\n")
+            # If language will be identified according to the majority of all sentences:
+            else:
+                all_tags=[]
+                all_lemmas=[]
+                all_langs=[]
+                all_input_ids=[]
+                # Go over all batches and each sentence in each batch
                 for batch in tokenized_batches:
                     all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
+                    batch_tags = torch.argmax(all_out["logits2"], dim=-1)
                     batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
+                    batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
+                    all_input_ids.extend(batch["input_ids"].tolist())
                     batch["input_ids"].to("cpu")
                     batch["attention_mask"].to("cpu")
+                    all_langs.extend(batch_langs[:, 0].tolist())
+                    all_tags.extend(batch_tags.tolist())
+                    all_lemmas.extend(batch_lemmas.tolist())
+                # Identify the language
+                tag_config["lang"] = 1 if sum(all_langs)/len(all_langs)>=0.5 else 0
+                LANG = tag_config["lang"]
+                LANG_STR = self.config["id_to_lang"][LANG]
+                # If the output will be returned as python list:
+                if tag_config["write_output_to"]==None:
+                    all_tagged_sentences = []
+                    for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
                         this_sentence=[]
                         for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
                             if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
                                 if len(this_sentence)>0:
                                     this_sentence[-1]["w"] += self.tokenizer.decode(inps)
                                 else:
+                                    this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
                             else:
+                                this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
+                        all_tagged_sentences.append({"lang":LANG_STR, "sent":  [ {"w":i["w"], "t":self.tags[LANG][i["t"]]} for i in this_sentence] })
+                    return all_tagged_sentences
+                # If the output is in TSV format
+                elif tag_config["output_tsv"]:
+                    for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
                         this_sentence=[]
                         for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
                             if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
                                 if len(this_sentence)>0:
                                     this_sentence[-1]["w"] += self.tokenizer.decode(inps)
                                 else:
+                                    this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
                             else:
+                                this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
+                        this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]]} for i in this_sentence]
                         tag_config["write_output_to"].write(LANG_STR)
                         for lin in this_sentence:
                             tag_config["write_output_to"].write("\t")
                             tag_config["write_output_to"].write(lin["w"])
                             tag_config["write_output_to"].write("\t")
                             tag_config["write_output_to"].write(lin["t"])
                             tag_config["write_output_to"].write("\n")
                         tag_config["write_output_to"].write("\n")
+                # If output format will be json
+                else:
+                    for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
                         this_sentence=[]
                         for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
                             if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
                                 if len(this_sentence)>0:
                                     this_sentence[-1]["w"] += self.tokenizer.decode(inps)
                                 else:
+                                    this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
                             else:
+                                this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
+                        json.dump({"lang":LANG_STR, "sent":[ {"w":i["w"], "t":self.tags[LANG][i["t"]]} for i in this_sentence]}, tag_config["write_output_to"])
                         tag_config["write_output_to"].write("\n")
     def _check_if_text_file_and_return_content(self, filepath):
         try:
     @torch.no_grad()
     def tag(self, inp=None, **tag_config):
         self.eval()
+        if "lemmatise" in tag_config and tag_config["lemmatise"]==False:
+            tag_config["lemmatize"] = False
+            if "lemmatise" in tag_config:
+                del tag_config["lemmatise"]
+        else:
+            tag_config["lemmatize"] = True
+            if "lemmatise" in tag_config:
+                del tag_config["lemmatise"]
+        if "lemmatize" in tag_config and tag_config["lemmatize"]==False:
+            tag_config["lemmatize"] = False
         if "one_sentence_per_line" not in tag_config:
             tag_config["one_sentence_per_line"]=False
                         os.makedirs(os.path.dirname(out_path), exist_ok=True)
                         if tag_config["one_sentence_per_line"]:
                             inp = [i for i in file_content.split("\n") if i!=""]
+                            inp = [" ".join(i.split()) for i in inp if i!=""]
                             with open(out_path, "w") as opened_file:
                                 tag_config["write_output_to"] = opened_file
                                 self.tag_sentence_list(inp, **tag_config)
                                 self.tag_sentence_list(inp, **tag_config)
                     else:
                         print (f"Could not properly open and read {input_path}.")
+            if write_to is not sys.stdout and write_to is not sys.stderr:
+                write_to.close()
             return
         else:
             # Tag one sentence per line in a string
             if tag_config["one_sentence_per_line"]:
                 inp = [i for i in inp.split("\n") if i!=""]
+                inp = [" ".join(self._preprocess_text(i).split()) for i in inp if i!=""]
                 return self.tag_sentence_list(inp, **tag_config)
             # identify sentences
         # Tag one sentence per list item
         elif type(inp) == list:
             inp=[i.strip() for i in inp]
+            inp=[" ".join(self._preprocess_text(i).split()) for i in inp if i!=""]
             return self.tag_sentence_list(inp, **tag_config)
     def identify_language_sentence_list(self, lst, **tag_config):
     @torch.no_grad()
     def identify_language(self, inp=None, **tag_config):
         self.eval()
         if "one_sentence_per_line" not in tag_config:
             tag_config["one_sentence_per_line"]=False
         if "lang" in tag_config:
             del tag_config["lang"]
         if "lang_per_sentence" not in tag_config:
             tag_config["lang_per_sentence"] = False
+        elif type(tag_config["lang_per_sentence"])==bool and tag_config["lang_per_sentence"]:
             tag_config["lang_per_sentence"] = True
         if "input_directory" in tag_config and "output_directory" in tag_config and "write_output_to" in tag_config and tag_config["write_output_to"]!=None:
                         torch.cuda.empty_cache()
                         if tag_config["write_output_to"]==None:
+                            general_output.extend([{"f":i[0], "lang":self.config["id_to_lang"][i[1]]} for i in zip(file_names, langs)])
                         elif tag_config["output_tsv"]:
                             for fil,lan in zip(file_names, langs):
                                 tag_config["write_output_to"].write(fil)
                                 tag_config["write_output_to"].write("\n")
                         else:
                             for fil,lan in zip(file_names, langs):
+                                json.dump({"f":fil, "lang":self.config["id_to_lang"][lan]})
                         file_names=[]
                         contents=[]
                     else:
                 torch.cuda.empty_cache()
                 if tag_config["write_output_to"]==None:
+                    general_output.extend([{"f":i[0], "lang":self.config["id_to_lang"][i[1]]} for i in zip(file_names, langs)])
                 elif tag_config["output_tsv"]:
                     for fil,lan in zip(file_names, langs):
                         tag_config["write_output_to"].write(fil)
                         tag_config["write_output_to"].write("\n")
                 else:
                     for fil,lan in zip(file_names, langs):
+                        json.dump({"f":fil, "lang":self.config["id_to_lang"][lan]})
             return general_output if len(general_output)>0 else None
                                                 opened_file.write(lan)
                                                 opened_file.write("\n")
                                         else:
+                                            json.dump([{"s":sen, "lang":lan} for sen,lan in zip(inp, out) ] , opened_file)
                                     else:
                                         if tag_config["output_tsv"]:
                                             opened_file.write(out[0])
                                         else:
+                                            json.dump({"lang":out[0]} , opened_file)
                             else:
                                 if tag_config["lang_per_sentence"]:
+                                    general_output.extend([{"s":sen, "lang":lan} for sen,lan in zip(inp, out) ])
                                 else:
+                                    general_output.append({"f":input_path, "lang":out[0]})
                         # If there is an opened pipe already
                         else:
                                         tag_config["write_output_to"].write("\n")
                                     tag_config["write_output_to"].write("\n")
                                 else:
+                                    json.dump([{"s":sen, "lang":lan} for sen,lan in zip(inp, out) ] , tag_config["write_output_to"])
                                     tag_config["write_output_to"].write("\n")
                             else:
                                 if tag_config["output_tsv"]:
                                     tag_config["write_output_to"].write(out[0])
                                     tag_config["write_output_to"].write("\n")
                                 else:
+                                    json.dump({"f":input_path, "lang":out[0]} , tag_config["write_output_to"])
                                     tag_config["write_output_to"].write("\n")
                     else:
                             tag_config["write_output_to"].write("err")
                             tag_config["write_output_to"].write("\n")
                         else:
+                            json.dump({"f":input_path, "lang":"err"} , tag_config["write_output_to"])
                             tag_config["write_output_to"].write("\n")
+            if tag_config["write_output_to"] and tag_config["write_output_to"] is not sys.stdout and tag_config["write_output_to"] is not sys.stderr:
                 tag_config["write_output_to"].close()
             return general_output if len(general_output)>0 else None
             # If return as list
             if tag_config["write_output_to"]==None:
+                return [{"s":i[0], "lang": i[1]} for i in zip(inp, out)]
             if tag_config["output_tsv"]:
                 for sen,lan in zip(inp, out):
                     tag_config["write_output_to"].write(out)
                     tag_config["write_output_to"].write("\n")
             else:
+                json.dump([{"s":sen, "lang":lan} for sen,lan in zip(inp, out) ] , tag_config["write_output_to"])
             return
             # If return as list
             if tag_config["write_output_to"]==None:
+                return [{"s":i[0], "lang": i[1]} for i in zip(inp, out)]
             if tag_config["output_tsv"]:
                 for sen,lan in zip(inp, out):
                     tag_config["write_output_to"].write(lan)
                     tag_config["write_output_to"].write("\n")
             else:
+                json.dump([{"s":sen, "lang":lan} for sen,lan in zip(inp, out) ] , tag_config["write_output_to"])
             return

tagger_config.json CHANGED Viewed

The diff for this file is too large to render. See raw diff