Ahmet Yildirim commited on
Commit ·
d6d6f4f
1
Parent(s): ce2411a
- Update lemmatisering
Browse files- .gitattributes +1 -0
- README.md +2 -0
- fullform_list.json_large +3 -0
- modeling_humit_tagger.py +627 -205
- tagger_config.json +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.json_large filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -20,6 +20,7 @@ This specific version of the tagger is based on Norbert3-base.
|
|
| 20 |
The aim of this model is to make Humit-Tagger available as a HuggingFace model including all functionality that the [original code](https://github.com/humit-oslo/humit-tagger) supports.
|
| 21 |
In addition to the morphological tagging, this model supports Nynorsk/Bokmåk language identification provided by this [repository](https://github.com/humit-oslo/humit-sprakidentifikator).
|
| 22 |
|
|
|
|
| 23 |
|
| 24 |
This model adds four classification layers on top of the base model.
|
| 25 |
These layers do language identification, morphologic classification, lemmatization classification, and sentence boundary detection.
|
|
@@ -75,6 +76,7 @@ These functions receive similar parameters.
|
|
| 75 |
| lang\_per\_sentence| yes | no | True / False | False | identify the language per sentence and output the tags according to the language identified for that sentence. If this is not set, and lang is "au" then the whole input (or a file if input\_directory is used) is used to identify the language. |
|
| 76 |
| write\_output\_to | yes | yes | a file path, a file handle, or "list" | sys.stdout | to specify where to write the output. If a file path is provided, the output will be written to that file. The file is overwritten. If a file handle is provided, then the output is written there. If "list" is given as parameters, then the function returns a python "list". |
|
| 77 |
| output\_tsv | yes | yes | True/False | False | to specify the output format. The default is the json format. If multiple sentences exist, each line is a single valid json but not the whole output. This option cannot be used along with write\_output\_to="list" |
|
|
|
|
| 78 |
| lang\_per\_item | no | yes | True/False | False | consider each item in the list given as separate input for language identification. |
|
| 79 |
| fast\_mode | no | yes | True/False | False | identify languages of the files in the input directory in fast mode. This mode uses only the beginning of the files in identification. This method is much more faster for many files but is not as accurate as if this paramer is set to False. |
|
| 80 |
|
|
|
|
| 20 |
The aim of this model is to make Humit-Tagger available as a HuggingFace model including all functionality that the [original code](https://github.com/humit-oslo/humit-tagger) supports.
|
| 21 |
In addition to the morphological tagging, this model supports Nynorsk/Bokmåk language identification provided by this [repository](https://github.com/humit-oslo/humit-sprakidentifikator).
|
| 22 |
|
| 23 |
+
**If you do not need lemmatisation, we recommend giving the lemmatisation=False flag as parameter. This will save some cpu time and make the tagging faster.**
|
| 24 |
|
| 25 |
This model adds four classification layers on top of the base model.
|
| 26 |
These layers do language identification, morphologic classification, lemmatization classification, and sentence boundary detection.
|
|
|
|
| 76 |
| lang\_per\_sentence| yes | no | True / False | False | identify the language per sentence and output the tags according to the language identified for that sentence. If this is not set, and lang is "au" then the whole input (or a file if input\_directory is used) is used to identify the language. |
|
| 77 |
| write\_output\_to | yes | yes | a file path, a file handle, or "list" | sys.stdout | to specify where to write the output. If a file path is provided, the output will be written to that file. The file is overwritten. If a file handle is provided, then the output is written there. If "list" is given as parameters, then the function returns a python "list". |
|
| 78 |
| output\_tsv | yes | yes | True/False | False | to specify the output format. The default is the json format. If multiple sentences exist, each line is a single valid json but not the whole output. This option cannot be used along with write\_output\_to="list" |
|
| 79 |
+
| lemmatisation | yes | no | True / False | True | to specify whether lemmatisation will be applied. Disabling lemmatisation by giving this parameter as False makes the tagger faster. |
|
| 80 |
| lang\_per\_item | no | yes | True/False | False | consider each item in the list given as separate input for language identification. |
|
| 81 |
| fast\_mode | no | yes | True/False | False | identify languages of the files in the input directory in fast mode. This mode uses only the beginning of the files in identification. This method is much more faster for many files but is not as accurate as if this paramer is set to False. |
|
| 82 |
|
fullform_list.json_large
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a32e9d7c36ed2ba9ec7f080e118760c444277fb6f213172246d24711b0493433
|
| 3 |
+
size 240703613
|
modeling_humit_tagger.py
CHANGED
|
@@ -32,7 +32,7 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 32 |
kwargs["this_model_config"]=json.load(js)
|
| 33 |
|
| 34 |
|
| 35 |
-
# Download this model's
|
| 36 |
lemma_rules_path = hf_hub_download(repo_id=repo_name, filename=kwargs["config"].lemma_rules_py_file)
|
| 37 |
|
| 38 |
# load lemma rules class
|
|
@@ -46,6 +46,7 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 46 |
base_config_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_config_file"])
|
| 47 |
base_model_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_model_file"])
|
| 48 |
base_model_config_json_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_config_json_file"])
|
|
|
|
| 49 |
|
| 50 |
# Copy base model's configuration python file into our working directory
|
| 51 |
config_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)) , os.path.basename(base_config_file))
|
|
@@ -81,12 +82,13 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 81 |
|
| 82 |
kwargs["model_weights_path"] = model_weights_path
|
| 83 |
kwargs["repo_name"] = repo_name
|
|
|
|
| 84 |
return HumitTaggerModel(**kwargs)
|
| 85 |
|
| 86 |
def __init__(self, **kwargs ):
|
| 87 |
super(HumitTaggerModel, self).__init__()
|
| 88 |
json_cfg = kwargs["base_model_json_cfg"]
|
| 89 |
-
self.config=kwargs["this_model_config"]
|
| 90 |
self.LemmaHandling = sys.modules["lemma_rules"].LemmaHandling
|
| 91 |
self.LemmaHandling.load_lemma_rules_from_obj(self.config["lemma_rules"])
|
| 92 |
cfg=sys.modules["base_config"].NorbertConfig(**json_cfg)
|
|
@@ -117,6 +119,32 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 117 |
self.REPLACE_PATTERN = '|'.join(sorted(re.escape(k) for k in self.REPLACE_DICT))
|
| 118 |
self.MAX_LENGTH = self.bert.config.max_position_embeddings
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
def forward(self, input_ids=None, attention_mask=None ):
|
| 121 |
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True )
|
| 122 |
sequence_output = self.dropout(outputs.last_hidden_state)
|
|
@@ -171,19 +199,24 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 171 |
}
|
| 172 |
batched_sentences.append(to_append)
|
| 173 |
|
| 174 |
-
torch.cuda.
|
|
|
|
| 175 |
|
| 176 |
return batched_sentences
|
| 177 |
|
| 178 |
def _split_sentences(self, inp):
|
| 179 |
|
|
|
|
|
|
|
|
|
|
| 180 |
# Here we get the whole text tokenized.
|
| 181 |
encodings = self.tokenizer(inp,add_special_tokens=False, return_tensors="pt").to(self.device)
|
| 182 |
|
| 183 |
# Save a copy of the tokenization
|
| 184 |
original_encodings=copy.deepcopy(encodings)
|
| 185 |
original_encodings=original_encodings.to("cpu")
|
| 186 |
-
torch.cuda.
|
|
|
|
| 187 |
|
| 188 |
# Pad to the complete size (model max_size -1 (-1 to add CLS))
|
| 189 |
old_size=encodings["input_ids"][0].size()[0]
|
|
@@ -225,13 +258,15 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 225 |
# First get them back to CPU to open space on GPU
|
| 226 |
input_ids_batched=[i.to("cpu") for i in input_ids_batched]
|
| 227 |
attention_mask_batched=[i.to("cpu") for i in attention_mask_batched]
|
| 228 |
-
torch.cuda.
|
|
|
|
| 229 |
|
| 230 |
for input_ids, attention_masks in zip(input_ids_batched, attention_mask_batched):
|
| 231 |
current_batch={"input_ids":input_ids.to(self.device).long(), "attention_mask":attention_masks.to(self.device).long()}
|
| 232 |
outputs = self(**current_batch)
|
| 233 |
del current_batch
|
| 234 |
-
torch.cuda.
|
|
|
|
| 235 |
|
| 236 |
label_data=outputs["logits1"].argmax(-1)
|
| 237 |
labels_output.extend(label_data)
|
|
@@ -240,7 +275,8 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 240 |
labels_output=torch.stack(labels_output ,dim=0)
|
| 241 |
labels_output=labels_output[:, range(1,self.MAX_LENGTH)]
|
| 242 |
labels_output=torch.reshape(labels_output,(1,row_count * self.MAX_LENGTH_WITHOUT_CLS))
|
| 243 |
-
torch.cuda.
|
|
|
|
| 244 |
|
| 245 |
# Now the data is split into sentences
|
| 246 |
# So, now create sentence data as list so that this could be used
|
|
@@ -265,7 +301,9 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 265 |
del old_size
|
| 266 |
del inp
|
| 267 |
del outputs
|
| 268 |
-
|
|
|
|
|
|
|
| 269 |
|
| 270 |
return sentence_list
|
| 271 |
|
|
@@ -279,6 +317,85 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 279 |
sentences.extend(self._split_sentences(i.strip()))
|
| 280 |
return sentences
|
| 281 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
def tag_sentence_list(self, lst, **tag_config):
|
| 283 |
|
| 284 |
# If the sentences are not tokenized, tokenize while batching:
|
|
@@ -296,62 +413,268 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 296 |
else:
|
| 297 |
tokenized_batches = self._batchify(lst)
|
| 298 |
|
| 299 |
-
# If
|
| 300 |
-
if tag_config["
|
| 301 |
-
|
| 302 |
-
# If
|
| 303 |
-
if tag_config["
|
| 304 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
for batch in tokenized_batches:
|
| 306 |
all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
|
| 307 |
batch_tags = torch.argmax(all_out["logits2"], dim=-1)
|
| 308 |
-
|
|
|
|
| 309 |
batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
|
|
|
|
| 310 |
batch["input_ids"].to("cpu")
|
| 311 |
batch["attention_mask"].to("cpu")
|
|
|
|
|
|
|
|
|
|
| 312 |
|
| 313 |
-
|
| 314 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
this_sentence=[]
|
| 316 |
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 317 |
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
| 318 |
break
|
| 319 |
-
if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
|
| 320 |
if len(this_sentence)>0:
|
| 321 |
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 322 |
else:
|
| 323 |
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
|
| 324 |
else:
|
| 325 |
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
|
| 326 |
-
all_tagged_sentences.append({"lang":id_to_lang[lang], "sent": [ {"w":i["w"], "t":self.tags[lang][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]})
|
| 327 |
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
elif tag_config["output_tsv"]:
|
| 332 |
-
for batch in tokenized_batches:
|
| 333 |
-
all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
|
| 334 |
-
batch_tags = torch.argmax(all_out["logits2"], dim=-1)
|
| 335 |
-
batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
|
| 336 |
-
batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
|
| 337 |
-
batch["input_ids"].to("cpu")
|
| 338 |
-
batch["attention_mask"].to("cpu")
|
| 339 |
|
| 340 |
-
|
| 341 |
-
|
|
|
|
| 342 |
this_sentence=[]
|
| 343 |
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 344 |
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
| 345 |
break
|
| 346 |
-
if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
|
| 347 |
if len(this_sentence)>0:
|
| 348 |
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 349 |
else:
|
| 350 |
-
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
|
| 351 |
else:
|
| 352 |
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
|
| 353 |
-
|
| 354 |
-
|
|
|
|
|
|
|
| 355 |
for lin in this_sentence:
|
| 356 |
tag_config["write_output_to"].write("\t")
|
| 357 |
tag_config["write_output_to"].write(lin["w"])
|
|
@@ -362,49 +685,235 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 362 |
tag_config["write_output_to"].write("\n")
|
| 363 |
tag_config["write_output_to"].write("\n")
|
| 364 |
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
|
| 369 |
-
batch_tags = torch.argmax(all_out["logits2"], dim=-1)
|
| 370 |
-
batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
|
| 371 |
-
batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
|
| 372 |
-
batch["input_ids"].to("cpu")
|
| 373 |
-
batch["attention_mask"].to("cpu")
|
| 374 |
-
|
| 375 |
-
for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
|
| 376 |
-
batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
|
| 377 |
this_sentence=[]
|
| 378 |
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 379 |
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
| 380 |
break
|
| 381 |
-
if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
|
| 382 |
if len(this_sentence)>0:
|
| 383 |
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 384 |
else:
|
| 385 |
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
|
| 386 |
else:
|
| 387 |
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
|
| 388 |
-
|
| 389 |
-
|
|
|
|
| 390 |
tag_config["write_output_to"].write("\n")
|
| 391 |
|
| 392 |
-
# If
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 399 |
for batch in tokenized_batches:
|
| 400 |
all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
|
| 401 |
-
batch_tags = torch.argmax(all_out["logits2"], dim=-1)
|
| 402 |
batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
|
|
|
|
|
|
|
| 403 |
batch["input_ids"].to("cpu")
|
| 404 |
batch["attention_mask"].to("cpu")
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
this_sentence=[]
|
| 409 |
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 410 |
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
|
@@ -413,24 +922,15 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 413 |
if len(this_sentence)>0:
|
| 414 |
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 415 |
else:
|
| 416 |
-
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag
|
| 417 |
else:
|
| 418 |
-
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag
|
| 419 |
-
all_tagged_sentences.append({"lang":LANG_STR, "sent":
|
| 420 |
-
|
| 421 |
-
return all_tagged_sentences
|
| 422 |
-
|
| 423 |
-
# If the output is in TSV format to a pipe (stdout or a file handle)
|
| 424 |
-
elif tag_config["output_tsv"]:
|
| 425 |
-
for batch in tokenized_batches:
|
| 426 |
-
all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
|
| 427 |
-
batch_tags = torch.argmax(all_out["logits2"], dim=-1)
|
| 428 |
-
batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
|
| 429 |
-
batch["input_ids"].to("cpu")
|
| 430 |
-
batch["attention_mask"].to("cpu")
|
| 431 |
|
| 432 |
-
|
| 433 |
-
|
|
|
|
| 434 |
this_sentence=[]
|
| 435 |
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 436 |
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
|
@@ -439,32 +939,22 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 439 |
if len(this_sentence)>0:
|
| 440 |
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 441 |
else:
|
| 442 |
-
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag
|
| 443 |
else:
|
| 444 |
-
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag
|
| 445 |
-
this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]]
|
| 446 |
tag_config["write_output_to"].write(LANG_STR)
|
| 447 |
for lin in this_sentence:
|
| 448 |
tag_config["write_output_to"].write("\t")
|
| 449 |
tag_config["write_output_to"].write(lin["w"])
|
| 450 |
tag_config["write_output_to"].write("\t")
|
| 451 |
-
tag_config["write_output_to"].write(lin["l"])
|
| 452 |
-
tag_config["write_output_to"].write("\t")
|
| 453 |
tag_config["write_output_to"].write(lin["t"])
|
| 454 |
tag_config["write_output_to"].write("\n")
|
| 455 |
tag_config["write_output_to"].write("\n")
|
| 456 |
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
|
| 461 |
-
batch_tags = torch.argmax(all_out["logits2"], dim=-1)
|
| 462 |
-
batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
|
| 463 |
-
batch["input_ids"].to("cpu")
|
| 464 |
-
batch["attention_mask"].to("cpu")
|
| 465 |
-
|
| 466 |
-
for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
|
| 467 |
-
batch_lemmas.tolist()):
|
| 468 |
this_sentence=[]
|
| 469 |
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 470 |
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
|
@@ -473,98 +963,13 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 473 |
if len(this_sentence)>0:
|
| 474 |
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 475 |
else:
|
| 476 |
-
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag
|
| 477 |
-
else:
|
| 478 |
-
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
|
| 479 |
-
|
| 480 |
-
json.dump({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]}, tag_config["write_output_to"])
|
| 481 |
-
tag_config["write_output_to"].write("\n")
|
| 482 |
-
|
| 483 |
-
# If language will be identified according to the majority of all sentences:
|
| 484 |
-
else:
|
| 485 |
-
all_tags=[]
|
| 486 |
-
all_lemmas=[]
|
| 487 |
-
all_langs=[]
|
| 488 |
-
all_input_ids=[]
|
| 489 |
-
# Go over all batches and each sentence in each batch
|
| 490 |
-
for batch in tokenized_batches:
|
| 491 |
-
all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
|
| 492 |
-
batch_tags = torch.argmax(all_out["logits2"], dim=-1)
|
| 493 |
-
batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
|
| 494 |
-
batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
|
| 495 |
-
all_input_ids.extend(batch["input_ids"].tolist())
|
| 496 |
-
batch["input_ids"].to("cpu")
|
| 497 |
-
batch["attention_mask"].to("cpu")
|
| 498 |
-
all_langs.extend(batch_langs[:, 0].tolist())
|
| 499 |
-
all_tags.extend(batch_tags.tolist())
|
| 500 |
-
all_lemmas.extend(batch_lemmas.tolist())
|
| 501 |
-
|
| 502 |
-
# Identify the language
|
| 503 |
-
tag_config["lang"] = 1 if sum(all_langs)/len(all_langs)>=0.5 else 0
|
| 504 |
-
LANG = tag_config["lang"]
|
| 505 |
-
LANG_STR = self.config["id_to_lang"][LANG]
|
| 506 |
-
|
| 507 |
-
# If the output will be returned as python list:
|
| 508 |
-
if tag_config["write_output_to"]==None:
|
| 509 |
-
all_tagged_sentences = []
|
| 510 |
-
for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
|
| 511 |
-
this_sentence=[]
|
| 512 |
-
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 513 |
-
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
| 514 |
-
break
|
| 515 |
-
if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
|
| 516 |
-
if len(this_sentence)>0:
|
| 517 |
-
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 518 |
else:
|
| 519 |
-
this_sentence.append({"w":
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
all_tagged_sentences.append({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence] })
|
| 523 |
-
return all_tagged_sentences
|
| 524 |
-
|
| 525 |
-
# If the output is in TSV format
|
| 526 |
-
elif tag_config["output_tsv"]:
|
| 527 |
-
for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
|
| 528 |
-
this_sentence=[]
|
| 529 |
-
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 530 |
-
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
| 531 |
-
break
|
| 532 |
-
if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
|
| 533 |
-
if len(this_sentence)>0:
|
| 534 |
-
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 535 |
-
else:
|
| 536 |
-
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
|
| 537 |
-
else:
|
| 538 |
-
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
|
| 539 |
-
this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]
|
| 540 |
-
tag_config["write_output_to"].write(LANG_STR)
|
| 541 |
-
for lin in this_sentence:
|
| 542 |
-
tag_config["write_output_to"].write("\t")
|
| 543 |
-
tag_config["write_output_to"].write(lin["w"])
|
| 544 |
-
tag_config["write_output_to"].write("\t")
|
| 545 |
-
tag_config["write_output_to"].write(lin["l"])
|
| 546 |
-
tag_config["write_output_to"].write("\t")
|
| 547 |
-
tag_config["write_output_to"].write(lin["t"])
|
| 548 |
tag_config["write_output_to"].write("\n")
|
| 549 |
-
tag_config["write_output_to"].write("\n")
|
| 550 |
|
| 551 |
-
# If output format will be json
|
| 552 |
-
else:
|
| 553 |
-
for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
|
| 554 |
-
this_sentence=[]
|
| 555 |
-
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 556 |
-
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
| 557 |
-
break
|
| 558 |
-
if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
|
| 559 |
-
if len(this_sentence)>0:
|
| 560 |
-
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 561 |
-
else:
|
| 562 |
-
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
|
| 563 |
-
else:
|
| 564 |
-
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
|
| 565 |
-
|
| 566 |
-
json.dump({"lang":LANG_STR, "sent":[ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(i["w"],i["l"])} for i in this_sentence]}, tag_config["write_output_to"])
|
| 567 |
-
tag_config["write_output_to"].write("\n")
|
| 568 |
|
| 569 |
def _check_if_text_file_and_return_content(self, filepath):
|
| 570 |
try:
|
|
@@ -575,7 +980,21 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 575 |
|
| 576 |
@torch.no_grad()
|
| 577 |
def tag(self, inp=None, **tag_config):
|
|
|
|
| 578 |
self.eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 579 |
if "one_sentence_per_line" not in tag_config:
|
| 580 |
tag_config["one_sentence_per_line"]=False
|
| 581 |
|
|
@@ -620,7 +1039,7 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 620 |
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
| 621 |
if tag_config["one_sentence_per_line"]:
|
| 622 |
inp = [i for i in file_content.split("\n") if i!=""]
|
| 623 |
-
inp = [i for i in inp if i!=""]
|
| 624 |
with open(out_path, "w") as opened_file:
|
| 625 |
tag_config["write_output_to"] = opened_file
|
| 626 |
self.tag_sentence_list(inp, **tag_config)
|
|
@@ -631,8 +1050,8 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 631 |
self.tag_sentence_list(inp, **tag_config)
|
| 632 |
else:
|
| 633 |
print (f"Could not properly open and read {input_path}.")
|
| 634 |
-
|
| 635 |
-
|
| 636 |
return
|
| 637 |
|
| 638 |
else:
|
|
@@ -650,7 +1069,7 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 650 |
# Tag one sentence per line in a string
|
| 651 |
if tag_config["one_sentence_per_line"]:
|
| 652 |
inp = [i for i in inp.split("\n") if i!=""]
|
| 653 |
-
inp = [self._preprocess_text(i) for i in inp if i!=""]
|
| 654 |
return self.tag_sentence_list(inp, **tag_config)
|
| 655 |
|
| 656 |
# identify sentences
|
|
@@ -660,7 +1079,7 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 660 |
# Tag one sentence per list item
|
| 661 |
elif type(inp) == list:
|
| 662 |
inp=[i.strip() for i in inp]
|
| 663 |
-
inp=[self._preprocess_text(i) for i in inp if i!=""]
|
| 664 |
return self.tag_sentence_list(inp, **tag_config)
|
| 665 |
|
| 666 |
def identify_language_sentence_list(self, lst, **tag_config):
|
|
@@ -703,9 +1122,12 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 703 |
|
| 704 |
@torch.no_grad()
|
| 705 |
def identify_language(self, inp=None, **tag_config):
|
|
|
|
| 706 |
self.eval()
|
|
|
|
| 707 |
if "one_sentence_per_line" not in tag_config:
|
| 708 |
tag_config["one_sentence_per_line"]=False
|
|
|
|
| 709 |
if "lang" in tag_config:
|
| 710 |
del tag_config["lang"]
|
| 711 |
|
|
@@ -715,7 +1137,7 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 715 |
if "lang_per_sentence" not in tag_config:
|
| 716 |
tag_config["lang_per_sentence"] = False
|
| 717 |
|
| 718 |
-
elif tag_config["lang_per_sentence"]:
|
| 719 |
tag_config["lang_per_sentence"] = True
|
| 720 |
|
| 721 |
if "input_directory" in tag_config and "output_directory" in tag_config and "write_output_to" in tag_config and tag_config["write_output_to"]!=None:
|
|
@@ -771,7 +1193,7 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 771 |
torch.cuda.empty_cache()
|
| 772 |
|
| 773 |
if tag_config["write_output_to"]==None:
|
| 774 |
-
general_output.extend([{"f":i[0], "
|
| 775 |
elif tag_config["output_tsv"]:
|
| 776 |
for fil,lan in zip(file_names, langs):
|
| 777 |
tag_config["write_output_to"].write(fil)
|
|
@@ -780,7 +1202,7 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 780 |
tag_config["write_output_to"].write("\n")
|
| 781 |
else:
|
| 782 |
for fil,lan in zip(file_names, langs):
|
| 783 |
-
json.dump({"f":fil, "
|
| 784 |
file_names=[]
|
| 785 |
contents=[]
|
| 786 |
else:
|
|
@@ -801,7 +1223,7 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 801 |
torch.cuda.empty_cache()
|
| 802 |
|
| 803 |
if tag_config["write_output_to"]==None:
|
| 804 |
-
general_output.extend([{"f":i[0], "
|
| 805 |
elif tag_config["output_tsv"]:
|
| 806 |
for fil,lan in zip(file_names, langs):
|
| 807 |
tag_config["write_output_to"].write(fil)
|
|
@@ -810,7 +1232,7 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 810 |
tag_config["write_output_to"].write("\n")
|
| 811 |
else:
|
| 812 |
for fil,lan in zip(file_names, langs):
|
| 813 |
-
json.dump({"f":fil, "
|
| 814 |
|
| 815 |
return general_output if len(general_output)>0 else None
|
| 816 |
|
|
@@ -852,17 +1274,17 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 852 |
opened_file.write(lan)
|
| 853 |
opened_file.write("\n")
|
| 854 |
else:
|
| 855 |
-
json.dump([{"s":sen, "
|
| 856 |
else:
|
| 857 |
if tag_config["output_tsv"]:
|
| 858 |
opened_file.write(out[0])
|
| 859 |
else:
|
| 860 |
-
json.dump({"
|
| 861 |
else:
|
| 862 |
if tag_config["lang_per_sentence"]:
|
| 863 |
-
general_output.extend([{"s":sen, "
|
| 864 |
else:
|
| 865 |
-
general_output.append({"f":input_path, "
|
| 866 |
|
| 867 |
# If there is an opened pipe already
|
| 868 |
else:
|
|
@@ -875,7 +1297,7 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 875 |
tag_config["write_output_to"].write("\n")
|
| 876 |
tag_config["write_output_to"].write("\n")
|
| 877 |
else:
|
| 878 |
-
json.dump([{"s":sen, "
|
| 879 |
tag_config["write_output_to"].write("\n")
|
| 880 |
else:
|
| 881 |
if tag_config["output_tsv"]:
|
|
@@ -884,7 +1306,7 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 884 |
tag_config["write_output_to"].write(out[0])
|
| 885 |
tag_config["write_output_to"].write("\n")
|
| 886 |
else:
|
| 887 |
-
json.dump({"f":input_path, "
|
| 888 |
tag_config["write_output_to"].write("\n")
|
| 889 |
|
| 890 |
else:
|
|
@@ -894,10 +1316,10 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 894 |
tag_config["write_output_to"].write("err")
|
| 895 |
tag_config["write_output_to"].write("\n")
|
| 896 |
else:
|
| 897 |
-
json.dump({"f":input_path, "
|
| 898 |
tag_config["write_output_to"].write("\n")
|
| 899 |
|
| 900 |
-
if tag_config["write_output_to"] and tag_config["write_output_to"]
|
| 901 |
tag_config["write_output_to"].close()
|
| 902 |
|
| 903 |
return general_output if len(general_output)>0 else None
|
|
@@ -933,7 +1355,7 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 933 |
|
| 934 |
# If return as list
|
| 935 |
if tag_config["write_output_to"]==None:
|
| 936 |
-
return [{"s":i[0], "
|
| 937 |
|
| 938 |
if tag_config["output_tsv"]:
|
| 939 |
for sen,lan in zip(inp, out):
|
|
@@ -942,7 +1364,7 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 942 |
tag_config["write_output_to"].write(out)
|
| 943 |
tag_config["write_output_to"].write("\n")
|
| 944 |
else:
|
| 945 |
-
json.dump([{"s":sen, "
|
| 946 |
|
| 947 |
return
|
| 948 |
|
|
@@ -954,7 +1376,7 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 954 |
|
| 955 |
# If return as list
|
| 956 |
if tag_config["write_output_to"]==None:
|
| 957 |
-
return [{"s":i[0], "
|
| 958 |
|
| 959 |
if tag_config["output_tsv"]:
|
| 960 |
for sen,lan in zip(inp, out):
|
|
@@ -963,7 +1385,7 @@ class HumitTaggerModel(torch.nn.Module):
|
|
| 963 |
tag_config["write_output_to"].write(lan)
|
| 964 |
tag_config["write_output_to"].write("\n")
|
| 965 |
else:
|
| 966 |
-
json.dump([{"s":sen, "
|
| 967 |
|
| 968 |
return
|
| 969 |
|
|
|
|
| 32 |
kwargs["this_model_config"]=json.load(js)
|
| 33 |
|
| 34 |
|
| 35 |
+
# Download this model's lemma rules pickle file:
|
| 36 |
lemma_rules_path = hf_hub_download(repo_id=repo_name, filename=kwargs["config"].lemma_rules_py_file)
|
| 37 |
|
| 38 |
# load lemma rules class
|
|
|
|
| 46 |
base_config_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_config_file"])
|
| 47 |
base_model_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_model_file"])
|
| 48 |
base_model_config_json_file = hf_hub_download(repo_id=kwargs["this_model_config"]["base_model"], filename=kwargs["this_model_config"]["base_model_config_json_file"])
|
| 49 |
+
fullformlist_file = hf_hub_download(repo_id=repo_name, filename=kwargs["this_model_config"]["fullformlist_file"])
|
| 50 |
|
| 51 |
# Copy base model's configuration python file into our working directory
|
| 52 |
config_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)) , os.path.basename(base_config_file))
|
|
|
|
| 82 |
|
| 83 |
kwargs["model_weights_path"] = model_weights_path
|
| 84 |
kwargs["repo_name"] = repo_name
|
| 85 |
+
kwargs["fullformlist_file"] = fullformlist_file
|
| 86 |
return HumitTaggerModel(**kwargs)
|
| 87 |
|
| 88 |
def __init__(self, **kwargs ):
|
| 89 |
super(HumitTaggerModel, self).__init__()
|
| 90 |
json_cfg = kwargs["base_model_json_cfg"]
|
| 91 |
+
self.config = kwargs["this_model_config"]
|
| 92 |
self.LemmaHandling = sys.modules["lemma_rules"].LemmaHandling
|
| 93 |
self.LemmaHandling.load_lemma_rules_from_obj(self.config["lemma_rules"])
|
| 94 |
cfg=sys.modules["base_config"].NorbertConfig(**json_cfg)
|
|
|
|
| 119 |
self.REPLACE_PATTERN = '|'.join(sorted(re.escape(k) for k in self.REPLACE_DICT))
|
| 120 |
self.MAX_LENGTH = self.bert.config.max_position_embeddings
|
| 121 |
|
| 122 |
+
# Note the classes that represents gen and prop tags
|
| 123 |
+
self.gen_tag_classes = set()
|
| 124 |
+
self.prop_tag_classes = set()
|
| 125 |
+
self.t_2_tag_classes = set()
|
| 126 |
+
|
| 127 |
+
for i, lst in enumerate(self.config["tags"][0]):
|
| 128 |
+
if "gen" in lst:
|
| 129 |
+
self.gen_tag_classes.add(i)
|
| 130 |
+
if "prop" in lst:
|
| 131 |
+
self.prop_tag_classes.add(i)
|
| 132 |
+
if "2" in lst:
|
| 133 |
+
self.t_2_tag_classes.add(i)
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
# Load the fullform list
|
| 137 |
+
self.fullform_list=[{},{}]
|
| 138 |
+
try:
|
| 139 |
+
with open(kwargs["fullformlist_file"], 'r') as f:
|
| 140 |
+
self.fullform_list = json.load(f)
|
| 141 |
+
for k in range(2):
|
| 142 |
+
for i in self.fullform_list[k]:
|
| 143 |
+
for j in self.fullform_list[k][i][j]:
|
| 144 |
+
self.fullform_list[k][i][j]=set(self.fullform_list[k][i][j])
|
| 145 |
+
except:
|
| 146 |
+
pass
|
| 147 |
+
|
| 148 |
def forward(self, input_ids=None, attention_mask=None ):
|
| 149 |
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True )
|
| 150 |
sequence_output = self.dropout(outputs.last_hidden_state)
|
|
|
|
| 199 |
}
|
| 200 |
batched_sentences.append(to_append)
|
| 201 |
|
| 202 |
+
if torch.cuda.is_available():
|
| 203 |
+
torch.cuda.empty_cache()
|
| 204 |
|
| 205 |
return batched_sentences
|
| 206 |
|
| 207 |
def _split_sentences(self, inp):
|
| 208 |
|
| 209 |
+
# Remove double spaces
|
| 210 |
+
inp=" ".join(inp.split())
|
| 211 |
+
|
| 212 |
# Here we get the whole text tokenized.
|
| 213 |
encodings = self.tokenizer(inp,add_special_tokens=False, return_tensors="pt").to(self.device)
|
| 214 |
|
| 215 |
# Save a copy of the tokenization
|
| 216 |
original_encodings=copy.deepcopy(encodings)
|
| 217 |
original_encodings=original_encodings.to("cpu")
|
| 218 |
+
if torch.cuda.is_available():
|
| 219 |
+
torch.cuda.empty_cache()
|
| 220 |
|
| 221 |
# Pad to the complete size (model max_size -1 (-1 to add CLS))
|
| 222 |
old_size=encodings["input_ids"][0].size()[0]
|
|
|
|
| 258 |
# First get them back to CPU to open space on GPU
|
| 259 |
input_ids_batched=[i.to("cpu") for i in input_ids_batched]
|
| 260 |
attention_mask_batched=[i.to("cpu") for i in attention_mask_batched]
|
| 261 |
+
if torch.cuda.is_available():
|
| 262 |
+
torch.cuda.empty_cache()
|
| 263 |
|
| 264 |
for input_ids, attention_masks in zip(input_ids_batched, attention_mask_batched):
|
| 265 |
current_batch={"input_ids":input_ids.to(self.device).long(), "attention_mask":attention_masks.to(self.device).long()}
|
| 266 |
outputs = self(**current_batch)
|
| 267 |
del current_batch
|
| 268 |
+
if torch.cuda.is_available():
|
| 269 |
+
torch.cuda.empty_cache()
|
| 270 |
|
| 271 |
label_data=outputs["logits1"].argmax(-1)
|
| 272 |
labels_output.extend(label_data)
|
|
|
|
| 275 |
labels_output=torch.stack(labels_output ,dim=0)
|
| 276 |
labels_output=labels_output[:, range(1,self.MAX_LENGTH)]
|
| 277 |
labels_output=torch.reshape(labels_output,(1,row_count * self.MAX_LENGTH_WITHOUT_CLS))
|
| 278 |
+
if torch.cuda.is_available():
|
| 279 |
+
torch.cuda.empty_cache()
|
| 280 |
|
| 281 |
# Now the data is split into sentences
|
| 282 |
# So, now create sentence data as list so that this could be used
|
|
|
|
| 301 |
del old_size
|
| 302 |
del inp
|
| 303 |
del outputs
|
| 304 |
+
|
| 305 |
+
if torch.cuda.is_available():
|
| 306 |
+
torch.cuda.empty_cache()
|
| 307 |
|
| 308 |
return sentence_list
|
| 309 |
|
|
|
|
| 317 |
sentences.extend(self._split_sentences(i.strip()))
|
| 318 |
return sentences
|
| 319 |
|
| 320 |
+
def _lemmatize(self, tag, LANG):
|
| 321 |
+
|
| 322 |
+
# Here, a "tag" is a list of words in one sentence, their tags and an ordering of lemma classes according the lemmatization model for each word.
|
| 323 |
+
# We go over all words, and apply our algorithm for lemmatization
|
| 324 |
+
# 1. If the "pron" tag is found in the tags
|
| 325 |
+
# then, we check if the "gen" tag also exists
|
| 326 |
+
# if there is the "gen" tag in tags and if there is "s" at the end of the word, we remove that s
|
| 327 |
+
# and return the rest of the word as lemma
|
| 328 |
+
# 2. OR, we continue with "høflig" processing
|
| 329 |
+
# if the word is "De" and if it has the tag "høflig" then we set the lemma as "De", otherwise "de"
|
| 330 |
+
# 3. OR, we continue with checking the word and its word class (subst, verb, adj, etc.) towards the fullform lists.
|
| 331 |
+
# if the word and its word class exists in the fullformlist (of the language bokmål or nynorsk according the the language parameter)
|
| 332 |
+
# then we set the lemma from the fullform list.
|
| 333 |
+
# if there are multiple lemmas in the fullform list, then we check each lemma suggested by the model
|
| 334 |
+
# we pick the lemma amon the lemmas suggested by the fullformlist that comes the first among the lemmas suggested by model
|
| 335 |
+
# 4. OR, we set the first lemma suggested by the model
|
| 336 |
+
# 5. OR, just in case, one way or another if we cannot set a lemma, we set the word as the lemma
|
| 337 |
+
|
| 338 |
+
# Go over all words in the sentence
|
| 339 |
+
for i in range(len(tag)):
|
| 340 |
+
|
| 341 |
+
# If there is prop in tags
|
| 342 |
+
if tag[i]["t"] in self.prop_tag_classes:
|
| 343 |
+
|
| 344 |
+
# set the lemma as the word
|
| 345 |
+
tag[i]["l"]=tag[i]["w"]
|
| 346 |
+
|
| 347 |
+
# if there is gen in tags then remove the last Ss
|
| 348 |
+
if tag[i]["t"] in self.gen_tag_classes:
|
| 349 |
+
if tag[i]["l"].endswith("'s") or tag[i]["l"].endswith("'S"):
|
| 350 |
+
tag[i]["l"]=tag[i]["l"][:-2]
|
| 351 |
+
elif tag[i]["l"].endswith("s") or tag[i]["l"].endswith("S") or tag[i]["l"].endswith("'"):
|
| 352 |
+
tag[i]["l"]=tag[i]["l"][:-1]
|
| 353 |
+
continue
|
| 354 |
+
|
| 355 |
+
# if høflig
|
| 356 |
+
if tag[i]["w"]=="De":
|
| 357 |
+
if tag[i]["t"] in self.t_2_tag_classes:
|
| 358 |
+
tag[i]["l"]="De"
|
| 359 |
+
continue
|
| 360 |
+
else:
|
| 361 |
+
tag[i]["l"]="de"
|
| 362 |
+
continue
|
| 363 |
+
|
| 364 |
+
# for the rest of the cases of the word, lowercase the word and check against the fullform list
|
| 365 |
+
word=tag[i]["w"].lower()
|
| 366 |
+
word_class = self.tags[0][tag[i]["t"]][0]
|
| 367 |
+
|
| 368 |
+
# get the lemma from the fullform list
|
| 369 |
+
fullform_list_lemma = self.fullform_list[LANG].get(word, {}).get(word_class)
|
| 370 |
+
|
| 371 |
+
# if there is not a lemma in the fullformlist
|
| 372 |
+
# use the first lemma from the model
|
| 373 |
+
if fullform_list_lemma==None:
|
| 374 |
+
tag[i]["l"] = self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(tag[i]["w"], tag[i]["l"][0] )
|
| 375 |
+
|
| 376 |
+
# if there is only one fullformlist-lemma:
|
| 377 |
+
elif len(fullform_list_lemma) == 1:
|
| 378 |
+
tag[i]["l"] = next(iter(fullform_list_lemma))
|
| 379 |
+
|
| 380 |
+
# if there are multiple lemmas in the fullformlist
|
| 381 |
+
# here we disambugate among these lemmas using the alternatives from the model
|
| 382 |
+
elif len(fullform_list_lemma) > 1:
|
| 383 |
+
tag[i]["l"] = next((selected_lemma for x in tag[i]["l"] if (selected_lemma := self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(tag[i]["w"], x )) in fullform_list_lemma), self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(tag[i]["w"], tag[i]["l"][0] ) )
|
| 384 |
+
|
| 385 |
+
# This branch will probably not be called but kept just in case
|
| 386 |
+
# If none of the cases above, use the first lemma suggested by the model
|
| 387 |
+
else:
|
| 388 |
+
tag[i]["l"] = self.LemmaHandling.get_lemma_given_word_and_lemma_list_index(tag[i]["w"], tag[i]["l"][0] )
|
| 389 |
+
|
| 390 |
+
# This if will probable not be true either but kept just in case
|
| 391 |
+
# If a lemma could not be assigned after all these operations
|
| 392 |
+
# then asign the word itself
|
| 393 |
+
# Check by if the lemma field is still a list or if the field-type is string the legth is 0
|
| 394 |
+
if type(tag[i]["l"]) == list or len(tag[i]["l"]) == 0:
|
| 395 |
+
tag[i]["l"] = tag[i]["w"]
|
| 396 |
+
|
| 397 |
+
return tag
|
| 398 |
+
|
| 399 |
def tag_sentence_list(self, lst, **tag_config):
|
| 400 |
|
| 401 |
# If the sentences are not tokenized, tokenize while batching:
|
|
|
|
| 413 |
else:
|
| 414 |
tokenized_batches = self._batchify(lst)
|
| 415 |
|
| 416 |
+
# If lemmatization will be applied
|
| 417 |
+
if tag_config["lemmatize"]:
|
| 418 |
+
|
| 419 |
+
# If language will be identified per sentence
|
| 420 |
+
if tag_config["lang_per_sentence"]:
|
| 421 |
+
id_to_lang = self.config["id_to_lang"]
|
| 422 |
+
# If the output will be to a python list
|
| 423 |
+
if tag_config["write_output_to"]==None:
|
| 424 |
+
all_tagged_sentences = []
|
| 425 |
+
for batch in tokenized_batches:
|
| 426 |
+
all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
|
| 427 |
+
batch_tags = torch.argmax(all_out["logits2"], dim=-1)
|
| 428 |
+
batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
|
| 429 |
+
#batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
|
| 430 |
+
batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
|
| 431 |
+
batch["input_ids"].to("cpu")
|
| 432 |
+
batch["attention_mask"].to("cpu")
|
| 433 |
+
|
| 434 |
+
for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
|
| 435 |
+
batch_lemma_indices.indices.tolist(), batch_langs[:, 0].tolist()):
|
| 436 |
+
this_sentence=[]
|
| 437 |
+
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 438 |
+
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
| 439 |
+
break
|
| 440 |
+
if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
|
| 441 |
+
if len(this_sentence)>0:
|
| 442 |
+
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 443 |
+
else:
|
| 444 |
+
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
|
| 445 |
+
else:
|
| 446 |
+
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
|
| 447 |
+
this_sentence = self._lemmatize(this_sentence, lang)
|
| 448 |
+
all_tagged_sentences.append({"lang":id_to_lang[lang], "sent": [ {"w":i["w"], "t":self.tags[lang][i["t"]], "l":i["l"]} for i in this_sentence]})
|
| 449 |
+
|
| 450 |
+
return all_tagged_sentences
|
| 451 |
+
|
| 452 |
+
# If the output is in TSV format to a pipe (stdout or a file handle)
|
| 453 |
+
elif tag_config["output_tsv"]:
|
| 454 |
+
for batch in tokenized_batches:
|
| 455 |
+
all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
|
| 456 |
+
batch_tags = torch.argmax(all_out["logits2"], dim=-1)
|
| 457 |
+
batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
|
| 458 |
+
#batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
|
| 459 |
+
batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
|
| 460 |
+
batch["input_ids"].to("cpu")
|
| 461 |
+
batch["attention_mask"].to("cpu")
|
| 462 |
+
|
| 463 |
+
for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
|
| 464 |
+
batch_lemma_indices.indices.tolist(), batch_langs[:, 0].tolist()):
|
| 465 |
+
this_sentence=[]
|
| 466 |
+
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 467 |
+
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
| 468 |
+
break
|
| 469 |
+
if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
|
| 470 |
+
if len(this_sentence)>0:
|
| 471 |
+
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 472 |
+
else:
|
| 473 |
+
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
|
| 474 |
+
else:
|
| 475 |
+
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
|
| 476 |
+
this_sentence = self._lemmatize(this_sentence, lang)
|
| 477 |
+
this_sentence=[ {"w":i["w"], "t":self.tags_str[lang][i["t"]], "l":i["l"]} for i in this_sentence]
|
| 478 |
+
tag_config["write_output_to"].write(id_to_lang[lang])
|
| 479 |
+
for lin in this_sentence:
|
| 480 |
+
tag_config["write_output_to"].write("\t")
|
| 481 |
+
tag_config["write_output_to"].write(lin["w"])
|
| 482 |
+
tag_config["write_output_to"].write("\t")
|
| 483 |
+
tag_config["write_output_to"].write(lin["l"])
|
| 484 |
+
tag_config["write_output_to"].write("\t")
|
| 485 |
+
tag_config["write_output_to"].write(lin["t"])
|
| 486 |
+
tag_config["write_output_to"].write("\n")
|
| 487 |
+
tag_config["write_output_to"].write("\n")
|
| 488 |
+
|
| 489 |
+
# If output format will be json to a pipe (stdout or a file handle)
|
| 490 |
+
else:
|
| 491 |
+
for batch in tokenized_batches:
|
| 492 |
+
all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
|
| 493 |
+
batch_tags = torch.argmax(all_out["logits2"], dim=-1)
|
| 494 |
+
batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
|
| 495 |
+
#batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
|
| 496 |
+
batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
|
| 497 |
+
batch["input_ids"].to("cpu")
|
| 498 |
+
batch["attention_mask"].to("cpu")
|
| 499 |
+
|
| 500 |
+
for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
|
| 501 |
+
batch_lemma_indices.indices.tolist(), batch_langs[:, 0].tolist()):
|
| 502 |
+
this_sentence=[]
|
| 503 |
+
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 504 |
+
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
| 505 |
+
break
|
| 506 |
+
if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
|
| 507 |
+
if len(this_sentence)>0:
|
| 508 |
+
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 509 |
+
else:
|
| 510 |
+
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
|
| 511 |
+
else:
|
| 512 |
+
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
|
| 513 |
+
this_sentence = self._lemmatize(this_sentence, lang)
|
| 514 |
+
json.dump({"lang":id_to_lang[lang], "sent":[ {"w":i["w"], "t":self.tags[lang][i["t"]], "l":i["l"]} for i in this_sentence]}, tag_config["write_output_to"])
|
| 515 |
+
tag_config["write_output_to"].write("\n")
|
| 516 |
+
|
| 517 |
+
# If the language is set as parameter
|
| 518 |
+
elif tag_config["lang"] != -1:
|
| 519 |
+
LANG = tag_config["lang"]
|
| 520 |
+
LANG_STR = self.config["id_to_lang"][LANG]
|
| 521 |
+
# If the output will be to a python list
|
| 522 |
+
if tag_config["write_output_to"]==None:
|
| 523 |
+
all_tagged_sentences = []
|
| 524 |
+
for batch in tokenized_batches:
|
| 525 |
+
all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
|
| 526 |
+
batch_tags = torch.argmax(all_out["logits2"], dim=-1)
|
| 527 |
+
batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
|
| 528 |
+
#batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
|
| 529 |
+
batch["input_ids"].to("cpu")
|
| 530 |
+
batch["attention_mask"].to("cpu")
|
| 531 |
+
for input_ids, tags, lemma_indices in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
|
| 532 |
+
batch_lemma_indices.indices.tolist()): #batch_lemmas.tolist(),
|
| 533 |
+
this_sentence=[]
|
| 534 |
+
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemma_indices[1:]):
|
| 535 |
+
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
| 536 |
+
break
|
| 537 |
+
if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
|
| 538 |
+
if len(this_sentence)>0:
|
| 539 |
+
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 540 |
+
else:
|
| 541 |
+
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
|
| 542 |
+
else:
|
| 543 |
+
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
|
| 544 |
+
|
| 545 |
+
this_sentence = self._lemmatize(this_sentence, LANG)
|
| 546 |
+
all_tagged_sentences.append({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":i["l"]} for i in this_sentence]})
|
| 547 |
+
|
| 548 |
+
return all_tagged_sentences
|
| 549 |
+
|
| 550 |
+
# If the output is in TSV format to a pipe (stdout or a file handle)
|
| 551 |
+
elif tag_config["output_tsv"]:
|
| 552 |
+
for batch in tokenized_batches:
|
| 553 |
+
all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
|
| 554 |
+
batch_tags = torch.argmax(all_out["logits2"], dim=-1)
|
| 555 |
+
batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
|
| 556 |
+
#batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
|
| 557 |
+
batch["input_ids"].to("cpu")
|
| 558 |
+
batch["attention_mask"].to("cpu")
|
| 559 |
+
|
| 560 |
+
for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
|
| 561 |
+
batch_lemma_indices.indices.tolist()):
|
| 562 |
+
this_sentence=[]
|
| 563 |
+
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 564 |
+
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
| 565 |
+
break
|
| 566 |
+
if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
|
| 567 |
+
if len(this_sentence)>0:
|
| 568 |
+
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 569 |
+
else:
|
| 570 |
+
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
|
| 571 |
+
else:
|
| 572 |
+
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
|
| 573 |
+
|
| 574 |
+
this_sentence = self._lemmatize(this_sentence, LANG)
|
| 575 |
+
this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]], "l":i["l"]} for i in this_sentence]
|
| 576 |
+
tag_config["write_output_to"].write(LANG_STR)
|
| 577 |
+
for lin in this_sentence:
|
| 578 |
+
tag_config["write_output_to"].write("\t")
|
| 579 |
+
tag_config["write_output_to"].write(lin["w"])
|
| 580 |
+
tag_config["write_output_to"].write("\t")
|
| 581 |
+
tag_config["write_output_to"].write(lin["l"])
|
| 582 |
+
tag_config["write_output_to"].write("\t")
|
| 583 |
+
tag_config["write_output_to"].write(lin["t"])
|
| 584 |
+
tag_config["write_output_to"].write("\n")
|
| 585 |
+
tag_config["write_output_to"].write("\n")
|
| 586 |
+
|
| 587 |
+
# If output format will be json to a pipe (stdout or a file handle)
|
| 588 |
+
else:
|
| 589 |
+
for batch in tokenized_batches:
|
| 590 |
+
all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
|
| 591 |
+
batch_tags = torch.argmax(all_out["logits2"], dim=-1)
|
| 592 |
+
batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
|
| 593 |
+
#batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
|
| 594 |
+
batch["input_ids"].to("cpu")
|
| 595 |
+
batch["attention_mask"].to("cpu")
|
| 596 |
+
|
| 597 |
+
for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
|
| 598 |
+
batch_lemma_indices.indices.tolist()):
|
| 599 |
+
this_sentence=[]
|
| 600 |
+
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 601 |
+
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
| 602 |
+
break
|
| 603 |
+
if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
|
| 604 |
+
if len(this_sentence)>0:
|
| 605 |
+
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 606 |
+
else:
|
| 607 |
+
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
|
| 608 |
+
else:
|
| 609 |
+
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
|
| 610 |
+
|
| 611 |
+
this_sentence = self._lemmatize(this_sentence, LANG)
|
| 612 |
+
json.dump({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":i["l"]} for i in this_sentence]}, tag_config["write_output_to"])
|
| 613 |
+
tag_config["write_output_to"].write("\n")
|
| 614 |
+
|
| 615 |
+
# If language will be identified according to the majority of all sentences:
|
| 616 |
+
else:
|
| 617 |
+
all_tags=[]
|
| 618 |
+
all_lemmas=[]
|
| 619 |
+
all_langs=[]
|
| 620 |
+
all_input_ids=[]
|
| 621 |
+
# Go over all batches and each sentence in each batch
|
| 622 |
for batch in tokenized_batches:
|
| 623 |
all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
|
| 624 |
batch_tags = torch.argmax(all_out["logits2"], dim=-1)
|
| 625 |
+
batch_lemma_indices = torch.topk(all_out["logits3"].flatten(start_dim=2, end_dim=2), len(self.LemmaHandling.lemma_list))
|
| 626 |
+
#batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
|
| 627 |
batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
|
| 628 |
+
all_input_ids.extend(batch["input_ids"].tolist())
|
| 629 |
batch["input_ids"].to("cpu")
|
| 630 |
batch["attention_mask"].to("cpu")
|
| 631 |
+
all_langs.extend(batch_langs[:, 0].tolist())
|
| 632 |
+
all_tags.extend(batch_tags.tolist())
|
| 633 |
+
all_lemmas.extend(batch_lemma_indices.indices.tolist())
|
| 634 |
|
| 635 |
+
# Identify the language
|
| 636 |
+
tag_config["lang"] = 1 if sum(all_langs)/len(all_langs)>=0.5 else 0
|
| 637 |
+
LANG = tag_config["lang"]
|
| 638 |
+
LANG_STR = self.config["id_to_lang"][LANG]
|
| 639 |
+
|
| 640 |
+
# If the output will be returned as python list:
|
| 641 |
+
if tag_config["write_output_to"]==None:
|
| 642 |
+
all_tagged_sentences = []
|
| 643 |
+
for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
|
| 644 |
this_sentence=[]
|
| 645 |
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 646 |
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
| 647 |
break
|
| 648 |
+
if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
|
| 649 |
if len(this_sentence)>0:
|
| 650 |
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 651 |
else:
|
| 652 |
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
|
| 653 |
else:
|
| 654 |
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
|
|
|
|
| 655 |
|
| 656 |
+
this_sentence = self._lemmatize(this_sentence, LANG)
|
| 657 |
+
all_tagged_sentences.append({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":i["l"]} for i in this_sentence] })
|
| 658 |
+
return all_tagged_sentences
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 659 |
|
| 660 |
+
# If the output is in TSV format
|
| 661 |
+
elif tag_config["output_tsv"]:
|
| 662 |
+
for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
|
| 663 |
this_sentence=[]
|
| 664 |
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 665 |
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
| 666 |
break
|
| 667 |
+
if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
|
| 668 |
if len(this_sentence)>0:
|
| 669 |
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 670 |
else:
|
| 671 |
+
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
|
| 672 |
else:
|
| 673 |
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
|
| 674 |
+
|
| 675 |
+
this_sentence = self._lemmatize(this_sentence, LANG)
|
| 676 |
+
this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]], "l":i["l"]} for i in this_sentence]
|
| 677 |
+
tag_config["write_output_to"].write(LANG_STR)
|
| 678 |
for lin in this_sentence:
|
| 679 |
tag_config["write_output_to"].write("\t")
|
| 680 |
tag_config["write_output_to"].write(lin["w"])
|
|
|
|
| 685 |
tag_config["write_output_to"].write("\n")
|
| 686 |
tag_config["write_output_to"].write("\n")
|
| 687 |
|
| 688 |
+
# If output format will be json
|
| 689 |
+
else:
|
| 690 |
+
for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 691 |
this_sentence=[]
|
| 692 |
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 693 |
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
| 694 |
break
|
| 695 |
+
if lemma[0] == 0: # If there is no lemma here, that means we haven't reached the end of the word
|
| 696 |
if len(this_sentence)>0:
|
| 697 |
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 698 |
else:
|
| 699 |
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag, "l":lemma})
|
| 700 |
else:
|
| 701 |
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag, "l":lemma})
|
| 702 |
+
|
| 703 |
+
this_sentence = self._lemmatize(this_sentence, LANG)
|
| 704 |
+
json.dump({"lang":LANG_STR, "sent":[ {"w":i["w"], "t":self.tags[LANG][i["t"]], "l":i["l"]} for i in this_sentence]}, tag_config["write_output_to"])
|
| 705 |
tag_config["write_output_to"].write("\n")
|
| 706 |
|
| 707 |
+
# If lemmatization will not be applied:
|
| 708 |
+
else:
|
| 709 |
+
# If language will be identified per sentence
|
| 710 |
+
if tag_config["lang_per_sentence"]:
|
| 711 |
+
id_to_lang = self.config["id_to_lang"]
|
| 712 |
+
# If the output will be to a python list
|
| 713 |
+
if tag_config["write_output_to"]==None:
|
| 714 |
+
all_tagged_sentences = []
|
| 715 |
+
for batch in tokenized_batches:
|
| 716 |
+
all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
|
| 717 |
+
batch_tags = torch.argmax(all_out["logits2"], dim=-1)
|
| 718 |
+
batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
|
| 719 |
+
batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
|
| 720 |
+
batch["input_ids"].to("cpu")
|
| 721 |
+
batch["attention_mask"].to("cpu")
|
| 722 |
+
|
| 723 |
+
for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
|
| 724 |
+
batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
|
| 725 |
+
this_sentence=[]
|
| 726 |
+
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 727 |
+
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
| 728 |
+
break
|
| 729 |
+
if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
|
| 730 |
+
if len(this_sentence)>0:
|
| 731 |
+
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 732 |
+
else:
|
| 733 |
+
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
|
| 734 |
+
else:
|
| 735 |
+
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
|
| 736 |
+
all_tagged_sentences.append({"lang":id_to_lang[lang], "sent": [ {"w":i["w"], "t":self.tags[lang][i["t"]]} for i in this_sentence]})
|
| 737 |
+
|
| 738 |
+
return all_tagged_sentences
|
| 739 |
+
|
| 740 |
+
# If the output is in TSV format to a pipe (stdout or a file handle)
|
| 741 |
+
elif tag_config["output_tsv"]:
|
| 742 |
+
for batch in tokenized_batches:
|
| 743 |
+
all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
|
| 744 |
+
batch_tags = torch.argmax(all_out["logits2"], dim=-1)
|
| 745 |
+
batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
|
| 746 |
+
batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
|
| 747 |
+
batch["input_ids"].to("cpu")
|
| 748 |
+
batch["attention_mask"].to("cpu")
|
| 749 |
+
|
| 750 |
+
for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
|
| 751 |
+
batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
|
| 752 |
+
this_sentence=[]
|
| 753 |
+
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 754 |
+
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
| 755 |
+
break
|
| 756 |
+
if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
|
| 757 |
+
if len(this_sentence)>0:
|
| 758 |
+
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 759 |
+
else:
|
| 760 |
+
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
|
| 761 |
+
else:
|
| 762 |
+
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
|
| 763 |
+
this_sentence=[ {"w":i["w"], "t":self.tags_str[lang][i["t"]] } for i in this_sentence]
|
| 764 |
+
tag_config["write_output_to"].write(id_to_lang[lang])
|
| 765 |
+
for lin in this_sentence:
|
| 766 |
+
tag_config["write_output_to"].write("\t")
|
| 767 |
+
tag_config["write_output_to"].write(lin["w"])
|
| 768 |
+
tag_config["write_output_to"].write("\t")
|
| 769 |
+
tag_config["write_output_to"].write(lin["t"])
|
| 770 |
+
tag_config["write_output_to"].write("\n")
|
| 771 |
+
tag_config["write_output_to"].write("\n")
|
| 772 |
+
|
| 773 |
+
# If output format will be json to a pipe (stdout or a file handle)
|
| 774 |
+
else:
|
| 775 |
+
for batch in tokenized_batches:
|
| 776 |
+
all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
|
| 777 |
+
batch_tags = torch.argmax(all_out["logits2"], dim=-1)
|
| 778 |
+
batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
|
| 779 |
+
batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
|
| 780 |
+
batch["input_ids"].to("cpu")
|
| 781 |
+
batch["attention_mask"].to("cpu")
|
| 782 |
+
|
| 783 |
+
for input_ids, tags, lemmas, lang in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
|
| 784 |
+
batch_lemmas.tolist(), batch_langs[:, 0].tolist()):
|
| 785 |
+
this_sentence=[]
|
| 786 |
+
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 787 |
+
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
| 788 |
+
break
|
| 789 |
+
if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
|
| 790 |
+
if len(this_sentence)>0:
|
| 791 |
+
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 792 |
+
else:
|
| 793 |
+
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
|
| 794 |
+
else:
|
| 795 |
+
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
|
| 796 |
+
|
| 797 |
+
json.dump({"lang":id_to_lang[lang], "sent":[ {"w":i["w"], "t":self.tags[lang][i["t"]]} for i in this_sentence]}, tag_config["write_output_to"])
|
| 798 |
+
tag_config["write_output_to"].write("\n")
|
| 799 |
+
|
| 800 |
+
# If the language is set as parameter
|
| 801 |
+
elif tag_config["lang"] != -1:
|
| 802 |
+
LANG = tag_config["lang"]
|
| 803 |
+
LANG_STR = self.config["id_to_lang"][LANG]
|
| 804 |
+
# If the output will be to a python list
|
| 805 |
+
if tag_config["write_output_to"]==None:
|
| 806 |
+
all_tagged_sentences = []
|
| 807 |
+
for batch in tokenized_batches:
|
| 808 |
+
all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
|
| 809 |
+
batch_tags = torch.argmax(all_out["logits2"], dim=-1)
|
| 810 |
+
batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
|
| 811 |
+
batch["input_ids"].to("cpu")
|
| 812 |
+
batch["attention_mask"].to("cpu")
|
| 813 |
+
|
| 814 |
+
for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
|
| 815 |
+
batch_lemmas.tolist()):
|
| 816 |
+
this_sentence=[]
|
| 817 |
+
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 818 |
+
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
| 819 |
+
break
|
| 820 |
+
if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
|
| 821 |
+
if len(this_sentence)>0:
|
| 822 |
+
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 823 |
+
else:
|
| 824 |
+
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
|
| 825 |
+
else:
|
| 826 |
+
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
|
| 827 |
+
all_tagged_sentences.append({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]]} for i in this_sentence]})
|
| 828 |
+
|
| 829 |
+
return all_tagged_sentences
|
| 830 |
+
|
| 831 |
+
# If the output is in TSV format to a pipe (stdout or a file handle)
|
| 832 |
+
elif tag_config["output_tsv"]:
|
| 833 |
+
for batch in tokenized_batches:
|
| 834 |
+
all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
|
| 835 |
+
batch_tags = torch.argmax(all_out["logits2"], dim=-1)
|
| 836 |
+
batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
|
| 837 |
+
batch["input_ids"].to("cpu")
|
| 838 |
+
batch["attention_mask"].to("cpu")
|
| 839 |
+
|
| 840 |
+
for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
|
| 841 |
+
batch_lemmas.tolist()):
|
| 842 |
+
this_sentence=[]
|
| 843 |
+
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 844 |
+
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
| 845 |
+
break
|
| 846 |
+
if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
|
| 847 |
+
if len(this_sentence)>0:
|
| 848 |
+
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 849 |
+
else:
|
| 850 |
+
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
|
| 851 |
+
else:
|
| 852 |
+
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
|
| 853 |
+
this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]]} for i in this_sentence]
|
| 854 |
+
tag_config["write_output_to"].write(LANG_STR)
|
| 855 |
+
for lin in this_sentence:
|
| 856 |
+
tag_config["write_output_to"].write("\t")
|
| 857 |
+
tag_config["write_output_to"].write(lin["w"])
|
| 858 |
+
tag_config["write_output_to"].write("\t")
|
| 859 |
+
tag_config["write_output_to"].write(lin["t"])
|
| 860 |
+
tag_config["write_output_to"].write("\n")
|
| 861 |
+
tag_config["write_output_to"].write("\n")
|
| 862 |
+
|
| 863 |
+
# If output format will be json to a pipe (stdout or a file handle)
|
| 864 |
+
else:
|
| 865 |
+
for batch in tokenized_batches:
|
| 866 |
+
all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
|
| 867 |
+
batch_tags = torch.argmax(all_out["logits2"], dim=-1)
|
| 868 |
+
batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
|
| 869 |
+
batch["input_ids"].to("cpu")
|
| 870 |
+
batch["attention_mask"].to("cpu")
|
| 871 |
+
|
| 872 |
+
for input_ids, tags, lemmas in zip(batch["input_ids"].tolist(), batch_tags.tolist(),
|
| 873 |
+
batch_lemmas.tolist()):
|
| 874 |
+
this_sentence=[]
|
| 875 |
+
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 876 |
+
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
| 877 |
+
break
|
| 878 |
+
if lemma == 0: # If there is no lemma here, that means we haven't reached the end of the word
|
| 879 |
+
if len(this_sentence)>0:
|
| 880 |
+
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 881 |
+
else:
|
| 882 |
+
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
|
| 883 |
+
else:
|
| 884 |
+
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
|
| 885 |
+
|
| 886 |
+
json.dump({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]]} for i in this_sentence]}, tag_config["write_output_to"])
|
| 887 |
+
tag_config["write_output_to"].write("\n")
|
| 888 |
+
|
| 889 |
+
# If language will be identified according to the majority of all sentences:
|
| 890 |
+
else:
|
| 891 |
+
all_tags=[]
|
| 892 |
+
all_lemmas=[]
|
| 893 |
+
all_langs=[]
|
| 894 |
+
all_input_ids=[]
|
| 895 |
+
# Go over all batches and each sentence in each batch
|
| 896 |
for batch in tokenized_batches:
|
| 897 |
all_out = self(batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device))
|
| 898 |
+
batch_tags = torch.argmax(all_out["logits2"], dim=-1)
|
| 899 |
batch_lemmas = torch.argmax(all_out["logits3"], dim=-1)
|
| 900 |
+
batch_langs = torch.argmax(all_out["seq_logits"], dim=-1)
|
| 901 |
+
all_input_ids.extend(batch["input_ids"].tolist())
|
| 902 |
batch["input_ids"].to("cpu")
|
| 903 |
batch["attention_mask"].to("cpu")
|
| 904 |
+
all_langs.extend(batch_langs[:, 0].tolist())
|
| 905 |
+
all_tags.extend(batch_tags.tolist())
|
| 906 |
+
all_lemmas.extend(batch_lemmas.tolist())
|
| 907 |
+
|
| 908 |
+
# Identify the language
|
| 909 |
+
tag_config["lang"] = 1 if sum(all_langs)/len(all_langs)>=0.5 else 0
|
| 910 |
+
LANG = tag_config["lang"]
|
| 911 |
+
LANG_STR = self.config["id_to_lang"][LANG]
|
| 912 |
+
|
| 913 |
+
# If the output will be returned as python list:
|
| 914 |
+
if tag_config["write_output_to"]==None:
|
| 915 |
+
all_tagged_sentences = []
|
| 916 |
+
for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
|
| 917 |
this_sentence=[]
|
| 918 |
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 919 |
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
|
|
|
| 922 |
if len(this_sentence)>0:
|
| 923 |
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 924 |
else:
|
| 925 |
+
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
|
| 926 |
else:
|
| 927 |
+
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
|
| 928 |
+
all_tagged_sentences.append({"lang":LANG_STR, "sent": [ {"w":i["w"], "t":self.tags[LANG][i["t"]]} for i in this_sentence] })
|
| 929 |
+
return all_tagged_sentences
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 930 |
|
| 931 |
+
# If the output is in TSV format
|
| 932 |
+
elif tag_config["output_tsv"]:
|
| 933 |
+
for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
|
| 934 |
this_sentence=[]
|
| 935 |
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 936 |
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
|
|
|
| 939 |
if len(this_sentence)>0:
|
| 940 |
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 941 |
else:
|
| 942 |
+
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
|
| 943 |
else:
|
| 944 |
+
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
|
| 945 |
+
this_sentence=[ {"w":i["w"], "t":self.tags_str[LANG][i["t"]]} for i in this_sentence]
|
| 946 |
tag_config["write_output_to"].write(LANG_STR)
|
| 947 |
for lin in this_sentence:
|
| 948 |
tag_config["write_output_to"].write("\t")
|
| 949 |
tag_config["write_output_to"].write(lin["w"])
|
| 950 |
tag_config["write_output_to"].write("\t")
|
|
|
|
|
|
|
| 951 |
tag_config["write_output_to"].write(lin["t"])
|
| 952 |
tag_config["write_output_to"].write("\n")
|
| 953 |
tag_config["write_output_to"].write("\n")
|
| 954 |
|
| 955 |
+
# If output format will be json
|
| 956 |
+
else:
|
| 957 |
+
for input_ids, tags, lemmas in zip(all_input_ids, all_tags, all_lemmas):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 958 |
this_sentence=[]
|
| 959 |
for inps, tag, lemma in zip(input_ids[1:], tags[1:], lemmas[1:]):
|
| 960 |
if inps == self.tokenizer.sep_token_id or inps == self.tokenizer.pad_token_id:
|
|
|
|
| 963 |
if len(this_sentence)>0:
|
| 964 |
this_sentence[-1]["w"] += self.tokenizer.decode(inps)
|
| 965 |
else:
|
| 966 |
+
this_sentence.append({"w": self.tokenizer.decode(inps), "t": tag})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 967 |
else:
|
| 968 |
+
this_sentence.append({"w":self.tokenizer.decode(inps).strip(), "t":tag})
|
| 969 |
+
|
| 970 |
+
json.dump({"lang":LANG_STR, "sent":[ {"w":i["w"], "t":self.tags[LANG][i["t"]]} for i in this_sentence]}, tag_config["write_output_to"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 971 |
tag_config["write_output_to"].write("\n")
|
|
|
|
| 972 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 973 |
|
| 974 |
def _check_if_text_file_and_return_content(self, filepath):
|
| 975 |
try:
|
|
|
|
| 980 |
|
| 981 |
@torch.no_grad()
|
| 982 |
def tag(self, inp=None, **tag_config):
|
| 983 |
+
|
| 984 |
self.eval()
|
| 985 |
+
|
| 986 |
+
if "lemmatise" in tag_config and tag_config["lemmatise"]==False:
|
| 987 |
+
tag_config["lemmatize"] = False
|
| 988 |
+
if "lemmatise" in tag_config:
|
| 989 |
+
del tag_config["lemmatise"]
|
| 990 |
+
else:
|
| 991 |
+
tag_config["lemmatize"] = True
|
| 992 |
+
if "lemmatise" in tag_config:
|
| 993 |
+
del tag_config["lemmatise"]
|
| 994 |
+
|
| 995 |
+
if "lemmatize" in tag_config and tag_config["lemmatize"]==False:
|
| 996 |
+
tag_config["lemmatize"] = False
|
| 997 |
+
|
| 998 |
if "one_sentence_per_line" not in tag_config:
|
| 999 |
tag_config["one_sentence_per_line"]=False
|
| 1000 |
|
|
|
|
| 1039 |
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
| 1040 |
if tag_config["one_sentence_per_line"]:
|
| 1041 |
inp = [i for i in file_content.split("\n") if i!=""]
|
| 1042 |
+
inp = [" ".join(i.split()) for i in inp if i!=""]
|
| 1043 |
with open(out_path, "w") as opened_file:
|
| 1044 |
tag_config["write_output_to"] = opened_file
|
| 1045 |
self.tag_sentence_list(inp, **tag_config)
|
|
|
|
| 1050 |
self.tag_sentence_list(inp, **tag_config)
|
| 1051 |
else:
|
| 1052 |
print (f"Could not properly open and read {input_path}.")
|
| 1053 |
+
if write_to is not sys.stdout and write_to is not sys.stderr:
|
| 1054 |
+
write_to.close()
|
| 1055 |
return
|
| 1056 |
|
| 1057 |
else:
|
|
|
|
| 1069 |
# Tag one sentence per line in a string
|
| 1070 |
if tag_config["one_sentence_per_line"]:
|
| 1071 |
inp = [i for i in inp.split("\n") if i!=""]
|
| 1072 |
+
inp = [" ".join(self._preprocess_text(i).split()) for i in inp if i!=""]
|
| 1073 |
return self.tag_sentence_list(inp, **tag_config)
|
| 1074 |
|
| 1075 |
# identify sentences
|
|
|
|
| 1079 |
# Tag one sentence per list item
|
| 1080 |
elif type(inp) == list:
|
| 1081 |
inp=[i.strip() for i in inp]
|
| 1082 |
+
inp=[" ".join(self._preprocess_text(i).split()) for i in inp if i!=""]
|
| 1083 |
return self.tag_sentence_list(inp, **tag_config)
|
| 1084 |
|
| 1085 |
def identify_language_sentence_list(self, lst, **tag_config):
|
|
|
|
| 1122 |
|
| 1123 |
@torch.no_grad()
|
| 1124 |
def identify_language(self, inp=None, **tag_config):
|
| 1125 |
+
|
| 1126 |
self.eval()
|
| 1127 |
+
|
| 1128 |
if "one_sentence_per_line" not in tag_config:
|
| 1129 |
tag_config["one_sentence_per_line"]=False
|
| 1130 |
+
|
| 1131 |
if "lang" in tag_config:
|
| 1132 |
del tag_config["lang"]
|
| 1133 |
|
|
|
|
| 1137 |
if "lang_per_sentence" not in tag_config:
|
| 1138 |
tag_config["lang_per_sentence"] = False
|
| 1139 |
|
| 1140 |
+
elif type(tag_config["lang_per_sentence"])==bool and tag_config["lang_per_sentence"]:
|
| 1141 |
tag_config["lang_per_sentence"] = True
|
| 1142 |
|
| 1143 |
if "input_directory" in tag_config and "output_directory" in tag_config and "write_output_to" in tag_config and tag_config["write_output_to"]!=None:
|
|
|
|
| 1193 |
torch.cuda.empty_cache()
|
| 1194 |
|
| 1195 |
if tag_config["write_output_to"]==None:
|
| 1196 |
+
general_output.extend([{"f":i[0], "lang":self.config["id_to_lang"][i[1]]} for i in zip(file_names, langs)])
|
| 1197 |
elif tag_config["output_tsv"]:
|
| 1198 |
for fil,lan in zip(file_names, langs):
|
| 1199 |
tag_config["write_output_to"].write(fil)
|
|
|
|
| 1202 |
tag_config["write_output_to"].write("\n")
|
| 1203 |
else:
|
| 1204 |
for fil,lan in zip(file_names, langs):
|
| 1205 |
+
json.dump({"f":fil, "lang":self.config["id_to_lang"][lan]})
|
| 1206 |
file_names=[]
|
| 1207 |
contents=[]
|
| 1208 |
else:
|
|
|
|
| 1223 |
torch.cuda.empty_cache()
|
| 1224 |
|
| 1225 |
if tag_config["write_output_to"]==None:
|
| 1226 |
+
general_output.extend([{"f":i[0], "lang":self.config["id_to_lang"][i[1]]} for i in zip(file_names, langs)])
|
| 1227 |
elif tag_config["output_tsv"]:
|
| 1228 |
for fil,lan in zip(file_names, langs):
|
| 1229 |
tag_config["write_output_to"].write(fil)
|
|
|
|
| 1232 |
tag_config["write_output_to"].write("\n")
|
| 1233 |
else:
|
| 1234 |
for fil,lan in zip(file_names, langs):
|
| 1235 |
+
json.dump({"f":fil, "lang":self.config["id_to_lang"][lan]})
|
| 1236 |
|
| 1237 |
return general_output if len(general_output)>0 else None
|
| 1238 |
|
|
|
|
| 1274 |
opened_file.write(lan)
|
| 1275 |
opened_file.write("\n")
|
| 1276 |
else:
|
| 1277 |
+
json.dump([{"s":sen, "lang":lan} for sen,lan in zip(inp, out) ] , opened_file)
|
| 1278 |
else:
|
| 1279 |
if tag_config["output_tsv"]:
|
| 1280 |
opened_file.write(out[0])
|
| 1281 |
else:
|
| 1282 |
+
json.dump({"lang":out[0]} , opened_file)
|
| 1283 |
else:
|
| 1284 |
if tag_config["lang_per_sentence"]:
|
| 1285 |
+
general_output.extend([{"s":sen, "lang":lan} for sen,lan in zip(inp, out) ])
|
| 1286 |
else:
|
| 1287 |
+
general_output.append({"f":input_path, "lang":out[0]})
|
| 1288 |
|
| 1289 |
# If there is an opened pipe already
|
| 1290 |
else:
|
|
|
|
| 1297 |
tag_config["write_output_to"].write("\n")
|
| 1298 |
tag_config["write_output_to"].write("\n")
|
| 1299 |
else:
|
| 1300 |
+
json.dump([{"s":sen, "lang":lan} for sen,lan in zip(inp, out) ] , tag_config["write_output_to"])
|
| 1301 |
tag_config["write_output_to"].write("\n")
|
| 1302 |
else:
|
| 1303 |
if tag_config["output_tsv"]:
|
|
|
|
| 1306 |
tag_config["write_output_to"].write(out[0])
|
| 1307 |
tag_config["write_output_to"].write("\n")
|
| 1308 |
else:
|
| 1309 |
+
json.dump({"f":input_path, "lang":out[0]} , tag_config["write_output_to"])
|
| 1310 |
tag_config["write_output_to"].write("\n")
|
| 1311 |
|
| 1312 |
else:
|
|
|
|
| 1316 |
tag_config["write_output_to"].write("err")
|
| 1317 |
tag_config["write_output_to"].write("\n")
|
| 1318 |
else:
|
| 1319 |
+
json.dump({"f":input_path, "lang":"err"} , tag_config["write_output_to"])
|
| 1320 |
tag_config["write_output_to"].write("\n")
|
| 1321 |
|
| 1322 |
+
if tag_config["write_output_to"] and tag_config["write_output_to"] is not sys.stdout and tag_config["write_output_to"] is not sys.stderr:
|
| 1323 |
tag_config["write_output_to"].close()
|
| 1324 |
|
| 1325 |
return general_output if len(general_output)>0 else None
|
|
|
|
| 1355 |
|
| 1356 |
# If return as list
|
| 1357 |
if tag_config["write_output_to"]==None:
|
| 1358 |
+
return [{"s":i[0], "lang": i[1]} for i in zip(inp, out)]
|
| 1359 |
|
| 1360 |
if tag_config["output_tsv"]:
|
| 1361 |
for sen,lan in zip(inp, out):
|
|
|
|
| 1364 |
tag_config["write_output_to"].write(out)
|
| 1365 |
tag_config["write_output_to"].write("\n")
|
| 1366 |
else:
|
| 1367 |
+
json.dump([{"s":sen, "lang":lan} for sen,lan in zip(inp, out) ] , tag_config["write_output_to"])
|
| 1368 |
|
| 1369 |
return
|
| 1370 |
|
|
|
|
| 1376 |
|
| 1377 |
# If return as list
|
| 1378 |
if tag_config["write_output_to"]==None:
|
| 1379 |
+
return [{"s":i[0], "lang": i[1]} for i in zip(inp, out)]
|
| 1380 |
|
| 1381 |
if tag_config["output_tsv"]:
|
| 1382 |
for sen,lan in zip(inp, out):
|
|
|
|
| 1385 |
tag_config["write_output_to"].write(lan)
|
| 1386 |
tag_config["write_output_to"].write("\n")
|
| 1387 |
else:
|
| 1388 |
+
json.dump([{"s":sen, "lang":lan} for sen,lan in zip(inp, out) ] , tag_config["write_output_to"])
|
| 1389 |
|
| 1390 |
return
|
| 1391 |
|
tagger_config.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|