added model together with usage example and the README

Browse files

Files changed (7) hide show

README.md +61 -0
config.json +26 -0
model.safetensors +3 -0
model_usage_example.py +87 -0
special_tokens_map.json +7 -0
tokenizer_config.json +58 -0
vocab.txt +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+---
+license: cc-by-sa-4.0
+datasets:
+- cjvt/cc_gigafida
+- cjvt/solar3
+- cjvt/sloleks
+language:
+- cro
+tags:
+- word spelling error annotator
+---
+---
+language:
+- cro
+license: cc-by-sa-4.0
+---
+# BERTic-Incorrect-Spelling-Annotator
+This BERTic model is designed to annotate incorrectly spelled words in text. It utilizes the following labels:
+- 0: Word is written correctly,
+- 1: Word is written incorrectly.
+## Model Output Example
+Imagine we have the following Croatian text:
+_Model u tekstu prepoznije riječi u kojima se nalazaju pogreške ._
+If we convert input data to format acceptable by BERTic model:
+_[CLS] model [MASK] u [MASK] tekstu [MASK] prepo ##znije [MASK] riječi [MASK] u [MASK] kojima [MASK] se [MASK] nalaza ##ju [MASK] pogreške [MASK] . [MASK] [SEP]_
+The model might return the following predictions (note: predictions chosen for demonstration/explanation, not reproducibility!):
+_Model 0 u 0 tekstu 0 prepoznije 1 riječi 0 u 0 kojima 0 se 0 nalazaju 1 pogreške 0 . 0_
+We can observe that in the input sentence, the word `prepoznije` and `nalazaju` are spelled incorrectly, so the model marks them with the token (1).
+## More details
+Testing model with **generated** test sets provides following result:
+Precision: 0.9954
+Recall: 0.8764
+F1 Score: 0.9321
+F0.5 Score: 0.9691
+Testing the model with test sets constructed using the **Croatian corpus of non-professional written language by typical speakers and speakers with language disorders RAPUT 1.0** dataset provides the following results:
+Precision: 0.8213
+Recall: 0.3921
+F1 Score: 0.5308
+F0.5 Score: 0.6738
+## Authors
+Thanks to Martin Božič, Marko Robnik-Šikonja and Špela Arhar Holdt for developing this model.

config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_name_or_path": "./BERTic",
+  "architectures": [
+    "BertForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "embedding_size": 768,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.37.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 32000
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:296ea1e18f2c377a85f7ace0867ccf264318fe0caf2f04c4d773ef0774f57fbb
+size 440136504

model_usage_example.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import torch
+import torch.nn.functional as F
+from transformers import BertTokenizer, BertForTokenClassification
+import re
+import string
+def preprocess_input_text(text):
+    """
+    This function adds a [MASK] token after each word, inserts a space before every punctuation mark,
+    and converts all words to lowercase.
+    It returns the original words from the input text along with the preprocessed version of the input text.
+    """
+    text = re.sub(r'([' + string.punctuation + '])', r' \1', text)
+    text = re.sub(' +', ' ', text)
+    words = text.split(" ")
+    text = text.lower()
+    output = []
+    for word in text.split(" "):
+        output.append(word)
+        output.append("[MASK]")
+    return words, " ".join(output)
+def predict_using_trained_model_old(input_text, model_dir, device):
+    """
+    This function loads a model and predicts whether each word in the input text is correct or incorrect.
+    The output is the input text, where each word is followed by a label indicating whether the word is correct (0) or incorrect (1).
+    """
+    words, input_text = preprocess_input_text(input_text)
+    tokenizer = BertTokenizer.from_pretrained(model_dir)
+    model = BertForTokenClassification.from_pretrained(model_dir, num_labels=2)
+    model.to(device)
+    tokenized_inputs = tokenizer(input_text, max_length=128, padding='max_length', truncation=True, return_tensors="pt")
+    input_ids = tokenized_inputs["input_ids"].to(device)
+    attention_mask = tokenized_inputs["attention_mask"].to(device)
+    model.eval()
+    with torch.no_grad():
+        outputs = model(input_ids, attention_mask=attention_mask)
+        logits = outputs.logits
+    predictions = torch.argmax(logits, dim=-1).squeeze().cpu().numpy()
+    tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().cpu().numpy())
+    model_output = []
+    mask_index = 0
+    for token, prediction in zip(tokens, predictions):
+        if token == "[MASK]":
+            model_output.append(str(prediction))
+            mask_index += 1
+        elif token != "[CLS]" and token != "[SEP]" and token != "[PAD]":
+            model_output.append(words[mask_index])
+    return " ".join(model_output)
+if __name__ == '__main__':
+    input_text = "Model u tekstu prepoznije riječi u kojima se nalazaju pogreške."
+    model_dir = "."
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+    print(f"Using device: {device}")
+    model_output_text = predict_using_trained_model_old(input_text, model_dir, device)
+    print(f"Model output: {model_output_text}")

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": false,
+  "mask_token": "[MASK]",
+  "max_len": 512,
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": false,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff