Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

.gitattributes +1 -0
README.md +64 -0
config.json +0 -0
id2label.json +0 -0
merges.txt +0 -0
onnx/model.onnx +3 -0
onnx/model.onnx.data +3 -0
onnx/model_quantized.onnx +3 -0
special_tokens_map.json +15 -0
tokenizer.json +0 -0
tokenizer_config.json +58 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+onnx/model.onnx.data filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,64 @@

+---
+language: en
+license: apache-2.0
+library_name: transformers.js
+pipeline_tag: token-classification
+tags:
+  - grammatical-error-correction
+  - gector
+  - onnx
+  - transformers.js
+---
+# GECToR Base 2020 (ONNX)
+ONNX quantized version of the original GECToR model from Grammarly for browser-based grammatical error correction with [Transformers.js](https://huggingface.co/docs/transformers.js).
+## Original Model
+- **Source**: [Grammarly GECToR](https://github.com/grammarly/gector)
+- **Paper**: [GECToR – Grammatical Error Correction: Tag, Not Rewrite](https://arxiv.org/abs/2005.12592) (BEA Workshop 2020)
+- **Architecture**: RoBERTa-Base + token classification head
+- **Parameters**: ~125M
+## Conversion Details
+- **Format**: ONNX
+- **Quantization**: INT8 (dynamic quantization)
+- **Size**: ~125MB
+- **Converted by**: Manual export from PyTorch (AllenNLP format)
+## How It Works
+GECToR uses a token classification approach - instead of generating corrected text, it predicts edit operations for each token:
+- `$KEEP` - Keep token unchanged
+- `$DELETE` - Remove token
+- `$REPLACE_word` - Replace with specific word
+- `$APPEND_word` - Append word after token
+- `$TRANSFORM_*` - Apply transformation (case, verb form, etc.)
+The model runs iteratively (typically 2-3 passes) until no more edits are predicted.
+## Usage with Transformers.js
+```javascript
+import { pipeline } from '@huggingface/transformers';
+const classifier = await pipeline(
+  'token-classification',
+  'YOUR_USERNAME/gector-base-2020',
+  { dtype: 'q8' }
+);
+const result = await classifier('He go to school yesterday.');
+// Returns token predictions with edit tags
+```
+## Performance
+Faster than the 2024 version with slightly lower accuracy. Good balance of speed and quality.
+## License
+Apache 2.0 (following original model license)

config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

id2label.json ADDED Viewed

The diff for this file is too large to render. See raw diff

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

onnx/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a5dbe89d72d01afc610e69dd0060028f1106b3cf320193fe391cc1357db0371
+size 1568349

onnx/model.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ca83094f3558a0f7e25c2c8d0d1f07a802e589c11b039d949d7ed02130ff9ca
+size 511719424

onnx/model_quantized.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3ced3d80b1e06d610ff83104b2f4b8a3919f21b8e70120556526de314658426
+size 129688821

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50264": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff