Meyssa commited on
Commit
ff9abe8
·
verified ·
1 Parent(s): 7fbb7e4

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ onnx/model.onnx.data filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ license: apache-2.0
4
+ library_name: transformers.js
5
+ pipeline_tag: token-classification
6
+ tags:
7
+ - grammatical-error-correction
8
+ - gector
9
+ - onnx
10
+ - transformers.js
11
+ ---
12
+
13
+ # GECToR Base 2020 (ONNX)
14
+
15
+ ONNX quantized version of the original GECToR model from Grammarly for browser-based grammatical error correction with [Transformers.js](https://huggingface.co/docs/transformers.js).
16
+
17
+ ## Original Model
18
+
19
+ - **Source**: [Grammarly GECToR](https://github.com/grammarly/gector)
20
+ - **Paper**: [GECToR – Grammatical Error Correction: Tag, Not Rewrite](https://arxiv.org/abs/2005.12592) (BEA Workshop 2020)
21
+ - **Architecture**: RoBERTa-Base + token classification head
22
+ - **Parameters**: ~125M
23
+
24
+ ## Conversion Details
25
+
26
+ - **Format**: ONNX
27
+ - **Quantization**: INT8 (dynamic quantization)
28
+ - **Size**: ~125MB
29
+ - **Converted by**: Manual export from PyTorch (AllenNLP format)
30
+
31
+ ## How It Works
32
+
33
+ GECToR uses a token classification approach - instead of generating corrected text, it predicts edit operations for each token:
34
+
35
+ - `$KEEP` - Keep token unchanged
36
+ - `$DELETE` - Remove token
37
+ - `$REPLACE_word` - Replace with specific word
38
+ - `$APPEND_word` - Append word after token
39
+ - `$TRANSFORM_*` - Apply transformation (case, verb form, etc.)
40
+
41
+ The model runs iteratively (typically 2-3 passes) until no more edits are predicted.
42
+
43
+ ## Usage with Transformers.js
44
+
45
+ ```javascript
46
+ import { pipeline } from '@huggingface/transformers';
47
+
48
+ const classifier = await pipeline(
49
+ 'token-classification',
50
+ 'YOUR_USERNAME/gector-base-2020',
51
+ { dtype: 'q8' }
52
+ );
53
+
54
+ const result = await classifier('He go to school yesterday.');
55
+ // Returns token predictions with edit tags
56
+ ```
57
+
58
+ ## Performance
59
+
60
+ Faster than the 2024 version with slightly lower accuracy. Good balance of speed and quality.
61
+
62
+ ## License
63
+
64
+ Apache 2.0 (following original model license)
config.json ADDED
The diff for this file is too large to render. See raw diff
 
id2label.json ADDED
The diff for this file is too large to render. See raw diff
 
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
onnx/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a5dbe89d72d01afc610e69dd0060028f1106b3cf320193fe391cc1357db0371
3
+ size 1568349
onnx/model.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ca83094f3558a0f7e25c2c8d0d1f07a802e589c11b039d949d7ed02130ff9ca
3
+ size 511719424
onnx/model_quantized.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3ced3d80b1e06d610ff83104b2f4b8a3919f21b8e70120556526de314658426
3
+ size 129688821
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": false,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "extra_special_tokens": {},
51
+ "mask_token": "<mask>",
52
+ "model_max_length": 512,
53
+ "pad_token": "<pad>",
54
+ "sep_token": "</s>",
55
+ "tokenizer_class": "RobertaTokenizer",
56
+ "trim_offsets": true,
57
+ "unk_token": "<unk>"
58
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff