Text Classification
Transformers.js
ONNX
multilingual
bert
autofill
field-classification
tinybert
browser
Instructions to use vazish/tinybert-address-autofill with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers.js
How to use vazish/tinybert-address-autofill with Transformers.js:
// npm i @huggingface/transformers import { pipeline } from '@huggingface/transformers'; // Allocate pipeline const pipe = await pipeline('text-classification', 'vazish/tinybert-address-autofill');
first
Browse files- README.md +199 -3
- config.json +165 -0
- onnx/model.onnx +3 -0
- onnx/model_bnb4.onnx +3 -0
- onnx/model_fp16.onnx +3 -0
- onnx/model_int8.onnx +3 -0
- onnx/model_q4.onnx +3 -0
- onnx/model_q4f16.onnx +3 -0
- onnx/model_quantized.onnx +3 -0
- onnx/model_uint8.onnx +3 -0
- special_tokens_map.json +37 -0
- tokenizer.json +0 -0
- tokenizer_config.json +62 -0
- vocab.txt +0 -0
README.md
CHANGED
|
@@ -1,3 +1,199 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: apache-2.0
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
language: multilingual
|
| 4 |
+
library_name: transformers.js
|
| 5 |
+
pipeline_tag: text-classification
|
| 6 |
+
base_model: huawei-noah/TinyBERT_General_4L_312D
|
| 7 |
+
tags:
|
| 8 |
+
- autofill
|
| 9 |
+
- field-classification
|
| 10 |
+
- bert
|
| 11 |
+
- tinybert
|
| 12 |
+
- onnx
|
| 13 |
+
- transformers.js
|
| 14 |
+
- browser
|
| 15 |
+
---
|
| 16 |
+
|
| 17 |
+
# TinyBERT Address Autofill
|
| 18 |
+
|
| 19 |
+
A compact field-type classifier for HTML form autofill. Given a string
|
| 20 |
+
describing a single form field's attributes, it predicts one of 66 autofill
|
| 21 |
+
field types (`given-name`, `family-name`, `email`, `postal-code`,
|
| 22 |
+
`address-line1`, `cc-number`, etc.) or `other` when the field should not be
|
| 23 |
+
filled.
|
| 24 |
+
|
| 25 |
+
The model is fine-tuned from `huawei-noah/TinyBERT_General_4L_312D` on a
|
| 26 |
+
corpus of manually annotated shopping and address forms collected by Mozilla, and is
|
| 27 |
+
intended to run client-side inside Firefox (or any Transformers.js host) as
|
| 28 |
+
a replacement or augmentation for the existing regex-based heuristic field
|
| 29 |
+
detector.
|
| 30 |
+
|
| 31 |
+
## ONNX variants
|
| 32 |
+
|
| 33 |
+
All variants live under `onnx/` and are loadable through Transformers.js by
|
| 34 |
+
passing the corresponding `dtype` argument.
|
| 35 |
+
|
| 36 |
+
| File | Precision | Size | Transformers.js `dtype` |
|
| 37 |
+
| --- | --- | ---: | --- |
|
| 38 |
+
| `onnx/model.onnx` | fp32 | 57.6 MB | `fp32` |
|
| 39 |
+
| `onnx/model_fp16.onnx` | fp16 | 28.9 MB | `fp16` |
|
| 40 |
+
| `onnx/model_quantized.onnx` | int8 dynamic (default) | 14.6 MB | `q8` |
|
| 41 |
+
| `onnx/model_int8.onnx` | int8 dynamic | 14.6 MB | `int8` |
|
| 42 |
+
| `onnx/model_uint8.onnx` | uint8 dynamic | 14.6 MB | `uint8` |
|
| 43 |
+
| `onnx/model_q4.onnx` | 4-bit weight-only on MatMul | 42.3 MB | `q4` |
|
| 44 |
+
| `onnx/model_q4f16.onnx` | 4-bit on top of fp16 | 22.4 MB | `q4f16` |
|
| 45 |
+
| `onnx/model_bnb4.onnx` | bitsandbytes NF4 | 41.9 MB | `bnb4` |
|
| 46 |
+
|
| 47 |
+
## How to use
|
| 48 |
+
|
| 49 |
+
### Transformers.js (browser)
|
| 50 |
+
|
| 51 |
+
```js
|
| 52 |
+
import { pipeline } from "@huggingface/transformers";
|
| 53 |
+
|
| 54 |
+
const classifier = await pipeline(
|
| 55 |
+
"text-classification",
|
| 56 |
+
"vazish/tinybert-address-autofill",
|
| 57 |
+
{ dtype: "q8" } // try "fp16" for highest fidelity, "q4f16" for smallest
|
| 58 |
+
);
|
| 59 |
+
|
| 60 |
+
const out = await classifier(
|
| 61 |
+
"a-c-postal-code billing zip code dwfrm billing address fields postal code"
|
| 62 |
+
);
|
| 63 |
+
// → [{ label: "postal-code", score: 0.99 }]
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
### Python (Optimum + ONNX Runtime)
|
| 67 |
+
|
| 68 |
+
```python
|
| 69 |
+
from optimum.onnxruntime import ORTModelForSequenceClassification
|
| 70 |
+
from transformers import AutoTokenizer, pipeline
|
| 71 |
+
|
| 72 |
+
model = ORTModelForSequenceClassification.from_pretrained(
|
| 73 |
+
"vazish/tinybert-address-autofill",
|
| 74 |
+
file_name="onnx/model.onnx", # or onnx/model_quantized.onnx, etc.
|
| 75 |
+
)
|
| 76 |
+
tokenizer = AutoTokenizer.from_pretrained("vazish/tinybert-address-autofill")
|
| 77 |
+
clf = pipeline("text-classification", model=model, tokenizer=tokenizer)
|
| 78 |
+
|
| 79 |
+
clf("email email mail **email")
|
| 80 |
+
# → [{"label": "email", "score": 0.99}]
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
## Input format
|
| 84 |
+
|
| 85 |
+
The model expects a single string per field, built by concatenating that
|
| 86 |
+
field's HTML attributes after light normalisation:
|
| 87 |
+
|
| 88 |
+
1. Concatenate (in order): `type` + `autocomplete` + `id` + `name` +
|
| 89 |
+
`placeholder` + the field's computed `<label>` text.
|
| 90 |
+
2. Split camelCase boundaries to whitespace (`firstName` → `first name`).
|
| 91 |
+
3. Lowercase the whole thing.
|
| 92 |
+
4. If the field declares an `autocomplete` attribute, prepend an
|
| 93 |
+
`a-c-<value>` token (e.g. `a-c-postal-code`).
|
| 94 |
+
5. Optionally include adjacent-field context — `bb`-prefixed tokens for
|
| 95 |
+
the previous field on the same form and `aa`-prefixed tokens for the
|
| 96 |
+
next. Including adjacent context improves accuracy by roughly 8 percentage
|
| 97 |
+
points relative to the same model trained on isolated fields.
|
| 98 |
+
|
| 99 |
+
Example input for a "first name" field followed by a "last name" field:
|
| 100 |
+
|
| 101 |
+
```
|
| 102 |
+
first name first name enter first name aaa-c-family-name aalast aaname
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
## Training
|
| 106 |
+
|
| 107 |
+
| | |
|
| 108 |
+
| --- | --- |
|
| 109 |
+
| Base model | `huawei-noah/TinyBERT_General_4L_312D` (4 layers, hidden 312, intermediate 1200, 12 heads, ~14M params, max sequence length 512) |
|
| 110 |
+
| Head | `BertForSequenceClassification`, 66 output classes |
|
| 111 |
+
| Training set | ~360 real shopping / checkout / address forms, 6,691 labelled fields |
|
| 112 |
+
| Validation / test | ~246 forms, 4,300 fields, split into validation and test |
|
| 113 |
+
| Regions covered | US, CA, GB, FR, DE, BR, ES, JP, AT, IN, IT, PL, AU, CH (supported); some additional regions also represented for evaluation |
|
| 114 |
+
| Optimizer / schedule | Hugging Face `Trainer` defaults, 50 epochs |
|
| 115 |
+
| Hardware | Apple M1 MacBook Pro, ~75 minutes wall time |
|
| 116 |
+
|
| 117 |
+
Each form field is annotated with `data-mozautofill-type="<type>"` set to
|
| 118 |
+
the expected autofill class; fields that should not be filled receive no
|
| 119 |
+
attribute and are mapped to `other`.
|
| 120 |
+
|
| 121 |
+
## Evaluation
|
| 122 |
+
|
| 123 |
+
Evaluated on the project's held-out test set (2,168 labelled fields drawn
|
| 124 |
+
from real address / shopping forms) using ONNX Runtime on CPU.
|
| 125 |
+
|
| 126 |
+
- **Total** — strict exact-match accuracy.
|
| 127 |
+
- **Close** — counts predictions on closely related labels as correct
|
| 128 |
+
(e.g. `street-address` predicted when ground truth is `address-line1`,
|
| 129 |
+
`tel` predicted when ground truth is `tel-national`).
|
| 130 |
+
- **Blank** — false-fill rate. Fraction of `other`-labelled fields the
|
| 131 |
+
model predicted as a real autofill type. Lower is better; this metric
|
| 132 |
+
matters most for user experience because high false-fill means filling
|
| 133 |
+
search boxes, comments, and gift-card fields with personal data.
|
| 134 |
+
|
| 135 |
+
| Variant | Total | Close | Blank | Throughput (CPU) |
|
| 136 |
+
| --- | ---: | ---: | ---: | ---: |
|
| 137 |
+
| fp32 | **89.62%** | 91.51% | 2.40% | ~218/s |
|
| 138 |
+
| fp16 | **89.71%** | 91.61% | 2.31% | ~132/s |
|
| 139 |
+
| bnb4 | 88.42% | 90.64% | 2.77% | ~214/s |
|
| 140 |
+
| q4 | 88.01% | 90.54% | 2.58% | ~209/s |
|
| 141 |
+
| q4f16 | 88.01% | 90.54% | 2.58% | ~95/s |
|
| 142 |
+
| uint8 | 87.27% | 89.53% | 3.27% | ~163/s |
|
| 143 |
+
| int8 / quantized | 84.82% | 87.73% | **1.94%** | ~257/s |
|
| 144 |
+
|
| 145 |
+
For reference, the existing Firefox regex-based heuristic detector reaches
|
| 146 |
+
roughly 85% total accuracy on comparable test sets.
|
| 147 |
+
|
| 148 |
+
Highlights:
|
| 149 |
+
|
| 150 |
+
- **fp16** is statistically indistinguishable from fp32 across all metrics
|
| 151 |
+
while halving the file size. It is the recommended high-fidelity
|
| 152 |
+
variant. Latency on CPU is ~2× fp32 because most CPUs lack native fp16
|
| 153 |
+
ops, but the gap closes on hardware with fp16 support and on
|
| 154 |
+
WebGPU.
|
| 155 |
+
- **int8 / quantized** has the lowest exact accuracy but **the lowest
|
| 156 |
+
false-fill rate of any variant** (1.94%, below the fp32 baseline). It
|
| 157 |
+
errs toward `other` when uncertain — the safer failure mode for an
|
| 158 |
+
autofill UI. This is the recommended size-constrained default.
|
| 159 |
+
- 4-bit variants (`q4`, `q4f16`, `bnb4`) cluster around 88% total accuracy
|
| 160 |
+
with `q4f16` being the smallest at 22 MB.
|
| 161 |
+
|
| 162 |
+
## Limitations
|
| 163 |
+
|
| 164 |
+
- Trained primarily on the supported-region list above. Accuracy on
|
| 165 |
+
unsupported regions trained-without-data drops ~5–10 percentage points;
|
| 166 |
+
adding region-specific samples to the training set typically recovers
|
| 167 |
+
most of that gap.
|
| 168 |
+
- Underrepresented field types (`address-line3`, `additional-name`,
|
| 169 |
+
`phonetic-*`, `tel-local-prefix`, etc.) have very few training examples
|
| 170 |
+
and are sometimes confidently misclassified.
|
| 171 |
+
- Quantized variants disagree with fp32 on roughly 0.1% (`fp16`) to ~5%
|
| 172 |
+
(`int8`) of inputs. The exact disagreement pattern is captured in the
|
| 173 |
+
evaluation table above.
|
| 174 |
+
- The model assumes the team's preprocessing format (camelCase-split,
|
| 175 |
+
lowercased, with optional `a-c-`/`bb`/`aa` markers). Feeding raw HTML
|
| 176 |
+
attribute strings without this normalisation will degrade accuracy.
|
| 177 |
+
|
| 178 |
+
## Citation
|
| 179 |
+
|
| 180 |
+
This model is built on TinyBERT:
|
| 181 |
+
|
| 182 |
+
```bibtex
|
| 183 |
+
@inproceedings{jiao-etal-2020-tinybert,
|
| 184 |
+
title = {{TinyBERT}: Distilling {BERT} for Natural Language Understanding},
|
| 185 |
+
author = {Jiao, Xiaoqi and Yin, Yichun and Shang, Lifeng and Jiang, Xin
|
| 186 |
+
and Chen, Xiao and Li, Linlin and Wang, Fang and Liu, Qun},
|
| 187 |
+
booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2020},
|
| 188 |
+
year = {2020},
|
| 189 |
+
pages = {4163--4174},
|
| 190 |
+
url = {https://aclanthology.org/2020.findings-emnlp.372}
|
| 191 |
+
}
|
| 192 |
+
```
|
| 193 |
+
|
| 194 |
+
If you use this checkpoint, please also cite the Mozilla autofill ML
|
| 195 |
+
investigation that produced it (citation forthcoming).
|
| 196 |
+
|
| 197 |
+
## License
|
| 198 |
+
|
| 199 |
+
Apache 2.0.
|
config.json
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertForSequenceClassification"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"cell": {},
|
| 7 |
+
"classifier_dropout": null,
|
| 8 |
+
"dtype": "float32",
|
| 9 |
+
"emb_size": 312,
|
| 10 |
+
"hidden_act": "gelu",
|
| 11 |
+
"hidden_dropout_prob": 0.1,
|
| 12 |
+
"hidden_size": 312,
|
| 13 |
+
"id2label": {
|
| 14 |
+
"1": "other",
|
| 15 |
+
"2": "given-name",
|
| 16 |
+
"3": "family-name",
|
| 17 |
+
"4": "name",
|
| 18 |
+
"5": "additional-name",
|
| 19 |
+
"6": "phonetic-given-name",
|
| 20 |
+
"7": "phonetic-family-name",
|
| 21 |
+
"8": "phonetic-name",
|
| 22 |
+
"9": "honorific-prefix",
|
| 23 |
+
"10": "honorific-suffix",
|
| 24 |
+
"11": "nickname",
|
| 25 |
+
"12": "street-address",
|
| 26 |
+
"13": "address-lookup",
|
| 27 |
+
"14": "address-line1",
|
| 28 |
+
"15": "address-line2",
|
| 29 |
+
"16": "address-line3",
|
| 30 |
+
"17": "address-level1",
|
| 31 |
+
"18": "address-level2",
|
| 32 |
+
"19": "address-level3",
|
| 33 |
+
"20": "address-level4",
|
| 34 |
+
"21": "street",
|
| 35 |
+
"22": "address-streetname",
|
| 36 |
+
"23": "address-housenumber",
|
| 37 |
+
"24": "address-extra-housesuffix",
|
| 38 |
+
"25": "postal-code",
|
| 39 |
+
"26": "postal-code-lookup",
|
| 40 |
+
"27": "postal-code-and-city",
|
| 41 |
+
"28": "postal-code-or-suburb",
|
| 42 |
+
"29": "country",
|
| 43 |
+
"30": "country-name",
|
| 44 |
+
"31": "tel",
|
| 45 |
+
"32": "tel-country-code",
|
| 46 |
+
"33": "tel-national",
|
| 47 |
+
"34": "tel-area-code",
|
| 48 |
+
"35": "tel-local",
|
| 49 |
+
"36": "tel-local-prefix",
|
| 50 |
+
"37": "tel-local-suffix",
|
| 51 |
+
"38": "tel-extension",
|
| 52 |
+
"39": "organization",
|
| 53 |
+
"40": "organization-title",
|
| 54 |
+
"41": "bday",
|
| 55 |
+
"42": "bday-day",
|
| 56 |
+
"43": "bday-month",
|
| 57 |
+
"44": "bday-year",
|
| 58 |
+
"45": "email",
|
| 59 |
+
"46": "apartment",
|
| 60 |
+
"47": "floor",
|
| 61 |
+
"48": "stair",
|
| 62 |
+
"49": "building",
|
| 63 |
+
"50": "block",
|
| 64 |
+
"51": "address-extra",
|
| 65 |
+
"52": "cc-name",
|
| 66 |
+
"53": "cc-given-name",
|
| 67 |
+
"54": "cc-additional-name",
|
| 68 |
+
"55": "cc-family-name",
|
| 69 |
+
"56": "cc-number",
|
| 70 |
+
"57": "cc-exp",
|
| 71 |
+
"58": "cc-exp-month",
|
| 72 |
+
"59": "cc-exp-year",
|
| 73 |
+
"60": "cc-csc",
|
| 74 |
+
"61": "cc-type",
|
| 75 |
+
"62": "sex",
|
| 76 |
+
"63": "id-number",
|
| 77 |
+
"64": "vat-number",
|
| 78 |
+
"65": "reference-point",
|
| 79 |
+
"66": "loginname"
|
| 80 |
+
},
|
| 81 |
+
"initializer_range": 0.02,
|
| 82 |
+
"intermediate_size": 1200,
|
| 83 |
+
"label2id": {
|
| 84 |
+
"additional-name": 5,
|
| 85 |
+
"address-extra": 51,
|
| 86 |
+
"address-extra-housesuffix": 24,
|
| 87 |
+
"address-housenumber": 23,
|
| 88 |
+
"address-level1": 17,
|
| 89 |
+
"address-level2": 18,
|
| 90 |
+
"address-level3": 19,
|
| 91 |
+
"address-level4": 20,
|
| 92 |
+
"address-line1": 14,
|
| 93 |
+
"address-line2": 15,
|
| 94 |
+
"address-line3": 16,
|
| 95 |
+
"address-lookup": 13,
|
| 96 |
+
"address-streetname": 22,
|
| 97 |
+
"apartment": 46,
|
| 98 |
+
"bday": 41,
|
| 99 |
+
"bday-day": 42,
|
| 100 |
+
"bday-month": 43,
|
| 101 |
+
"bday-year": 44,
|
| 102 |
+
"block": 50,
|
| 103 |
+
"building": 49,
|
| 104 |
+
"cc-additional-name": 54,
|
| 105 |
+
"cc-csc": 60,
|
| 106 |
+
"cc-exp": 57,
|
| 107 |
+
"cc-exp-month": 58,
|
| 108 |
+
"cc-exp-year": 59,
|
| 109 |
+
"cc-family-name": 55,
|
| 110 |
+
"cc-given-name": 53,
|
| 111 |
+
"cc-name": 52,
|
| 112 |
+
"cc-number": 56,
|
| 113 |
+
"cc-type": 61,
|
| 114 |
+
"country": 29,
|
| 115 |
+
"country-name": 30,
|
| 116 |
+
"email": 45,
|
| 117 |
+
"family-name": 3,
|
| 118 |
+
"floor": 47,
|
| 119 |
+
"given-name": 2,
|
| 120 |
+
"honorific-prefix": 9,
|
| 121 |
+
"honorific-suffix": 10,
|
| 122 |
+
"id-number": 63,
|
| 123 |
+
"loginname": 66,
|
| 124 |
+
"name": 4,
|
| 125 |
+
"nickname": 11,
|
| 126 |
+
"organization": 39,
|
| 127 |
+
"organization-title": 40,
|
| 128 |
+
"other": 1,
|
| 129 |
+
"phonetic-family-name": 7,
|
| 130 |
+
"phonetic-given-name": 6,
|
| 131 |
+
"phonetic-name": 8,
|
| 132 |
+
"postal-code": 25,
|
| 133 |
+
"postal-code-and-city": 27,
|
| 134 |
+
"postal-code-lookup": 26,
|
| 135 |
+
"postal-code-or-suburb": 28,
|
| 136 |
+
"reference-point": 65,
|
| 137 |
+
"sex": 62,
|
| 138 |
+
"stair": 48,
|
| 139 |
+
"street": 21,
|
| 140 |
+
"street-address": 12,
|
| 141 |
+
"tel": 31,
|
| 142 |
+
"tel-area-code": 34,
|
| 143 |
+
"tel-country-code": 32,
|
| 144 |
+
"tel-extension": 38,
|
| 145 |
+
"tel-local": 35,
|
| 146 |
+
"tel-local-prefix": 36,
|
| 147 |
+
"tel-local-suffix": 37,
|
| 148 |
+
"tel-national": 33,
|
| 149 |
+
"vat-number": 64
|
| 150 |
+
},
|
| 151 |
+
"layer_norm_eps": 1e-12,
|
| 152 |
+
"max_position_embeddings": 512,
|
| 153 |
+
"model_type": "bert",
|
| 154 |
+
"num_attention_heads": 12,
|
| 155 |
+
"num_hidden_layers": 4,
|
| 156 |
+
"pad_token_id": 0,
|
| 157 |
+
"position_embedding_type": "absolute",
|
| 158 |
+
"pre_trained": "",
|
| 159 |
+
"problem_type": "single_label_classification",
|
| 160 |
+
"structure": [],
|
| 161 |
+
"transformers_version": "4.57.6",
|
| 162 |
+
"type_vocab_size": 2,
|
| 163 |
+
"use_cache": true,
|
| 164 |
+
"vocab_size": 30522
|
| 165 |
+
}
|
onnx/model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6ca907af8e24d3ec61567bdd42dfe31aeb873a9eeff8023df2aa1fc5b73a06d7
|
| 3 |
+
size 57560725
|
onnx/model_bnb4.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bb31133ea6c69123ba1273ee52bae452cb83e16d5d3f765d8d28675f6df96da1
|
| 3 |
+
size 41914512
|
onnx/model_fp16.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5192afb88fa3b4f64db2362f32518ee8ed445fba49fbe426310d0e8108a3aab6
|
| 3 |
+
size 28851649
|
onnx/model_int8.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d3b69884aab01e3af514935c70a24bf935ea87272ebb4d0dd6656fc8efe4eb6d
|
| 3 |
+
size 14563081
|
onnx/model_q4.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e13491b840aa0ae3666c443a280277455f84d3012fe33d031a3b13576610856b
|
| 3 |
+
size 42260304
|
onnx/model_q4f16.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:87f433c2775284502dd25c6ab8eaf281f3a01dca8797bfc695599939493dc230
|
| 3 |
+
size 22361887
|
onnx/model_quantized.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d3b69884aab01e3af514935c70a24bf935ea87272ebb4d0dd6656fc8efe4eb6d
|
| 3 |
+
size 14563081
|
onnx/model_uint8.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b9a6ff07e301d108e2d7e0e777b777182066c08556bdee3b26f381f26a9f9b4f
|
| 3 |
+
size 14563097
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": {
|
| 3 |
+
"content": "[CLS]",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"mask_token": {
|
| 10 |
+
"content": "[MASK]",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": {
|
| 17 |
+
"content": "[PAD]",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"sep_token": {
|
| 24 |
+
"content": "[SEP]",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"unk_token": {
|
| 31 |
+
"content": "[UNK]",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": false,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
}
|
| 37 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "[PAD]",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"100": {
|
| 12 |
+
"content": "[UNK]",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"101": {
|
| 20 |
+
"content": "[CLS]",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"102": {
|
| 28 |
+
"content": "[SEP]",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"103": {
|
| 36 |
+
"content": "[MASK]",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"clean_up_tokenization_spaces": true,
|
| 45 |
+
"cls_token": "[CLS]",
|
| 46 |
+
"do_basic_tokenize": true,
|
| 47 |
+
"do_lower_case": true,
|
| 48 |
+
"extra_special_tokens": {},
|
| 49 |
+
"mask_token": "[MASK]",
|
| 50 |
+
"max_length": 512,
|
| 51 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 52 |
+
"never_split": null,
|
| 53 |
+
"pad_token": "[PAD]",
|
| 54 |
+
"sep_token": "[SEP]",
|
| 55 |
+
"stride": 0,
|
| 56 |
+
"strip_accents": null,
|
| 57 |
+
"tokenize_chinese_chars": true,
|
| 58 |
+
"tokenizer_class": "BertTokenizer",
|
| 59 |
+
"truncation_side": "right",
|
| 60 |
+
"truncation_strategy": "longest_first",
|
| 61 |
+
"unk_token": "[UNK]"
|
| 62 |
+
}
|
vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|