Add layoutlm-camembertv2-qa
Browse files- README +60 -0
- config.json +21 -0
- model.safetensors +3 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +16 -0
- tokenizer.json +0 -0
- tokenizer_config.json +64 -0
README
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MODEL_NAME
|
| 2 |
+
|
| 3 |
+
This repository contains **layoutlm-camembertv2-qa** weights exported to `safetensors` format.
|
| 4 |
+
|
| 5 |
+
## Source
|
| 6 |
+
|
| 7 |
+
These weights are derived from pretrained models:
|
| 8 |
+
|
| 9 |
+
- **Layout encoder (LayoutLM)**: [`microsoft/layoutlm-base-uncased`](https://huggingface.co/microsoft/layoutlm-base-uncased) — pretrained on IIT-CDIP + masked visual-language modeling (LayoutLM paper)
|
| 10 |
+
- **Text encoder**: [`almanach/camembertv2-base`](https://huggingface.co/almanach/camembertv2-base) — French language model (RoBERTa-like architecture)
|
| 11 |
+
|
| 12 |
+
## Methodology
|
| 13 |
+
|
| 14 |
+
This checkpoint was produced by **weight merging**, not end-to-end training.
|
| 15 |
+
|
| 16 |
+
1. Load the pretrained layout encoder weights (LiLT or LayoutLM) — kept intact
|
| 17 |
+
2. Replace the text encoder weights (embeddings, attention layers, FFN) with those from the French model
|
| 18 |
+
3. Update the tokenizer and vocabulary configuration accordingly
|
| 19 |
+
|
| 20 |
+
No training or fine-tuning was performed at this stage.
|
| 21 |
+
This checkpoint is intended as a **starting point** for downstream fine-tuning on French document understanding tasks (NER, token classification, extractive QA…).
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
## Files
|
| 25 |
+
|
| 26 |
+
| File | Description |
|
| 27 |
+
|------|-------------|
|
| 28 |
+
| `model.safetensors` | Model weights |
|
| 29 |
+
| `pytorch_model.bin` | Model weights (PyTorch format) |
|
| 30 |
+
| `config.json` | Model configuration |
|
| 31 |
+
| `tokenizer_config.json` | Tokenizer configuration |
|
| 32 |
+
| `README.md` | This model card |
|
| 33 |
+
|
| 34 |
+
## Usage
|
| 35 |
+
|
| 36 |
+
```python
|
| 37 |
+
from transformers import AutoTokenizer, AutoModel
|
| 38 |
+
|
| 39 |
+
tokenizer = AutoTokenizer.from_pretrained("USERNAME/MODEL_NAME")
|
| 40 |
+
model = AutoModel.from_pretrained("USERNAME/MODEL_NAME")
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
## Limitations
|
| 44 |
+
|
| 45 |
+
- This model has **not been fine-tuned** on any French document dataset
|
| 46 |
+
- Performance on downstream tasks is **not guaranteed** without task-specific fine-tuning
|
| 47 |
+
- Intended for research and experimentation purposes
|
| 48 |
+
|
| 49 |
+
## License
|
| 50 |
+
|
| 51 |
+
Weights are derived from models released under the MIT and Apache-2.0 licenses.
|
| 52 |
+
Please refer to the original repositories for full license terms.
|
| 53 |
+
|
| 54 |
+
## Acknowledgements
|
| 55 |
+
|
| 56 |
+
- [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) — Xu et al., 2020
|
| 57 |
+
- [`microsoft/layoutlm-base-uncased`](https://huggingface.co/microsoft/layoutlm-base-uncased)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
> **Note**: This is not an official release from any of the above organizations.
|
config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"LayoutLMForQuestionAnswering"
|
| 4 |
+
],
|
| 5 |
+
"model_type": "layoutlm",
|
| 6 |
+
"hidden_size": 768,
|
| 7 |
+
"num_hidden_layers": 12,
|
| 8 |
+
"num_attention_heads": 12,
|
| 9 |
+
"intermediate_size": 3072,
|
| 10 |
+
"hidden_act": "gelu",
|
| 11 |
+
"hidden_dropout_prob": 0.1,
|
| 12 |
+
"attention_probs_dropout_prob": 0.1,
|
| 13 |
+
"max_position_embeddings": 1025,
|
| 14 |
+
"max_2d_position_embeddings": 1024,
|
| 15 |
+
"type_vocab_size": 1,
|
| 16 |
+
"vocab_size": 32768,
|
| 17 |
+
"pad_token_id": 0,
|
| 18 |
+
"layer_norm_eps": 1e-12,
|
| 19 |
+
"initializer_range": 0.02,
|
| 20 |
+
"num_labels": 2
|
| 21 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6ee3fb60eec62b6a9962c0737e20f45ba42e1f0e956a148483945c07ec0f9e45
|
| 3 |
+
size 457440704
|
pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:886011a878e2cd1ec388342141aec26820337dc2e7d7595d5d1c72bca987d770
|
| 3 |
+
size 457484503
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"[PAD]",
|
| 4 |
+
"[CLS]",
|
| 5 |
+
"[SEP]",
|
| 6 |
+
"[UNK]",
|
| 7 |
+
"[MASK]"
|
| 8 |
+
],
|
| 9 |
+
"bos_token": "[CLS]",
|
| 10 |
+
"cls_token": "[CLS]",
|
| 11 |
+
"eos_token": "[SEP]",
|
| 12 |
+
"mask_token": "[MASK]",
|
| 13 |
+
"pad_token": "[PAD]",
|
| 14 |
+
"sep_token": "[SEP]",
|
| 15 |
+
"unk_token": "[UNK]"
|
| 16 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": true,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"0": {
|
| 5 |
+
"content": "[PAD]",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": false,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
},
|
| 12 |
+
"1": {
|
| 13 |
+
"content": "[CLS]",
|
| 14 |
+
"lstrip": false,
|
| 15 |
+
"normalized": false,
|
| 16 |
+
"rstrip": false,
|
| 17 |
+
"single_word": false,
|
| 18 |
+
"special": true
|
| 19 |
+
},
|
| 20 |
+
"2": {
|
| 21 |
+
"content": "[SEP]",
|
| 22 |
+
"lstrip": false,
|
| 23 |
+
"normalized": false,
|
| 24 |
+
"rstrip": false,
|
| 25 |
+
"single_word": false,
|
| 26 |
+
"special": true
|
| 27 |
+
},
|
| 28 |
+
"3": {
|
| 29 |
+
"content": "[UNK]",
|
| 30 |
+
"lstrip": false,
|
| 31 |
+
"normalized": false,
|
| 32 |
+
"rstrip": false,
|
| 33 |
+
"single_word": false,
|
| 34 |
+
"special": true
|
| 35 |
+
},
|
| 36 |
+
"4": {
|
| 37 |
+
"content": "[MASK]",
|
| 38 |
+
"lstrip": false,
|
| 39 |
+
"normalized": false,
|
| 40 |
+
"rstrip": false,
|
| 41 |
+
"single_word": false,
|
| 42 |
+
"special": true
|
| 43 |
+
}
|
| 44 |
+
},
|
| 45 |
+
"additional_special_tokens": [
|
| 46 |
+
"[PAD]",
|
| 47 |
+
"[CLS]",
|
| 48 |
+
"[SEP]",
|
| 49 |
+
"[UNK]",
|
| 50 |
+
"[MASK]"
|
| 51 |
+
],
|
| 52 |
+
"bos_token": "[CLS]",
|
| 53 |
+
"clean_up_tokenization_spaces": true,
|
| 54 |
+
"cls_token": "[CLS]",
|
| 55 |
+
"eos_token": "[SEP]",
|
| 56 |
+
"errors": "replace",
|
| 57 |
+
"mask_token": "[MASK]",
|
| 58 |
+
"model_max_length": 1024,
|
| 59 |
+
"pad_token": "[PAD]",
|
| 60 |
+
"sep_token": "[SEP]",
|
| 61 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
| 62 |
+
"trim_offsets": true,
|
| 63 |
+
"unk_token": "[UNK]"
|
| 64 |
+
}
|