Upload folder using huggingface_hub
Browse files- README.md +72 -0
- config.cfg +43 -0
- meta.json +3 -0
- ner/parameters.safetensors +3 -0
- text_cnn/parameters.safetensors +3 -0
- tokenizer +0 -0
- transformer/config.json +29 -0
- transformer/model.safetensors +3 -0
- transformer/parameters.safetensors +3 -0
- transformer/tokenizer.json +0 -0
- transformer/tokenizer_config.json +20 -0
README.md
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- eds
|
| 4 |
+
- ner
|
| 5 |
+
- medical
|
| 6 |
+
- french
|
| 7 |
+
language:
|
| 8 |
+
- fr
|
| 9 |
+
---
|
| 10 |
+
# BRIGHT NER: EDS-NLP (CamemBERT + CRF) fine-tuned for molecular
|
| 11 |
+
|
| 12 |
+
## Description
|
| 13 |
+
This is a EDS-NLP (CamemBERT + CRF) architecture fine-tuned to extract clinical neuro-oncology entities related to the `molecular` semantic group. It was trained on a synthetic dataset generated for the properly de-identified BRIGHT project dataset (see the `generated_data` folder in the primary repository).
|
| 14 |
+
|
| 15 |
+
This model repository was specifically designed to fit within the `bright_db` overarching namespace.
|
| 16 |
+
|
| 17 |
+
## Fields
|
| 18 |
+
It extracts the following fields (described in French):
|
| 19 |
+
- **mol_idh1**: Statut mutation IDH1
|
| 20 |
+
- **mol_idh2**: Statut mutation IDH2
|
| 21 |
+
- **mol_mgmt**: Méthylation promoteur MGMT
|
| 22 |
+
- **mol_h3f3a**: Mutation H3F3A
|
| 23 |
+
- **mol_hist1h3b**: Mutation HIST1H3B
|
| 24 |
+
- **mol_tert**: Mutation promoteur TERT
|
| 25 |
+
- **mol_CDKN2A**: Délétion homozygote CDKN2A
|
| 26 |
+
- **mol_atrx**: Mutation ATRX
|
| 27 |
+
- **mol_cic**: Mutation CIC
|
| 28 |
+
- **mol_fubp1**: Mutation FUBP1
|
| 29 |
+
- **mol_fgfr1**: Mutation FGFR1
|
| 30 |
+
- **mol_egfr_mut**: Mutation EGFR
|
| 31 |
+
- **mol_prkca**: Mutation PRKCA
|
| 32 |
+
- **mol_pten**: Mutation PTEN
|
| 33 |
+
- **mol_p53**: Mutation p53
|
| 34 |
+
- **mol_braf**: Mutation BRAF
|
| 35 |
+
|
| 36 |
+
## Performance on Validation Set
|
| 37 |
+
**Aggregates**:
|
| 38 |
+
- Macro F1: 0.5636 (Precision: 0.5866, Recall: 0.5486)
|
| 39 |
+
- Micro F1: 0.8612 (Precision: 0.8680, Recall: 0.8544)
|
| 40 |
+
|
| 41 |
+
**Per-Label Breakdowns**:
|
| 42 |
+
| Label | Precision | Recall | F1 |
|
| 43 |
+
|---|---|---|---|
|
| 44 |
+
| mol_idh1 | 0.9481 | 0.9309 | 0.9394 |
|
| 45 |
+
| mol_idh2 | 0.7929 | 0.7929 | 0.7929 |
|
| 46 |
+
| mol_mgmt | 0.8317 | 0.8984 | 0.8638 |
|
| 47 |
+
| mol_h3f3a | 0.8571 | 0.9231 | 0.8889 |
|
| 48 |
+
| mol_hist1h3b | 0.0000 | 0.0000 | 0.0000 |
|
| 49 |
+
| mol_tert | 0.8283 | 0.8542 | 0.8410 |
|
| 50 |
+
| mol_CDKN2A | 0.8148 | 0.7333 | 0.7719 |
|
| 51 |
+
| mol_atrx | 0.9714 | 0.7556 | 0.8500 |
|
| 52 |
+
| mol_cic | 0.9130 | 0.7000 | 0.7925 |
|
| 53 |
+
| mol_fubp1 | 0.8276 | 0.9600 | 0.8889 |
|
| 54 |
+
| mol_fgfr1 | 0.0000 | 0.0000 | 0.0000 |
|
| 55 |
+
| mol_egfr_mut | 0.0000 | 0.0000 | 0.0000 |
|
| 56 |
+
| mol_prkca | 0.0000 | 0.0000 | 0.0000 |
|
| 57 |
+
| mol_pten | 0.6000 | 0.4286 | 0.5000 |
|
| 58 |
+
| mol_p53 | 0.0000 | 0.0000 | 0.0000 |
|
| 59 |
+
| mol_braf | 1.0000 | 0.8000 | 0.8889 |
|
| 60 |
+
|
| 61 |
+
## Usage
|
| 62 |
+
|
| 63 |
+
```python
|
| 64 |
+
# Inference Code
|
| 65 |
+
import edsnlp
|
| 66 |
+
|
| 67 |
+
nlp = edsnlp.load("raphael-r/bright-eds-molecular")
|
| 68 |
+
doc = nlp("Patient presenting with epileptic seizures...")
|
| 69 |
+
|
| 70 |
+
for ent in doc.ents:
|
| 71 |
+
print(ent.text, "=>", ent.label_)
|
| 72 |
+
```
|
config.cfg
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[nlp]
|
| 2 |
+
lang = "fr"
|
| 3 |
+
pipeline = ["normalizer", "sentences", "transformer", "text_cnn", "ner"]
|
| 4 |
+
|
| 5 |
+
[nlp.tokenizer]
|
| 6 |
+
@tokenizers = "spacy.Tokenizer.v1"
|
| 7 |
+
|
| 8 |
+
[components]
|
| 9 |
+
|
| 10 |
+
[components.normalizer]
|
| 11 |
+
@factory = "eds.normalizer"
|
| 12 |
+
|
| 13 |
+
[components.sentences]
|
| 14 |
+
@factory = "eds.sentences"
|
| 15 |
+
|
| 16 |
+
[components.transformer]
|
| 17 |
+
@factory = "eds.transformer"
|
| 18 |
+
model = "./transformer"
|
| 19 |
+
window = 510
|
| 20 |
+
stride = 382
|
| 21 |
+
|
| 22 |
+
[components.text_cnn]
|
| 23 |
+
@factory = "eds.text_cnn"
|
| 24 |
+
embedding = ${components.transformer}
|
| 25 |
+
kernel_sizes = (3, 12)
|
| 26 |
+
residual = true
|
| 27 |
+
normalize = "pre"
|
| 28 |
+
|
| 29 |
+
[components.ner]
|
| 30 |
+
@factory = "eds.ner_crf"
|
| 31 |
+
embedding = ${components.text_cnn}
|
| 32 |
+
labels = ["mol_idh1", "mol_idh2", "mol_mgmt", "mol_h3f3a", "mol_hist1h3b", "mol_tert", "mol_CDKN2A", "mol_atrx", "mol_cic", "mol_fubp1", "mol_fgfr1", "mol_egfr_mut", "mol_prkca", "mol_pten", "mol_p53", "mol_braf"]
|
| 33 |
+
infer_span_setter = true
|
| 34 |
+
mode = "joint"
|
| 35 |
+
window = 0
|
| 36 |
+
stride = 0
|
| 37 |
+
|
| 38 |
+
[components.ner.target_span_getter]
|
| 39 |
+
ents = true
|
| 40 |
+
|
| 41 |
+
[components.ner.span_setter]
|
| 42 |
+
ents = ["mol_CDKN2A", "mol_atrx", "mol_braf", "mol_cic", "mol_egfr_mut", "mol_fgfr1", "mol_fubp1", "mol_h3f3a", "mol_idh1", "mol_idh2", "mol_mgmt", "mol_p53", "mol_prkca", "mol_pten", "mol_tert"]
|
| 43 |
+
|
meta.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
|
| 3 |
+
}
|
ner/parameters.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3d810b1c29cb6802bca74523b0a25641ac4eaa953143c1e81379d04b28355ca3
|
| 3 |
+
size 246911
|
text_cnn/parameters.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e33ff0b146b3d273512aa08ff8dcdc24f6094e875a2492eae684bac1e48f857e
|
| 3 |
+
size 40117816
|
tokenizer
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
transformer/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_cross_attention": false,
|
| 3 |
+
"architectures": [
|
| 4 |
+
"CamembertModel"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"bos_token_id": 5,
|
| 8 |
+
"classifier_dropout": null,
|
| 9 |
+
"dtype": "float32",
|
| 10 |
+
"eos_token_id": 6,
|
| 11 |
+
"hidden_act": "gelu",
|
| 12 |
+
"hidden_dropout_prob": 0.1,
|
| 13 |
+
"hidden_size": 768,
|
| 14 |
+
"initializer_range": 0.02,
|
| 15 |
+
"intermediate_size": 3072,
|
| 16 |
+
"is_decoder": false,
|
| 17 |
+
"layer_norm_eps": 1e-05,
|
| 18 |
+
"max_position_embeddings": 514,
|
| 19 |
+
"model_type": "camembert",
|
| 20 |
+
"num_attention_heads": 12,
|
| 21 |
+
"num_hidden_layers": 12,
|
| 22 |
+
"output_past": true,
|
| 23 |
+
"pad_token_id": 1,
|
| 24 |
+
"position_embedding_type": "absolute",
|
| 25 |
+
"transformers_version": "5.0.0",
|
| 26 |
+
"type_vocab_size": 1,
|
| 27 |
+
"use_cache": true,
|
| 28 |
+
"vocab_size": 32005
|
| 29 |
+
}
|
transformer/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cbc56233e2439980c294eb577fb157a465b03d8e1b361f1d5082b6b1e7a57144
|
| 3 |
+
size 442510152
|
transformer/parameters.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:11f513f3f694fea58fb103c1507640d2b8a21eca9e4fdb6d9ca50a7385583e7e
|
| 3 |
+
size 11584
|
transformer/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
transformer/tokenizer_config.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": true,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": "<s>",
|
| 5 |
+
"clean_up_tokenization_spaces": true,
|
| 6 |
+
"cls_token": "<s>",
|
| 7 |
+
"eos_token": "</s>",
|
| 8 |
+
"extra_special_tokens": [
|
| 9 |
+
"<s>NOTUSED",
|
| 10 |
+
"</s>NOTUSED",
|
| 11 |
+
"<unk>NOTUSED"
|
| 12 |
+
],
|
| 13 |
+
"is_local": true,
|
| 14 |
+
"mask_token": "<mask>",
|
| 15 |
+
"model_max_length": 512,
|
| 16 |
+
"pad_token": "<pad>",
|
| 17 |
+
"sep_token": "</s>",
|
| 18 |
+
"tokenizer_class": "CamembertTokenizer",
|
| 19 |
+
"unk_token": "<unk>"
|
| 20 |
+
}
|