raphael-r commited on
Commit
d932b6f
·
verified ·
1 Parent(s): 0330a5c

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - eds
4
+ - ner
5
+ - medical
6
+ - french
7
+ language:
8
+ - fr
9
+ ---
10
+ # BRIGHT NER: EDS-NLP (CamemBERT + CRF) fine-tuned for molecular
11
+
12
+ ## Description
13
+ This is a EDS-NLP (CamemBERT + CRF) architecture fine-tuned to extract clinical neuro-oncology entities related to the `molecular` semantic group. It was trained on a synthetic dataset generated for the properly de-identified BRIGHT project dataset (see the `generated_data` folder in the primary repository).
14
+
15
+ This model repository was specifically designed to fit within the `bright_db` overarching namespace.
16
+
17
+ ## Fields
18
+ It extracts the following fields (described in French):
19
+ - **mol_idh1**: Statut mutation IDH1
20
+ - **mol_idh2**: Statut mutation IDH2
21
+ - **mol_mgmt**: Méthylation promoteur MGMT
22
+ - **mol_h3f3a**: Mutation H3F3A
23
+ - **mol_hist1h3b**: Mutation HIST1H3B
24
+ - **mol_tert**: Mutation promoteur TERT
25
+ - **mol_CDKN2A**: Délétion homozygote CDKN2A
26
+ - **mol_atrx**: Mutation ATRX
27
+ - **mol_cic**: Mutation CIC
28
+ - **mol_fubp1**: Mutation FUBP1
29
+ - **mol_fgfr1**: Mutation FGFR1
30
+ - **mol_egfr_mut**: Mutation EGFR
31
+ - **mol_prkca**: Mutation PRKCA
32
+ - **mol_pten**: Mutation PTEN
33
+ - **mol_p53**: Mutation p53
34
+ - **mol_braf**: Mutation BRAF
35
+
36
+ ## Performance on Validation Set
37
+ **Aggregates**:
38
+ - Macro F1: 0.5636 (Precision: 0.5866, Recall: 0.5486)
39
+ - Micro F1: 0.8612 (Precision: 0.8680, Recall: 0.8544)
40
+
41
+ **Per-Label Breakdowns**:
42
+ | Label | Precision | Recall | F1 |
43
+ |---|---|---|---|
44
+ | mol_idh1 | 0.9481 | 0.9309 | 0.9394 |
45
+ | mol_idh2 | 0.7929 | 0.7929 | 0.7929 |
46
+ | mol_mgmt | 0.8317 | 0.8984 | 0.8638 |
47
+ | mol_h3f3a | 0.8571 | 0.9231 | 0.8889 |
48
+ | mol_hist1h3b | 0.0000 | 0.0000 | 0.0000 |
49
+ | mol_tert | 0.8283 | 0.8542 | 0.8410 |
50
+ | mol_CDKN2A | 0.8148 | 0.7333 | 0.7719 |
51
+ | mol_atrx | 0.9714 | 0.7556 | 0.8500 |
52
+ | mol_cic | 0.9130 | 0.7000 | 0.7925 |
53
+ | mol_fubp1 | 0.8276 | 0.9600 | 0.8889 |
54
+ | mol_fgfr1 | 0.0000 | 0.0000 | 0.0000 |
55
+ | mol_egfr_mut | 0.0000 | 0.0000 | 0.0000 |
56
+ | mol_prkca | 0.0000 | 0.0000 | 0.0000 |
57
+ | mol_pten | 0.6000 | 0.4286 | 0.5000 |
58
+ | mol_p53 | 0.0000 | 0.0000 | 0.0000 |
59
+ | mol_braf | 1.0000 | 0.8000 | 0.8889 |
60
+
61
+ ## Usage
62
+
63
+ ```python
64
+ # Inference Code
65
+ import edsnlp
66
+
67
+ nlp = edsnlp.load("raphael-r/bright-eds-molecular")
68
+ doc = nlp("Patient presenting with epileptic seizures...")
69
+
70
+ for ent in doc.ents:
71
+ print(ent.text, "=>", ent.label_)
72
+ ```
config.cfg ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [nlp]
2
+ lang = "fr"
3
+ pipeline = ["normalizer", "sentences", "transformer", "text_cnn", "ner"]
4
+
5
+ [nlp.tokenizer]
6
+ @tokenizers = "spacy.Tokenizer.v1"
7
+
8
+ [components]
9
+
10
+ [components.normalizer]
11
+ @factory = "eds.normalizer"
12
+
13
+ [components.sentences]
14
+ @factory = "eds.sentences"
15
+
16
+ [components.transformer]
17
+ @factory = "eds.transformer"
18
+ model = "./transformer"
19
+ window = 510
20
+ stride = 382
21
+
22
+ [components.text_cnn]
23
+ @factory = "eds.text_cnn"
24
+ embedding = ${components.transformer}
25
+ kernel_sizes = (3, 12)
26
+ residual = true
27
+ normalize = "pre"
28
+
29
+ [components.ner]
30
+ @factory = "eds.ner_crf"
31
+ embedding = ${components.text_cnn}
32
+ labels = ["mol_idh1", "mol_idh2", "mol_mgmt", "mol_h3f3a", "mol_hist1h3b", "mol_tert", "mol_CDKN2A", "mol_atrx", "mol_cic", "mol_fubp1", "mol_fgfr1", "mol_egfr_mut", "mol_prkca", "mol_pten", "mol_p53", "mol_braf"]
33
+ infer_span_setter = true
34
+ mode = "joint"
35
+ window = 0
36
+ stride = 0
37
+
38
+ [components.ner.target_span_getter]
39
+ ents = true
40
+
41
+ [components.ner.span_setter]
42
+ ents = ["mol_CDKN2A", "mol_atrx", "mol_braf", "mol_cic", "mol_egfr_mut", "mol_fgfr1", "mol_fubp1", "mol_h3f3a", "mol_idh1", "mol_idh2", "mol_mgmt", "mol_p53", "mol_prkca", "mol_pten", "mol_tert"]
43
+
meta.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+
3
+ }
ner/parameters.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d810b1c29cb6802bca74523b0a25641ac4eaa953143c1e81379d04b28355ca3
3
+ size 246911
text_cnn/parameters.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e33ff0b146b3d273512aa08ff8dcdc24f6094e875a2492eae684bac1e48f857e
3
+ size 40117816
tokenizer ADDED
The diff for this file is too large to render. See raw diff
 
transformer/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "CamembertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 5,
8
+ "classifier_dropout": null,
9
+ "dtype": "float32",
10
+ "eos_token_id": 6,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "is_decoder": false,
17
+ "layer_norm_eps": 1e-05,
18
+ "max_position_embeddings": 514,
19
+ "model_type": "camembert",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "output_past": true,
23
+ "pad_token_id": 1,
24
+ "position_embedding_type": "absolute",
25
+ "transformers_version": "5.0.0",
26
+ "type_vocab_size": 1,
27
+ "use_cache": true,
28
+ "vocab_size": 32005
29
+ }
transformer/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbc56233e2439980c294eb577fb157a465b03d8e1b361f1d5082b6b1e7a57144
3
+ size 442510152
transformer/parameters.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11f513f3f694fea58fb103c1507640d2b8a21eca9e4fdb6d9ca50a7385583e7e
3
+ size 11584
transformer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
transformer/tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<s>",
5
+ "clean_up_tokenization_spaces": true,
6
+ "cls_token": "<s>",
7
+ "eos_token": "</s>",
8
+ "extra_special_tokens": [
9
+ "<s>NOTUSED",
10
+ "</s>NOTUSED",
11
+ "<unk>NOTUSED"
12
+ ],
13
+ "is_local": true,
14
+ "mask_token": "<mask>",
15
+ "model_max_length": 512,
16
+ "pad_token": "<pad>",
17
+ "sep_token": "</s>",
18
+ "tokenizer_class": "CamembertTokenizer",
19
+ "unk_token": "<unk>"
20
+ }