raphael-r commited on
Commit
801819e
·
verified ·
1 Parent(s): d69ab71

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - gliner
4
+ - ner
5
+ - medical
6
+ - french
7
+ language:
8
+ - fr
9
+ ---
10
+ # BRIGHT NER: GLiNER2 fine-tuned for symptoms_evolution
11
+
12
+ ## Description
13
+ This is a GLiNER2 architecture fine-tuned to extract clinical neuro-oncology entities related to the `symptoms_evolution` semantic group. It was trained on a synthetic dataset generated for the properly de-identified BRIGHT project dataset (see the `generated_data` folder in the primary repository).
14
+
15
+ This model repository was specifically designed to fit within the `bright_db` overarching namespace.
16
+
17
+ ## Fields
18
+ It extracts the following fields (described in French):
19
+ - **epilepsie_1er_symptome**: Crises comme premier symptôme
20
+ - **ceph_hic_1er_symptome**: Céphalées/HIC comme premier symptôme
21
+ - **ceph_hic**: Mention céphalées/HIC
22
+ - **deficit_1er_symptome**: Déficit neurologique initial
23
+ - **deficit**: Mention déficit neurologique
24
+ - **cognitif_1er_symptome**: Troubles cognitifs comme premier symptôme
25
+ - **cognitif**: Mention troubles cognitifs
26
+ - **autre_trouble_1er_symptome**: Autres premiers symptômes
27
+ - **contraste_1er_symptome**: Prise de contraste sur première imagerie
28
+ - **prise_de_contraste**: Mention générale prise de contraste
29
+ - **oedeme_1er_symptome**: Présence œdème sur première imagerie
30
+ - **calcif_1er_symptome**: Présence calcification sur première imagerie
31
+ - **epilepsie**: Mention épilepsie/crises
32
+ - **autre_trouble**: Autres symptômes actuels
33
+ - **evol_clinique**: Évolution globale (stable, progression)
34
+ - **progress_clinique**: Aggravation symptômes
35
+ - **progress_radiologique**: Croissance tumorale imagerie
36
+ - **reponse_radiologique**: Réponse tumorale imagerie
37
+
38
+ ## Performance on Validation Set
39
+ **Aggregates**:
40
+ - Macro F1: 0.1530 (Precision: 0.1166, Recall: 0.4584)
41
+ - Micro F1: 0.1969 (Precision: 0.1104, Recall: 0.9105)
42
+
43
+ **Per-Label Breakdowns**:
44
+ | Label | Precision | Recall | F1 |
45
+ |---|---|---|---|
46
+ | epilepsie_1er_symptome | 0.0099 | 0.5000 | 0.0194 |
47
+ | ceph_hic_1er_symptome | 0.0100 | 1.0000 | 0.0198 |
48
+ | ceph_hic | 0.0000 | 0.0000 | 0.0000 |
49
+ | deficit_1er_symptome | 0.0000 | 0.0000 | 0.0000 |
50
+ | deficit | 0.0000 | 0.0000 | 0.0000 |
51
+ | cognitif_1er_symptome | 0.0000 | 0.0000 | 0.0000 |
52
+ | cognitif | 0.0000 | 0.0000 | 0.0000 |
53
+ | autre_trouble_1er_symptome | 0.5429 | 0.8261 | 0.6552 |
54
+ | contraste_1er_symptome | 0.0500 | 1.0000 | 0.0952 |
55
+ | prise_de_contraste | 0.2100 | 1.0000 | 0.3471 |
56
+ | oedeme_1er_symptome | 0.2115 | 0.9565 | 0.3465 |
57
+ | calcif_1er_symptome | 0.0000 | 0.0000 | 0.0000 |
58
+ | epilepsie | 0.0800 | 1.0000 | 0.1481 |
59
+ | autre_trouble | 0.1154 | 1.0000 | 0.2069 |
60
+ | evol_clinique | 0.8692 | 0.9688 | 0.9163 |
61
+ | progress_clinique | 0.0000 | 0.0000 | 0.0000 |
62
+ | progress_radiologique | 0.0000 | 0.0000 | 0.0000 |
63
+ | reponse_radiologique | 0.0000 | 0.0000 | 0.0000 |
64
+
65
+ ## Usage
66
+
67
+ ```python
68
+ # Inference Code
69
+ from gliner2 import GLiNER2
70
+
71
+ model = GLiNER2.from_pretrained("raphael-r/bright-gliner-symptoms_evolution")
72
+ text = "Patient presenting with epileptic seizures..."
73
+ entities = model.extract_entities(text)
74
+
75
+ for entity in entities:
76
+ print(entity["text"], "=>", entity["label"])
77
+ ```
config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "counting_layer": "count_lstm",
4
+ "max_width": 8,
5
+ "model_name": "microsoft/mdeberta-v3-base",
6
+ "model_type": "extractor",
7
+ "token_pooling": "first",
8
+ "transformers_version": "5.0.0"
9
+ }
encoder_config/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "attention_probs_dropout_prob": 0.1,
4
+ "bos_token_id": null,
5
+ "dtype": "float32",
6
+ "eos_token_id": null,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 768,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 3072,
12
+ "layer_norm_eps": 1e-07,
13
+ "legacy": true,
14
+ "max_position_embeddings": 512,
15
+ "max_relative_positions": -1,
16
+ "model_type": "deberta-v2",
17
+ "norm_rel_ebd": "layer_norm",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 0,
21
+ "pooler_dropout": 0,
22
+ "pooler_hidden_act": "gelu",
23
+ "pooler_hidden_size": 768,
24
+ "pos_att_type": [
25
+ "p2c",
26
+ "c2p"
27
+ ],
28
+ "position_biased_input": false,
29
+ "position_buckets": 256,
30
+ "relative_attention": true,
31
+ "share_att_key": true,
32
+ "tie_word_embeddings": true,
33
+ "transformers_version": "5.0.0",
34
+ "type_vocab_size": 0,
35
+ "vocab_size": 250112
36
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bbb18c664d8d67661c6ab1249b6439cf378ab6cf969e2f55b30a5da5b09a2c8
3
+ size 1228421964
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1c7ccb287623cccb7c03150953b6d2a09dd95122933393c9151c3a60095c97e
3
+ size 16337353
tokenizer_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "[CLS]",
4
+ "clean_up_tokenization_spaces": false,
5
+ "cls_token": "[CLS]",
6
+ "do_lower_case": false,
7
+ "eos_token": "[SEP]",
8
+ "extra_special_tokens": [
9
+ "[SEP_STRUCT]",
10
+ "[SEP_TEXT]",
11
+ "[P]",
12
+ "[C]",
13
+ "[E]",
14
+ "[R]",
15
+ "[L]",
16
+ "[EXAMPLE]",
17
+ "[OUTPUT]",
18
+ "[DESCRIPTION]"
19
+ ],
20
+ "is_local": false,
21
+ "mask_token": "[MASK]",
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "model_specific_special_tokens": {},
24
+ "pad_token": "[PAD]",
25
+ "sep_token": "[SEP]",
26
+ "sp_model_kwargs": {},
27
+ "split_by_punct": false,
28
+ "tokenizer_class": "TokenizersBackend",
29
+ "unk_token": "[UNK]",
30
+ "vocab_type": "spm"
31
+ }