raphael-r commited on
Commit
1c8950f
·
verified ·
1 Parent(s): 138a287

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - gliner
4
+ - ner
5
+ - medical
6
+ - french
7
+ language:
8
+ - fr
9
+ ---
10
+ # BRIGHT NER: GLiNER2 fine-tuned for dates_outcomes
11
+
12
+ ## Description
13
+ This is a GLiNER2 architecture fine-tuned to extract clinical neuro-oncology entities related to the `dates_outcomes` semantic group. It was trained on a synthetic dataset generated for the properly de-identified BRIGHT project dataset (see the `generated_data` folder in the primary repository).
14
+
15
+ This model repository was specifically designed to fit within the `bright_db` overarching namespace.
16
+
17
+ ## Fields
18
+ It extracts the following fields (described in French):
19
+ - **date_chir**: Date intervention neurochirurgicale ou résection
20
+ - **date_rcp**: Date réunion concertation pluridisciplinaire
21
+ - **dn_date**: Date dernières nouvelles ou dernier suivi
22
+ - **date_deces**: Date décès patient (seulement si décédé)
23
+ - **date_1er_symptome**: Date apparition premiers symptômes
24
+ - **exam_radio_date_decouverte**: Date premier examen découvrant la tumeur
25
+ - **date_progression**: Date récidive/progression
26
+ - **survie_globale**: Durée survie en mois
27
+ - **infos_deces**: Circonstances décès
28
+
29
+ ## Performance on Validation Set
30
+ **Aggregates**:
31
+ - Macro F1: 0.2458 (Precision: 0.2195, Recall: 0.6817)
32
+ - Micro F1: 0.3032 (Precision: 0.1861, Recall: 0.8171)
33
+
34
+ **Per-Label Breakdowns**:
35
+ | Label | Precision | Recall | F1 |
36
+ |---|---|---|---|
37
+ | date_chir | 0.0615 | 1.0000 | 0.1159 |
38
+ | date_rcp | 0.7347 | 0.9231 | 0.8182 |
39
+ | dn_date | 0.0000 | 0.0000 | 0.0000 |
40
+ | date_deces | 0.0000 | 0.0000 | 0.0000 |
41
+ | date_1er_symptome | 0.0615 | 1.0000 | 0.1159 |
42
+ | exam_radio_date_decouverte | 0.0462 | 1.0000 | 0.0882 |
43
+ | date_progression | 0.0333 | 1.0000 | 0.0645 |
44
+ | survie_globale | 0.3714 | 0.8125 | 0.5098 |
45
+ | infos_deces | 0.6667 | 0.4000 | 0.5000 |
46
+
47
+ ## Usage
48
+
49
+ ```python
50
+ # Inference Code
51
+ from gliner2 import GLiNER2
52
+
53
+ model = GLiNER2.from_pretrained("raphael-r/bright-gliner-dates_outcomes")
54
+ text = "Patient presenting with epileptic seizures..."
55
+ entities = model.extract_entities(text)
56
+
57
+ for entity in entities:
58
+ print(entity["text"], "=>", entity["label"])
59
+ ```
config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "counting_layer": "count_lstm",
4
+ "max_width": 8,
5
+ "model_name": "microsoft/mdeberta-v3-base",
6
+ "model_type": "extractor",
7
+ "token_pooling": "first",
8
+ "transformers_version": "5.0.0"
9
+ }
encoder_config/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "attention_probs_dropout_prob": 0.1,
4
+ "bos_token_id": null,
5
+ "dtype": "float32",
6
+ "eos_token_id": null,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 768,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 3072,
12
+ "layer_norm_eps": 1e-07,
13
+ "legacy": true,
14
+ "max_position_embeddings": 512,
15
+ "max_relative_positions": -1,
16
+ "model_type": "deberta-v2",
17
+ "norm_rel_ebd": "layer_norm",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 0,
21
+ "pooler_dropout": 0,
22
+ "pooler_hidden_act": "gelu",
23
+ "pooler_hidden_size": 768,
24
+ "pos_att_type": [
25
+ "p2c",
26
+ "c2p"
27
+ ],
28
+ "position_biased_input": false,
29
+ "position_buckets": 256,
30
+ "relative_attention": true,
31
+ "share_att_key": true,
32
+ "tie_word_embeddings": true,
33
+ "transformers_version": "5.0.0",
34
+ "type_vocab_size": 0,
35
+ "vocab_size": 250112
36
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81d6b913c4a42ea6f4f46611f5f324e3e5008c3ff68c6b4fc46d5afe8d6021a9
3
+ size 1228421964
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1c7ccb287623cccb7c03150953b6d2a09dd95122933393c9151c3a60095c97e
3
+ size 16337353
tokenizer_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "[CLS]",
4
+ "clean_up_tokenization_spaces": false,
5
+ "cls_token": "[CLS]",
6
+ "do_lower_case": false,
7
+ "eos_token": "[SEP]",
8
+ "extra_special_tokens": [
9
+ "[SEP_STRUCT]",
10
+ "[SEP_TEXT]",
11
+ "[P]",
12
+ "[C]",
13
+ "[E]",
14
+ "[R]",
15
+ "[L]",
16
+ "[EXAMPLE]",
17
+ "[OUTPUT]",
18
+ "[DESCRIPTION]"
19
+ ],
20
+ "is_local": false,
21
+ "mask_token": "[MASK]",
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "model_specific_special_tokens": {},
24
+ "pad_token": "[PAD]",
25
+ "sep_token": "[SEP]",
26
+ "sp_model_kwargs": {},
27
+ "split_by_punct": false,
28
+ "tokenizer_class": "TokenizersBackend",
29
+ "unk_token": "[UNK]",
30
+ "vocab_type": "spm"
31
+ }