Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

.gitattributes +1 -0
README.md +59 -0
config.json +9 -0
encoder_config/config.json +36 -0
model.safetensors +3 -0
tokenizer.json +3 -0
tokenizer_config.json +31 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,59 @@

+---
+tags:
+- gliner
+- ner
+- medical
+- french
+language:
+- fr
+---
+# BRIGHT NER: GLiNER2 fine-tuned for dates_outcomes
+## Description
+This is a GLiNER2 architecture fine-tuned to extract clinical neuro-oncology entities related to the `dates_outcomes` semantic group. It was trained on a synthetic dataset generated for the properly de-identified BRIGHT project dataset (see the `generated_data` folder in the primary repository).
+This model repository was specifically designed to fit within the `bright_db` overarching namespace.
+## Fields
+It extracts the following fields (described in French):
+- **date_chir**: Date intervention neurochirurgicale ou résection
+- **date_rcp**: Date réunion concertation pluridisciplinaire
+- **dn_date**: Date dernières nouvelles ou dernier suivi
+- **date_deces**: Date décès patient (seulement si décédé)
+- **date_1er_symptome**: Date apparition premiers symptômes
+- **exam_radio_date_decouverte**: Date premier examen découvrant la tumeur
+- **date_progression**: Date récidive/progression
+- **survie_globale**: Durée survie en mois
+- **infos_deces**: Circonstances décès
+## Performance on Validation Set
+**Aggregates**:
+- Macro F1: 0.2458 (Precision: 0.2195, Recall: 0.6817)
+- Micro F1: 0.3032 (Precision: 0.1861, Recall: 0.8171)
+**Per-Label Breakdowns**:
+| Label | Precision | Recall | F1 |
+|---|---|---|---|
+| date_chir | 0.0615 | 1.0000 | 0.1159 |
+| date_rcp | 0.7347 | 0.9231 | 0.8182 |
+| dn_date | 0.0000 | 0.0000 | 0.0000 |
+| date_deces | 0.0000 | 0.0000 | 0.0000 |
+| date_1er_symptome | 0.0615 | 1.0000 | 0.1159 |
+| exam_radio_date_decouverte | 0.0462 | 1.0000 | 0.0882 |
+| date_progression | 0.0333 | 1.0000 | 0.0645 |
+| survie_globale | 0.3714 | 0.8125 | 0.5098 |
+| infos_deces | 0.6667 | 0.4000 | 0.5000 |
+## Usage
+```python
+# Inference Code
+from gliner2 import GLiNER2
+model = GLiNER2.from_pretrained("raphael-r/bright-gliner-dates_outcomes")
+text = "Patient presenting with epileptic seizures..."
+entities = model.extract_entities(text)
+for entity in entities:
+    print(entity["text"], "=>", entity["label"])
+```

config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_attn_implementation_autoset": true,
+  "counting_layer": "count_lstm",
+  "max_width": 8,
+  "model_name": "microsoft/mdeberta-v3-base",
+  "model_type": "extractor",
+  "token_pooling": "first",
+  "transformers_version": "5.0.0"
+}

encoder_config/config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "_attn_implementation_autoset": true,
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": null,
+  "dtype": "float32",
+  "eos_token_id": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-07,
+  "legacy": true,
+  "max_position_embeddings": 512,
+  "max_relative_positions": -1,
+  "model_type": "deberta-v2",
+  "norm_rel_ebd": "layer_norm",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "pooler_dropout": 0,
+  "pooler_hidden_act": "gelu",
+  "pooler_hidden_size": 768,
+  "pos_att_type": [
+    "p2c",
+    "c2p"
+  ],
+  "position_biased_input": false,
+  "position_buckets": 256,
+  "relative_attention": true,
+  "share_att_key": true,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.0.0",
+  "type_vocab_size": 0,
+  "vocab_size": 250112
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:81d6b913c4a42ea6f4f46611f5f324e3e5008c3ff68c6b4fc46d5afe8d6021a9
+size 1228421964

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1c7ccb287623cccb7c03150953b6d2a09dd95122933393c9151c3a60095c97e
+size 16337353

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "eos_token": "[SEP]",
+  "extra_special_tokens": [
+    "[SEP_STRUCT]",
+    "[SEP_TEXT]",
+    "[P]",
+    "[C]",
+    "[E]",
+    "[R]",
+    "[L]",
+    "[EXAMPLE]",
+    "[OUTPUT]",
+    "[DESCRIPTION]"
+  ],
+  "is_local": false,
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "model_specific_special_tokens": {},
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "sp_model_kwargs": {},
+  "split_by_punct": false,
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "[UNK]",
+  "vocab_type": "spm"
+}