feat: updated conll model

Browse files

Files changed (11) hide show

conll2012_dataset_maker.py +120 -0
models/o3-mini_20250218/README.md +0 -199
models/o3-mini_20250218/added_tokens.json +0 -3
models/o3-mini_20250218/config.json +0 -150
models/o3-mini_20250218/model.safetensors +0 -3
models/o3-mini_20250218/special_tokens_map.json +0 -15
models/o3-mini_20250218/spm.model +0 -3
models/o3-mini_20250218/tokenizer.json +0 -0
models/o3-mini_20250218/tokenizer_config.json +0 -60
models/o3-mini_20250218/training_args.bin +0 -3
multi_head_trainer.py +1 -1

conll2012_dataset_maker.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from datasets import load_dataset, DatasetDict
+import argparse
+import logging
+from utils import default_logging_config, get_uniq_training_labels, show_examples
+logger = logging.getLogger(__name__)
+allowed_pos = {'``', '$', "''", ',', '-LRB-', '-RRB-', '.', ':', 'ADD', 'CC', 'CD', 'DT', 'EX',
+               'FW', 'HYPH', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NFP', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS',
+               'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ',
+               'WDT', 'WP', 'WP$', 'WRB'}
+allowed_ner = {'O', 'B-PERSON', 'I-PERSON', 'B-NORP', 'I-NORP', 'B-FAC', 'I-FAC', 'B-ORG', 'I-ORG', 'B-GPE', 'I-GPE',
+               'B-LOC', 'I-LOC', 'B-PRODUCT', 'I-PRODUCT', 'B-DATE', 'I-DATE', 'B-TIME', 'I-TIME', 'B-PERCENT',
+               'I-PERCENT', 'B-MONEY', 'I-MONEY', 'B-QUANTITY', 'I-QUANTITY', 'B-ORDINAL', 'I-ORDINAL',
+               'B-CARDINAL', 'I-CARDINAL', 'B-EVENT', 'I-EVENT', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'B-LAW', 'I-LAW',
+               'B-LANGUAGE', 'I-LANGUAGE'}
+def is_valid_example(exp):
+    """
+    Simple filter that checks if all pos_tags are in allowed_pos
+    and all ner_tags are in allowed_ner. If you do not want any
+    filtering, simply return True.
+    """
+    # You can skip filtering by just returning True:
+    # return True
+    # If your dataset has multiple tokens with possibly different tags,
+    # check them all:
+    for pos_tag in exp["pos_tags"]:
+        if pos_tag not in allowed_pos:
+            return False
+    for ner_tag in exp["ner_tags"]:
+        if ner_tag not in allowed_ner:
+            return False
+    return True
+def transform_and_filter_dataset(onto_ds):
+    """
+    onto_ds is a DatasetDict with splits: 'train', 'validation', 'test', etc.
+    Return a new DatasetDict with the same splits but:
+      - Filter out unwanted examples
+      - Possibly rename or remove columns
+      - Possibly introduce new columns
+    """
+    pos_tag_int2str = onto_ds["train"].features["sentences"][0]["pos_tags"].feature.names
+    ner_tag_int2str = onto_ds["train"].features["sentences"][0]["named_entities"].feature.names
+    def flatten_ontonotes(batch):
+        out = {
+            "tokens": [],
+            "ner_tags": [],
+            "pos_tags": [],
+            "verb_predicate": [],
+        }
+        for doc_id, sents in zip(batch["document_id"], batch["sentences"]):
+            for sent_info in sents:
+                out["tokens"].append(sent_info["words"])
+                out["ner_tags"].append([ner_tag_int2str[i] for i in sent_info["named_entities"]])
+                out["pos_tags"].append([pos_tag_int2str[i] for i in sent_info["pos_tags"]])
+                out["verb_predicate"].append([("Yes" if s else "O") for s in sent_info["predicate_lemmas"]])
+        return out
+    new_splits = {}
+    for split_name, split_ds in onto_ds.items():
+        # Flatten
+        flattened_ds = split_ds.map(
+            flatten_ontonotes,
+            batched=True,
+            remove_columns=["sentences", "document_id"],  # remove old columns
+        )
+        # Filter out invalid examples
+        filtered_split = flattened_ds.filter(is_valid_example)
+        new_splits[split_name] = filtered_split
+    return DatasetDict(new_splits)
+# ------------------------------------------------------------------------------
+# 6) Main Script
+# ------------------------------------------------------------------------------
+if __name__ == "__main__":
+    import logging.config
+    arg_parser = argparse.ArgumentParser(description="Process OntoNotes CoNLL-2012 (English).")
+    arg_parser.add_argument("--log-level", help="Log level.", action="store",
+                            default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])
+    arg_parser.add_argument("--save", help="Save final dataset to disk.", action="store_true", default=False)
+    arg_parser.add_argument("--save-path", help="Where to save final dataset.", default="./conll2012_en12_training_data")
+    arg_parser.add_argument("--show", help="Show examples: <split>/<col>/<label>/<count>", default=None)
+    args = arg_parser.parse_args()
+    logging.config.dictConfig(default_logging_config)
+    logger.setLevel(args.log_level)
+    # 6a) Load OntoNotes (English) from the 'conll2012_ontonotesv5' script
+    #     This usually yields "train", "validation", "test" splits.
+    ontonotes_ds = load_dataset("conll2012_ontonotesv5", "english_v12")
+    logger.info(f"Splits loaded: {ontonotes_ds}")
+    # 6b) Transform & Filter
+    final_dataset = transform_and_filter_dataset(ontonotes_ds)
+    # 6d) Show examples if user requested
+    show_examples(final_dataset, args.show)
+    # 6e) Log unique training labels (POS/NER) if you like
+    get_uniq_training_labels(final_dataset)
+    # 6f) Save to disk if requested
+    if args.save:
+        final_dataset.save_to_disk(args.save_path)
+        logger.info("Saved dataset to %s", args.save_path)

models/o3-mini_20250218/README.md DELETED Viewed

@@ -1,199 +0,0 @@
----
-license: bsd-2-clause
----
-### Dataset: o3-mini_20250218
-```text
-DatasetDict({
-    test: Dataset({
-        features: ['text', 'tokens', 'adj', 'adv', 'det', 'enc', 'func', 'misc', 'ner1', 'ner2', 'noun', 'pronoun', 'punct', 'verb', 'wh'],
-        num_rows: 2571
-    })
-    train: Dataset({
-        features: ['text', 'tokens', 'adj', 'adv', 'det', 'enc', 'func', 'misc', 'ner1', 'ner2', 'noun', 'pronoun', 'punct', 'verb', 'wh'],
-        num_rows: 23389
-    })
-    validation: Dataset({
-        features: ['text', 'tokens', 'adj', 'adv', 'det', 'enc', 'func', 'misc', 'ner1', 'ner2', 'noun', 'pronoun', 'punct', 'verb', 'wh'],
-        num_rows: 2599
-    })
-})
-```
-### Classification Reports
-```text
------ adj classification report -----
-              precision    recall  f1-score   support
-          JJ       0.90      0.87      0.88      3187
-         JJR       0.95      0.88      0.91       162
-         JJS       0.88      0.84      0.86       102
-           O       0.99      0.99      0.99     29414
-    accuracy                           0.98     32865
-   macro avg       0.93      0.89      0.91     32865
-weighted avg       0.98      0.98      0.98     32865
------ adv classification report -----
-              precision    recall  f1-score   support
-           O       0.99      0.99      0.99     30468
-          RB       0.91      0.91      0.91      2157
-         RBR       0.89      0.90      0.89       146
-         RBS       0.80      0.79      0.79        94
-    accuracy                           0.99     32865
-   macro avg       0.90      0.90      0.90     32865
-weighted avg       0.99      0.99      0.99     32865
------ det classification report -----
-              precision    recall  f1-score   support
-          DT       0.96      0.95      0.96      4447
-          EX       0.96      0.90      0.93        82
-           O       0.99      0.99      0.99     28163
-         PDT       0.63      0.55      0.59       173
-    accuracy                           0.99     32865
-   macro avg       0.89      0.85      0.87     32865
-weighted avg       0.99      0.99      0.99     32865
------ enc classification report -----
-              precision    recall  f1-score   support
-     BRACKET       0.79      0.89      0.84       385
-           O       0.99      0.99      0.99     31944
-       QUOTE       0.75      0.76      0.76       536
-    accuracy                           0.99     32865
-   macro avg       0.85      0.88      0.86     32865
-weighted avg       0.99      0.99      0.99     32865
------ func classification report -----
-              precision    recall  f1-score   support
-          CC       0.98      0.99      0.98      1153
-          IN       0.97      0.98      0.97      3805
-           O       0.99      0.99      0.99     26444
-          RP       0.87      0.77      0.82       373
-          TO       1.00      0.99      0.99       871
-          UH       0.77      0.68      0.72       219
-    accuracy                           0.99     32865
-   macro avg       0.93      0.90      0.91     32865
-weighted avg       0.99      0.99      0.99     32865
------ misc classification report -----
-              precision    recall  f1-score   support
-           $       0.92      0.86      0.89        64
-         ADD       0.77      0.71      0.74       719
-          CD       0.89      0.89      0.89       558
-       EMOJI       1.00      0.73      0.85        15
-           O       0.99      0.99      0.99     30608
-        TIME       0.88      0.90      0.89       901
-    accuracy                           0.98     32865
-   macro avg       0.91      0.85      0.87     32865
-weighted avg       0.98      0.98      0.98     32865
------ ner1 classification report -----
-              precision    recall  f1-score   support
-       B-GPE       0.87      0.90      0.89       473
-       B-ORG       0.86      0.82      0.84       424
-       B-PER       0.95      0.93      0.94       649
-       I-GPE       0.85      0.90      0.87       147
-       I-ORG       0.85      0.82      0.83       310
-       I-PER       0.96      0.96      0.96       261
-           O       0.99      0.99      0.99     30601
-    accuracy                           0.99     32865
-   macro avg       0.90      0.90      0.90     32865
-weighted avg       0.99      0.99      0.99     32865
------ ner2 classification report -----
-              precision    recall  f1-score   support
-     B-EVENT       0.62      0.52      0.56       621
-       B-LOC       0.78      0.78      0.78       909
-     I-EVENT       0.54      0.32      0.40      1033
-       I-LOC       0.73      0.66      0.70       597
-           O       0.96      0.98      0.97     29705
-    accuracy                           0.94     32865
-   macro avg       0.73      0.65      0.68     32865
-weighted avg       0.93      0.94      0.93     32865
------ noun classification report -----
-              precision    recall  f1-score   support
-          NN       0.96      0.96      0.96      4400
-         NNP       0.94      0.96      0.95      2410
-        NNPS       0.67      0.72      0.69        61
-         NNS       0.97      0.97      0.97      1698
-           O       0.99      0.99      0.99     24296
-    accuracy                           0.98     32865
-   macro avg       0.91      0.92      0.91     32865
-weighted avg       0.98      0.98      0.98     32865
------ pronoun classification report -----
-              precision    recall  f1-score   support
-           O       1.00      1.00      1.00     29952
-         POS       0.97      0.97      0.97       154
-         PRP       0.97      0.97      0.97      2139
-        PRP$       0.99      0.98      0.99       620
-    accuracy                           1.00     32865
-   macro avg       0.98      0.98      0.98     32865
-weighted avg       1.00      1.00      1.00     32865
------ punct classification report -----
-              precision    recall  f1-score   support
-       COLON       0.99      0.95      0.97       201
-       COMMA       0.99      1.00      0.99      1454
-     EXCLAIM       0.99      0.97      0.98       107
-        HYPH       0.96      0.95      0.95       321
-          LS       0.57      0.53      0.55        15
-           O       1.00      1.00      1.00     28545
-      PERIOD       0.98      0.99      0.99      2022
-    QUESTION       0.99      0.99      0.99       156
-         SEP       0.75      0.41      0.53        44
-    accuracy                           1.00     32865
-   macro avg       0.91      0.87      0.88     32865
-weighted avg       1.00      1.00      1.00     32865
------ verb classification report -----
-              precision    recall  f1-score   support
-          MD       1.00      0.98      0.99       527
-           O       1.00      0.99      0.99     26452
-          VB       0.95      0.94      0.94      1540
-         VBD       0.96      0.96      0.96      1330
-         VBG       0.94      0.96      0.95       625
-         VBN       0.88      0.93      0.90       766
-         VBP       0.88      0.92      0.90       766
-         VBZ       0.99      0.98      0.98       859
-    accuracy                           0.98     32865
-   macro avg       0.95      0.96      0.95     32865
-weighted avg       0.99      0.98      0.98     32865
------ wh classification report -----
-              precision    recall  f1-score   support
-           O       0.99      1.00      0.99     32019
-         WDT       0.75      0.57      0.65       186
-          WP       0.84      0.71      0.77       164
-         WP$       0.62      0.58      0.60       238
-         WRB       0.94      0.72      0.81       258
-    accuracy                           0.99     32865
-   macro avg       0.83      0.72      0.77     32865
-weighted avg       0.99      0.99      0.99     32865
-```

models/o3-mini_20250218/added_tokens.json DELETED Viewed

@@ -1,3 +0,0 @@
-{
-  "[MASK]": 128000
-}

models/o3-mini_20250218/config.json DELETED Viewed

@@ -1,150 +0,0 @@
-{
-  "_name_or_path": "microsoft/deberta-v3-base",
-  "architectures": [
-    "MultiHeadModel"
-  ],
-  "attention_probs_dropout_prob": 0.1,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
-  "hidden_size": 768,
-  "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "label_maps": {
-    "adj": [
-      "JJ",
-      "JJS",
-      "JJR",
-      "O"
-    ],
-    "adv": [
-      "RBR",
-      "RB",
-      "RBS",
-      "O"
-    ],
-    "det": [
-      "PDT",
-      "DT",
-      "EX",
-      "O"
-    ],
-    "enc": [
-      "QUOTE",
-      "TICK",
-      "BRACKET",
-      "O"
-    ],
-    "func": [
-      "UH",
-      "RP",
-      "TO",
-      "O",
-      "IN",
-      "CC"
-    ],
-    "misc": [
-      "EMOJI",
-      "TIME",
-      "ADD",
-      "CD",
-      "O",
-      "$"
-    ],
-    "ner1": [
-      "I-ORG",
-      "B-ORG",
-      "I-GPE",
-      "B-PER",
-      "O",
-      "B-GPE",
-      "I-PER"
-    ],
-    "ner2": [
-      "I-LOC",
-      "B-LOC",
-      "I-EVENT",
-      "O",
-      "B-EVENT"
-    ],
-    "noun": [
-      "NNS",
-      "O",
-      "NNP",
-      "NN",
-      "NNPS"
-    ],
-    "pronoun": [
-      "PRP$",
-      "PRP",
-      "POS",
-      "O"
-    ],
-    "punct": [
-      "QUESTION",
-      "LS",
-      "COMMA",
-      "EXCLAIM",
-      "COLON",
-      "PERIOD",
-      "SEP",
-      "O",
-      "HYPH"
-    ],
-    "verb": [
-      "MD",
-      "VBG",
-      "O",
-      "VB",
-      "VBP",
-      "VBZ",
-      "VBN",
-      "VBD"
-    ],
-    "wh": [
-      "WP$",
-      "O",
-      "WP",
-      "WRB",
-      "WDT"
-    ]
-  },
-  "layer_norm_eps": 1e-07,
-  "legacy": true,
-  "max_position_embeddings": 512,
-  "max_relative_positions": -1,
-  "model_type": "deberta-v2",
-  "norm_rel_ebd": "layer_norm",
-  "num_attention_heads": 12,
-  "num_hidden_layers": 12,
-  "num_labels_dict": {
-    "adj": 4,
-    "adv": 4,
-    "det": 4,
-    "enc": 4,
-    "func": 6,
-    "misc": 6,
-    "ner1": 7,
-    "ner2": 5,
-    "noun": 5,
-    "pronoun": 4,
-    "punct": 9,
-    "verb": 8,
-    "wh": 5
-  },
-  "pad_token_id": 0,
-  "pooler_dropout": 0,
-  "pooler_hidden_act": "gelu",
-  "pooler_hidden_size": 768,
-  "pos_att_type": [
-    "p2c",
-    "c2p"
-  ],
-  "position_biased_input": false,
-  "position_buckets": 256,
-  "relative_attention": true,
-  "share_att_key": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.48.2",
-  "type_vocab_size": 0,
-  "vocab_size": 128100
-}

models/o3-mini_20250218/model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d7fc80d3a8526faa41c3c79c97c87d72ca6f01fb6ef3812cd3a7764787b9949f
-size 735571028

models/o3-mini_20250218/special_tokens_map.json DELETED Viewed

@@ -1,15 +0,0 @@
-{
-  "bos_token": "[CLS]",
-  "cls_token": "[CLS]",
-  "eos_token": "[SEP]",
-  "mask_token": "[MASK]",
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "unk_token": {
-    "content": "[UNK]",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  }
-}

models/o3-mini_20250218/spm.model DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
-size 2464616

models/o3-mini_20250218/tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

models/o3-mini_20250218/tokenizer_config.json DELETED Viewed

@@ -1,60 +0,0 @@
-{
-  "add_prefix_space": true,
-  "added_tokens_decoder": {
-    "0": {
-      "content": "[PAD]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "1": {
-      "content": "[CLS]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "2": {
-      "content": "[SEP]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "3": {
-      "content": "[UNK]",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "128000": {
-      "content": "[MASK]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "bos_token": "[CLS]",
-  "clean_up_tokenization_spaces": false,
-  "cls_token": "[CLS]",
-  "do_lower_case": false,
-  "eos_token": "[SEP]",
-  "extra_special_tokens": {},
-  "mask_token": "[MASK]",
-  "model_max_length": 1000000000000000019884624838656,
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "sp_model_kwargs": {},
-  "split_by_punct": false,
-  "tokenizer_class": "DebertaV2Tokenizer",
-  "unk_token": "[UNK]",
-  "vocab_type": "spm"
-}

models/o3-mini_20250218/training_args.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:393cbff7e2678a2b8c4e3190f5be4af291a4d8e9e2ca5376460939e460fa5ce5
-size 5304

multi_head_trainer.py CHANGED Viewed

@@ -305,7 +305,7 @@ if __name__ == "__main__":
     arg_parser.add_argument("--mini", help='Train model using small subset of examples for pipeline testing.',
                             action="store_true", default=False)
     arg_parser.add_argument("--save-path", help="Save final model to specified path.",
-                            action="store", default="./ud_final")
     arg_parser.add_argument("--show", help="Show examples: <split>/<col>/<label>/<count>",
                             action="store", default=None)
     arg_parser.add_argument("--train", help='Train model using loaded examples.',

     arg_parser.add_argument("--mini", help='Train model using small subset of examples for pipeline testing.',
                             action="store_true", default=False)
     arg_parser.add_argument("--save-path", help="Save final model to specified path.",
+                            action="store", default="./final")
     arg_parser.add_argument("--show", help="Show examples: <split>/<col>/<label>/<count>",
                             action="store", default=None)
     arg_parser.add_argument("--train", help='Train model using loaded examples.',