feat: example maker

Browse files

Files changed (4) hide show

dataset_maker.py +429 -0
examples.py +11 -0
requirements.txt +1 -0
util.py +73 -0

dataset_maker.py ADDED Viewed

	@@ -0,0 +1,429 @@

+from datasets import Dataset, load_dataset
+from typing import Optional
+import curses
+import logging
+import os
+import random
+import uuid
+from examples import custom_examples
+from util import naive_sentence_end_pattern, naive_tokenize
+logger = logging.getLogger(__name__)
+DATASET_PATH = "dataset"
+FEATURES = {
+    "<>^": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "LEFT", "RIGHT", "UP", "WRAP",
+    ]] for item in sublist],
+    "{}": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "WRAP",
+    ]] for item in sublist],
+    "()": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "WRAP",
+    ]] for item in sublist],
+    "[]": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "WRAP",
+    ]] for item in sublist],
+    "''": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "WRAP",
+    ]] for item in sublist],
+    '""': [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "WRAP",
+    ]] for item in sublist],
+    "``": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "WRAP",
+    ]] for item in sublist],
+    "activity": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "DANCE",  # dance
+        "GAME",  # game
+    ]] for item in sublist],
+    "addr": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "CHAN",  # radio frequency, TV channel or station name, e.g. 107.7 "The Bone", CBS, CNN, PBS, etc.
+        "DOOR",  # apt, door, or suite number
+        "EMAIL",  # email address
+        "FAC",  # facility address or specific physical building name
+        "FILE",  # file name and path
+        "GEO",  # geo-coordinates
+        "IP",  # IP address or CIDR notation
+        "MAIL",  # physical mailbox or p.o. box
+        "PHONE",  # telephone or fax
+        "SITE",  # DNS domain name or website name
+        "URL",  # URL parts not EMAIL, FILE, IP, or SITE
+    ]] for item in sublist],
+    "cardinal": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "AGE",  # age
+        "DIST",  # distance
+        "FRAC",  # faction
+        "MASS",  # mass
+        "MONEY",  # currency
+        "PCT",  # percent
+        "PCTILE",  # percentile
+        "SPEED",  # speed
+        "WEIGHT",  # weight, force due to gravity
+    ]] for item in sublist],
+    "concept": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "ART",  # art, music, or literary concept
+        "BIO",  # biology or medical concept
+        "BIZ",  # business or marketing concept
+        "CHEM",  # chemistry or bio-chem concept
+        "CLIM",  # climate or ocean science concept
+        "ECON",  # economic concept
+        "EDU",  # education concept
+        "ENG",  # engineering concept
+        "FIN",  # finance or investment concept
+        "GEOG",  # geography concept
+        "GEOL",  # geology concept
+        "INFO",  # computing, data, or info sciences concept
+        "LAW",  # legal concept
+        "MATH",  # math concept
+        "PHIL",  # ethical or philosophical concept
+        "PHYS",  # physics concept
+        "POLI",  # sociological or political concept
+        "PROG",  # computer programming concept
+        "PSY",  # psychological concept
+        "RELI",  # religious concept
+        "SPORTS",  # sports concept
+        "WAR",  # military concept
+    ]] for item in sublist],
+    "nature": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "FAUNA",  # animal life
+        "FLORA",  # plant life
+        "PHENOM",  # phenomena
+    ]] for item in sublist],
+    "media": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "AUD",  # music and audio recordings
+        "IMG",  # photos, paintings, and other images
+        "SOFT",  # software
+        "TXT",  # articles, books, papers, etc.
+        "VID",  # film and other videos
+    ]] for item in sublist],
+    "org": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "ORG",  # organization
+        "TITLE",  # title or role
+    ]] for item in sublist],
+    "other": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "MATH",  # math notation
+        "OUT",  # computer program output, e.g. stderr/out, logs, etc.
+        "PROG",  # computer programming notation
+        "SCI",  # scientific notation outside math and programming
+    ]] for item in sublist],
+    "people": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "GPE",  # geopolitical entity e.g. countries, cities, states, or regions
+        "LANG",  # language
+        "NORP",  # nationalities, religious, or political groups. e.g. "American", "Muslim", or "Communist"
+    ]] for item in sublist],
+    # person or personified being
+    "person": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "ALIAS",  # nickname or alternative name
+        "HONOR",  # honorific
+        "NAME",  # person name
+        "PROF",  # professional designation
+        "USER",  # username
+    ]] for item in sublist],
+    "place": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "BYTE",  # digital location
+        "FIC",  # fictional locations
+        "LOC",  # physical locations
+        "UI",  # location on a user interface
+        "VIRT",  # virtual location
+        "WEB",  # web-connected location
+    ]] for item in sublist],
+    "time": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "TIME",  # years, dates, time values
+        "EVENT",  # event in time
+    ]] for item in sublist],
+}
+# TODO: might be multi-label
+FEATURES["zz_prime"] = ["_all", "_ambiguous", *FEATURES.keys()]  # primary feature, zz_ so it's labeled last
+UUID5_NS = uuid.UUID("246a5463-afae-4571-a6e0-f319d74147d3")  # Changes sentences signatures
+def get_uniq_training_labels(ds: Dataset, columns_to_exclude: set[str] = None):
+    columns_to_train_on = [k for k in ds.features.keys() if k not in (
+        {"text", "tokens"} if columns_to_exclude is None else columns_to_exclude)]
+    # Create a dictionary of sets, keyed by each column name
+    label_counters = {col: dict() for col in columns_to_train_on}
+    unique_label_values = {col: set() for col in columns_to_train_on}
+    for example in ds:
+        # Each of these columns is a list (one entry per token),
+        # so we update our set with each token-level value
+        for col in columns_to_train_on:
+            unique_label_values[col].update(example[col])
+            for label_val in example[col]:
+                if label_val not in label_counters[col]:
+                    label_counters[col][label_val] = 0  # Inits with 0
+                label_counters[col][label_val] += 1
+    logger.info(f"Columns:")
+    for col in columns_to_train_on:
+        logger.info(f"  {col}:")
+        # Convert to a sorted list just to have a nice, stable ordering
+        vals = sorted(unique_label_values[col])
+        logger.info(f"    {len(vals)} labels: {[f'{v}:{label_counters[col][v]}' for v in vals]}")
+    return unique_label_values
+def main(stdscr, args):
+    wikipedia_dataset_name = "20231101.en"
+    wikipedia_dataset = load_dataset("wikimedia/wikipedia", wikipedia_dataset_name)
+    total_page_cnt = len(wikipedia_dataset["train"])
+    stdscr.clear()
+    stdscr.addstr(f"Loaded {wikipedia_dataset_name} containing {total_page_cnt} pages.")
+    signature_cache = set()
+    new_dataset_dict = {k: [] for k in ["text", "tokens", *FEATURES.keys()]}
+    if os.path.exists(DATASET_PATH):
+        # Load previous examples
+        for i, exp in enumerate(Dataset.load_from_disk(DATASET_PATH)):
+            sig = uuid.uuid5(UUID5_NS, exp["text"])
+            if sig in signature_cache:
+                continue
+            signature_cache.add(sig)
+            for k, v in exp.items():
+                new_dataset_dict[k].append(v)
+    esc_pressed = False
+    while not esc_pressed:
+        # Select random Wikipedia page
+        page = wikipedia_dataset["train"][random.randint(0, total_page_cnt)]
+        # If all custom examples are labeled, move on to Wikipedia
+        for page_chunk in (custom_examples + page["text"].split("\n\n")):
+            page_chunk = page_chunk.strip()
+            if not page_chunk:
+                continue
+            page_chunk_lines = page_chunk.split("\n")
+            for chunk_line in page_chunk_lines:
+                chunk_line = chunk_line.strip()
+                if not chunk_line:
+                    continue
+                while not esc_pressed and chunk_line:
+                    sentence_end_match = naive_sentence_end_pattern.search(chunk_line)
+                    if sentence_end_match:
+                        sentence_blob = chunk_line[:sentence_end_match.end()]
+                        chunk_line = chunk_line[sentence_end_match.end():].strip()
+                    else:
+                        sentence_blob = chunk_line
+                        chunk_line = ""
+                    sig = uuid.uuid5(UUID5_NS, sentence_blob)
+                    if sig in signature_cache:
+                        continue
+                    signature_cache.add(sig)
+                    # TODO: sentence context
+                    #   - prefix each text with a context blob that gets tokenized with the text
+                    #   - label context blobs as B-CTXT and I-CTXT
+                    #   - this way, contextual information from outside the direct text can be injected
+                    #   - this allows injecting contexts from what we've already processed on the page
+                    #   - use a unique signal sequences to signal contexts, e.g.:
+                    #     - {{[[((prev:a,b;last:c,d))]]}}>>>
+                    exp_idx = len(new_dataset_dict["text"])
+                    stdscr.addstr(f"""\n\n>>>{sentence_blob}<<<
+Press 'y' to accept or anything else to reject.
+Press Esc to exit.
+""")
+                    ch = stdscr.getch()
+                    stdscr.clear()
+                    if ch == 27:  # Esc
+                        esc_pressed = True
+                    elif ch == ord("y"):
+                        naive_tokens = naive_tokenize(sentence_blob)
+                        tokens_len = len(naive_tokens)
+                        last_idx = tokens_len - 1
+                        new_exp = {
+                            "text": sentence_blob,
+                            "tokens": naive_tokens,
+                        }
+                        for feat_name, feat_labels in FEATURES.items():
+                            feat_labels_len = len(feat_labels)
+                            labels_accepted = False
+                            while not esc_pressed and not labels_accepted:
+                                labels = []
+                                skip_to_idx = None
+                                skip_label = None
+                                for token_idx, token in enumerate(naive_tokens):
+                                    if skip_to_idx is not None and skip_to_idx >= token_idx:
+                                        labels.append(skip_label if skip_label is not None else "O")
+                                        continue
+                                    skip_to_idx = None
+                                    skip_label = None
+                                    token_len = len(token)
+                                    enter_pressed = False
+                                    idx_blob = ""
+                                    stdscr.clear()
+                                    stdscr.addstr(f"""Example {exp_idx}:
+{"\n".join([f"{pad_to_desired_len(k)}{v}" for k, v in new_exp.items()])}
+{pad_to_desired_len(feat_name)}{labels}
+Labels:  {",  ".join([f"{i}:{l}" for i, l in enumerate(feat_labels)])}
+ {naive_tokens[token_idx:]}
+ {" " * (token_len + 1)}^
+ {" " * (token_len + 1)}{token_idx}
+: """)
+                                    while not esc_pressed and not enter_pressed:
+                                        ch = stdscr.getch()
+                                        if ch in {8, 127, curses.KEY_BACKSPACE}:  # Delete
+                                            idx_blob = idx_blob[:-1]
+                                            y, x = stdscr.getyx()
+                                            next_x = x - 1
+                                            if next_x > 1:
+                                                stdscr.move(y, x - 1)
+                                                stdscr.clrtoeol()
+                                                stdscr.refresh()
+                                        elif ch == 27:  # Esc
+                                            esc_pressed = True
+                                        elif ch in {10, curses.KEY_ENTER}:  # Enter
+                                            enter_pressed = True
+                                        else:
+                                            # Otherwise, add the character to the string
+                                            ch_chr = chr(ch)
+                                            stdscr.addstr(ch_chr)
+                                            idx_blob += ch_chr
+                                    if not idx_blob:
+                                        label_blob = idx_blob if idx_blob else "O"
+                                        labels.append(label_blob)
+                                    elif ">" in idx_blob:
+                                        try:
+                                            idx_blob, skip_distance = idx_blob.split(">")
+                                            if idx_blob:
+                                                label_idx = int(idx_blob)
+                                                if 0 <= label_idx < feat_labels_len:
+                                                    label_blob = feat_labels[label_idx]
+                                                    labels.append(label_blob)
+                                                    skip_label = label_blob
+                                            else:
+                                                labels.append("O")
+                                            if skip_distance:
+                                                skip_to_idx = token_idx + int(skip_distance)
+                                                if skip_to_idx > last_idx:
+                                                    skip_to_idx = last_idx
+                                            else:
+                                                skip_to_idx = last_idx
+                                        except ValueError:
+                                            stdscr.addstr(f"Could not convert {idx_blob} to an integer idx value.")
+                                    else:
+                                        try:
+                                            label_idx = int(idx_blob)
+                                            if 0 <= label_idx < feat_labels_len:
+                                                label_blob = feat_labels[label_idx]
+                                                labels.append(label_blob)
+                                        except ValueError:
+                                            stdscr.addstr(f"Could not convert {idx_blob} to an integer idx value.")
+                                stdscr.clear()
+                                stdscr.addstr(f"""Example {exp_idx}:
+{"\n".join([f"{pad_to_desired_len(k)}{v}" for k, v in new_exp.items()])}
+{pad_to_desired_len(feat_name)}{labels}
+Press 'y' to accept or anything else to reject.
+Press Esc to exit.
+""")
+                                ch = stdscr.getch()
+                                stdscr.clear()
+                                if ch == 27:  # Esc
+                                    esc_pressed = True
+                                elif ch == ord("y"):
+                                    new_exp[feat_name] = labels
+                                    labels_accepted = True
+                            if esc_pressed:
+                                break
+                        # Add if complete
+                        if sorted(new_exp.keys()) == sorted(new_dataset_dict.keys()):
+                            for k, v in new_exp.items():
+                                new_dataset_dict[k].append(v)
+        # Exiting
+        stdscr.clear()
+        return Dataset.from_dict(new_dataset_dict)
+def pad_to_desired_len(blob: str, desired: int = 15):
+    blob_len = len(blob)
+    if blob_len < desired:
+        return f"{blob}{' ' * (desired - blob_len)}"
+    return blob
+def show_examples(ds: Dataset, show_expr: Optional[str]):
+    if not show_expr:
+        ds_len = len(ds)
+        count_to_show = ds_len if ds_len < 10 else 10
+        examples_to_show = ds.shuffle()[:count_to_show]
+    else:
+        args_show_tokens = show_expr.split("/")
+        split_to_show, col_to_show, label_to_show, count_to_show = args_show_tokens
+        count_to_show = int(count_to_show)
+        examples_to_show = ds.filter(
+            lambda exp: label_to_show in exp[col_to_show]).shuffle(seed=42)[:count_to_show]
+    for i in range(count_to_show):
+        logger.info(f"Example {i}:")
+        for feature in examples_to_show.keys():
+            logger.info(f"  {feature}: {examples_to_show[feature][i]}")
+if __name__ == "__main__":
+    import argparse
+    import logging.config
+    arg_parser = argparse.ArgumentParser(description="Train multi-task model.")
+    arg_parser.add_argument("--show", help="Show examples: <split>/<col>/<label>/<count>",
+                            action="store", default=None)
+    parsed_args = arg_parser.parse_args()
+    logging.config.dictConfig({
+        "version": 1,
+        "disable_existing_loggers": False,
+        "formatters": {
+            "default": {
+                "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+            },
+        },
+        "handlers": {
+            "console": {
+                "class": "logging.StreamHandler",
+                "formatter": "default",
+            },
+        },
+        "loggers": {
+            "": {
+                "level": "INFO",
+                "handlers": ["console"],
+            },
+        },
+    })
+    new_ds = curses.wrapper(main, parsed_args)
+    logger.info(f"Writing dataset to disk...\n{new_ds}")
+    show_examples(new_ds, parsed_args.show)
+    get_uniq_training_labels(new_ds)
+    new_ds.save_to_disk(DATASET_PATH)

examples.py ADDED Viewed

	@@ -0,0 +1,11 @@

+custom_examples = [
+    "Brachyarthrum is a genus of true bugs belonging to the family Miridae.",
+    "District 29 is a district in the Texas House of Representatives.",
+    "Edgard Viseur (born 10 April 1905, date of death unknown) was a Belgian middle-distance runner.",
+    "Greater Manchester bus route 192 runs between Hazel Grove in the Metropolitan Borough of Stockport and Piccadilly Gardens in Manchester city centre.",
+    "He competed in the men's 3000 metres steeplechase at the 1928 Summer Olympics.",
+    "It was described by Breuning in 1939.",
+    "Ropica bicristata is a species of beetle in the family Cerambycidae.",
+    "Tactical Force is a 2011 Canadian-American action film written and directed by Adamo Paolo Cultraro, and starring Steve Austin, Michael Jai White, Michael Shanks, Keith Jardine, Michael Eklund, Darren Shahlavi and Lexa Doig.",
+    "The saline red bat (Lasiurus salinae) is a species of bat from the family Vespertilionidae.",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ datasets

util.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import re
+non_terminal_periods = (
+    r"(?<!\sApt)"
+    r"(?<!\sBlvd)"
+    r"(?<!\sCapt)"
+    r"(?<!\sDr)"
+    r"(?<!\sJr)"
+    r"(?<!\sMr)"
+    r"(?<!\sMrs)"
+    r"(?<!\sMs)"
+    r"(?<!\sPh\.D)"
+    r"(?<!\sRd)"
+    r"(?<!\sSr)"
+    r"(?<!\sSt)"
+    r"(?<!\se\.g)"
+    r"(?<!\setc)"
+    r"(?<!\si\.e)"
+    r"(?<!\s[A-Z])"
+    r"(?<!^[a-zA-Z0-9])"
+)
+naive_sentence_end_pattern = re.compile(r"([\n\r]+"
+                                        r"|[!?]+\"?(?=\s|$)"
+                                        r"|" + non_terminal_periods + r"\.+\"?(?=\s|$))")
+# Option 1:
+#   [\n\r]+    - Match consecutive newline and carriage returns
+# Option 2:
+#   [!?]+      - Match ! or ?
+#   (?=\s|$)   - Must be followed by \s or end-of-string
+# Option 3:
+#   non_terminal_periods  - Must not be preceded by non-terminal characters
+#   \.+                   - Match .
+#   (?=\s|$)              - Must be followed by \s or end-of-string
+naive_tokenize_pattern = re.compile(
+    r"("
+    r"\s+"
+    r"|-+(?=\s|$)"
+    r"|(?<=\s)-+"
+    r"|-{2,}"
+    r"|–+"
+    r"|—+"
+    r"|(?<=[a-z])n’t(?=\s|$)"
+    r"|(?<=[a-z])n't(?=\s|$)"
+    r"|’[a-s,u-z]+(?=\s|$)"
+    r"|'[a-s,u-z]+(?=\s|$)"
+    r"|’+"
+    r"|'+"
+    r"|\"+"
+    r"|`+"
+    r"|,+(?=\"|\s|$)"
+    r"|" + non_terminal_periods + r"\.+(?=\"|\s|$)"
+    r"|:+"
+    r"|;+"
+    r"|[?!]+(?=\"|\s|$)"
+    r"|\(+"
+    r"|\)+"
+    r"|\[+"
+    r"|]+"
+    r"|\{+"
+    r"|}+"
+    r"|<+"
+    r"|>+"
+    r")"
+)
+def naive_tokenize(text: str):
+    return [t for t in naive_tokenize_pattern.split(text)
+            if t != ""
+            and not t.startswith(" ")
+            and not t.startswith("\t")]