feat: functional CLI editor

Browse files

Files changed (4) hide show

dataset_maker.py +103 -35
examples.py +15 -9
requirements.txt +2 -1
util.py +3 -0

dataset_maker.py CHANGED Viewed

@@ -5,6 +5,7 @@ import logging
 import os
 import random
 import uuid
 from examples import custom_examples
 from util import naive_sentence_end_pattern, naive_tokenize
@@ -41,9 +42,10 @@ FEATURES = {
         "WRAP",
     ]] for item in sublist],
-    "activity": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
         "DANCE",  # dance
         "GAME",  # game
     ]] for item in sublist],
     "addr": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
@@ -60,18 +62,6 @@ FEATURES = {
         "URL",  # URL parts not EMAIL, FILE, IP, or SITE
     ]] for item in sublist],
-    "cardinal": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
-        "AGE",  # age
-        "DIST",  # distance
-        "FRAC",  # faction
-        "MASS",  # mass
-        "MONEY",  # currency
-        "PCT",  # percent
-        "PCTILE",  # percentile
-        "SPEED",  # speed
-        "WEIGHT",  # weight, force due to gravity
-    ]] for item in sublist],
     "concept": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
         "ART",  # art, music, or literary concept
         "BIO",  # biology or medical concept
@@ -82,25 +72,49 @@ FEATURES = {
         "EDU",  # education concept
         "ENG",  # engineering concept
         "FIN",  # finance or investment concept
         "GEOG",  # geography concept
         "GEOL",  # geology concept
         "INFO",  # computing, data, or info sciences concept
         "LAW",  # legal concept
         "MATH",  # math concept
         "PHIL",  # ethical or philosophical concept
         "PHYS",  # physics concept
         "POLI",  # sociological or political concept
         "PROG",  # computer programming concept
         "PSY",  # psychological concept
         "RELI",  # religious concept
         "SPORTS",  # sports concept
         "WAR",  # military concept
     ]] for item in sublist],
-    "nature": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
-        "FAUNA",  # animal life
-        "FLORA",  # plant life
-        "PHENOM",  # phenomena
     ]] for item in sublist],
     "media": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
@@ -111,14 +125,42 @@ FEATURES = {
         "VID",  # film and other videos
     ]] for item in sublist],
     "org": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
         "ORG",  # organization
         "TITLE",  # title or role
     ]] for item in sublist],
     "other": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
-        "MATH",  # math notation
         "OUT",  # computer program output, e.g. stderr/out, logs, etc.
         "PROG",  # computer programming notation
         "SCI",  # scientific notation outside math and programming
     ]] for item in sublist],
@@ -134,7 +176,7 @@ FEATURES = {
         "ALIAS",  # nickname or alternative name
         "HONOR",  # honorific
         "NAME",  # person name
-        "PROF",  # professional designation
         "USER",  # username
     ]] for item in sublist],
@@ -147,6 +189,12 @@ FEATURES = {
         "WEB",  # web-connected location
     ]] for item in sublist],
     "time": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
         "TIME",  # years, dates, time values
         "EVENT",  # event in time
@@ -159,7 +207,7 @@ UUID5_NS = uuid.UUID("246a5463-afae-4571-a6e0-f319d74147d3")  # Changes sentence
 def get_uniq_training_labels(ds: Dataset, columns_to_exclude: set[str] = None):
     columns_to_train_on = [k for k in ds.features.keys() if k not in (
-        {"text", "tokens"} if columns_to_exclude is None else columns_to_exclude)]
     # Create a dictionary of sets, keyed by each column name
     label_counters = {col: dict() for col in columns_to_train_on}
@@ -193,18 +241,28 @@ def main(stdscr, args):
     stdscr.clear()
     stdscr.addstr(f"Loaded {wikipedia_dataset_name} containing {total_page_cnt} pages.")
     signature_cache = set()
-    new_dataset_dict = {k: [] for k in ["text", "tokens", *FEATURES.keys()]}
     if os.path.exists(DATASET_PATH):
         # Load previous examples
         for i, exp in enumerate(Dataset.load_from_disk(DATASET_PATH)):
-            sig = uuid.uuid5(UUID5_NS, exp["text"])
-            if sig in signature_cache:
                 continue
             signature_cache.add(sig)
-            for k, v in exp.items():
-                new_dataset_dict[k].append(v)
     esc_pressed = False
     while not esc_pressed:
@@ -229,7 +287,7 @@ def main(stdscr, args):
                         sentence_blob = chunk_line
                         chunk_line = ""
-                    sig = uuid.uuid5(UUID5_NS, sentence_blob)
                     if sig in signature_cache:
                         continue
                     signature_cache.add(sig)
@@ -274,7 +332,9 @@ Press Esc to exit.
                                         continue
                                     skip_to_idx = None
                                     skip_label = None
-                                    token_len = len(token)
                                     enter_pressed = False
                                     idx_blob = ""
@@ -286,9 +346,9 @@ Press Esc to exit.
 Labels:  {",  ".join([f"{i}:{l}" for i, l in enumerate(feat_labels)])}
- {naive_tokens[token_idx:]}
- {" " * (token_len + 1)}^
- {" " * (token_len + 1)}{token_idx}
 : """)
                                     while not esc_pressed and not enter_pressed:
@@ -343,12 +403,12 @@ Labels:  {",  ".join([f"{i}:{l}" for i, l in enumerate(feat_labels)])}
                                             stdscr.addstr(f"Could not convert {idx_blob} to an integer idx value.")
                                 stdscr.clear()
                                 stdscr.addstr(f"""Example {exp_idx}:
 {"\n".join([f"{pad_to_desired_len(k)}{v}" for k, v in new_exp.items()])}
 {pad_to_desired_len(feat_name)}{labels}
 Press 'y' to accept or anything else to reject.
-Press Esc to exit.
-""")
                                 ch = stdscr.getch()
                                 stdscr.clear()
                                 if ch == 27:  # Esc
@@ -359,6 +419,7 @@ Press Esc to exit.
                             if esc_pressed:
                                 break
                         # Add if complete
                         if sorted(new_exp.keys()) == sorted(new_dataset_dict.keys()):
                             for k, v in new_exp.items():
                                 new_dataset_dict[k].append(v)
@@ -377,11 +438,11 @@ def pad_to_desired_len(blob: str, desired: int = 15):
 def show_examples(ds: Dataset, show_expr: Optional[str]):
     if not show_expr:
         ds_len = len(ds)
-        count_to_show = ds_len if ds_len < 10 else 10
         examples_to_show = ds.shuffle()[:count_to_show]
     else:
         args_show_tokens = show_expr.split("/")
-        split_to_show, col_to_show, label_to_show, count_to_show = args_show_tokens
         count_to_show = int(count_to_show)
         examples_to_show = ds.filter(
             lambda exp: label_to_show in exp[col_to_show]).shuffle(seed=42)[:count_to_show]
@@ -396,7 +457,14 @@ if __name__ == "__main__":
     import logging.config
     arg_parser = argparse.ArgumentParser(description="Train multi-task model.")
-    arg_parser.add_argument("--show", help="Show examples: <split>/<col>/<label>/<count>",
                             action="store", default=None)
     parsed_args = arg_parser.parse_args()

 import os
 import random
 import uuid
+import wcwidth
 from examples import custom_examples
 from util import naive_sentence_end_pattern, naive_tokenize
         "WRAP",
     ]] for item in sublist],
+    "act": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
         "DANCE",  # dance
         "GAME",  # game
+        "PROJECT",  # project
     ]] for item in sublist],
     "addr": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
         "URL",  # URL parts not EMAIL, FILE, IP, or SITE
     ]] for item in sublist],
     "concept": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
         "ART",  # art, music, or literary concept
         "BIO",  # biology or medical concept
         "EDU",  # education concept
         "ENG",  # engineering concept
         "FIN",  # finance or investment concept
+        "FORMAT",  # formatting concept, e.g. list, outline, paragraph, table, figure, etc.
         "GEOG",  # geography concept
         "GEOL",  # geology concept
         "INFO",  # computing, data, or info sciences concept
+        "LANG",  # linguistics concept
         "LAW",  # legal concept
         "MATH",  # math concept
+        "ORG",  # organizational concept
         "PHIL",  # ethical or philosophical concept
         "PHYS",  # physics concept
         "POLI",  # sociological or political concept
         "PROG",  # computer programming concept
         "PSY",  # psychological concept
         "RELI",  # religious concept
+        "SOC",  # sociology concept
         "SPORTS",  # sports concept
         "WAR",  # military concept
     ]] for item in sublist],
+    "coord": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "AND",
+        "OR",  # or, nor is negatives connected by AND
+        "NEG",  # Negative
+    ]] for item in sublist],
+    "error": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "OMIT",  # omitted or missing values due to formating or redactions
+        "ORDER",  # word order problem
+        "SPELL",  # spelling error
+    ]] for item in sublist],
+    "foreign": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "ES",  # Spanish
+        "FR",  # French
+        "HANS",  # Chinese simplified
+        "HANT",  # Chinese traditional
+        "JA",  # Japanese
+        "LA",  # Latin
+        "LANG",  # marker indicating language of subsequent foreign token
+        "LOAN",  # loadword, English word based on foreign sound
+        "PHONE",  # phonetic, formal (e.g. Hepburn romanization) or otherwise
+        "TRANS",  # marker indicating translation
     ]] for item in sublist],
     "media": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
         "VID",  # film and other videos
     ]] for item in sublist],
+    "nature": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "FAUNA",  # animal life
+        "FLORA",  # plant life
+        "PHENOM",  # phenomena
+    ]] for item in sublist],
+    "num": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "AGE",  # age
+        "COUNT",  # count
+        "DIST",  # distance
+        "FRAC",  # faction
+        "MASS",  # mass
+        "MONEY",  # currency
+        "ORD",  # ordinal
+        "PCT",  # percent
+        "PCTILE",  # percentile
+        "RANGE",  # numeric range
+        "SPEED",  # speed
+        "WEIGHT",  # weight, force due to gravity
+    ]] for item in sublist],
     "org": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
         "ORG",  # organization
         "TITLE",  # title or role
     ]] for item in sublist],
     "other": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "DIV",  # / or ÷
+        "EXP",  # exponent, e.g. ^
+        "GT",  # >
+        "LT",  # <
+        "MATH",  # non-arithmatic math notation
+        "MINUS",  # -
+        "MULT",  # x, X, or *
         "OUT",  # computer program output, e.g. stderr/out, logs, etc.
+        "PLUS",  # +
         "PROG",  # computer programming notation
         "SCI",  # scientific notation outside math and programming
     ]] for item in sublist],
         "ALIAS",  # nickname or alternative name
         "HONOR",  # honorific
         "NAME",  # person name
+        "PROF",  # profession or professional designation e.g. CFA, CPA, MD
         "USER",  # username
     ]] for item in sublist],
         "WEB",  # web-connected location
     ]] for item in sublist],
+    "thing": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
+        "AWARD",  # named accolade or honorary award
+        "DEVICE",  # device, tool, or toy
+        "FOOD",  # food
+    ]] for item in sublist],
     "time": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
         "TIME",  # years, dates, time values
         "EVENT",  # event in time
 def get_uniq_training_labels(ds: Dataset, columns_to_exclude: set[str] = None):
     columns_to_train_on = [k for k in ds.features.keys() if k not in (
+        {"text", "tokens", "sig"} if columns_to_exclude is None else columns_to_exclude)]
     # Create a dictionary of sets, keyed by each column name
     label_counters = {col: dict() for col in columns_to_train_on}
     stdscr.clear()
     stdscr.addstr(f"Loaded {wikipedia_dataset_name} containing {total_page_cnt} pages.")
+    new_dataset_dict = {k: [] for k in ["text", "tokens", *FEATURES.keys(), "sig"]}
     signature_cache = set()
+    target_sig, target_col, target_idx, new_label = None, None, None, None
+    if args.replace:
+        args_replace_tokens = args.replace.split("/")
+        target_sig, target_col, target_idx, new_label = args_replace_tokens
     if os.path.exists(DATASET_PATH):
         # Load previous examples
         for i, exp in enumerate(Dataset.load_from_disk(DATASET_PATH)):
+            sig = str(uuid.uuid5(UUID5_NS, exp["text"]))
+            if sig in signature_cache or sig == args.redo:
                 continue
             signature_cache.add(sig)
+            if sig == target_sig:
+                for k, v in exp.items():
+                    if k == target_col:
+                        v[int(target_idx)] = new_label
+                    new_dataset_dict[k].append(v)
+            else:
+                for k, v in exp.items():
+                    new_dataset_dict[k].append(v)
     esc_pressed = False
     while not esc_pressed:
                         sentence_blob = chunk_line
                         chunk_line = ""
+                    sig = str(uuid.uuid5(UUID5_NS, sentence_blob))
                     if sig in signature_cache:
                         continue
                     signature_cache.add(sig)
                                         continue
                                     skip_to_idx = None
                                     skip_label = None
+                                    padding_len = (
+                                        1 + wcwidth.wcswidth(", ".join([f"'{t}'" for t in naive_tokens[:token_idx]]))
+                                        + wcwidth.wcswidth(token)) + (0 if token_idx == 0 else 2)
                                     enter_pressed = False
                                     idx_blob = ""
 Labels:  {",  ".join([f"{i}:{l}" for i, l in enumerate(feat_labels)])}
+ {naive_tokens}
+ {" " * padding_len}^
+ {" " * padding_len}{token_idx}
 : """)
                                     while not esc_pressed and not enter_pressed:
                                             stdscr.addstr(f"Could not convert {idx_blob} to an integer idx value.")
                                 stdscr.clear()
                                 stdscr.addstr(f"""Example {exp_idx}:
 {"\n".join([f"{pad_to_desired_len(k)}{v}" for k, v in new_exp.items()])}
 {pad_to_desired_len(feat_name)}{labels}
 Press 'y' to accept or anything else to reject.
+Press Esc to exit.""")
                                 ch = stdscr.getch()
                                 stdscr.clear()
                                 if ch == 27:  # Esc
                             if esc_pressed:
                                 break
                         # Add if complete
+                        new_exp["sig"] = sig
                         if sorted(new_exp.keys()) == sorted(new_dataset_dict.keys()):
                             for k, v in new_exp.items():
                                 new_dataset_dict[k].append(v)
 def show_examples(ds: Dataset, show_expr: Optional[str]):
     if not show_expr:
         ds_len = len(ds)
+        count_to_show = ds_len if ds_len < 25 else 25
         examples_to_show = ds.shuffle()[:count_to_show]
     else:
         args_show_tokens = show_expr.split("/")
+        col_to_show, label_to_show, count_to_show = args_show_tokens
         count_to_show = int(count_to_show)
         examples_to_show = ds.filter(
             lambda exp: label_to_show in exp[col_to_show]).shuffle(seed=42)[:count_to_show]
     import logging.config
     arg_parser = argparse.ArgumentParser(description="Train multi-task model.")
+    arg_parser.add_argument("--redo",
+                            help="Redo example based on signature",
+                            action="store", default=None)
+    arg_parser.add_argument("--replace",
+                            help="Replace a label using a sig, col, idx, and new label",
+                            action="store", default=None)
+    arg_parser.add_argument("--show",
+                            help="Show examples: <col>/<label>/<count>",
                             action="store", default=None)
     parsed_args = arg_parser.parse_args()

examples.py CHANGED Viewed

@@ -1,11 +1,17 @@
 custom_examples = [
-    "Brachyarthrum is a genus of true bugs belonging to the family Miridae.",
-    "District 29 is a district in the Texas House of Representatives.",
-    "Edgard Viseur (born 10 April 1905, date of death unknown) was a Belgian middle-distance runner.",
-    "Greater Manchester bus route 192 runs between Hazel Grove in the Metropolitan Borough of Stockport and Piccadilly Gardens in Manchester city centre.",
-    "He competed in the men's 3000 metres steeplechase at the 1928 Summer Olympics.",
-    "It was described by Breuning in 1939.",
-    "Ropica bicristata is a species of beetle in the family Cerambycidae.",
-    "Tactical Force is a 2011 Canadian-American action film written and directed by Adamo Paolo Cultraro, and starring Steve Austin, Michael Jai White, Michael Shanks, Keith Jardine, Michael Eklund, Darren Shahlavi and Lexa Doig.",
-    "The saline red bat (Lasiurus salinae) is a species of bat from the family Vespertilionidae.",
 ]

 custom_examples = [
+    "Sailaifengye () or Salafiyah refers to the Chinese Salafi Movement.",
+    "Sailaifengye (Chinese: 赛莱菲耶) or Salafiyah refers to the Chinese Salafi Movement.",
+    "Chinese Salafists are not a unified organization but \"a patchwork of relatively independent mosque / prayer-congregations\" loosely connected through overlapping networks of students, teachers, and Ulema from shared overseas institutions and circles of study.",
+    "Catch Without Arms is the third album from the Los Gatos, CA rock band Dredg, released on June 21, 2005.",
+    "Turbonilla lara is a species of sea snail, a marine gastropod mollusk in the family Pyramidellidae, the pyrams and their allies.",
+    "The Pro Patria Medal is a South African military campaign medal which was instituted by the Republic in 1974.",
+    "Sarath Nanda Silva PC served as the 41st Chief Justice of the Supreme Court of Sri Lanka.",
+    "The Truman Capote Award for Literary Criticism is awarded for literary criticism by the University of Iowa on behalf of the Truman Capote Literary Trust.",
+    "Charles Waldo Rezk (born 26 January 1969) is an American mathematician, specializing in algebraic topology, category theory, and spectral algebraic geometry.",
+    "In the 1990s he was part of the comic duo Sugar and Spice.",
+    "Shōta no Sushi (将太の寿司, lit. Shōta's Sushi) is a Japanese manga series written and illustrated by Daisuke Terasawa about a teen boy Shota Sekiguchi (関口将太, Sekiguchi Shōta) and his journey from an apprentice to become a sushi chef.",
+    "This is a list of cities with a population above 100,000, as listed in the 2011 Census of India in the Indian state of Odisha:",
+    "John Fredriksson (30 August 1923 – 29 May 2012) was a Swedish alpine skier.",
+    "Sandpines Golf Links is a public golf course in Florence, Oregon, United States, on the central Oregon Coast",
+    "The Rocky Mountain Dinosaur Resource Center is a fossil museum primarily exhibiting fossil organisms of North America's Late Cretaceous including dinosaurs, pterosaurs, marine reptiles, and fish.",
 ]

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- datasets


1	+ datasets
2	+ wcwidth

util.py CHANGED Viewed

@@ -16,7 +16,9 @@ non_terminal_periods = (
     r"(?<!\se\.g)"
     r"(?<!\setc)"
     r"(?<!\si\.e)"
     r"(?<!\s[A-Z])"
     r"(?<!^[a-zA-Z0-9])"
 )
@@ -62,6 +64,7 @@ naive_tokenize_pattern = re.compile(
     r"|}+"
     r"|<+"
     r"|>+"
     r")"
 )

     r"(?<!\se\.g)"
     r"(?<!\setc)"
     r"(?<!\si\.e)"
+    r"(?<!\slit)"
     r"(?<!\s[A-Z])"
+    r"(?<!\(r)"
     r"(?<!^[a-zA-Z0-9])"
 )
     r"|}+"
     r"|<+"
     r"|>+"
+    r"|[\u4e00-\u9fff]"  # For Chinese characters, which are not space delimited
     r")"
 )