Commit
·
5a7f6ac
1
Parent(s):
eddcbd9
feat: example maker
Browse files- dataset_maker.py +429 -0
- examples.py +11 -0
- requirements.txt +1 -0
- util.py +73 -0
dataset_maker.py
ADDED
|
@@ -0,0 +1,429 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import Dataset, load_dataset
|
| 2 |
+
from typing import Optional
|
| 3 |
+
import curses
|
| 4 |
+
import logging
|
| 5 |
+
import os
|
| 6 |
+
import random
|
| 7 |
+
import uuid
|
| 8 |
+
|
| 9 |
+
from examples import custom_examples
|
| 10 |
+
from util import naive_sentence_end_pattern, naive_tokenize
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
DATASET_PATH = "dataset"
|
| 15 |
+
FEATURES = {
|
| 16 |
+
"<>^": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 17 |
+
"LEFT", "RIGHT", "UP", "WRAP",
|
| 18 |
+
]] for item in sublist],
|
| 19 |
+
|
| 20 |
+
"{}": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 21 |
+
"WRAP",
|
| 22 |
+
]] for item in sublist],
|
| 23 |
+
|
| 24 |
+
"()": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 25 |
+
"WRAP",
|
| 26 |
+
]] for item in sublist],
|
| 27 |
+
|
| 28 |
+
"[]": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 29 |
+
"WRAP",
|
| 30 |
+
]] for item in sublist],
|
| 31 |
+
|
| 32 |
+
"''": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 33 |
+
"WRAP",
|
| 34 |
+
]] for item in sublist],
|
| 35 |
+
|
| 36 |
+
'""': [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 37 |
+
"WRAP",
|
| 38 |
+
]] for item in sublist],
|
| 39 |
+
|
| 40 |
+
"``": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 41 |
+
"WRAP",
|
| 42 |
+
]] for item in sublist],
|
| 43 |
+
|
| 44 |
+
"activity": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 45 |
+
"DANCE", # dance
|
| 46 |
+
"GAME", # game
|
| 47 |
+
]] for item in sublist],
|
| 48 |
+
|
| 49 |
+
"addr": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 50 |
+
"CHAN", # radio frequency, TV channel or station name, e.g. 107.7 "The Bone", CBS, CNN, PBS, etc.
|
| 51 |
+
"DOOR", # apt, door, or suite number
|
| 52 |
+
"EMAIL", # email address
|
| 53 |
+
"FAC", # facility address or specific physical building name
|
| 54 |
+
"FILE", # file name and path
|
| 55 |
+
"GEO", # geo-coordinates
|
| 56 |
+
"IP", # IP address or CIDR notation
|
| 57 |
+
"MAIL", # physical mailbox or p.o. box
|
| 58 |
+
"PHONE", # telephone or fax
|
| 59 |
+
"SITE", # DNS domain name or website name
|
| 60 |
+
"URL", # URL parts not EMAIL, FILE, IP, or SITE
|
| 61 |
+
]] for item in sublist],
|
| 62 |
+
|
| 63 |
+
"cardinal": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 64 |
+
"AGE", # age
|
| 65 |
+
"DIST", # distance
|
| 66 |
+
"FRAC", # faction
|
| 67 |
+
"MASS", # mass
|
| 68 |
+
"MONEY", # currency
|
| 69 |
+
"PCT", # percent
|
| 70 |
+
"PCTILE", # percentile
|
| 71 |
+
"SPEED", # speed
|
| 72 |
+
"WEIGHT", # weight, force due to gravity
|
| 73 |
+
]] for item in sublist],
|
| 74 |
+
|
| 75 |
+
"concept": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 76 |
+
"ART", # art, music, or literary concept
|
| 77 |
+
"BIO", # biology or medical concept
|
| 78 |
+
"BIZ", # business or marketing concept
|
| 79 |
+
"CHEM", # chemistry or bio-chem concept
|
| 80 |
+
"CLIM", # climate or ocean science concept
|
| 81 |
+
"ECON", # economic concept
|
| 82 |
+
"EDU", # education concept
|
| 83 |
+
"ENG", # engineering concept
|
| 84 |
+
"FIN", # finance or investment concept
|
| 85 |
+
"GEOG", # geography concept
|
| 86 |
+
"GEOL", # geology concept
|
| 87 |
+
"INFO", # computing, data, or info sciences concept
|
| 88 |
+
"LAW", # legal concept
|
| 89 |
+
"MATH", # math concept
|
| 90 |
+
"PHIL", # ethical or philosophical concept
|
| 91 |
+
"PHYS", # physics concept
|
| 92 |
+
"POLI", # sociological or political concept
|
| 93 |
+
"PROG", # computer programming concept
|
| 94 |
+
"PSY", # psychological concept
|
| 95 |
+
"RELI", # religious concept
|
| 96 |
+
"SPORTS", # sports concept
|
| 97 |
+
"WAR", # military concept
|
| 98 |
+
]] for item in sublist],
|
| 99 |
+
|
| 100 |
+
"nature": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 101 |
+
"FAUNA", # animal life
|
| 102 |
+
"FLORA", # plant life
|
| 103 |
+
"PHENOM", # phenomena
|
| 104 |
+
]] for item in sublist],
|
| 105 |
+
|
| 106 |
+
"media": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 107 |
+
"AUD", # music and audio recordings
|
| 108 |
+
"IMG", # photos, paintings, and other images
|
| 109 |
+
"SOFT", # software
|
| 110 |
+
"TXT", # articles, books, papers, etc.
|
| 111 |
+
"VID", # film and other videos
|
| 112 |
+
]] for item in sublist],
|
| 113 |
+
|
| 114 |
+
"org": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 115 |
+
"ORG", # organization
|
| 116 |
+
"TITLE", # title or role
|
| 117 |
+
]] for item in sublist],
|
| 118 |
+
|
| 119 |
+
"other": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 120 |
+
"MATH", # math notation
|
| 121 |
+
"OUT", # computer program output, e.g. stderr/out, logs, etc.
|
| 122 |
+
"PROG", # computer programming notation
|
| 123 |
+
"SCI", # scientific notation outside math and programming
|
| 124 |
+
]] for item in sublist],
|
| 125 |
+
|
| 126 |
+
"people": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 127 |
+
"GPE", # geopolitical entity e.g. countries, cities, states, or regions
|
| 128 |
+
"LANG", # language
|
| 129 |
+
"NORP", # nationalities, religious, or political groups. e.g. "American", "Muslim", or "Communist"
|
| 130 |
+
]] for item in sublist],
|
| 131 |
+
|
| 132 |
+
# person or personified being
|
| 133 |
+
"person": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 134 |
+
"ALIAS", # nickname or alternative name
|
| 135 |
+
"HONOR", # honorific
|
| 136 |
+
"NAME", # person name
|
| 137 |
+
"PROF", # professional designation
|
| 138 |
+
"USER", # username
|
| 139 |
+
]] for item in sublist],
|
| 140 |
+
|
| 141 |
+
"place": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 142 |
+
"BYTE", # digital location
|
| 143 |
+
"FIC", # fictional locations
|
| 144 |
+
"LOC", # physical locations
|
| 145 |
+
"UI", # location on a user interface
|
| 146 |
+
"VIRT", # virtual location
|
| 147 |
+
"WEB", # web-connected location
|
| 148 |
+
]] for item in sublist],
|
| 149 |
+
|
| 150 |
+
"time": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 151 |
+
"TIME", # years, dates, time values
|
| 152 |
+
"EVENT", # event in time
|
| 153 |
+
]] for item in sublist],
|
| 154 |
+
}
|
| 155 |
+
# TODO: might be multi-label
|
| 156 |
+
FEATURES["zz_prime"] = ["_all", "_ambiguous", *FEATURES.keys()] # primary feature, zz_ so it's labeled last
|
| 157 |
+
UUID5_NS = uuid.UUID("246a5463-afae-4571-a6e0-f319d74147d3") # Changes sentences signatures
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def get_uniq_training_labels(ds: Dataset, columns_to_exclude: set[str] = None):
|
| 161 |
+
columns_to_train_on = [k for k in ds.features.keys() if k not in (
|
| 162 |
+
{"text", "tokens"} if columns_to_exclude is None else columns_to_exclude)]
|
| 163 |
+
|
| 164 |
+
# Create a dictionary of sets, keyed by each column name
|
| 165 |
+
label_counters = {col: dict() for col in columns_to_train_on}
|
| 166 |
+
unique_label_values = {col: set() for col in columns_to_train_on}
|
| 167 |
+
|
| 168 |
+
for example in ds:
|
| 169 |
+
# Each of these columns is a list (one entry per token),
|
| 170 |
+
# so we update our set with each token-level value
|
| 171 |
+
for col in columns_to_train_on:
|
| 172 |
+
unique_label_values[col].update(example[col])
|
| 173 |
+
for label_val in example[col]:
|
| 174 |
+
if label_val not in label_counters[col]:
|
| 175 |
+
label_counters[col][label_val] = 0 # Inits with 0
|
| 176 |
+
label_counters[col][label_val] += 1
|
| 177 |
+
|
| 178 |
+
logger.info(f"Columns:")
|
| 179 |
+
for col in columns_to_train_on:
|
| 180 |
+
logger.info(f" {col}:")
|
| 181 |
+
# Convert to a sorted list just to have a nice, stable ordering
|
| 182 |
+
vals = sorted(unique_label_values[col])
|
| 183 |
+
logger.info(f" {len(vals)} labels: {[f'{v}:{label_counters[col][v]}' for v in vals]}")
|
| 184 |
+
|
| 185 |
+
return unique_label_values
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def main(stdscr, args):
|
| 189 |
+
wikipedia_dataset_name = "20231101.en"
|
| 190 |
+
wikipedia_dataset = load_dataset("wikimedia/wikipedia", wikipedia_dataset_name)
|
| 191 |
+
total_page_cnt = len(wikipedia_dataset["train"])
|
| 192 |
+
|
| 193 |
+
stdscr.clear()
|
| 194 |
+
stdscr.addstr(f"Loaded {wikipedia_dataset_name} containing {total_page_cnt} pages.")
|
| 195 |
+
|
| 196 |
+
signature_cache = set()
|
| 197 |
+
|
| 198 |
+
new_dataset_dict = {k: [] for k in ["text", "tokens", *FEATURES.keys()]}
|
| 199 |
+
if os.path.exists(DATASET_PATH):
|
| 200 |
+
# Load previous examples
|
| 201 |
+
for i, exp in enumerate(Dataset.load_from_disk(DATASET_PATH)):
|
| 202 |
+
sig = uuid.uuid5(UUID5_NS, exp["text"])
|
| 203 |
+
if sig in signature_cache:
|
| 204 |
+
continue
|
| 205 |
+
signature_cache.add(sig)
|
| 206 |
+
for k, v in exp.items():
|
| 207 |
+
new_dataset_dict[k].append(v)
|
| 208 |
+
|
| 209 |
+
esc_pressed = False
|
| 210 |
+
while not esc_pressed:
|
| 211 |
+
# Select random Wikipedia page
|
| 212 |
+
page = wikipedia_dataset["train"][random.randint(0, total_page_cnt)]
|
| 213 |
+
# If all custom examples are labeled, move on to Wikipedia
|
| 214 |
+
for page_chunk in (custom_examples + page["text"].split("\n\n")):
|
| 215 |
+
page_chunk = page_chunk.strip()
|
| 216 |
+
if not page_chunk:
|
| 217 |
+
continue
|
| 218 |
+
page_chunk_lines = page_chunk.split("\n")
|
| 219 |
+
for chunk_line in page_chunk_lines:
|
| 220 |
+
chunk_line = chunk_line.strip()
|
| 221 |
+
if not chunk_line:
|
| 222 |
+
continue
|
| 223 |
+
while not esc_pressed and chunk_line:
|
| 224 |
+
sentence_end_match = naive_sentence_end_pattern.search(chunk_line)
|
| 225 |
+
if sentence_end_match:
|
| 226 |
+
sentence_blob = chunk_line[:sentence_end_match.end()]
|
| 227 |
+
chunk_line = chunk_line[sentence_end_match.end():].strip()
|
| 228 |
+
else:
|
| 229 |
+
sentence_blob = chunk_line
|
| 230 |
+
chunk_line = ""
|
| 231 |
+
|
| 232 |
+
sig = uuid.uuid5(UUID5_NS, sentence_blob)
|
| 233 |
+
if sig in signature_cache:
|
| 234 |
+
continue
|
| 235 |
+
signature_cache.add(sig)
|
| 236 |
+
|
| 237 |
+
# TODO: sentence context
|
| 238 |
+
# - prefix each text with a context blob that gets tokenized with the text
|
| 239 |
+
# - label context blobs as B-CTXT and I-CTXT
|
| 240 |
+
# - this way, contextual information from outside the direct text can be injected
|
| 241 |
+
# - this allows injecting contexts from what we've already processed on the page
|
| 242 |
+
# - use a unique signal sequences to signal contexts, e.g.:
|
| 243 |
+
# - {{[[((prev:a,b;last:c,d))]]}}>>>
|
| 244 |
+
|
| 245 |
+
exp_idx = len(new_dataset_dict["text"])
|
| 246 |
+
stdscr.addstr(f"""\n\n>>>{sentence_blob}<<<
|
| 247 |
+
|
| 248 |
+
Press 'y' to accept or anything else to reject.
|
| 249 |
+
Press Esc to exit.
|
| 250 |
+
""")
|
| 251 |
+
ch = stdscr.getch()
|
| 252 |
+
stdscr.clear()
|
| 253 |
+
if ch == 27: # Esc
|
| 254 |
+
esc_pressed = True
|
| 255 |
+
elif ch == ord("y"):
|
| 256 |
+
naive_tokens = naive_tokenize(sentence_blob)
|
| 257 |
+
tokens_len = len(naive_tokens)
|
| 258 |
+
last_idx = tokens_len - 1
|
| 259 |
+
new_exp = {
|
| 260 |
+
"text": sentence_blob,
|
| 261 |
+
"tokens": naive_tokens,
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
for feat_name, feat_labels in FEATURES.items():
|
| 265 |
+
feat_labels_len = len(feat_labels)
|
| 266 |
+
labels_accepted = False
|
| 267 |
+
while not esc_pressed and not labels_accepted:
|
| 268 |
+
labels = []
|
| 269 |
+
skip_to_idx = None
|
| 270 |
+
skip_label = None
|
| 271 |
+
for token_idx, token in enumerate(naive_tokens):
|
| 272 |
+
if skip_to_idx is not None and skip_to_idx >= token_idx:
|
| 273 |
+
labels.append(skip_label if skip_label is not None else "O")
|
| 274 |
+
continue
|
| 275 |
+
skip_to_idx = None
|
| 276 |
+
skip_label = None
|
| 277 |
+
token_len = len(token)
|
| 278 |
+
|
| 279 |
+
enter_pressed = False
|
| 280 |
+
idx_blob = ""
|
| 281 |
+
stdscr.clear()
|
| 282 |
+
stdscr.addstr(f"""Example {exp_idx}:
|
| 283 |
+
|
| 284 |
+
{"\n".join([f"{pad_to_desired_len(k)}{v}" for k, v in new_exp.items()])}
|
| 285 |
+
{pad_to_desired_len(feat_name)}{labels}
|
| 286 |
+
|
| 287 |
+
Labels: {", ".join([f"{i}:{l}" for i, l in enumerate(feat_labels)])}
|
| 288 |
+
|
| 289 |
+
{naive_tokens[token_idx:]}
|
| 290 |
+
{" " * (token_len + 1)}^
|
| 291 |
+
{" " * (token_len + 1)}{token_idx}
|
| 292 |
+
|
| 293 |
+
: """)
|
| 294 |
+
while not esc_pressed and not enter_pressed:
|
| 295 |
+
ch = stdscr.getch()
|
| 296 |
+
if ch in {8, 127, curses.KEY_BACKSPACE}: # Delete
|
| 297 |
+
idx_blob = idx_blob[:-1]
|
| 298 |
+
y, x = stdscr.getyx()
|
| 299 |
+
next_x = x - 1
|
| 300 |
+
if next_x > 1:
|
| 301 |
+
stdscr.move(y, x - 1)
|
| 302 |
+
stdscr.clrtoeol()
|
| 303 |
+
stdscr.refresh()
|
| 304 |
+
elif ch == 27: # Esc
|
| 305 |
+
esc_pressed = True
|
| 306 |
+
elif ch in {10, curses.KEY_ENTER}: # Enter
|
| 307 |
+
enter_pressed = True
|
| 308 |
+
else:
|
| 309 |
+
# Otherwise, add the character to the string
|
| 310 |
+
ch_chr = chr(ch)
|
| 311 |
+
stdscr.addstr(ch_chr)
|
| 312 |
+
idx_blob += ch_chr
|
| 313 |
+
if not idx_blob:
|
| 314 |
+
label_blob = idx_blob if idx_blob else "O"
|
| 315 |
+
labels.append(label_blob)
|
| 316 |
+
elif ">" in idx_blob:
|
| 317 |
+
try:
|
| 318 |
+
idx_blob, skip_distance = idx_blob.split(">")
|
| 319 |
+
if idx_blob:
|
| 320 |
+
label_idx = int(idx_blob)
|
| 321 |
+
if 0 <= label_idx < feat_labels_len:
|
| 322 |
+
label_blob = feat_labels[label_idx]
|
| 323 |
+
labels.append(label_blob)
|
| 324 |
+
skip_label = label_blob
|
| 325 |
+
else:
|
| 326 |
+
labels.append("O")
|
| 327 |
+
|
| 328 |
+
if skip_distance:
|
| 329 |
+
skip_to_idx = token_idx + int(skip_distance)
|
| 330 |
+
if skip_to_idx > last_idx:
|
| 331 |
+
skip_to_idx = last_idx
|
| 332 |
+
else:
|
| 333 |
+
skip_to_idx = last_idx
|
| 334 |
+
except ValueError:
|
| 335 |
+
stdscr.addstr(f"Could not convert {idx_blob} to an integer idx value.")
|
| 336 |
+
else:
|
| 337 |
+
try:
|
| 338 |
+
label_idx = int(idx_blob)
|
| 339 |
+
if 0 <= label_idx < feat_labels_len:
|
| 340 |
+
label_blob = feat_labels[label_idx]
|
| 341 |
+
labels.append(label_blob)
|
| 342 |
+
except ValueError:
|
| 343 |
+
stdscr.addstr(f"Could not convert {idx_blob} to an integer idx value.")
|
| 344 |
+
stdscr.clear()
|
| 345 |
+
stdscr.addstr(f"""Example {exp_idx}:
|
| 346 |
+
{"\n".join([f"{pad_to_desired_len(k)}{v}" for k, v in new_exp.items()])}
|
| 347 |
+
{pad_to_desired_len(feat_name)}{labels}
|
| 348 |
+
|
| 349 |
+
Press 'y' to accept or anything else to reject.
|
| 350 |
+
Press Esc to exit.
|
| 351 |
+
""")
|
| 352 |
+
ch = stdscr.getch()
|
| 353 |
+
stdscr.clear()
|
| 354 |
+
if ch == 27: # Esc
|
| 355 |
+
esc_pressed = True
|
| 356 |
+
elif ch == ord("y"):
|
| 357 |
+
new_exp[feat_name] = labels
|
| 358 |
+
labels_accepted = True
|
| 359 |
+
if esc_pressed:
|
| 360 |
+
break
|
| 361 |
+
# Add if complete
|
| 362 |
+
if sorted(new_exp.keys()) == sorted(new_dataset_dict.keys()):
|
| 363 |
+
for k, v in new_exp.items():
|
| 364 |
+
new_dataset_dict[k].append(v)
|
| 365 |
+
# Exiting
|
| 366 |
+
stdscr.clear()
|
| 367 |
+
return Dataset.from_dict(new_dataset_dict)
|
| 368 |
+
|
| 369 |
+
|
| 370 |
+
def pad_to_desired_len(blob: str, desired: int = 15):
|
| 371 |
+
blob_len = len(blob)
|
| 372 |
+
if blob_len < desired:
|
| 373 |
+
return f"{blob}{' ' * (desired - blob_len)}"
|
| 374 |
+
return blob
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
def show_examples(ds: Dataset, show_expr: Optional[str]):
|
| 378 |
+
if not show_expr:
|
| 379 |
+
ds_len = len(ds)
|
| 380 |
+
count_to_show = ds_len if ds_len < 10 else 10
|
| 381 |
+
examples_to_show = ds.shuffle()[:count_to_show]
|
| 382 |
+
else:
|
| 383 |
+
args_show_tokens = show_expr.split("/")
|
| 384 |
+
split_to_show, col_to_show, label_to_show, count_to_show = args_show_tokens
|
| 385 |
+
count_to_show = int(count_to_show)
|
| 386 |
+
examples_to_show = ds.filter(
|
| 387 |
+
lambda exp: label_to_show in exp[col_to_show]).shuffle(seed=42)[:count_to_show]
|
| 388 |
+
for i in range(count_to_show):
|
| 389 |
+
logger.info(f"Example {i}:")
|
| 390 |
+
for feature in examples_to_show.keys():
|
| 391 |
+
logger.info(f" {feature}: {examples_to_show[feature][i]}")
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
if __name__ == "__main__":
|
| 395 |
+
import argparse
|
| 396 |
+
import logging.config
|
| 397 |
+
|
| 398 |
+
arg_parser = argparse.ArgumentParser(description="Train multi-task model.")
|
| 399 |
+
arg_parser.add_argument("--show", help="Show examples: <split>/<col>/<label>/<count>",
|
| 400 |
+
action="store", default=None)
|
| 401 |
+
parsed_args = arg_parser.parse_args()
|
| 402 |
+
|
| 403 |
+
logging.config.dictConfig({
|
| 404 |
+
"version": 1,
|
| 405 |
+
"disable_existing_loggers": False,
|
| 406 |
+
"formatters": {
|
| 407 |
+
"default": {
|
| 408 |
+
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
| 409 |
+
},
|
| 410 |
+
},
|
| 411 |
+
"handlers": {
|
| 412 |
+
"console": {
|
| 413 |
+
"class": "logging.StreamHandler",
|
| 414 |
+
"formatter": "default",
|
| 415 |
+
},
|
| 416 |
+
},
|
| 417 |
+
"loggers": {
|
| 418 |
+
"": {
|
| 419 |
+
"level": "INFO",
|
| 420 |
+
"handlers": ["console"],
|
| 421 |
+
},
|
| 422 |
+
},
|
| 423 |
+
})
|
| 424 |
+
|
| 425 |
+
new_ds = curses.wrapper(main, parsed_args)
|
| 426 |
+
logger.info(f"Writing dataset to disk...\n{new_ds}")
|
| 427 |
+
show_examples(new_ds, parsed_args.show)
|
| 428 |
+
get_uniq_training_labels(new_ds)
|
| 429 |
+
new_ds.save_to_disk(DATASET_PATH)
|
examples.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
custom_examples = [
|
| 2 |
+
"Brachyarthrum is a genus of true bugs belonging to the family Miridae.",
|
| 3 |
+
"District 29 is a district in the Texas House of Representatives.",
|
| 4 |
+
"Edgard Viseur (born 10 April 1905, date of death unknown) was a Belgian middle-distance runner.",
|
| 5 |
+
"Greater Manchester bus route 192 runs between Hazel Grove in the Metropolitan Borough of Stockport and Piccadilly Gardens in Manchester city centre.",
|
| 6 |
+
"He competed in the men's 3000 metres steeplechase at the 1928 Summer Olympics.",
|
| 7 |
+
"It was described by Breuning in 1939.",
|
| 8 |
+
"Ropica bicristata is a species of beetle in the family Cerambycidae.",
|
| 9 |
+
"Tactical Force is a 2011 Canadian-American action film written and directed by Adamo Paolo Cultraro, and starring Steve Austin, Michael Jai White, Michael Shanks, Keith Jardine, Michael Eklund, Darren Shahlavi and Lexa Doig.",
|
| 10 |
+
"The saline red bat (Lasiurus salinae) is a species of bat from the family Vespertilionidae.",
|
| 11 |
+
]
|
requirements.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
datasets
|
util.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
non_terminal_periods = (
|
| 4 |
+
r"(?<!\sApt)"
|
| 5 |
+
r"(?<!\sBlvd)"
|
| 6 |
+
r"(?<!\sCapt)"
|
| 7 |
+
r"(?<!\sDr)"
|
| 8 |
+
r"(?<!\sJr)"
|
| 9 |
+
r"(?<!\sMr)"
|
| 10 |
+
r"(?<!\sMrs)"
|
| 11 |
+
r"(?<!\sMs)"
|
| 12 |
+
r"(?<!\sPh\.D)"
|
| 13 |
+
r"(?<!\sRd)"
|
| 14 |
+
r"(?<!\sSr)"
|
| 15 |
+
r"(?<!\sSt)"
|
| 16 |
+
r"(?<!\se\.g)"
|
| 17 |
+
r"(?<!\setc)"
|
| 18 |
+
r"(?<!\si\.e)"
|
| 19 |
+
r"(?<!\s[A-Z])"
|
| 20 |
+
r"(?<!^[a-zA-Z0-9])"
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
naive_sentence_end_pattern = re.compile(r"([\n\r]+"
|
| 24 |
+
r"|[!?]+\"?(?=\s|$)"
|
| 25 |
+
r"|" + non_terminal_periods + r"\.+\"?(?=\s|$))")
|
| 26 |
+
# Option 1:
|
| 27 |
+
# [\n\r]+ - Match consecutive newline and carriage returns
|
| 28 |
+
# Option 2:
|
| 29 |
+
# [!?]+ - Match ! or ?
|
| 30 |
+
# (?=\s|$) - Must be followed by \s or end-of-string
|
| 31 |
+
# Option 3:
|
| 32 |
+
# non_terminal_periods - Must not be preceded by non-terminal characters
|
| 33 |
+
# \.+ - Match .
|
| 34 |
+
# (?=\s|$) - Must be followed by \s or end-of-string
|
| 35 |
+
|
| 36 |
+
naive_tokenize_pattern = re.compile(
|
| 37 |
+
r"("
|
| 38 |
+
r"\s+"
|
| 39 |
+
r"|-+(?=\s|$)"
|
| 40 |
+
r"|(?<=\s)-+"
|
| 41 |
+
r"|-{2,}"
|
| 42 |
+
r"|–+"
|
| 43 |
+
r"|—+"
|
| 44 |
+
r"|(?<=[a-z])n’t(?=\s|$)"
|
| 45 |
+
r"|(?<=[a-z])n't(?=\s|$)"
|
| 46 |
+
r"|’[a-s,u-z]+(?=\s|$)"
|
| 47 |
+
r"|'[a-s,u-z]+(?=\s|$)"
|
| 48 |
+
r"|’+"
|
| 49 |
+
r"|'+"
|
| 50 |
+
r"|\"+"
|
| 51 |
+
r"|`+"
|
| 52 |
+
r"|,+(?=\"|\s|$)"
|
| 53 |
+
r"|" + non_terminal_periods + r"\.+(?=\"|\s|$)"
|
| 54 |
+
r"|:+"
|
| 55 |
+
r"|;+"
|
| 56 |
+
r"|[?!]+(?=\"|\s|$)"
|
| 57 |
+
r"|\(+"
|
| 58 |
+
r"|\)+"
|
| 59 |
+
r"|\[+"
|
| 60 |
+
r"|]+"
|
| 61 |
+
r"|\{+"
|
| 62 |
+
r"|}+"
|
| 63 |
+
r"|<+"
|
| 64 |
+
r"|>+"
|
| 65 |
+
r")"
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def naive_tokenize(text: str):
|
| 70 |
+
return [t for t in naive_tokenize_pattern.split(text)
|
| 71 |
+
if t != ""
|
| 72 |
+
and not t.startswith(" ")
|
| 73 |
+
and not t.startswith("\t")]
|