PyTorch
gpt2
achille-fusco commited on
Commit
c2760fe
·
verified ·
1 Parent(s): 6552f98

Upload folder using huggingface_hub

Browse files
__init__.py ADDED
File without changes
bos_eos_patch.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ from tokenizers import Tokenizer
3
+ from tokenizers.processors import TemplateProcessing
4
+ import os, json
5
+
6
+ TOK_DIR = "/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M"
7
+
8
+ # 0) sanity: what path will HF load?
9
+ tmp = AutoTokenizer.from_pretrained(TOK_DIR, trust_remote_code=True, local_files_only=True)
10
+ print("HF loads from:", tmp.name_or_path)
11
+
12
+ # 1) get bos/eos tokens & ids
13
+ bos, eos = tmp.bos_token, tmp.eos_token
14
+ assert bos and eos, "BOS/EOS not defined in special_tokens_map.json / tokenizer_config.json"
15
+ bos_id, eos_id = tmp.convert_tokens_to_ids([bos, eos])
16
+
17
+ # 2) patch tokenizer.json with a TemplateProcessing post-processor
18
+ tok_json = os.path.join(TOK_DIR, "tokenizer.json")
19
+ tk = Tokenizer.from_file(tok_json)
20
+ tk.post_processor = TemplateProcessing(
21
+ single=f"{bos} $A {eos}",
22
+ pair=f"{bos} $A {eos} $B:1 {eos}:1",
23
+ special_tokens=[(bos, bos_id), (eos, eos_id)],
24
+ )
25
+ tk.save(tok_json)
26
+
27
+ # 3) (optional) keep bos/eos also in tokenizer_config.json
28
+ cfg_path = os.path.join(TOK_DIR, "tokenizer_config.json")
29
+ with open(cfg_path, "r", encoding="utf-8") as f:
30
+ cfg = json.load(f)
31
+ cfg["bos_token"] = bos
32
+ cfg["eos_token"] = eos
33
+ with open(cfg_path, "w", encoding="utf-8") as f:
34
+ json.dump(cfg, f, indent=2)
35
+
36
+ # 4) verify post-processor is present after a fresh reload
37
+ tok = AutoTokenizer.from_pretrained(TOK_DIR, trust_remote_code=True, local_files_only=True)
38
+ print("post-processor:", tok.backend_tokenizer.post_processor) # should NOT be None
39
+
40
+ # 5) final check: specials appear when requested
41
+ enc = tok("the singers were singing a very nice song!", add_special_tokens=True, return_attention_mask=False)
42
+ print(tok.convert_ids_to_tokens(enc["input_ids"]))
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_dir": "/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M",
3
+ "data_dir": "01-data",
4
+ "train_glob": "*.train",
5
+ "valid_glob": "*.valid",
6
+ "output_dir": "03-models/gpt2_ParFindFast_10M",
7
+ "datapoint_length" : 512,
8
+ "training_type" : "strict_small",
9
+ "n_epochs" : 10,
10
+ "batch_size" : 16,
11
+ "learning_rate" : 0.00005,
12
+ "weight_decay" : 0,
13
+ "num_training_steps" : 200000,
14
+ "num_warmup_steps" : 2000,
15
+ "sft_learning_rate" : 0.00005,
16
+ "gradient_clip_norm" : 1,
17
+ "seed" : -1,
18
+ "base_folder" : "03-models",
19
+ "experiment_name" : "gpt2_ParFindFast_10M",
20
+ "use_wandb" : false,
21
+ "wandb_experiment_name" : "gpt2_ParFindFast",
22
+ "wandb_project_name" : "BabyLM-2025",
23
+ "tokenizer_class": "ParadigmTokenizerWrapper",
24
+ "model_type": "gpt2",
25
+ "vocab_size": 29215
26
+ }
data_utils.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # File: data_utils.py
2
+ import torch
3
+ from torch.utils.data import Dataset, DataLoader
4
+ from torch.nn.utils.rnn import pad_sequence
5
+
6
+ # from tokenizer import ParadigmTokenizerWrapper
7
+ from transformers import AutoTokenizer
8
+
9
+ import math
10
+ import os
11
+ from tqdm import tqdm
12
+ import pickle
13
+
14
+ TRAIN_PATH_10M = '01-data/clean_train_10M'
15
+ DATASETS = ['bnc_spoken', 'childes', 'gutenberg', 'open_subtitles', 'simple_wiki', 'switchboard']
16
+
17
+ class FullBabyLMDataset(Dataset):
18
+ def __init__(self, cfg, pretokenized_data=None):
19
+ tokenizer_path = cfg["tokenizer_dir"]
20
+
21
+ # Use HF loader so tokenizer_class + auto_map are honored
22
+ self.tokenizer = AutoTokenizer.from_pretrained(
23
+ tokenizer_path,
24
+ trust_remote_code=True,
25
+ local_files_only=True
26
+ )
27
+
28
+ # Pull specials directly from the wrapper (it *is* a PreTrainedTokenizerFast)
29
+ self.model_bos = self.tokenizer.bos_token_id
30
+ self.model_eos = self.tokenizer.eos_token_id
31
+ self.model_pad = self.tokenizer.pad_token_id
32
+
33
+ if pretokenized_data is not None:
34
+ self.data = pretokenized_data
35
+ return
36
+
37
+ # Tokenize, split and reconstruct each dataset
38
+ self.data = []
39
+ dataset_folder = TRAIN_PATH_10M # using the 10M setting here
40
+
41
+ for dataset in DATASETS:
42
+ dataset_path = os.path.join(dataset_folder, f'{dataset}.train')
43
+ with open(dataset_path, 'r', encoding='utf-8') as f:
44
+ all_text = ' '.join(f.readlines())
45
+ print(f'Opened {dataset_path}')
46
+
47
+ # Tokenize in BATCH mode so indexing [0] is correct
48
+ tokenized_dataset = self.tokenizer([all_text])['input_ids'][0]
49
+ print(f'Tokenized {dataset_path}; {len(tokenized_dataset)} tokens total')
50
+
51
+ # Chunk into datapoints
52
+ chunk_size = cfg["datapoint_length"]
53
+ num_chunks = math.ceil(len(tokenized_dataset) / chunk_size)
54
+ for curr_chunk in tqdm(range(num_chunks), desc=f"Chunking {dataset}"):
55
+ start = curr_chunk * chunk_size
56
+ end = (curr_chunk + 1) * chunk_size
57
+ chunk_tokens = tokenized_dataset[start:end]
58
+ if isinstance(chunk_tokens, torch.Tensor):
59
+ chunk_tokens = chunk_tokens.tolist()
60
+ self.data.append(chunk_tokens)
61
+ print(f"Chunked {dataset_path}")
62
+
63
+ def __len__(self):
64
+ return len(self.data)
65
+
66
+ def __getitem__(self, idx):
67
+ # Add BOS/EOS here (sequence length + 2)
68
+ return torch.as_tensor([self.model_bos] + self.data[idx] + [self.model_eos], dtype=torch.long)
69
+
70
+ ## General utilities ##
71
+ def load_babylm_data(cfg):
72
+ num_words = "100M" if cfg["training_type"] == "strict" else "10M"
73
+ cache_dir = '01-data/cached_train'
74
+ os.makedirs(cache_dir, exist_ok=True)
75
+ filename = os.path.join(cache_dir, f'train_gpt2_{num_words}.pkl')
76
+
77
+ # Cache ONLY the tokenized chunks, not the Dataset object
78
+ if os.path.exists(filename):
79
+ with open(filename, 'rb') as f:
80
+ token_chunks = pickle.load(f)
81
+ full_babylm_dset = FullBabyLMDataset(cfg, pretokenized_data=token_chunks)
82
+ else:
83
+ tmp_dataset = FullBabyLMDataset(cfg)
84
+ with open(filename, 'wb') as f:
85
+ pickle.dump(tmp_dataset.data, f)
86
+ full_babylm_dset = tmp_dataset
87
+
88
+ collate_fn = get_collate_fn(full_babylm_dset.model_eos, full_babylm_dset.model_pad)
89
+ dataloader = DataLoader(
90
+ full_babylm_dset,
91
+ batch_size=cfg["batch_size"],
92
+ shuffle=True,
93
+ collate_fn=collate_fn,
94
+ num_workers=0, # set >0 if your env supports it
95
+ pin_memory=False # set True on GPUs if it helps
96
+ )
97
+ return dataloader
98
+
99
+ def get_collate_fn(model_eos, model_pad):
100
+ def collate_fn(batch):
101
+ tokens = pad_sequence(batch, padding_value=model_pad, batch_first=True)
102
+ input_tokens = tokens[:, :-1]
103
+ target_tokens = tokens[:, 1:]
104
+ target_mask = input_tokens != model_pad
105
+ # Ensure first position is always trainable
106
+ target_mask[:, 0] = True
107
+ return input_tokens, target_tokens, target_mask
108
+ return collate_fn
latest_optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2edd8392c1373169859829d3c0801f75ff860b53375c20cbb801e3b68578564e
3
+ size 866363865
latest_scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2591c628797378c411d922038d0262e546252fffcc965e0b922e08e478400d64
3
+ size 1507
latest_student.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2f66172c4f8517ab8db532ac2110a9ad28ac57f784f81d95a569653219fd062
3
+ size 433175117
models.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # File: models.py
2
+ # ---------------
3
+ # All functions related to loading and saving models
4
+
5
+ import os
6
+ import torch
7
+ from utils import mkdir
8
+ import gc
9
+
10
+ import transformers
11
+ from transformers import GPT2LMHeadModel, GPT2Config
12
+ from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
13
+ from vllm import LLM, SamplingParams
14
+
15
+ DEVICE = torch.device('cuda') if torch.cuda.is_available() \
16
+ else torch.device('cpu')
17
+
18
+
19
+ ## INITIALIZATION ##
20
+ def initialize_model_and_optimizers(cfg):
21
+ student = initialize_model(cfg)
22
+ optimizer = initialize_optimizer(cfg, student)
23
+ scheduler = initialize_scheduler(cfg, student, optimizer)
24
+ return student, optimizer, scheduler
25
+
26
+ def initialize_model(cfg):
27
+ # First load the student
28
+ size = "100m" if cfg['training_type'] == 'strict' else '10m'
29
+ config = GPT2Config.from_pretrained(f"./03-models/gpt2_ParFindFast_10M")
30
+ student = GPT2LMHeadModel(config).to(DEVICE)
31
+ return student
32
+
33
+ def get_parameter_names(model, forbidden_layer_types):
34
+ """
35
+ Returns the names of the model parameters that are not inside a forbidden layer.
36
+ """
37
+ result = []
38
+ for name, child in model.named_children():
39
+ result += [
40
+ f"{name}.{n}"
41
+ for n in get_parameter_names(child, forbidden_layer_types)
42
+ if not isinstance(child, tuple(forbidden_layer_types))
43
+ ]
44
+ # Add model specific parameters (defined with nn.Parameter) since they are not in any child.
45
+ result += list(model._parameters.keys())
46
+ return result
47
+
48
+ def initialize_optimizer(cfg, student):
49
+ lr = cfg['learning_rate']
50
+ decay_parameters = get_parameter_names(student, ALL_LAYERNORM_LAYERS)
51
+ decay_parameters = [name for name in decay_parameters if "bias" not in name]
52
+ optimizer_grouped_parameters = [
53
+ {
54
+ "params": [
55
+ p for n, p in student.named_parameters() if (n in decay_parameters and p.requires_grad)
56
+ ],
57
+ "weight_decay": cfg["weight_decay"],
58
+ },
59
+ {
60
+ "params": [
61
+ p for n, p in student.named_parameters() if (n not in decay_parameters and p.requires_grad)
62
+ ],
63
+ "weight_decay": 0.0,
64
+ },
65
+ ]
66
+
67
+ optimizer = torch.optim.AdamW(
68
+ optimizer_grouped_parameters, lr=lr, eps=1e-8, betas=(0.9, 0.999)
69
+ )
70
+
71
+ return optimizer
72
+
73
+ def initialize_scheduler(cfg, student, optimizer):
74
+ num_training_steps = cfg["num_training_steps"]
75
+ num_warmup_steps = cfg["num_warmup_steps"]
76
+ scheduler = transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps = num_warmup_steps,
77
+ num_training_steps = num_training_steps)
78
+ return scheduler
79
+
80
+ ## SAVING AND LOADING ##
81
+ def save_epoch_checkpoint(student, optimizer, scheduler, epoch, checkpoint_dir):
82
+ # Open a folder for the round
83
+ folder = os.path.join(checkpoint_dir, f'epoch_{epoch}')
84
+ mkdir(folder)
85
+
86
+ # Save the metrics and model
87
+ torch.save(optimizer.state_dict(), os.path.join(folder, 'latest_optimizer.pt'))
88
+ torch.save(scheduler.state_dict(), os.path.join(folder, 'latest_scheduler.pt'))
89
+ torch.save(student.state_dict(), os.path.join(folder, 'latest_student.pt'))
90
+ torch.save(student.state_dict(), os.path.join(folder, 'pytorch_model.bin'))
paradigm_utils.py ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # paradigm_utils.py
2
+
3
+ import time
4
+ from collections import defaultdict
5
+ from tqdm import tqdm
6
+ import os
7
+ import math
8
+ import json
9
+ from typing import List, Tuple, Set, Dict, Any
10
+
11
+ def _serialize_suffixes(sfx_set):
12
+ flat = []
13
+ for s in sfx_set:
14
+ if isinstance(s, tuple):
15
+ base, nested = s
16
+ flat.append([base, sorted(list(nested))]) # JSON-safe pair
17
+ else:
18
+ flat.append(s) # plain string
19
+ # stable order: strings first, then pairs; then lexicographic
20
+ def key(x):
21
+ return (0, x) if isinstance(x, str) else (1, x[0], tuple(x[1]))
22
+ return sorted(flat, key=key)
23
+
24
+ def paradigms_to_json(paradigms):
25
+ out = []
26
+ for stems, suffixes in paradigms:
27
+ out.append({
28
+ "stems": sorted(list(stems)),
29
+ "suffixes": _serialize_suffixes(suffixes),
30
+ })
31
+ return out
32
+
33
+ def save_paradigms_json(paradigms, path, meta=None):
34
+ payload = {
35
+ "schema_version": 1,
36
+ "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
37
+ "meta": meta or {},
38
+ "paradigms": paradigms_to_json(paradigms),
39
+ }
40
+ with open(path, "w", encoding="utf-8") as f:
41
+ json.dump(payload, f, ensure_ascii=False, indent=2)
42
+
43
+ def _deserialize_suffixes(sfx_list):
44
+ out = set()
45
+ for item in sfx_list:
46
+ if isinstance(item, list): # [base, nested_list]
47
+ base, nested = item
48
+ out.add((base, frozenset(nested)))
49
+ else:
50
+ out.add(item)
51
+ return out
52
+
53
+ def load_paradigms_json(path):
54
+ with open(path, "r", encoding="utf-8") as f:
55
+ payload = json.load(f)
56
+ paradigms = []
57
+ for p in payload["paradigms"]:
58
+ stems = set(p["stems"])
59
+ suffixes = _deserialize_suffixes(p["suffixes"])
60
+ paradigms.append((stems, suffixes))
61
+ meta = payload.get("meta", {})
62
+ return paradigms, meta
63
+
64
+ ### -----------------------------
65
+ ### 1. Extract (stem, suffix) pairs from vocabulary
66
+ ### -----------------------------
67
+
68
+ def extract_stem_suffix_pairs(vocab):
69
+ """Return a mapping from stems to all suffixes they occur with, including null suffix."""
70
+ stem_to_suffixes = defaultdict(set)
71
+ for word in tqdm(vocab, desc="[1/7] Extracting stem-suffix pairs"):
72
+ for i in range(0, len(word) + 1): # include empty suffix
73
+ stem, suffix = word[:i], word[i:]
74
+ stem_to_suffixes[stem].add(suffix)
75
+ return stem_to_suffixes
76
+
77
+ ### -----------------------------
78
+ ### 2. Group stems by shared suffix sets and normalize by common prefix
79
+ ### -----------------------------
80
+
81
+ def group_stems_by_suffixes(stem_to_suffixes, min_shared_stems=2, min_suffixes=2):
82
+ suffix_to_stems = defaultdict(set)
83
+ for stem, suffixes in stem_to_suffixes.items():
84
+ suffix_key = frozenset(suffixes)
85
+ suffix_to_stems[suffix_key].add(stem)
86
+
87
+ normalized_suffix_map = defaultdict(set)
88
+
89
+ for suffixes, stems in tqdm(suffix_to_stems.items(), desc="[2/7] Grouping and normalizing"):
90
+ non_empty_suffixes = [s for s in suffixes if s]
91
+ if len(stems) >= min_shared_stems and len(suffixes) >= min_suffixes:
92
+ common_prefix = os.path.commonprefix(non_empty_suffixes) if non_empty_suffixes else ""
93
+
94
+ if common_prefix:
95
+ normalized_stems = {stem + common_prefix for stem in stems}
96
+ adjusted_suffixes = {s[len(common_prefix):] if s.startswith(common_prefix) else s for s in suffixes}
97
+ else:
98
+ normalized_stems = stems
99
+ adjusted_suffixes = suffixes
100
+
101
+ if len(adjusted_suffixes) >= min_suffixes:
102
+ suffix_key = frozenset(adjusted_suffixes)
103
+ normalized_suffix_map[suffix_key].update(normalized_stems)
104
+
105
+ paradigms = [(stems, set(suffixes)) for suffixes, stems in normalized_suffix_map.items()]
106
+ return paradigms
107
+
108
+ ### -----------------------------
109
+ ### 3. Expand stem sets based on suffix set coverage
110
+ ### -----------------------------
111
+
112
+ def stem_set_expansion(paradigms, stem_to_suffixes):
113
+ updated = 0
114
+ suffix_to_stems = {frozenset(suffixes): set(stems) for stems, suffixes in paradigms}
115
+
116
+ for stem, suffixes in tqdm(stem_to_suffixes.items(), desc="[3/7] Expanding stem sets"):
117
+ added = False
118
+ for paradigm_suffixes in sorted(suffix_to_stems.keys(), key=lambda x: (-len(x), tuple(sorted(x)))):
119
+ if paradigm_suffixes.issubset(suffixes):
120
+ if stem not in suffix_to_stems[paradigm_suffixes]:
121
+ suffix_to_stems[paradigm_suffixes].add(stem)
122
+ updated += 1
123
+ added = True
124
+ if not added and stem == 'design':
125
+ print(f"[DEBUG] No suitable paradigm for 'design' with suffixes {suffixes}")
126
+
127
+ enriched = [(stems, set(suffixes)) for suffixes, stems in suffix_to_stems.items()]
128
+ print(f"✅ Added {updated} stems via stem set expansion.")
129
+ return enriched
130
+
131
+ ### -----------------------------
132
+ ### 4. Expand suffix sets based on partial compatibility
133
+ ### -----------------------------
134
+
135
+ def harmonic_number(n):
136
+ return sum(1.0 / i for i in range(1, n + 1))
137
+
138
+ def suffix_set_expansion(paradigms):
139
+ base = paradigms[:] # snapshot
140
+ merged = [ (set(stems), set(suffixes)) for stems, suffixes in base ]
141
+ enriched_count = 0
142
+
143
+ # Iterate in a deterministic order
144
+ for i, (stems_i, suffixes_i) in enumerate(sort_paradigms(merged)):
145
+ for j, (stems_j, suffixes_j) in enumerate(sort_paradigms(merged)):
146
+ if i == j:
147
+ continue
148
+ if suffixes_i > suffixes_j:
149
+ intersection = stems_i & stems_j
150
+ denom = max(1, len(stems_j)) # guard
151
+ if (len(stems_j) - len(intersection)) < (len(stems_j) / harmonic_number(denom)):
152
+ stems_i |= stems_j
153
+ enriched_count += 1
154
+ # do not mutate stems_j/suffixes_j further
155
+
156
+ print(f"\n✅ Enriched {enriched_count} paradigms via suffix set expansion.")
157
+ # Return back in original tuple-of-sets form
158
+ return [ (set(st), set(sf)) for st, sf in sort_paradigms(merged) ]
159
+
160
+ ### -----------------------------
161
+ ### 5. Prune subsumed stems
162
+ ### -----------------------------
163
+
164
+ def prune_subsumed_stems(paradigms):
165
+ pruned_paradigms = []
166
+ for i, (stems_i, suffixes_i) in enumerate(paradigms):
167
+ pruned_stems = set(stems_i)
168
+ for j, (stems_j, suffixes_j) in enumerate(paradigms):
169
+ if i == j:
170
+ continue
171
+ if suffixes_j >= suffixes_i:
172
+ pruned_stems -= (stems_j & stems_i)
173
+ if pruned_stems:
174
+ pruned_paradigms.append((pruned_stems, suffixes_i))
175
+ print(f"✅ Pruned to {len(pruned_paradigms)} paradigms after removing subsumed stems.")
176
+ return sort_paradigms(pruned_paradigms)
177
+
178
+ ### -----------------------------
179
+ ### 6. Sort paradigms by size
180
+ ### -----------------------------
181
+
182
+ def sort_paradigms(paradigms):
183
+ """
184
+ Primary: log(len(stems)) * log(len(suffixes)) (DESC)
185
+ Ties: (-len(stems), -len(suffixes), lexicographic stems, lexicographic suffix heads)
186
+ """
187
+ def score(p):
188
+ stems, suffixes = p
189
+ if stems and suffixes:
190
+ return math.log(len(stems)) * math.log(len(suffixes))
191
+ return 0.0
192
+
193
+ def tie_key(p):
194
+ stems, suffixes = p
195
+ sfx_heads = []
196
+ for s in suffixes:
197
+ sfx_heads.append(s[0] if isinstance(s, tuple) else s)
198
+ return (-len(stems), -len(suffixes),
199
+ " ".join(sorted(stems)),
200
+ " ".join(sorted(sfx_heads)))
201
+
202
+ return sorted(paradigms, key=lambda p: (-score(p), tie_key(p)))
203
+
204
+ def sort_paradigms_by_suffix_count(paradigms):
205
+ def score(p):
206
+ stem_count = len(p[0])
207
+ suffix_count = len(p[1])
208
+ if stem_count > 0 and suffix_count > 0:
209
+ return suffix_count
210
+ return 0
211
+ return sorted(paradigms, key=score, reverse=True)
212
+
213
+ def nest_suffixes_from_paradigms(paradigms):
214
+ print("[7/7] Nesting suffixes based on reusable paradigms...")
215
+
216
+ suffix_set_index = {frozenset(suffixes): True for _, suffixes in paradigms}
217
+ nested_paradigms = []
218
+
219
+ for stems, suffixes in paradigms:
220
+ suffixes_list = list(suffixes)
221
+ nested_suffixes = set()
222
+ used = set()
223
+
224
+ # deterministic nested pairing
225
+ for i, s1 in enumerate(sorted(suffixes_list)):
226
+ for j, s2 in enumerate(sorted(suffixes_list)):
227
+ if i == j or s2 in used or not isinstance(s1, str) or not isinstance(s2, str):
228
+ continue
229
+ if s2.startswith(s1) and s1 != '':
230
+ remainder = s2[len(s1):]
231
+ if remainder and frozenset({'', remainder}) in suffix_set_index:
232
+ nested_suffixes.add((s1, frozenset({'', remainder})))
233
+ used.add(s2)
234
+ used.add(s1)
235
+ break
236
+
237
+ for s in suffixes_list:
238
+ if s not in used:
239
+ nested_suffixes.add(s)
240
+
241
+ nested_paradigms.append((set(stems), nested_suffixes))
242
+
243
+ print(f"✅ Nested structure created for {len(nested_paradigms)} paradigms.")
244
+ return sort_paradigms(nested_paradigms)
245
+
246
+
247
+ def refine_nested_stem_conflicts(paradigms):
248
+ """
249
+ Remove stems from higher-ranked paradigms if they are fully explained by nested structures
250
+ in lower-ranked paradigms.
251
+
252
+ Args:
253
+ paradigms: list of (stem_set, suffix_set), where suffix_set may contain nested (str, frozenset) tuples
254
+
255
+ Returns:
256
+ Refined list of paradigms with redundant derived stems removed
257
+ """
258
+ refined_paradigms = paradigms[:]
259
+ all_suffix_sets = {frozenset(suffixes) for _, suffixes in paradigms}
260
+
261
+ # Build a mapping from nested suffix sets to their parent prefixes
262
+ derived_stems = set()
263
+ for stems, suffixes in paradigms:
264
+ for sfx in suffixes:
265
+ if isinstance(sfx, tuple):
266
+ base, nested_suffixes = sfx
267
+ if frozenset(nested_suffixes) in all_suffix_sets:
268
+ for stem in stems:
269
+ derived_stems.add(stem + base)
270
+
271
+ # Remove derived stems from paradigms with simple suffix sets (like ['', 's'])
272
+ updated_paradigms = []
273
+ for stems, suffixes in refined_paradigms:
274
+ cleaned_stems = stems - derived_stems
275
+ updated_paradigms.append((cleaned_stems, suffixes))
276
+
277
+ print(f"✅ Removed {len(derived_stems)} derived stems explained by nested paradigms.")
278
+ return updated_paradigms
279
+
280
+
281
+ ### -----------------------------
282
+ ### 7. Segment word based on ranked paradigms
283
+ ### -----------------------------
284
+ def recursive_fallback(word, suffix_set):
285
+ for suffix in sorted(suffix_set, key=lambda s: -len(s)):
286
+ if suffix and word.endswith(suffix):
287
+ stem_candidate = word[:-len(suffix)]
288
+ rest = recursive_fallback(stem_candidate, suffix_set)
289
+ return rest + [suffix]
290
+ return [word] # fallback to whole word if nothing matches
291
+
292
+
293
+
294
+ ### -----------------------------
295
+ ### Main runner
296
+ ### -----------------------------
297
+
298
+ def run_paradigm_extraction(vocab, min_shared_stems=2, min_suffixes=2, enrich_suffix_sets=True):
299
+ start = time.time()
300
+ stem_to_suffixes = extract_stem_suffix_pairs(vocab)
301
+ paradigms = group_stems_by_suffixes(stem_to_suffixes, min_shared_stems, min_suffixes)
302
+ paradigms = stem_set_expansion(paradigms, stem_to_suffixes)
303
+ paradigms = sort_paradigms(paradigms)
304
+ paradigms = prune_subsumed_stems(paradigms)
305
+ paradigms = sort_paradigms(paradigms)
306
+ paradigms = nest_suffixes_from_paradigms(paradigms)
307
+ paradigms = refine_nested_stem_conflicts(paradigms)
308
+
309
+ paradigms = sort_paradigms(paradigms)
310
+ if enrich_suffix_sets:
311
+ print("[4/7] Expanding suffix sets based on partial compatibility...")
312
+ paradigms = suffix_set_expansion(paradigms)
313
+
314
+ paradigms = sort_paradigms(paradigms)
315
+ paradigms = prune_subsumed_stems(paradigms)
316
+ paradigms = sort_paradigms(paradigms)
317
+
318
+
319
+
320
+ '''# Fallback paradigm for unassigned full words
321
+ vocab_words = set(vocab)
322
+ assigned_words = set()
323
+ for stems, suffixes in paradigms:
324
+ for stem in stems:
325
+ for suffix in suffixes:
326
+ if isinstance(suffix, tuple):
327
+ base, _ = suffix
328
+ assigned_words.add(stem + base)
329
+ else:
330
+ assigned_words.add(stem + suffix)
331
+
332
+ unassigned_words = vocab_words - assigned_words
333
+ if unassigned_words:
334
+ print(f"✅ {len(unassigned_words)} full words were not assigned to any paradigm, added fallback paradigm.")
335
+ paradigms.append((set(unassigned_words), frozenset({""})))
336
+
337
+
338
+ paradigms = sort_paradigms(paradigms)'''
339
+
340
+ print(f"\n✅ Extracted {len(paradigms)} paradigms.")
341
+ print(f"⏱️ Finished in {time.time() - start:.2f} seconds.")
342
+ return paradigms
343
+
344
+ def segment_word_from_nested_paradigms(word, paradigms, fallback=True, top_k=300):
345
+ """
346
+ Segment a word based on nested paradigms with optional fallback.
347
+
348
+ Parameters:
349
+ word (str): The word to segment.
350
+ paradigms (list): A list of tuples (stems, suffixes) with optional nesting.
351
+ fallback (bool): Whether to fall back on longest suffix match from top_k paradigms.
352
+ top_k (int): Number of top paradigms to consider in fallback.
353
+
354
+ Returns:
355
+ List[str]: Segmented pieces of the word.
356
+ """
357
+
358
+ def match_suffixes(suffixes, remainder):
359
+ """Recursive helper to match nested suffix structures."""
360
+ for suffix in suffixes:
361
+ if isinstance(suffix, tuple):
362
+ base, nested = suffix
363
+ if remainder.startswith(base):
364
+ sub = remainder[len(base):]
365
+ nested_result = match_suffixes(nested, sub)
366
+ if nested_result is not None:
367
+ return [base] + nested_result
368
+ elif remainder == suffix:
369
+ return [suffix] if suffix else []
370
+ return None
371
+
372
+ # First pass: try full nested match
373
+ for stems, suffixes in paradigms:
374
+ for stem in stems:
375
+ if word.startswith(stem):
376
+ remainder = word[len(stem):]
377
+ matched_suffix = match_suffixes(suffixes, remainder)
378
+ if matched_suffix is not None:
379
+ return [stem] + matched_suffix
380
+
381
+ # Fallback strategy: longest suffix among top_k paradigms
382
+ if fallback:
383
+ seen_suffixes = set()
384
+
385
+ def collect_suffixes(suffixes):
386
+ for s in suffixes:
387
+ if isinstance(s, tuple):
388
+ seen_suffixes.add(s[0])
389
+ collect_suffixes(s[1])
390
+ else:
391
+ seen_suffixes.add(s)
392
+
393
+ for _, suffixes in paradigms[:top_k]:
394
+ collect_suffixes(suffixes)
395
+
396
+ # Try matching the longest suffix first
397
+ for suffix in sorted(seen_suffixes, key=lambda s: -len(s)):
398
+ if suffix and word.endswith(suffix):
399
+ stem = word[:-len(suffix)]
400
+ return [stem, suffix]
401
+ return [word]
402
+
403
+ return [word]
404
+
405
+
406
+ def segment_word_from_paradigms(word, paradigms, top_k=20):
407
+ """
408
+ Simpler fallback-only version: match longest suffix among top_k paradigms.
409
+
410
+ Parameters:
411
+ word (str): Word to segment.
412
+ paradigms (list): Paradigm structures.
413
+ top_k (int): How many paradigms to consider.
414
+
415
+ Returns:
416
+ List[str]: Segmentation result.
417
+ """
418
+ candidates = paradigms[:top_k]
419
+ best_split = None
420
+ for stems, suffixes in candidates:
421
+ for suffix in sorted(suffixes, key=lambda s: -len(s) if isinstance(s, str) else -len(s[0])):
422
+ if isinstance(suffix, tuple):
423
+ suffix = suffix[0] # ignore nested for fallback
424
+ if word.endswith(suffix):
425
+ stem_candidate = word[:-len(suffix)] if suffix else word
426
+ if stem_candidate in stems:
427
+ split = [stem_candidate, suffix] if suffix else [stem_candidate]
428
+ if best_split is None or len(suffix) > len(best_split[-1]):
429
+ best_split = split
430
+ return best_split or [word]
paradigms.json ADDED
@@ -0,0 +1,5050 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "schema_version": 1,
3
+ "created_at": "2025-08-13T19:03:27Z",
4
+ "meta": {
5
+ "min_shared_stems": 2,
6
+ "min_suffixes": 2,
7
+ "enrich_suffix_sets": true,
8
+ "fallback_top_k": 20,
9
+ "normalization": {
10
+ "lowercase": true,
11
+ "separate_apostrophes": false,
12
+ "separate_digits": true,
13
+ "separate_punctuation": true
14
+ }
15
+ },
16
+ "paradigms": [
17
+ {
18
+ "stems": [
19
+ "add",
20
+ "allow",
21
+ "answer",
22
+ "appear",
23
+ "ask",
24
+ "attack",
25
+ "attempt",
26
+ "back",
27
+ "bang",
28
+ "belong",
29
+ "block",
30
+ "call",
31
+ "check",
32
+ "cheer",
33
+ "claim",
34
+ "climb",
35
+ "color",
36
+ "concern",
37
+ "cook",
38
+ "cover",
39
+ "crawl",
40
+ "depend",
41
+ "dream",
42
+ "end",
43
+ "enter",
44
+ "exist",
45
+ "explain",
46
+ "extend",
47
+ "fear",
48
+ "film",
49
+ "gasp",
50
+ "groan",
51
+ "growl",
52
+ "hand",
53
+ "happen",
54
+ "head",
55
+ "interest",
56
+ "kick",
57
+ "knock",
58
+ "land",
59
+ "laugh",
60
+ "lean",
61
+ "lift",
62
+ "listen",
63
+ "look",
64
+ "lower",
65
+ "need",
66
+ "offer",
67
+ "order",
68
+ "pick",
69
+ "point",
70
+ "pretend",
71
+ "print",
72
+ "pull",
73
+ "question",
74
+ "rank",
75
+ "regard",
76
+ "remain",
77
+ "remember",
78
+ "repeat",
79
+ "represent",
80
+ "result",
81
+ "return",
82
+ "row",
83
+ "scream",
84
+ "seem",
85
+ "shout",
86
+ "sign",
87
+ "sort",
88
+ "sound",
89
+ "spell",
90
+ "start",
91
+ "stay",
92
+ "suggest",
93
+ "talk",
94
+ "test",
95
+ "train",
96
+ "want",
97
+ "whisper",
98
+ "wonder",
99
+ "yell"
100
+ ],
101
+ "suffixes": [
102
+ "",
103
+ "ed",
104
+ "ing",
105
+ "s"
106
+ ]
107
+ },
108
+ {
109
+ "stems": [
110
+ "command",
111
+ "contain",
112
+ "count",
113
+ "flow",
114
+ "form",
115
+ "help",
116
+ "jump",
117
+ "kill",
118
+ "light",
119
+ "park",
120
+ "play",
121
+ "record",
122
+ "report",
123
+ "roll",
124
+ "show",
125
+ "turn",
126
+ "walk",
127
+ "work"
128
+ ],
129
+ "suffixes": [
130
+ "",
131
+ "ed",
132
+ "er",
133
+ "ing",
134
+ "s"
135
+ ]
136
+ },
137
+ {
138
+ "stems": [
139
+ "'a",
140
+ "accident",
141
+ "account",
142
+ "acre",
143
+ "adult",
144
+ "advantage",
145
+ "adventure",
146
+ "affair",
147
+ "afterward",
148
+ "agent",
149
+ "aid",
150
+ "airplane",
151
+ "airport",
152
+ "album",
153
+ "alligator",
154
+ "american",
155
+ "amount",
156
+ "ancestor",
157
+ "angel",
158
+ "angle",
159
+ "animal",
160
+ "ant",
161
+ "anyway",
162
+ "appearance",
163
+ "application",
164
+ "area",
165
+ "argument",
166
+ "arrangement",
167
+ "arrondissement",
168
+ "arrow",
169
+ "art",
170
+ "article",
171
+ "aspect",
172
+ "athlete",
173
+ "backward",
174
+ "bag",
175
+ "ball",
176
+ "balloon",
177
+ "banana",
178
+ "band",
179
+ "bandage",
180
+ "bank",
181
+ "barrel",
182
+ "basket",
183
+ "battle",
184
+ "bean",
185
+ "beast",
186
+ "begin",
187
+ "being",
188
+ "belief",
189
+ "benefit",
190
+ "beside",
191
+ "bird",
192
+ "biscuit",
193
+ "black",
194
+ "blade",
195
+ "blanket",
196
+ "blue",
197
+ "boat",
198
+ "bond",
199
+ "bone",
200
+ "book",
201
+ "bottle",
202
+ "brain",
203
+ "brake",
204
+ "brick",
205
+ "bubble",
206
+ "buck",
207
+ "bug",
208
+ "bullet",
209
+ "bun",
210
+ "button",
211
+ "cadet",
212
+ "cake",
213
+ "camera",
214
+ "canadian",
215
+ "candidate",
216
+ "candle",
217
+ "canton",
218
+ "cap",
219
+ "captive",
220
+ "carrot",
221
+ "cartoon",
222
+ "case",
223
+ "cave",
224
+ "cd",
225
+ "cell",
226
+ "center",
227
+ "chain",
228
+ "chamber",
229
+ "chance",
230
+ "channel",
231
+ "chap",
232
+ "character",
233
+ "characteristic",
234
+ "chemical",
235
+ "chick",
236
+ "chicken",
237
+ "chip",
238
+ "chum",
239
+ "cigarette",
240
+ "circle",
241
+ "circumstance",
242
+ "citizen",
243
+ "client",
244
+ "clip",
245
+ "clock",
246
+ "cloud",
247
+ "club",
248
+ "coat",
249
+ "coin",
250
+ "college",
251
+ "column",
252
+ "comb",
253
+ "comic",
254
+ "committee",
255
+ "commune",
256
+ "competition",
257
+ "complaint",
258
+ "compound",
259
+ "computer",
260
+ "comrade",
261
+ "concert",
262
+ "condition",
263
+ "cone",
264
+ "consequence",
265
+ "contract",
266
+ "control",
267
+ "conversation",
268
+ "corner",
269
+ "corsair",
270
+ "cost",
271
+ "course",
272
+ "court",
273
+ "cousin",
274
+ "cow",
275
+ "cowboy",
276
+ "crab",
277
+ "crayon",
278
+ "creature",
279
+ "creek",
280
+ "crew",
281
+ "crime",
282
+ "criminal",
283
+ "crisp",
284
+ "crop",
285
+ "cup",
286
+ "curtain",
287
+ "cushion",
288
+ "customer",
289
+ "cut",
290
+ "d'",
291
+ "dane",
292
+ "death",
293
+ "debt",
294
+ "decade",
295
+ "decision",
296
+ "deck",
297
+ "deed",
298
+ "degree",
299
+ "department",
300
+ "depth",
301
+ "description",
302
+ "detective",
303
+ "device",
304
+ "dialect",
305
+ "diamond",
306
+ "diaper",
307
+ "dinner",
308
+ "dinosaur",
309
+ "disease",
310
+ "district",
311
+ "division",
312
+ "document",
313
+ "dodd",
314
+ "dollar",
315
+ "dolphin",
316
+ "door",
317
+ "dot",
318
+ "doughnut",
319
+ "down",
320
+ "dozen",
321
+ "dragon",
322
+ "drug",
323
+ "drum",
324
+ "dwelling",
325
+ "eagle",
326
+ "economic",
327
+ "edge",
328
+ "edward",
329
+ "effect",
330
+ "effort",
331
+ "egg",
332
+ "el",
333
+ "element",
334
+ "elephant",
335
+ "employee",
336
+ "envelope",
337
+ "episode",
338
+ "equation",
339
+ "error",
340
+ "european",
341
+ "evan",
342
+ "evening",
343
+ "event",
344
+ "example",
345
+ "exception",
346
+ "exercise",
347
+ "expense",
348
+ "expert",
349
+ "fact",
350
+ "fan",
351
+ "fault",
352
+ "favorite",
353
+ "feather",
354
+ "fella",
355
+ "fellow",
356
+ "female",
357
+ "field",
358
+ "finger",
359
+ "fit",
360
+ "flag",
361
+ "flame",
362
+ "flat",
363
+ "flight",
364
+ "foe",
365
+ "folk",
366
+ "food",
367
+ "forest",
368
+ "fork",
369
+ "fortune",
370
+ "forward",
371
+ "france",
372
+ "frog",
373
+ "fruit",
374
+ "function",
375
+ "galley",
376
+ "game",
377
+ "garden",
378
+ "gate",
379
+ "gene",
380
+ "generation",
381
+ "gesture",
382
+ "get",
383
+ "ghost",
384
+ "giant",
385
+ "gift",
386
+ "glove",
387
+ "goal",
388
+ "governor",
389
+ "grab",
390
+ "grape",
391
+ "greek",
392
+ "gro",
393
+ "gros",
394
+ "ground",
395
+ "group",
396
+ "guest",
397
+ "gun",
398
+ "habit",
399
+ "heel",
400
+ "height",
401
+ "helicopter",
402
+ "hill",
403
+ "his",
404
+ "historian",
405
+ "hit",
406
+ "holiday",
407
+ "hotel",
408
+ "hour",
409
+ "household",
410
+ "hum",
411
+ "hundred",
412
+ "hymn",
413
+ "idea",
414
+ "image",
415
+ "incident",
416
+ "indian",
417
+ "individual",
418
+ "infection",
419
+ "instrument",
420
+ "intention",
421
+ "interview",
422
+ "island",
423
+ "item",
424
+ "jacket",
425
+ "jap",
426
+ "jaw",
427
+ "jean",
428
+ "jet",
429
+ "job",
430
+ "joint",
431
+ "key",
432
+ "kitten",
433
+ "knee",
434
+ "knight",
435
+ "knot",
436
+ "label",
437
+ "ladder",
438
+ "lake",
439
+ "lamb",
440
+ "lamp",
441
+ "language",
442
+ "lawyer",
443
+ "league",
444
+ "lecture",
445
+ "legend",
446
+ "lesson",
447
+ "letter",
448
+ "level",
449
+ "limb",
450
+ "lip",
451
+ "lord",
452
+ "lot",
453
+ "luca",
454
+ "lung",
455
+ "machine",
456
+ "magazine",
457
+ "male",
458
+ "map",
459
+ "marble",
460
+ "marine",
461
+ "material",
462
+ "math",
463
+ "matter",
464
+ "meal",
465
+ "medal",
466
+ "message",
467
+ "metal",
468
+ "meter",
469
+ "method",
470
+ "mile",
471
+ "million",
472
+ "minister",
473
+ "minute",
474
+ "missile",
475
+ "model",
476
+ "monkey",
477
+ "monster",
478
+ "morning",
479
+ "mountain",
480
+ "mouth",
481
+ "mr",
482
+ "muscle",
483
+ "mushroom",
484
+ "musician",
485
+ "muslim",
486
+ "nail",
487
+ "needle",
488
+ "neighbor",
489
+ "nerve",
490
+ "net",
491
+ "network",
492
+ "newspaper",
493
+ "novel",
494
+ "oar",
495
+ "object",
496
+ "occur",
497
+ "odd",
498
+ "olympic",
499
+ "oop",
500
+ "opinion",
501
+ "option",
502
+ "orange",
503
+ "organisation",
504
+ "organism",
505
+ "organization",
506
+ "our",
507
+ "owl",
508
+ "package",
509
+ "page",
510
+ "pancake",
511
+ "paper",
512
+ "parcel",
513
+ "parent",
514
+ "parson",
515
+ "particle",
516
+ "passenger",
517
+ "path",
518
+ "patient",
519
+ "pattern",
520
+ "paw",
521
+ "peanut",
522
+ "pen",
523
+ "pencil",
524
+ "penguin",
525
+ "pension",
526
+ "pepper",
527
+ "performance",
528
+ "period",
529
+ "peter",
530
+ "photo",
531
+ "pickle",
532
+ "picture",
533
+ "piece",
534
+ "pig",
535
+ "pill",
536
+ "pillow",
537
+ "pilot",
538
+ "pin",
539
+ "pirate",
540
+ "planet",
541
+ "plate",
542
+ "pleasure",
543
+ "plebe",
544
+ "plum",
545
+ "pocket",
546
+ "poem",
547
+ "pole",
548
+ "politician",
549
+ "pop",
550
+ "position",
551
+ "pot",
552
+ "power",
553
+ "prefecture",
554
+ "preparation",
555
+ "price",
556
+ "priest",
557
+ "principle",
558
+ "prisoner",
559
+ "prize",
560
+ "problem",
561
+ "procedure",
562
+ "product",
563
+ "production",
564
+ "profit",
565
+ "program",
566
+ "project",
567
+ "proposal",
568
+ "prospect",
569
+ "province",
570
+ "provision",
571
+ "pupil",
572
+ "purpose",
573
+ "put",
574
+ "quarter",
575
+ "rabbit",
576
+ "rat",
577
+ "rate",
578
+ "ray",
579
+ "reason",
580
+ "refer",
581
+ "reference",
582
+ "representative",
583
+ "reptile",
584
+ "resource",
585
+ "restaurant",
586
+ "review",
587
+ "rhyme",
588
+ "ribbon",
589
+ "richard",
590
+ "risk",
591
+ "river",
592
+ "road",
593
+ "rocket",
594
+ "roger",
595
+ "role",
596
+ "roman",
597
+ "room",
598
+ "root",
599
+ "rope",
600
+ "round",
601
+ "route",
602
+ "run",
603
+ "russian",
604
+ "saint",
605
+ "sale",
606
+ "sample",
607
+ "sausage",
608
+ "savage",
609
+ "saving",
610
+ "scale",
611
+ "scene",
612
+ "scheme",
613
+ "school",
614
+ "science",
615
+ "scientist",
616
+ "scout",
617
+ "season",
618
+ "sense",
619
+ "servant",
620
+ "service",
621
+ "session",
622
+ "settlement",
623
+ "shadow",
624
+ "shark",
625
+ "sheet",
626
+ "shell",
627
+ "shirt",
628
+ "shoe",
629
+ "shop",
630
+ "shore",
631
+ "shot",
632
+ "shoulder",
633
+ "shriek",
634
+ "shrine",
635
+ "side",
636
+ "signal",
637
+ "sin",
638
+ "single",
639
+ "sit",
640
+ "site",
641
+ "skill",
642
+ "skirt",
643
+ "sleeve",
644
+ "smack",
645
+ "snake",
646
+ "sneeze",
647
+ "sniff",
648
+ "sock",
649
+ "soldier",
650
+ "sometime",
651
+ "song",
652
+ "soul",
653
+ "source",
654
+ "space",
655
+ "spear",
656
+ "spider",
657
+ "spirit",
658
+ "spoon",
659
+ "spot",
660
+ "square",
661
+ "squeak",
662
+ "squeeze",
663
+ "squirrel",
664
+ "stable",
665
+ "stack",
666
+ "stage",
667
+ "stamp",
668
+ "standard",
669
+ "statue",
670
+ "stone",
671
+ "storm",
672
+ "stranger",
673
+ "strap",
674
+ "street",
675
+ "string",
676
+ "strip",
677
+ "stroke",
678
+ "structure",
679
+ "student",
680
+ "studio",
681
+ "style",
682
+ "subject",
683
+ "suburb",
684
+ "sunday",
685
+ "surrounding",
686
+ "suspicion",
687
+ "sweet",
688
+ "symbol",
689
+ "system",
690
+ "table",
691
+ "tail",
692
+ "tale",
693
+ "tap",
694
+ "tape",
695
+ "target",
696
+ "task",
697
+ "team",
698
+ "technique",
699
+ "temperature",
700
+ "temple",
701
+ "tenant",
702
+ "their",
703
+ "thing",
704
+ "thought",
705
+ "thousand",
706
+ "threat",
707
+ "ticket",
708
+ "tiger",
709
+ "tip",
710
+ "toad",
711
+ "tool",
712
+ "topic",
713
+ "toward",
714
+ "towel",
715
+ "tower",
716
+ "toy",
717
+ "treasure",
718
+ "tree",
719
+ "trial",
720
+ "triangle",
721
+ "trip",
722
+ "truck",
723
+ "trunk",
724
+ "turtle",
725
+ "twin",
726
+ "type",
727
+ "union",
728
+ "up",
729
+ "upward",
730
+ "value",
731
+ "vegetable",
732
+ "vehicle",
733
+ "verse",
734
+ "vessel",
735
+ "victim",
736
+ "video",
737
+ "village",
738
+ "vocalize",
739
+ "vol",
740
+ "volume",
741
+ "wall",
742
+ "warrior",
743
+ "way",
744
+ "weapon",
745
+ "weed",
746
+ "well",
747
+ "whale",
748
+ "wheel",
749
+ "white",
750
+ "whoop",
751
+ "william",
752
+ "window",
753
+ "wing",
754
+ "winner",
755
+ "winter",
756
+ "worm",
757
+ "wrestler",
758
+ "writing",
759
+ "your"
760
+ ],
761
+ "suffixes": [
762
+ "",
763
+ "s"
764
+ ]
765
+ },
766
+ {
767
+ "stems": [
768
+ "arriv",
769
+ "believ",
770
+ "breath",
771
+ "caus",
772
+ "chang",
773
+ "charg",
774
+ "chuckl",
775
+ "clos",
776
+ "continu",
777
+ "creat",
778
+ "danc",
779
+ "dat",
780
+ "decid",
781
+ "describ",
782
+ "examin",
783
+ "fac",
784
+ "fir",
785
+ "forc",
786
+ "glanc",
787
+ "hop",
788
+ "includ",
789
+ "increas",
790
+ "indicat",
791
+ "involv",
792
+ "judg",
793
+ "lik",
794
+ "liv",
795
+ "lov",
796
+ "mov",
797
+ "notic",
798
+ "operat",
799
+ "ow",
800
+ "plac",
801
+ "produc",
802
+ "promis",
803
+ "provid",
804
+ "r",
805
+ "rais",
806
+ "remov",
807
+ "rul",
808
+ "serv",
809
+ "shar",
810
+ "smil",
811
+ "star",
812
+ "struggl",
813
+ "surpris",
814
+ "us",
815
+ "voic",
816
+ "vot",
817
+ "wav",
818
+ "wip"
819
+ ],
820
+ "suffixes": [
821
+ "es",
822
+ "ing",
823
+ [
824
+ "e",
825
+ [
826
+ "",
827
+ "d"
828
+ ]
829
+ ]
830
+ ]
831
+ },
832
+ {
833
+ "stems": [
834
+ "affect",
835
+ "arm",
836
+ "award",
837
+ "border",
838
+ "bound",
839
+ "bow",
840
+ "colour",
841
+ "comment",
842
+ "content",
843
+ "demand",
844
+ "design",
845
+ "detail",
846
+ "di",
847
+ "doubt",
848
+ "flood",
849
+ "fold",
850
+ "guard",
851
+ "hat",
852
+ "heart",
853
+ "honor",
854
+ "honour",
855
+ "hook",
856
+ "host",
857
+ "interrupt",
858
+ "limit",
859
+ "list",
860
+ "lock",
861
+ "mark",
862
+ "mention",
863
+ "mind",
864
+ "murder",
865
+ "own",
866
+ "plant",
867
+ "post",
868
+ "protest",
869
+ "remark",
870
+ "remind",
871
+ "request",
872
+ "respect",
873
+ "respond",
874
+ "ruin",
875
+ "screw",
876
+ "seal",
877
+ "seat",
878
+ "suit",
879
+ "thank",
880
+ "unit",
881
+ "view",
882
+ "volunteer",
883
+ "wound"
884
+ ],
885
+ "suffixes": [
886
+ "",
887
+ "ed",
888
+ "s"
889
+ ]
890
+ },
891
+ {
892
+ "stems": [
893
+ "approach",
894
+ "attend",
895
+ "avoid",
896
+ "boil",
897
+ "bother",
898
+ "complain",
899
+ "consider",
900
+ "crash",
901
+ "cross",
902
+ "dash",
903
+ "destroy",
904
+ "drown",
905
+ "earn",
906
+ "echo",
907
+ "expect",
908
+ "fill",
909
+ "fix",
910
+ "flash",
911
+ "float",
912
+ "fuck",
913
+ "gain",
914
+ "gather",
915
+ "guess",
916
+ "hang",
917
+ "heat",
918
+ "leap",
919
+ "lick",
920
+ "march",
921
+ "mess",
922
+ "mix",
923
+ "pack",
924
+ "perform",
925
+ "pour",
926
+ "rest",
927
+ "rush",
928
+ "search",
929
+ "shift",
930
+ "smash",
931
+ "starr",
932
+ "strain",
933
+ "stretch",
934
+ "suffer",
935
+ "trust",
936
+ "wander"
937
+ ],
938
+ "suffixes": [
939
+ "",
940
+ "ed",
941
+ "ing"
942
+ ]
943
+ },
944
+ {
945
+ "stems": [
946
+ "adam",
947
+ "boy",
948
+ "bro",
949
+ "brother",
950
+ "cat",
951
+ "chi",
952
+ "color",
953
+ "council",
954
+ "daughter",
955
+ "day",
956
+ "doctor",
957
+ "dog",
958
+ "doll",
959
+ "father",
960
+ "friend",
961
+ "girl",
962
+ "god",
963
+ "government",
964
+ "guy",
965
+ "it",
966
+ "king",
967
+ "let",
968
+ "mark",
969
+ "master",
970
+ "moment",
971
+ "mother",
972
+ "name",
973
+ "night",
974
+ "number",
975
+ "one",
976
+ "other",
977
+ "people",
978
+ "person",
979
+ "queen",
980
+ "ship",
981
+ "sister",
982
+ "son",
983
+ "steven",
984
+ "water",
985
+ "week",
986
+ "world",
987
+ "year"
988
+ ],
989
+ "suffixes": [
990
+ "",
991
+ "'s",
992
+ "s"
993
+ ]
994
+ },
995
+ {
996
+ "stems": [
997
+ "address",
998
+ "breath",
999
+ "brush",
1000
+ "dat",
1001
+ "dress",
1002
+ "finish",
1003
+ "hop",
1004
+ "kiss",
1005
+ "miss",
1006
+ "ow",
1007
+ "pass",
1008
+ "push",
1009
+ "reach",
1010
+ "star",
1011
+ "touch",
1012
+ "us",
1013
+ "watch",
1014
+ "wish"
1015
+ ],
1016
+ "suffixes": [
1017
+ "",
1018
+ "ed",
1019
+ "es",
1020
+ "ing"
1021
+ ]
1022
+ },
1023
+ {
1024
+ "stems": [
1025
+ "age",
1026
+ "base",
1027
+ "bu",
1028
+ "deserve",
1029
+ "desire",
1030
+ "di",
1031
+ "estimate",
1032
+ "experience",
1033
+ "eye",
1034
+ "feature",
1035
+ "fee",
1036
+ "figure",
1037
+ "file",
1038
+ "han",
1039
+ "hate",
1040
+ "her",
1041
+ "hi",
1042
+ "influence",
1043
+ "issue",
1044
+ "la",
1045
+ "lie",
1046
+ "measure",
1047
+ "name",
1048
+ "phone",
1049
+ "pile",
1050
+ "prove",
1051
+ "puzzle",
1052
+ "recognize",
1053
+ "release",
1054
+ "score",
1055
+ "sentence",
1056
+ "shape",
1057
+ "size",
1058
+ "tie",
1059
+ "tire",
1060
+ "title",
1061
+ "trouble",
1062
+ "win"
1063
+ ],
1064
+ "suffixes": [
1065
+ "",
1066
+ "d",
1067
+ "s"
1068
+ ]
1069
+ },
1070
+ {
1071
+ "stems": [
1072
+ "beep",
1073
+ "bend",
1074
+ "bit",
1075
+ "board",
1076
+ "bowl",
1077
+ "break",
1078
+ "click",
1079
+ "cough",
1080
+ "drink",
1081
+ "engineer",
1082
+ "feed",
1083
+ "fool",
1084
+ "fund",
1085
+ "grunt",
1086
+ "hurt",
1087
+ "mean",
1088
+ "neighbour",
1089
+ "pay",
1090
+ "pound",
1091
+ "ring",
1092
+ "send",
1093
+ "spend",
1094
+ "sport",
1095
+ "spring",
1096
+ "squeal",
1097
+ "stand",
1098
+ "stream",
1099
+ "suck",
1100
+ "swing",
1101
+ "tear",
1102
+ "tour",
1103
+ "track",
1104
+ "understand"
1105
+ ],
1106
+ "suffixes": [
1107
+ "",
1108
+ "ing",
1109
+ "s"
1110
+ ]
1111
+ },
1112
+ {
1113
+ "stems": [
1114
+ "arm",
1115
+ "bell",
1116
+ "bill",
1117
+ "bu",
1118
+ "cheek",
1119
+ "cop",
1120
+ "director",
1121
+ "doll",
1122
+ "duck",
1123
+ "factor",
1124
+ "ga",
1125
+ "german",
1126
+ "good",
1127
+ "ha",
1128
+ "heart",
1129
+ "jo",
1130
+ "la",
1131
+ "lad",
1132
+ "photograph",
1133
+ "ra",
1134
+ "sand",
1135
+ "smell",
1136
+ "ton",
1137
+ "trick",
1138
+ "wa",
1139
+ "wood"
1140
+ ],
1141
+ "suffixes": [
1142
+ "",
1143
+ "s",
1144
+ "y"
1145
+ ]
1146
+ },
1147
+ {
1148
+ "stems": [
1149
+ "bomb",
1150
+ "build",
1151
+ "draw",
1152
+ "farm",
1153
+ "fight",
1154
+ "flow",
1155
+ "lay",
1156
+ "play",
1157
+ "read",
1158
+ "sing",
1159
+ "speak",
1160
+ "stick",
1161
+ "work"
1162
+ ],
1163
+ "suffixes": [
1164
+ "",
1165
+ "ing",
1166
+ "s",
1167
+ [
1168
+ "er",
1169
+ [
1170
+ "",
1171
+ "s"
1172
+ ]
1173
+ ]
1174
+ ]
1175
+ },
1176
+ {
1177
+ "stems": [
1178
+ "cheer",
1179
+ "count",
1180
+ "hand",
1181
+ "part",
1182
+ "rain",
1183
+ "read",
1184
+ "rock",
1185
+ "sleep",
1186
+ "stick",
1187
+ "tell",
1188
+ "wear",
1189
+ "wind"
1190
+ ],
1191
+ "suffixes": [
1192
+ "",
1193
+ "ing",
1194
+ "s",
1195
+ "y"
1196
+ ]
1197
+ },
1198
+ {
1199
+ "stems": [
1200
+ "'i",
1201
+ "bridge",
1202
+ "bu",
1203
+ "ca",
1204
+ "co",
1205
+ "deal",
1206
+ "do",
1207
+ "fee",
1208
+ "ha",
1209
+ "hi",
1210
+ "les",
1211
+ "lo",
1212
+ "los",
1213
+ "mean",
1214
+ "nut",
1215
+ "pain",
1216
+ "plan",
1217
+ "plane",
1218
+ "po",
1219
+ "ra",
1220
+ "sea",
1221
+ "si",
1222
+ "ye"
1223
+ ],
1224
+ "suffixes": [
1225
+ "",
1226
+ "s",
1227
+ "t"
1228
+ ]
1229
+ },
1230
+ {
1231
+ "stems": [
1232
+ "absolute",
1233
+ "according",
1234
+ "actual",
1235
+ "anxious",
1236
+ "apparent",
1237
+ "awful",
1238
+ "bad",
1239
+ "bare",
1240
+ "beautiful",
1241
+ "bitter",
1242
+ "bold",
1243
+ "brief",
1244
+ "calm",
1245
+ "careful",
1246
+ "certain",
1247
+ "cheerful",
1248
+ "comparative",
1249
+ "constant",
1250
+ "curious",
1251
+ "current",
1252
+ "dead",
1253
+ "decided",
1254
+ "definite",
1255
+ "desperate",
1256
+ "dreadful",
1257
+ "eager",
1258
+ "earnest",
1259
+ "effective",
1260
+ "entire",
1261
+ "essential",
1262
+ "exact",
1263
+ "exceeding",
1264
+ "excited",
1265
+ "extreme",
1266
+ "fair",
1267
+ "fierce",
1268
+ "firm",
1269
+ "former",
1270
+ "fortunate",
1271
+ "frank",
1272
+ "frequent",
1273
+ "general",
1274
+ "glad",
1275
+ "grim",
1276
+ "ho",
1277
+ "honest",
1278
+ "hurried",
1279
+ "immediate",
1280
+ "impatient",
1281
+ "initial",
1282
+ "joyful",
1283
+ "main",
1284
+ "mere",
1285
+ "most",
1286
+ "natural",
1287
+ "neat",
1288
+ "normal",
1289
+ "obvious",
1290
+ "partial",
1291
+ "particular",
1292
+ "physical",
1293
+ "polite",
1294
+ "poor",
1295
+ "positive",
1296
+ "practical",
1297
+ "previous",
1298
+ "private",
1299
+ "proper",
1300
+ "proud",
1301
+ "quiet",
1302
+ "rapid",
1303
+ "rare",
1304
+ "recent",
1305
+ "regular",
1306
+ "repeated",
1307
+ "rough",
1308
+ "sad",
1309
+ "scarce",
1310
+ "serious",
1311
+ "severe",
1312
+ "sharp",
1313
+ "similar",
1314
+ "simultaneous",
1315
+ "sole",
1316
+ "solemn",
1317
+ "sore",
1318
+ "special",
1319
+ "stern",
1320
+ "strict",
1321
+ "successful",
1322
+ "sudden",
1323
+ "sufficient",
1324
+ "sure",
1325
+ "swift",
1326
+ "technical",
1327
+ "tel",
1328
+ "thorough",
1329
+ "thoughtful",
1330
+ "tick",
1331
+ "tight",
1332
+ "total",
1333
+ "typical",
1334
+ "ultimate",
1335
+ "unfortunate",
1336
+ "unlike",
1337
+ "usual",
1338
+ "wild",
1339
+ "willing",
1340
+ "wonderful"
1341
+ ],
1342
+ "suffixes": [
1343
+ "",
1344
+ "ly"
1345
+ ]
1346
+ },
1347
+ {
1348
+ "stems": [
1349
+ "abandon",
1350
+ "accept",
1351
+ "accomplish",
1352
+ "adopt",
1353
+ "afford",
1354
+ "aim",
1355
+ "air",
1356
+ "alarm",
1357
+ "appeal",
1358
+ "arrest",
1359
+ "assist",
1360
+ "betray",
1361
+ "borrow",
1362
+ "bump",
1363
+ "bust",
1364
+ "conduct",
1365
+ "convert",
1366
+ "crowd",
1367
+ "crown",
1368
+ "crush",
1369
+ "dart",
1370
+ "debut",
1371
+ "defeat",
1372
+ "delay",
1373
+ "delight",
1374
+ "deposit",
1375
+ "desert",
1376
+ "detect",
1377
+ "disappear",
1378
+ "display",
1379
+ "distinguish",
1380
+ "distress",
1381
+ "disturb",
1382
+ "doom",
1383
+ "draft",
1384
+ "dread",
1385
+ "drift",
1386
+ "dump",
1387
+ "expand",
1388
+ "ey",
1389
+ "fad",
1390
+ "fashion",
1391
+ "flush",
1392
+ "focus",
1393
+ "fri",
1394
+ "frighten",
1395
+ "grant",
1396
+ "grasp",
1397
+ "hail",
1398
+ "halt",
1399
+ "haul",
1400
+ "insist",
1401
+ "intend",
1402
+ "jar",
1403
+ "last",
1404
+ "launch",
1405
+ "leak",
1406
+ "lin",
1407
+ "link",
1408
+ "maintain",
1409
+ "melt",
1410
+ "mount",
1411
+ "obey",
1412
+ "obtain",
1413
+ "peep",
1414
+ "piss",
1415
+ "pitch",
1416
+ "poison",
1417
+ "polish",
1418
+ "premier",
1419
+ "prevent",
1420
+ "recall",
1421
+ "recommend",
1422
+ "reform",
1423
+ "register",
1424
+ "relax",
1425
+ "render",
1426
+ "repair",
1427
+ "retain",
1428
+ "reveal",
1429
+ "scratch",
1430
+ "se",
1431
+ "shock",
1432
+ "sight",
1433
+ "snatch",
1434
+ "spill",
1435
+ "spoil",
1436
+ "stuff",
1437
+ "succeed",
1438
+ "suspect",
1439
+ "swallow",
1440
+ "switch",
1441
+ "talent",
1442
+ "thrill",
1443
+ "ti",
1444
+ "toss",
1445
+ "tripp",
1446
+ "tuck",
1447
+ "twist",
1448
+ "we",
1449
+ "weigh",
1450
+ "witness",
1451
+ "wreck",
1452
+ "yield"
1453
+ ],
1454
+ "suffixes": [
1455
+ "",
1456
+ "ed"
1457
+ ]
1458
+ },
1459
+ {
1460
+ "stems": [
1461
+ "acknowledge",
1462
+ "ad",
1463
+ "advise",
1464
+ "an",
1465
+ "approve",
1466
+ "ar",
1467
+ "assume",
1468
+ "assure",
1469
+ "ban",
1470
+ "be",
1471
+ "behave",
1472
+ "ben",
1473
+ "bi",
1474
+ "blame",
1475
+ "capture",
1476
+ "cease",
1477
+ "cla",
1478
+ "col",
1479
+ "compare",
1480
+ "compete",
1481
+ "convince",
1482
+ "cor",
1483
+ "crow",
1484
+ "cure",
1485
+ "da",
1486
+ "damage",
1487
+ "decline",
1488
+ "define",
1489
+ "discharge",
1490
+ "divide",
1491
+ "divorce",
1492
+ "en",
1493
+ "enable",
1494
+ "escape",
1495
+ "exchange",
1496
+ "explode",
1497
+ "for",
1498
+ "fun",
1499
+ "go",
1500
+ "gran",
1501
+ "guide",
1502
+ "har",
1503
+ "hee",
1504
+ "hin",
1505
+ "hire",
1506
+ "hoo",
1507
+ "ignore",
1508
+ "induce",
1509
+ "inquire",
1510
+ "introduce",
1511
+ "ki",
1512
+ "kin",
1513
+ "lai",
1514
+ "lea",
1515
+ "lou",
1516
+ "men",
1517
+ "mi",
1518
+ "min",
1519
+ "moo",
1520
+ "mu",
1521
+ "nature",
1522
+ "nee",
1523
+ "nickname",
1524
+ "nor",
1525
+ "participate",
1526
+ "persuade",
1527
+ "please",
1528
+ "praise",
1529
+ "preserve",
1530
+ "propose",
1531
+ "purchase",
1532
+ "pursue",
1533
+ "realise",
1534
+ "recognise",
1535
+ "refuse",
1536
+ "rejoice",
1537
+ "relieve",
1538
+ "rescue",
1539
+ "reserve",
1540
+ "resolve",
1541
+ "restore",
1542
+ "ri",
1543
+ "san",
1544
+ "scare",
1545
+ "schedule",
1546
+ "secure",
1547
+ "she",
1548
+ "solve",
1549
+ "spare",
1550
+ "stan",
1551
+ "suite",
1552
+ "suppose",
1553
+ "ten",
1554
+ "tumble",
1555
+ "unite",
1556
+ "urge",
1557
+ "venture",
1558
+ "wan",
1559
+ "wee",
1560
+ "welcome",
1561
+ "wi",
1562
+ "woo"
1563
+ ],
1564
+ "suffixes": [
1565
+ "",
1566
+ "d"
1567
+ ]
1568
+ },
1569
+ {
1570
+ "stems": [
1571
+ "brave",
1572
+ "chief",
1573
+ "common",
1574
+ "ear",
1575
+ "elder",
1576
+ "equal",
1577
+ "final",
1578
+ "friend",
1579
+ "heaven",
1580
+ "kind",
1581
+ "month",
1582
+ "new",
1583
+ "official",
1584
+ "plain",
1585
+ "right",
1586
+ "short",
1587
+ "week",
1588
+ "year"
1589
+ ],
1590
+ "suffixes": [
1591
+ "",
1592
+ "ly",
1593
+ "s"
1594
+ ]
1595
+ },
1596
+ {
1597
+ "stems": [
1598
+ "'it",
1599
+ "'t",
1600
+ "alaeddin",
1601
+ "albert",
1602
+ "alexander",
1603
+ "america",
1604
+ "anne",
1605
+ "bart",
1606
+ "ben",
1607
+ "bessie",
1608
+ "bob",
1609
+ "catherine",
1610
+ "child",
1611
+ "children",
1612
+ "clown",
1613
+ "colin",
1614
+ "colonel",
1615
+ "company",
1616
+ "country",
1617
+ "cromer",
1618
+ "dad",
1619
+ "dada",
1620
+ "daddy",
1621
+ "dat",
1622
+ "david",
1623
+ "dolly",
1624
+ "donald",
1625
+ "earth",
1626
+ "edna",
1627
+ "eleanor",
1628
+ "ellie",
1629
+ "else",
1630
+ "eve",
1631
+ "everybody",
1632
+ "everyone",
1633
+ "everything",
1634
+ "family",
1635
+ "florence",
1636
+ "fraser",
1637
+ "grandma",
1638
+ "he",
1639
+ "here",
1640
+ "how",
1641
+ "husband",
1642
+ "irene",
1643
+ "jane",
1644
+ "john",
1645
+ "jumbo",
1646
+ "jwww",
1647
+ "kitty",
1648
+ "lady",
1649
+ "lara",
1650
+ "life",
1651
+ "lt",
1652
+ "maggie",
1653
+ "mama",
1654
+ "man",
1655
+ "men",
1656
+ "michael",
1657
+ "molly",
1658
+ "mom",
1659
+ "mot",
1660
+ "mum",
1661
+ "mummy",
1662
+ "nobody",
1663
+ "papa",
1664
+ "patty",
1665
+ "paul",
1666
+ "peggy",
1667
+ "rosamund",
1668
+ "sarah",
1669
+ "she",
1670
+ "somebody",
1671
+ "someone",
1672
+ "steve",
1673
+ "ted",
1674
+ "teddy",
1675
+ "that",
1676
+ "there",
1677
+ "today",
1678
+ "tom",
1679
+ "tonight",
1680
+ "uncle",
1681
+ "ursula",
1682
+ "what",
1683
+ "when",
1684
+ "where",
1685
+ "who",
1686
+ "wife",
1687
+ "woman",
1688
+ "women"
1689
+ ],
1690
+ "suffixes": [
1691
+ "",
1692
+ "'s"
1693
+ ]
1694
+ },
1695
+ {
1696
+ "stems": [
1697
+ "crack",
1698
+ "defend",
1699
+ "flow",
1700
+ "hunt",
1701
+ "play",
1702
+ "publish",
1703
+ "us",
1704
+ "wash",
1705
+ "work"
1706
+ ],
1707
+ "suffixes": [
1708
+ "",
1709
+ "ed",
1710
+ "ing",
1711
+ [
1712
+ "er",
1713
+ [
1714
+ "",
1715
+ "s"
1716
+ ]
1717
+ ]
1718
+ ]
1719
+ },
1720
+ {
1721
+ "stems": [
1722
+ "africa",
1723
+ "america",
1724
+ "an",
1725
+ "arabia",
1726
+ "asia",
1727
+ "australia",
1728
+ "austria",
1729
+ "ba",
1730
+ "be",
1731
+ "bi",
1732
+ "broke",
1733
+ "brow",
1734
+ "cha",
1735
+ "chose",
1736
+ "cla",
1737
+ "cor",
1738
+ "crow",
1739
+ "da",
1740
+ "dam",
1741
+ "did",
1742
+ "easter",
1743
+ "ed",
1744
+ "eva",
1745
+ "eve",
1746
+ "fa",
1747
+ "fu",
1748
+ "georgia",
1749
+ "gi",
1750
+ "glen",
1751
+ "gra",
1752
+ "have",
1753
+ "he",
1754
+ "ho",
1755
+ "ia",
1756
+ "in",
1757
+ "india",
1758
+ "is",
1759
+ "ji",
1760
+ "julie",
1761
+ "ki",
1762
+ "korea",
1763
+ "lea",
1764
+ "leo",
1765
+ "li",
1766
+ "ma",
1767
+ "mario",
1768
+ "me",
1769
+ "mi",
1770
+ "mo",
1771
+ "moo",
1772
+ "na",
1773
+ "no",
1774
+ "noo",
1775
+ "ow",
1776
+ "pa",
1777
+ "pi",
1778
+ "rise",
1779
+ "russia",
1780
+ "sa",
1781
+ "ski",
1782
+ "so",
1783
+ "soo",
1784
+ "spoke",
1785
+ "steve",
1786
+ "stole",
1787
+ "su",
1788
+ "ta",
1789
+ "te",
1790
+ "tee",
1791
+ "tha",
1792
+ "the",
1793
+ "ti",
1794
+ "to",
1795
+ "tow",
1796
+ "wi",
1797
+ "wo"
1798
+ ],
1799
+ "suffixes": [
1800
+ "",
1801
+ "n"
1802
+ ]
1803
+ },
1804
+ {
1805
+ "stems": [
1806
+ "am",
1807
+ "an",
1808
+ "and",
1809
+ "aunt",
1810
+ "ba",
1811
+ "bab",
1812
+ "bart",
1813
+ "blood",
1814
+ "bo",
1815
+ "brand",
1816
+ "bull",
1817
+ "bus",
1818
+ "carr",
1819
+ "chill",
1820
+ "cla",
1821
+ "da",
1822
+ "dave",
1823
+ "den",
1824
+ "difficult",
1825
+ "dirt",
1826
+ "dr",
1827
+ "dust",
1828
+ "earl",
1829
+ "ever",
1830
+ "fair",
1831
+ "fort",
1832
+ "frank",
1833
+ "frost",
1834
+ "full",
1835
+ "fur",
1836
+ "fuss",
1837
+ "gloom",
1838
+ "gra",
1839
+ "gravel",
1840
+ "guilt",
1841
+ "hard",
1842
+ "hast",
1843
+ "health",
1844
+ "iv",
1845
+ "jack",
1846
+ "jealous",
1847
+ "joe",
1848
+ "jud",
1849
+ "ka",
1850
+ "loft",
1851
+ "luc",
1852
+ "luck",
1853
+ "ma",
1854
+ "man",
1855
+ "might",
1856
+ "monarch",
1857
+ "na",
1858
+ "pa",
1859
+ "pit",
1860
+ "pre",
1861
+ "rick",
1862
+ "rub",
1863
+ "sa",
1864
+ "saxon",
1865
+ "scott",
1866
+ "sex",
1867
+ "snow",
1868
+ "stink",
1869
+ "th",
1870
+ "the",
1871
+ "thirst",
1872
+ "tin",
1873
+ "to",
1874
+ "tr",
1875
+ "var",
1876
+ "victor",
1877
+ "wealth",
1878
+ "wh",
1879
+ "worth",
1880
+ "ya",
1881
+ "yuck"
1882
+ ],
1883
+ "suffixes": [
1884
+ "",
1885
+ "y"
1886
+ ]
1887
+ },
1888
+ {
1889
+ "stems": [
1890
+ "arm",
1891
+ "cheer",
1892
+ "count",
1893
+ "deliver",
1894
+ "discover",
1895
+ "hair",
1896
+ "hand",
1897
+ "he",
1898
+ "heart",
1899
+ "mess",
1900
+ "part",
1901
+ "recover",
1902
+ "scar",
1903
+ "sh",
1904
+ "treat"
1905
+ ],
1906
+ "suffixes": [
1907
+ "",
1908
+ "ed",
1909
+ "y"
1910
+ ]
1911
+ },
1912
+ {
1913
+ "stems": [
1914
+ "as",
1915
+ "boot",
1916
+ "bu",
1917
+ "do",
1918
+ "ha",
1919
+ "hat",
1920
+ "heart",
1921
+ "hug",
1922
+ "ra",
1923
+ "set",
1924
+ "sing",
1925
+ "ss",
1926
+ "tent",
1927
+ "wa",
1928
+ "ye"
1929
+ ],
1930
+ "suffixes": [
1931
+ "",
1932
+ "h",
1933
+ "s"
1934
+ ]
1935
+ },
1936
+ {
1937
+ "stems": [
1938
+ "cent",
1939
+ "chart",
1940
+ "custom",
1941
+ "design",
1942
+ "engine",
1943
+ "horn",
1944
+ "mark",
1945
+ "mill",
1946
+ "murder",
1947
+ "own",
1948
+ "pet",
1949
+ "photograph",
1950
+ "port",
1951
+ "short",
1952
+ "tank"
1953
+ ],
1954
+ "suffixes": [
1955
+ "",
1956
+ "er",
1957
+ "s"
1958
+ ]
1959
+ },
1960
+ {
1961
+ "stems": [
1962
+ "'",
1963
+ "a",
1964
+ "bu",
1965
+ "e",
1966
+ "ha",
1967
+ "he'",
1968
+ "he’",
1969
+ "hi",
1970
+ "i",
1971
+ "it'",
1972
+ "she'",
1973
+ "that'",
1974
+ "there'",
1975
+ "what'"
1976
+ ],
1977
+ "suffixes": [
1978
+ "d",
1979
+ "ll",
1980
+ "s"
1981
+ ]
1982
+ },
1983
+ {
1984
+ "stems": [
1985
+ "bu",
1986
+ "ca",
1987
+ "chi",
1988
+ "co",
1989
+ "di",
1990
+ "ear",
1991
+ "ha",
1992
+ "hi",
1993
+ "law",
1994
+ "mistake",
1995
+ "ra",
1996
+ "sea",
1997
+ "si",
1998
+ "wa"
1999
+ ],
2000
+ "suffixes": [
2001
+ "",
2002
+ "n",
2003
+ "s"
2004
+ ]
2005
+ },
2006
+ {
2007
+ "stems": [
2008
+ "flow",
2009
+ "follow",
2010
+ "play",
2011
+ "support",
2012
+ "travel",
2013
+ "work"
2014
+ ],
2015
+ "suffixes": [
2016
+ "",
2017
+ "ed",
2018
+ "ers",
2019
+ "ing",
2020
+ "s"
2021
+ ]
2022
+ },
2023
+ {
2024
+ "stems": [
2025
+ "blow",
2026
+ "do",
2027
+ "draw",
2028
+ "flow",
2029
+ "grow",
2030
+ "know",
2031
+ "show",
2032
+ "throw"
2033
+ ],
2034
+ "suffixes": [
2035
+ "",
2036
+ "ing",
2037
+ "n",
2038
+ "s"
2039
+ ]
2040
+ },
2041
+ {
2042
+ "stems": [
2043
+ "do",
2044
+ "eat",
2045
+ "look",
2046
+ "play",
2047
+ "say",
2048
+ "talk",
2049
+ "tell",
2050
+ "think"
2051
+ ],
2052
+ "suffixes": [
2053
+ "",
2054
+ "in'",
2055
+ "ing",
2056
+ "s"
2057
+ ]
2058
+ },
2059
+ {
2060
+ "stems": [
2061
+ "abilit",
2062
+ "activit",
2063
+ "agenc",
2064
+ "appl",
2065
+ "arm",
2066
+ "authorit",
2067
+ "bab",
2068
+ "batter",
2069
+ "bod",
2070
+ "boundar",
2071
+ "butterfl",
2072
+ "carr",
2073
+ "categor",
2074
+ "centur",
2075
+ "ceremon",
2076
+ "cherr",
2077
+ "cit",
2078
+ "communit",
2079
+ "compan",
2080
+ "cop",
2081
+ "count",
2082
+ "countr",
2083
+ "cr",
2084
+ "deput",
2085
+ "difficult",
2086
+ "dut",
2087
+ "enem",
2088
+ "facilit",
2089
+ "factor",
2090
+ "fair",
2091
+ "famil",
2092
+ "fl",
2093
+ "foll",
2094
+ "fr",
2095
+ "grocer",
2096
+ "hand",
2097
+ "injur",
2098
+ "inquir",
2099
+ "lad",
2100
+ "missionar",
2101
+ "municipalit",
2102
+ "opportunit",
2103
+ "penn",
2104
+ "polic",
2105
+ "pon",
2106
+ "propert",
2107
+ "pupp",
2108
+ "qualit",
2109
+ "raspberr",
2110
+ "responsibilit",
2111
+ "sk",
2112
+ "stor",
2113
+ "strawberr",
2114
+ "stud",
2115
+ "suppl",
2116
+ "territor",
2117
+ "theor",
2118
+ "universit",
2119
+ "worr"
2120
+ ],
2121
+ "suffixes": [
2122
+ "ies",
2123
+ "y"
2124
+ ]
2125
+ },
2126
+ {
2127
+ "stems": [
2128
+ "bit",
2129
+ "catch",
2130
+ "cloth",
2131
+ "coach",
2132
+ "com",
2133
+ "do",
2134
+ "fuss",
2135
+ "go",
2136
+ "hid",
2137
+ "min",
2138
+ "process",
2139
+ "slid",
2140
+ "tim"
2141
+ ],
2142
+ "suffixes": [
2143
+ "",
2144
+ "es",
2145
+ "ing"
2146
+ ]
2147
+ },
2148
+ {
2149
+ "stems": [
2150
+ "ai",
2151
+ "ba",
2152
+ "bake",
2153
+ "dee",
2154
+ "fa",
2155
+ "ma",
2156
+ "no",
2157
+ "pa",
2158
+ "pitche",
2159
+ "sa",
2160
+ "te",
2161
+ "thrille"
2162
+ ],
2163
+ "suffixes": [
2164
+ "",
2165
+ "d",
2166
+ "r"
2167
+ ]
2168
+ },
2169
+ {
2170
+ "stems": [
2171
+ "can",
2172
+ "co",
2173
+ "critic",
2174
+ "di",
2175
+ "emotion",
2176
+ "experiment",
2177
+ "form",
2178
+ "leg",
2179
+ "occasion",
2180
+ "person",
2181
+ "region",
2182
+ "sign"
2183
+ ],
2184
+ "suffixes": [
2185
+ "",
2186
+ "al",
2187
+ "s"
2188
+ ]
2189
+ },
2190
+ {
2191
+ "stems": [
2192
+ "angel",
2193
+ "ash",
2194
+ "branch",
2195
+ "bus",
2196
+ "bush",
2197
+ "business",
2198
+ "cas",
2199
+ "child",
2200
+ "church",
2201
+ "class",
2202
+ "con",
2203
+ "dan",
2204
+ "di",
2205
+ "dish",
2206
+ "ey",
2207
+ "fox",
2208
+ "fri",
2209
+ "glass",
2210
+ "hat",
2211
+ "hero",
2212
+ "inch",
2213
+ "jam",
2214
+ "li",
2215
+ "lin",
2216
+ "loss",
2217
+ "mass",
2218
+ "mat",
2219
+ "match",
2220
+ "not",
2221
+ "on",
2222
+ "pi",
2223
+ "plan",
2224
+ "potato",
2225
+ "rang",
2226
+ "rat",
2227
+ "rich",
2228
+ "sandwich",
2229
+ "se",
2230
+ "sid",
2231
+ "sit",
2232
+ "ski",
2233
+ "speech",
2234
+ "strip",
2235
+ "tap",
2236
+ "tax",
2237
+ "ti",
2238
+ "to",
2239
+ "tomato",
2240
+ "ton",
2241
+ "trench",
2242
+ "witness"
2243
+ ],
2244
+ "suffixes": [
2245
+ "",
2246
+ "es"
2247
+ ]
2248
+ },
2249
+ {
2250
+ "stems": [
2251
+ "bath",
2252
+ "bo",
2253
+ "breed",
2254
+ "broadcast",
2255
+ "burst",
2256
+ "buy",
2257
+ "buzz",
2258
+ "camp",
2259
+ "carry",
2260
+ "cast",
2261
+ "charm",
2262
+ "chatter",
2263
+ "cheat",
2264
+ "chew",
2265
+ "comfort",
2266
+ "copy",
2267
+ "creep",
2268
+ "cry",
2269
+ "din",
2270
+ "disgust",
2271
+ "dwell",
2272
+ "even",
2273
+ "fly",
2274
+ "glow",
2275
+ "humm",
2276
+ "hurry",
2277
+ "iron",
2278
+ "lack",
2279
+ "market",
2280
+ "marry",
2281
+ "react",
2282
+ "seek",
2283
+ "sell",
2284
+ "sew",
2285
+ "sink",
2286
+ "snow",
2287
+ "splash",
2288
+ "spread",
2289
+ "st",
2290
+ "steal",
2291
+ "steer",
2292
+ "study",
2293
+ "sweep",
2294
+ "swell",
2295
+ "th",
2296
+ "tidy",
2297
+ "trail",
2298
+ "try",
2299
+ "will",
2300
+ "worry"
2301
+ ],
2302
+ "suffixes": [
2303
+ "",
2304
+ "ing"
2305
+ ]
2306
+ },
2307
+ {
2308
+ "stems": [
2309
+ "act",
2310
+ "collect",
2311
+ "discuss",
2312
+ "miss",
2313
+ "pass",
2314
+ "protect",
2315
+ "suggest"
2316
+ ],
2317
+ "suffixes": [
2318
+ "",
2319
+ "ed",
2320
+ "ing",
2321
+ "ion"
2322
+ ]
2323
+ },
2324
+ {
2325
+ "stems": [
2326
+ "burn",
2327
+ "join",
2328
+ "learn",
2329
+ "sigh",
2330
+ "star"
2331
+ ],
2332
+ "suffixes": [
2333
+ "",
2334
+ "ed",
2335
+ "ing",
2336
+ "s",
2337
+ "t"
2338
+ ]
2339
+ },
2340
+ {
2341
+ "stems": [
2342
+ "'",
2343
+ "ann",
2344
+ "argentin",
2345
+ "b",
2346
+ "bab",
2347
+ "bell",
2348
+ "c",
2349
+ "carolin",
2350
+ "clar",
2351
+ "cub",
2352
+ "d",
2353
+ "dan",
2354
+ "dat",
2355
+ "dian",
2356
+ "er",
2357
+ "ev",
2358
+ "g",
2359
+ "georgi",
2360
+ "h",
2361
+ "hast",
2362
+ "juli",
2363
+ "l",
2364
+ "m",
2365
+ "mari",
2366
+ "n",
2367
+ "nin",
2368
+ "r",
2369
+ "ros",
2370
+ "s",
2371
+ "se",
2372
+ "sophi",
2373
+ "t",
2374
+ "te",
2375
+ "th",
2376
+ "tun",
2377
+ "us",
2378
+ "w",
2379
+ "wa",
2380
+ "y"
2381
+ ],
2382
+ "suffixes": [
2383
+ "a",
2384
+ "e"
2385
+ ]
2386
+ },
2387
+ {
2388
+ "stems": [
2389
+ "affect",
2390
+ "confess",
2391
+ "connect",
2392
+ "direct",
2393
+ "elect",
2394
+ "express",
2395
+ "not",
2396
+ "possess",
2397
+ "reflect",
2398
+ "select"
2399
+ ],
2400
+ "suffixes": [
2401
+ "",
2402
+ "ed",
2403
+ "ion"
2404
+ ]
2405
+ },
2406
+ {
2407
+ "stems": [
2408
+ "a",
2409
+ "banne",
2410
+ "cleane",
2411
+ "commande",
2412
+ "compose",
2413
+ "containe",
2414
+ "controlle",
2415
+ "counte",
2416
+ "dea",
2417
+ "designe",
2418
+ "dumpe",
2419
+ "e",
2420
+ "employe",
2421
+ "forme",
2422
+ "founde",
2423
+ "helpe",
2424
+ "jumpe",
2425
+ "kille",
2426
+ "lighte",
2427
+ "longe",
2428
+ "marke",
2429
+ "merge",
2430
+ "murdere",
2431
+ "painte",
2432
+ "parke",
2433
+ "presente",
2434
+ "rea",
2435
+ "recorde",
2436
+ "reporte",
2437
+ "rolle",
2438
+ "rubbe",
2439
+ "showe",
2440
+ "t",
2441
+ "tucke",
2442
+ "turne",
2443
+ "waite",
2444
+ "walke",
2445
+ "warne"
2446
+ ],
2447
+ "suffixes": [
2448
+ "d",
2449
+ "r"
2450
+ ]
2451
+ },
2452
+ {
2453
+ "stems": [
2454
+ "bark",
2455
+ "deal",
2456
+ "hold",
2457
+ "keep",
2458
+ "los",
2459
+ "shoot"
2460
+ ],
2461
+ "suffixes": [
2462
+ "",
2463
+ "er",
2464
+ "ing",
2465
+ "s"
2466
+ ]
2467
+ },
2468
+ {
2469
+ "stems": [
2470
+ "be",
2471
+ "box",
2472
+ "fish",
2473
+ "los",
2474
+ "rid",
2475
+ "us"
2476
+ ],
2477
+ "suffixes": [
2478
+ "",
2479
+ "er",
2480
+ "es",
2481
+ "ing"
2482
+ ]
2483
+ },
2484
+ {
2485
+ "stems": [
2486
+ "clean",
2487
+ "found",
2488
+ "long",
2489
+ "paint",
2490
+ "wait",
2491
+ "warn"
2492
+ ],
2493
+ "suffixes": [
2494
+ "",
2495
+ "ed",
2496
+ "er",
2497
+ "ing"
2498
+ ]
2499
+ },
2500
+ {
2501
+ "stems": [
2502
+ "e",
2503
+ "they'",
2504
+ "we'",
2505
+ "what'",
2506
+ "you'",
2507
+ "you’"
2508
+ ],
2509
+ "suffixes": [
2510
+ "d",
2511
+ "ll",
2512
+ "re",
2513
+ "ve"
2514
+ ]
2515
+ },
2516
+ {
2517
+ "stems": [
2518
+ "a",
2519
+ "b",
2520
+ "bab",
2521
+ "bell",
2522
+ "comfortabl",
2523
+ "considerabl",
2524
+ "cop",
2525
+ "dr",
2526
+ "e",
2527
+ "eas",
2528
+ "gentl",
2529
+ "grad",
2530
+ "ha",
2531
+ "hast",
2532
+ "he",
2533
+ "incredibl",
2534
+ "inquir",
2535
+ "ja",
2536
+ "jo",
2537
+ "m",
2538
+ "polic",
2539
+ "possibl",
2540
+ "probabl",
2541
+ "reasonabl",
2542
+ "scar",
2543
+ "sh",
2544
+ "shad",
2545
+ "shin",
2546
+ "simpl",
2547
+ "terribl",
2548
+ "th",
2549
+ "the",
2550
+ "tid",
2551
+ "wa"
2552
+ ],
2553
+ "suffixes": [
2554
+ "e",
2555
+ "y"
2556
+ ]
2557
+ },
2558
+ {
2559
+ "stems": [
2560
+ "bright",
2561
+ "deep",
2562
+ "light",
2563
+ "loud",
2564
+ "quick",
2565
+ "short",
2566
+ "slow",
2567
+ "soft",
2568
+ "warm"
2569
+ ],
2570
+ "suffixes": [
2571
+ "",
2572
+ "er",
2573
+ "ly"
2574
+ ]
2575
+ },
2576
+ {
2577
+ "stems": [
2578
+ "ca",
2579
+ "co",
2580
+ "fa",
2581
+ "ha",
2582
+ "ma",
2583
+ "olde",
2584
+ "pa",
2585
+ "smalle",
2586
+ "te"
2587
+ ],
2588
+ "suffixes": [
2589
+ "",
2590
+ "r",
2591
+ "st"
2592
+ ]
2593
+ },
2594
+ {
2595
+ "stems": [
2596
+ "a",
2597
+ "ca",
2598
+ "co",
2599
+ "da",
2600
+ "dea",
2601
+ "e",
2602
+ "ear",
2603
+ "gai",
2604
+ "gi",
2605
+ "grai",
2606
+ "grow",
2607
+ "ha",
2608
+ "i",
2609
+ "k",
2610
+ "l",
2611
+ "mai",
2612
+ "me",
2613
+ "mea",
2614
+ "norma",
2615
+ "ow",
2616
+ "pa",
2617
+ "pai",
2618
+ "rai",
2619
+ "sea",
2620
+ "shaw",
2621
+ "te",
2622
+ "trai",
2623
+ "va",
2624
+ "vo"
2625
+ ],
2626
+ "suffixes": [
2627
+ "l",
2628
+ "n"
2629
+ ]
2630
+ },
2631
+ {
2632
+ "stems": [
2633
+ "babbl",
2634
+ "be",
2635
+ "becom",
2636
+ "bit",
2637
+ "com",
2638
+ "d",
2639
+ "driv",
2640
+ "giv",
2641
+ "hid",
2642
+ "hous",
2643
+ "imitat",
2644
+ "jok",
2645
+ "leav",
2646
+ "los",
2647
+ "mak",
2648
+ "min",
2649
+ "nurs",
2650
+ "practic",
2651
+ "rac",
2652
+ "rid",
2653
+ "slid",
2654
+ "strik",
2655
+ "tak",
2656
+ "tim",
2657
+ "wak",
2658
+ "whin",
2659
+ "whistl",
2660
+ "writ"
2661
+ ],
2662
+ "suffixes": [
2663
+ "ing",
2664
+ [
2665
+ "e",
2666
+ [
2667
+ "",
2668
+ "s"
2669
+ ]
2670
+ ]
2671
+ ]
2672
+ },
2673
+ {
2674
+ "stems": [
2675
+ "addresse",
2676
+ "allie",
2677
+ "applie",
2678
+ "brushe",
2679
+ "crie",
2680
+ "davi",
2681
+ "dresse",
2682
+ "finishe",
2683
+ "frie",
2684
+ "how'",
2685
+ "kisse",
2686
+ "misse",
2687
+ "passe",
2688
+ "pushe",
2689
+ "r",
2690
+ "reache",
2691
+ "studie",
2692
+ "supplie",
2693
+ "touche",
2694
+ "trie",
2695
+ "watche",
2696
+ "where'",
2697
+ "who'",
2698
+ "why'",
2699
+ "wishe",
2700
+ "witnesse",
2701
+ "worrie"
2702
+ ],
2703
+ "suffixes": [
2704
+ "d",
2705
+ "s"
2706
+ ]
2707
+ },
2708
+ {
2709
+ "stems": [
2710
+ "amaz",
2711
+ "annoy",
2712
+ "await",
2713
+ "begg",
2714
+ "clapp",
2715
+ "confus",
2716
+ "consist",
2717
+ "dragg",
2718
+ "dropp",
2719
+ "embarrass",
2720
+ "excit",
2721
+ "fitt",
2722
+ "kidnapp",
2723
+ "mutter",
2724
+ "oppos",
2725
+ "pant",
2726
+ "plann",
2727
+ "referr",
2728
+ "rubb",
2729
+ "slipp",
2730
+ "stepp",
2731
+ "stirr",
2732
+ "stopp",
2733
+ "surround",
2734
+ "threaten",
2735
+ "travell",
2736
+ "trembl"
2737
+ ],
2738
+ "suffixes": [
2739
+ "ed",
2740
+ "ing"
2741
+ ]
2742
+ },
2743
+ {
2744
+ "stems": [
2745
+ "ba",
2746
+ "bo",
2747
+ "boo",
2748
+ "ear",
2749
+ "eighteen",
2750
+ "eleven",
2751
+ "for",
2752
+ "four",
2753
+ "grow",
2754
+ "ha",
2755
+ "ma",
2756
+ "mon",
2757
+ "my",
2758
+ "nineteen",
2759
+ "nor",
2760
+ "pa",
2761
+ "se",
2762
+ "seven",
2763
+ "six",
2764
+ "tee",
2765
+ "ten",
2766
+ "too",
2767
+ "warm",
2768
+ "wi",
2769
+ "you"
2770
+ ],
2771
+ "suffixes": [
2772
+ "",
2773
+ "th"
2774
+ ]
2775
+ },
2776
+ {
2777
+ "stems": [
2778
+ "clap",
2779
+ "drop",
2780
+ "slip",
2781
+ "step",
2782
+ "stop"
2783
+ ],
2784
+ "suffixes": [
2785
+ "",
2786
+ "ped",
2787
+ "ping",
2788
+ "s"
2789
+ ]
2790
+ },
2791
+ {
2792
+ "stems": [
2793
+ "great",
2794
+ "hard",
2795
+ "high",
2796
+ "near",
2797
+ "strong"
2798
+ ],
2799
+ "suffixes": [
2800
+ "",
2801
+ "er",
2802
+ "est",
2803
+ "ly"
2804
+ ]
2805
+ },
2806
+ {
2807
+ "stems": [
2808
+ "clear",
2809
+ "light",
2810
+ "open",
2811
+ "part"
2812
+ ],
2813
+ "suffixes": [
2814
+ "",
2815
+ "ed",
2816
+ "ing",
2817
+ "ly",
2818
+ "s"
2819
+ ]
2820
+ },
2821
+ {
2822
+ "stems": [
2823
+ "achieve",
2824
+ "agree",
2825
+ "announce",
2826
+ "base",
2827
+ "engage",
2828
+ "improve",
2829
+ "replace"
2830
+ ],
2831
+ "suffixes": [
2832
+ "",
2833
+ "d",
2834
+ "ment"
2835
+ ]
2836
+ },
2837
+ {
2838
+ "stems": [
2839
+ "co",
2840
+ "home",
2841
+ "mate",
2842
+ "pie",
2843
+ "range",
2844
+ "star",
2845
+ "ye"
2846
+ ],
2847
+ "suffixes": [
2848
+ "",
2849
+ "r",
2850
+ "s"
2851
+ ]
2852
+ },
2853
+ {
2854
+ "stems": [
2855
+ "doubt",
2856
+ "end",
2857
+ "fear",
2858
+ "help",
2859
+ "home",
2860
+ "regard",
2861
+ "wire"
2862
+ ],
2863
+ "suffixes": [
2864
+ "",
2865
+ "less",
2866
+ "s"
2867
+ ]
2868
+ },
2869
+ {
2870
+ "stems": [
2871
+ "advanc",
2872
+ "argu",
2873
+ "bor",
2874
+ "br",
2875
+ "chas",
2876
+ "chok",
2877
+ "dar",
2878
+ "encourag",
2879
+ "gaz",
2880
+ "graduat",
2881
+ "handl",
2882
+ "invit",
2883
+ "paus",
2884
+ "prepar",
2885
+ "realiz",
2886
+ "receiv",
2887
+ "reduc",
2888
+ "sav",
2889
+ "seiz",
2890
+ "surviv",
2891
+ "wast"
2892
+ ],
2893
+ "suffixes": [
2894
+ "ing",
2895
+ [
2896
+ "e",
2897
+ [
2898
+ "",
2899
+ "d"
2900
+ ]
2901
+ ]
2902
+ ]
2903
+ },
2904
+ {
2905
+ "stems": [
2906
+ "a",
2907
+ "absen",
2908
+ "assistan",
2909
+ "confiden",
2910
+ "distan",
2911
+ "fa",
2912
+ "for",
2913
+ "gree",
2914
+ "i",
2915
+ "ignoran",
2916
+ "importan",
2917
+ "independen",
2918
+ "innocen",
2919
+ "intelligen",
2920
+ "pa",
2921
+ "patien",
2922
+ "prin",
2923
+ "ra",
2924
+ "referen",
2925
+ "residen"
2926
+ ],
2927
+ "suffixes": [
2928
+ "ce",
2929
+ "t"
2930
+ ]
2931
+ },
2932
+ {
2933
+ "stems": [
2934
+ "accompan",
2935
+ "appl",
2936
+ "bur",
2937
+ "carr",
2938
+ "cr",
2939
+ "den",
2940
+ "dr",
2941
+ "fanc",
2942
+ "fr",
2943
+ "hurr",
2944
+ "identif",
2945
+ "justif",
2946
+ "marr",
2947
+ "occup",
2948
+ "repl",
2949
+ "satisf",
2950
+ "stud",
2951
+ "suppl",
2952
+ "tr",
2953
+ "worr"
2954
+ ],
2955
+ "suffixes": [
2956
+ "ied",
2957
+ "y"
2958
+ ]
2959
+ },
2960
+ {
2961
+ "stems": [
2962
+ "bet",
2963
+ "cruel",
2964
+ "du",
2965
+ "dump",
2966
+ "for",
2967
+ "has",
2968
+ "jet",
2969
+ "kit",
2970
+ "loyal",
2971
+ "nine",
2972
+ "par",
2973
+ "pat",
2974
+ "pi",
2975
+ "pot",
2976
+ "proper",
2977
+ "rus",
2978
+ "safe",
2979
+ "seven",
2980
+ "six",
2981
+ "spot"
2982
+ ],
2983
+ "suffixes": [
2984
+ "",
2985
+ "ty"
2986
+ ]
2987
+ },
2988
+ {
2989
+ "stems": [
2990
+ "classic",
2991
+ "coast",
2992
+ "constitution",
2993
+ "continent",
2994
+ "education",
2995
+ "electric",
2996
+ "environment",
2997
+ "fat",
2998
+ "fiction",
2999
+ "go",
3000
+ "historic",
3001
+ "logic",
3002
+ "magic",
3003
+ "marsh",
3004
+ "me",
3005
+ "met",
3006
+ "music",
3007
+ "roy",
3008
+ "se",
3009
+ "verb"
3010
+ ],
3011
+ "suffixes": [
3012
+ "",
3013
+ "al"
3014
+ ]
3015
+ },
3016
+ {
3017
+ "stems": [
3018
+ "admir",
3019
+ "agricultur",
3020
+ "anim",
3021
+ "approv",
3022
+ "brut",
3023
+ "c",
3024
+ "can",
3025
+ "fat",
3026
+ "fin",
3027
+ "g",
3028
+ "glob",
3029
+ "h",
3030
+ "mor",
3031
+ "natur",
3032
+ "propos",
3033
+ "sever",
3034
+ "surviv",
3035
+ "univers",
3036
+ "v"
3037
+ ],
3038
+ "suffixes": [
3039
+ "al",
3040
+ "e"
3041
+ ]
3042
+ },
3043
+ {
3044
+ "stems": [
3045
+ "be",
3046
+ "clos",
3047
+ "d",
3048
+ "danc",
3049
+ "driv",
3050
+ "explor",
3051
+ "freez",
3052
+ "liv",
3053
+ "los",
3054
+ "lov",
3055
+ "mak",
3056
+ "manag",
3057
+ "produc",
3058
+ "receiv",
3059
+ "rid",
3060
+ "rul",
3061
+ "us",
3062
+ "wrestl",
3063
+ "writ"
3064
+ ],
3065
+ "suffixes": [
3066
+ "ing",
3067
+ [
3068
+ "e",
3069
+ [
3070
+ "",
3071
+ "r"
3072
+ ]
3073
+ ]
3074
+ ]
3075
+ },
3076
+ {
3077
+ "stems": [
3078
+ "an",
3079
+ "comin",
3080
+ "doin",
3081
+ "eatin",
3082
+ "fuckin",
3083
+ "gettin",
3084
+ "goin",
3085
+ "lookin",
3086
+ "makin",
3087
+ "mornin",
3088
+ "playin",
3089
+ "sayin",
3090
+ "sittin",
3091
+ "takin",
3092
+ "talkin",
3093
+ "tellin",
3094
+ "thinkin",
3095
+ "tryin"
3096
+ ],
3097
+ "suffixes": [
3098
+ "'",
3099
+ "g"
3100
+ ]
3101
+ },
3102
+ {
3103
+ "stems": [
3104
+ "amus",
3105
+ "arrang",
3106
+ "manag",
3107
+ "mov",
3108
+ "retir",
3109
+ "settl"
3110
+ ],
3111
+ "suffixes": [
3112
+ "ement",
3113
+ "ing",
3114
+ [
3115
+ "e",
3116
+ [
3117
+ "",
3118
+ "d"
3119
+ ]
3120
+ ]
3121
+ ]
3122
+ },
3123
+ {
3124
+ "stems": [
3125
+ "be",
3126
+ "gentlem",
3127
+ "policem",
3128
+ "se",
3129
+ "th",
3130
+ "wom"
3131
+ ],
3132
+ "suffixes": [
3133
+ "",
3134
+ "an",
3135
+ "en"
3136
+ ]
3137
+ },
3138
+ {
3139
+ "stems": [
3140
+ "bless",
3141
+ "br",
3142
+ "greet",
3143
+ "paint",
3144
+ "proceed",
3145
+ "record"
3146
+ ],
3147
+ "suffixes": [
3148
+ "",
3149
+ "ed",
3150
+ [
3151
+ "ing",
3152
+ [
3153
+ "",
3154
+ "s"
3155
+ ]
3156
+ ]
3157
+ ]
3158
+ },
3159
+ {
3160
+ "stems": [
3161
+ "brave",
3162
+ "bu",
3163
+ "count",
3164
+ "ga",
3165
+ "hen",
3166
+ "slave"
3167
+ ],
3168
+ "suffixes": [
3169
+ "",
3170
+ "ry",
3171
+ "s"
3172
+ ]
3173
+ },
3174
+ {
3175
+ "stems": [
3176
+ "can",
3177
+ "didn",
3178
+ "don",
3179
+ "haven",
3180
+ "isn",
3181
+ "won"
3182
+ ],
3183
+ "suffixes": [
3184
+ "",
3185
+ "'t",
3186
+ "’t"
3187
+ ]
3188
+ },
3189
+ {
3190
+ "stems": [
3191
+ "celebrat",
3192
+ "concentrat",
3193
+ "creat",
3194
+ "indicat",
3195
+ "operat",
3196
+ "relat"
3197
+ ],
3198
+ "suffixes": [
3199
+ "ing",
3200
+ "ion",
3201
+ [
3202
+ "e",
3203
+ [
3204
+ "",
3205
+ "d"
3206
+ ]
3207
+ ]
3208
+ ]
3209
+ },
3210
+ {
3211
+ "stems": [
3212
+ "chair",
3213
+ "horse",
3214
+ "new",
3215
+ "post",
3216
+ "rifle",
3217
+ "sea"
3218
+ ],
3219
+ "suffixes": [
3220
+ "",
3221
+ "man",
3222
+ "s"
3223
+ ]
3224
+ },
3225
+ {
3226
+ "stems": [
3227
+ "confirm",
3228
+ "consider",
3229
+ "form",
3230
+ "found",
3231
+ "inform",
3232
+ "resign"
3233
+ ],
3234
+ "suffixes": [
3235
+ "",
3236
+ "ation",
3237
+ "ed"
3238
+ ]
3239
+ },
3240
+ {
3241
+ "stems": [
3242
+ "ba",
3243
+ "bea",
3244
+ "cracke",
3245
+ "defende",
3246
+ "floo",
3247
+ "flowe",
3248
+ "hea",
3249
+ "hunte",
3250
+ "ma",
3251
+ "manne",
3252
+ "owne",
3253
+ "pai",
3254
+ "playe",
3255
+ "publishe",
3256
+ "roa",
3257
+ "washe",
3258
+ "worke"
3259
+ ],
3260
+ "suffixes": [
3261
+ "d",
3262
+ [
3263
+ "r",
3264
+ [
3265
+ "",
3266
+ "s"
3267
+ ]
3268
+ ]
3269
+ ]
3270
+ },
3271
+ {
3272
+ "stems": [
3273
+ "bea",
3274
+ "bigge",
3275
+ "ea",
3276
+ "earlie",
3277
+ "elde",
3278
+ "fea",
3279
+ "greate",
3280
+ "harde",
3281
+ "highe",
3282
+ "longe",
3283
+ "lowe",
3284
+ "neare",
3285
+ "roa",
3286
+ "stronge",
3287
+ "talle",
3288
+ "va",
3289
+ "younge"
3290
+ ],
3291
+ "suffixes": [
3292
+ "r",
3293
+ "st"
3294
+ ]
3295
+ },
3296
+ {
3297
+ "stems": [
3298
+ "cruise",
3299
+ "doo",
3300
+ "eve",
3301
+ "fu",
3302
+ "gab",
3303
+ "he",
3304
+ "hei",
3305
+ "ka",
3306
+ "ou",
3307
+ "pete",
3308
+ "pipe",
3309
+ "poo",
3310
+ "sauce",
3311
+ "su",
3312
+ "tea",
3313
+ "yea",
3314
+ "you"
3315
+ ],
3316
+ "suffixes": [
3317
+ "",
3318
+ "r"
3319
+ ]
3320
+ },
3321
+ {
3322
+ "stems": [
3323
+ "blaz",
3324
+ "choos",
3325
+ "cycl",
3326
+ "hav",
3327
+ "paddl",
3328
+ "programm",
3329
+ "rattl",
3330
+ "ris",
3331
+ "s",
3332
+ "shin",
3333
+ "smok",
3334
+ "teas",
3335
+ "th",
3336
+ "trad",
3337
+ "twinkl",
3338
+ "w"
3339
+ ],
3340
+ "suffixes": [
3341
+ "e",
3342
+ "ing"
3343
+ ]
3344
+ },
3345
+ {
3346
+ "stems": [
3347
+ "bum",
3348
+ "dum",
3349
+ "gas",
3350
+ "ho",
3351
+ "lea",
3352
+ "overlap",
3353
+ "pee",
3354
+ "pop",
3355
+ "rip",
3356
+ "snap",
3357
+ "trap",
3358
+ "trip",
3359
+ "whip",
3360
+ "wi",
3361
+ "worship",
3362
+ "wrap"
3363
+ ],
3364
+ "suffixes": [
3365
+ "",
3366
+ "ped"
3367
+ ]
3368
+ },
3369
+ {
3370
+ "stems": [
3371
+ "act",
3372
+ "attract",
3373
+ "collect",
3374
+ "impress"
3375
+ ],
3376
+ "suffixes": [
3377
+ "",
3378
+ "ed",
3379
+ "ion",
3380
+ "ive"
3381
+ ]
3382
+ },
3383
+ {
3384
+ "stems": [
3385
+ "carrie",
3386
+ "ha",
3387
+ "line",
3388
+ "si"
3389
+ ],
3390
+ "suffixes": [
3391
+ "",
3392
+ "d",
3393
+ "r",
3394
+ "s"
3395
+ ]
3396
+ },
3397
+ {
3398
+ "stems": [
3399
+ "develop",
3400
+ "enjoy",
3401
+ "entertain",
3402
+ "treat"
3403
+ ],
3404
+ "suffixes": [
3405
+ "",
3406
+ "ed",
3407
+ "ing",
3408
+ "ment"
3409
+ ]
3410
+ },
3411
+ {
3412
+ "stems": [
3413
+ "",
3414
+ "admir",
3415
+ "approv",
3416
+ "arriv",
3417
+ "buri",
3418
+ "deni",
3419
+ "di",
3420
+ "form",
3421
+ "natur",
3422
+ "propos",
3423
+ "se",
3424
+ "sign",
3425
+ "surviv",
3426
+ "tri"
3427
+ ],
3428
+ "suffixes": [
3429
+ "al",
3430
+ "ed"
3431
+ ]
3432
+ },
3433
+ {
3434
+ "stems": [
3435
+ "",
3436
+ "celebrati",
3437
+ "concentrati",
3438
+ "confusi",
3439
+ "creati",
3440
+ "discussi",
3441
+ "imitati",
3442
+ "indicati",
3443
+ "missi",
3444
+ "passi",
3445
+ "processi",
3446
+ "protecti",
3447
+ "reacti",
3448
+ "so"
3449
+ ],
3450
+ "suffixes": [
3451
+ "ng",
3452
+ "on"
3453
+ ]
3454
+ },
3455
+ {
3456
+ "stems": [
3457
+ "angr",
3458
+ "da",
3459
+ "eas",
3460
+ "happ",
3461
+ "hast",
3462
+ "heart",
3463
+ "heav",
3464
+ "luck",
3465
+ "merr",
3466
+ "necessar",
3467
+ "primar",
3468
+ "read",
3469
+ "stead",
3470
+ "temporar"
3471
+ ],
3472
+ "suffixes": [
3473
+ "ily",
3474
+ "y"
3475
+ ]
3476
+ },
3477
+ {
3478
+ "stems": [
3479
+ "assembl",
3480
+ "b",
3481
+ "curl",
3482
+ "enquir",
3483
+ "fl",
3484
+ "fr",
3485
+ "injur",
3486
+ "inquir",
3487
+ "nodd",
3488
+ "popp",
3489
+ "sp",
3490
+ "spott",
3491
+ "stor",
3492
+ "tast"
3493
+ ],
3494
+ "suffixes": [
3495
+ "ed",
3496
+ "y"
3497
+ ]
3498
+ },
3499
+ {
3500
+ "stems": [
3501
+ "",
3502
+ "act",
3503
+ "collect",
3504
+ "direct",
3505
+ "elect",
3506
+ "illustrat",
3507
+ "l",
3508
+ "locat",
3509
+ "operat",
3510
+ "relat",
3511
+ "situat",
3512
+ "stat",
3513
+ "suggest"
3514
+ ],
3515
+ "suffixes": [
3516
+ "ed",
3517
+ [
3518
+ "ion",
3519
+ [
3520
+ "",
3521
+ "s"
3522
+ ]
3523
+ ]
3524
+ ]
3525
+ },
3526
+ {
3527
+ "stems": [
3528
+ "",
3529
+ "acti",
3530
+ "administrati",
3531
+ "attracti",
3532
+ "collecti",
3533
+ "competiti",
3534
+ "creati",
3535
+ "executi",
3536
+ "explosi",
3537
+ "extensi",
3538
+ "impressi",
3539
+ "moti",
3540
+ "positi"
3541
+ ],
3542
+ "suffixes": [
3543
+ "on",
3544
+ "ve"
3545
+ ]
3546
+ },
3547
+ {
3548
+ "stems": [
3549
+ "a",
3550
+ "age",
3551
+ "ai",
3552
+ "be",
3553
+ "consiste",
3554
+ "depende",
3555
+ "hi",
3556
+ "i",
3557
+ "pai",
3558
+ "sai",
3559
+ "spe",
3560
+ "te",
3561
+ "urge"
3562
+ ],
3563
+ "suffixes": [
3564
+ "d",
3565
+ "nt"
3566
+ ]
3567
+ },
3568
+ {
3569
+ "stems": [
3570
+ "animat",
3571
+ "confus",
3572
+ "construct",
3573
+ "convict",
3574
+ "decorat",
3575
+ "depress",
3576
+ "devot",
3577
+ "distribut",
3578
+ "educat",
3579
+ "execut",
3580
+ "invent",
3581
+ "nominat",
3582
+ "translat"
3583
+ ],
3584
+ "suffixes": [
3585
+ "ed",
3586
+ "ion"
3587
+ ]
3588
+ },
3589
+ {
3590
+ "stems": [
3591
+ "artist",
3592
+ "democrat",
3593
+ "log",
3594
+ "magnet",
3595
+ "top"
3596
+ ],
3597
+ "suffixes": [
3598
+ "",
3599
+ "ic",
3600
+ "s"
3601
+ ]
3602
+ },
3603
+ {
3604
+ "stems": [
3605
+ "build",
3606
+ "draw",
3607
+ "feel",
3608
+ "meet",
3609
+ "record"
3610
+ ],
3611
+ "suffixes": [
3612
+ "",
3613
+ "s",
3614
+ [
3615
+ "ing",
3616
+ [
3617
+ "",
3618
+ "s"
3619
+ ]
3620
+ ]
3621
+ ]
3622
+ },
3623
+ {
3624
+ "stems": [
3625
+ "colon",
3626
+ "industr",
3627
+ "memor",
3628
+ "part",
3629
+ "tr"
3630
+ ],
3631
+ "suffixes": [
3632
+ "ial",
3633
+ "ies",
3634
+ "y"
3635
+ ]
3636
+ },
3637
+ {
3638
+ "stems": [
3639
+ "correct",
3640
+ "direct",
3641
+ "faint",
3642
+ "li",
3643
+ "utter"
3644
+ ],
3645
+ "suffixes": [
3646
+ "",
3647
+ "ed",
3648
+ "ly"
3649
+ ]
3650
+ },
3651
+ {
3652
+ "stems": [
3653
+ "direct",
3654
+ "distinct",
3655
+ "intent",
3656
+ "on",
3657
+ "perfect"
3658
+ ],
3659
+ "suffixes": [
3660
+ "",
3661
+ "ion",
3662
+ "ly"
3663
+ ]
3664
+ },
3665
+ {
3666
+ "stems": [
3667
+ "act",
3668
+ "sail",
3669
+ "visit"
3670
+ ],
3671
+ "suffixes": [
3672
+ "",
3673
+ "ed",
3674
+ "ing",
3675
+ "s",
3676
+ [
3677
+ "or",
3678
+ [
3679
+ "",
3680
+ "s"
3681
+ ]
3682
+ ]
3683
+ ]
3684
+ },
3685
+ {
3686
+ "stems": [
3687
+ "admir",
3688
+ "combin",
3689
+ "declar",
3690
+ "determin",
3691
+ "examin",
3692
+ "imagin",
3693
+ "invit",
3694
+ "n",
3695
+ "observ",
3696
+ "organiz",
3697
+ "prepar",
3698
+ "quot"
3699
+ ],
3700
+ "suffixes": [
3701
+ "ation",
3702
+ [
3703
+ "e",
3704
+ [
3705
+ "",
3706
+ "d"
3707
+ ]
3708
+ ]
3709
+ ]
3710
+ },
3711
+ {
3712
+ "stems": [
3713
+ "aunt",
3714
+ "carr",
3715
+ "dogg",
3716
+ "duck",
3717
+ "frank",
3718
+ "jack",
3719
+ "jenn",
3720
+ "jul",
3721
+ "mumm",
3722
+ "napp",
3723
+ "pott",
3724
+ "ros"
3725
+ ],
3726
+ "suffixes": [
3727
+ "ie",
3728
+ "y"
3729
+ ]
3730
+ },
3731
+ {
3732
+ "stems": [
3733
+ "be",
3734
+ "bea",
3735
+ "bu",
3736
+ "fin",
3737
+ "i",
3738
+ "ki",
3739
+ "lea",
3740
+ "loa",
3741
+ "no",
3742
+ "swor",
3743
+ "wor",
3744
+ "yar"
3745
+ ],
3746
+ "suffixes": [
3747
+ "n",
3748
+ [
3749
+ "d",
3750
+ [
3751
+ "",
3752
+ "s"
3753
+ ]
3754
+ ]
3755
+ ]
3756
+ },
3757
+ {
3758
+ "stems": [
3759
+ "bu",
3760
+ "c",
3761
+ "har",
3762
+ "ju",
3763
+ "lor",
3764
+ "ma",
3765
+ "ro",
3766
+ "robbe",
3767
+ "sala",
3768
+ "slippe",
3769
+ "t",
3770
+ "wor"
3771
+ ],
3772
+ "suffixes": [
3773
+ "d",
3774
+ "ry"
3775
+ ]
3776
+ },
3777
+ {
3778
+ "stems": [
3779
+ "a",
3780
+ "ashe",
3781
+ "boxe",
3782
+ "d",
3783
+ "e",
3784
+ "fishe",
3785
+ "l",
3786
+ "m",
3787
+ "p",
3788
+ "pea",
3789
+ "va"
3790
+ ],
3791
+ "suffixes": [
3792
+ "r",
3793
+ "s"
3794
+ ]
3795
+ },
3796
+ {
3797
+ "stems": [
3798
+ "appl",
3799
+ "cod",
3800
+ "grav",
3801
+ "hol",
3802
+ "nois",
3803
+ "ros",
3804
+ "stor",
3805
+ "tast",
3806
+ "to",
3807
+ "ton",
3808
+ "trac"
3809
+ ],
3810
+ "suffixes": [
3811
+ "y",
3812
+ [
3813
+ "e",
3814
+ [
3815
+ "",
3816
+ "s"
3817
+ ]
3818
+ ]
3819
+ ]
3820
+ },
3821
+ {
3822
+ "stems": [
3823
+ "art",
3824
+ "ass",
3825
+ "ex",
3826
+ "guitar",
3827
+ "journal",
3828
+ "mo",
3829
+ "res",
3830
+ "social",
3831
+ "terror",
3832
+ "tour",
3833
+ "wa"
3834
+ ],
3835
+ "suffixes": [
3836
+ "",
3837
+ "ist"
3838
+ ]
3839
+ },
3840
+ {
3841
+ "stems": [
3842
+ "",
3843
+ "a",
3844
+ "ba",
3845
+ "di",
3846
+ "flu",
3847
+ "fooli",
3848
+ "hu",
3849
+ "rubbi",
3850
+ "standi",
3851
+ "wi"
3852
+ ],
3853
+ "suffixes": [
3854
+ "ng",
3855
+ "sh"
3856
+ ]
3857
+ },
3858
+ {
3859
+ "stems": [
3860
+ "",
3861
+ "b",
3862
+ "d",
3863
+ "ev",
3864
+ "h",
3865
+ "horsem",
3866
+ "m",
3867
+ "p",
3868
+ "r",
3869
+ "t"
3870
+ ],
3871
+ "suffixes": [
3872
+ "an",
3873
+ "en"
3874
+ ]
3875
+ },
3876
+ {
3877
+ "stems": [
3878
+ "beck",
3879
+ "bett",
3880
+ "count",
3881
+ "hard",
3882
+ "jenn",
3883
+ "philosoph",
3884
+ "photograph",
3885
+ "pott",
3886
+ "read",
3887
+ "stick"
3888
+ ],
3889
+ "suffixes": [
3890
+ "er",
3891
+ "y"
3892
+ ]
3893
+ },
3894
+ {
3895
+ "stems": [
3896
+ "chatt",
3897
+ "cutt",
3898
+ "digg",
3899
+ "lett",
3900
+ "manufactur",
3901
+ "rubb",
3902
+ "runn",
3903
+ "swimm",
3904
+ "trail",
3905
+ "winn"
3906
+ ],
3907
+ "suffixes": [
3908
+ "er",
3909
+ "ing"
3910
+ ]
3911
+ },
3912
+ {
3913
+ "stems": [
3914
+ "creat",
3915
+ "d",
3916
+ "defens",
3917
+ "dr",
3918
+ "expens",
3919
+ "g",
3920
+ "l",
3921
+ "nat",
3922
+ "offens",
3923
+ "relat"
3924
+ ],
3925
+ "suffixes": [
3926
+ "e",
3927
+ "ive"
3928
+ ]
3929
+ },
3930
+ {
3931
+ "stems": [
3932
+ "",
3933
+ "coloni",
3934
+ "go",
3935
+ "industri",
3936
+ "memori",
3937
+ "parti",
3938
+ "seri",
3939
+ "speci",
3940
+ "tri"
3941
+ ],
3942
+ "suffixes": [
3943
+ "al",
3944
+ "es"
3945
+ ]
3946
+ },
3947
+ {
3948
+ "stems": [
3949
+ "a",
3950
+ "basi",
3951
+ "do",
3952
+ "eri",
3953
+ "heroi",
3954
+ "in",
3955
+ "ma",
3956
+ "mi",
3957
+ "toxi"
3958
+ ],
3959
+ "suffixes": [
3960
+ "c",
3961
+ "n"
3962
+ ]
3963
+ },
3964
+ {
3965
+ "stems": [
3966
+ "activ",
3967
+ "c",
3968
+ "captiv",
3969
+ "commun",
3970
+ "dens",
3971
+ "grav",
3972
+ "intens",
3973
+ "secur",
3974
+ "univers"
3975
+ ],
3976
+ "suffixes": [
3977
+ "e",
3978
+ "ity"
3979
+ ]
3980
+ },
3981
+ {
3982
+ "stems": [
3983
+ "as",
3984
+ "dis",
3985
+ "muc",
3986
+ "o",
3987
+ "ric",
3988
+ "suc",
3989
+ "tc",
3990
+ "u",
3991
+ "zac"
3992
+ ],
3993
+ "suffixes": [
3994
+ "h",
3995
+ "k"
3996
+ ]
3997
+ },
3998
+ {
3999
+ "stems": [
4000
+ "behavio",
4001
+ "colo",
4002
+ "favo",
4003
+ "fo",
4004
+ "harbo",
4005
+ "hono",
4006
+ "labo",
4007
+ "neighbo",
4008
+ "o"
4009
+ ],
4010
+ "suffixes": [
4011
+ "r",
4012
+ "ur"
4013
+ ]
4014
+ },
4015
+ {
4016
+ "stems": [
4017
+ "communicat",
4018
+ "institut",
4019
+ "l",
4020
+ "nat",
4021
+ "on",
4022
+ "operat",
4023
+ "relat",
4024
+ "stat",
4025
+ "vers"
4026
+ ],
4027
+ "suffixes": [
4028
+ "e",
4029
+ [
4030
+ "ion",
4031
+ [
4032
+ "",
4033
+ "s"
4034
+ ]
4035
+ ]
4036
+ ]
4037
+ },
4038
+ {
4039
+ "stems": [
4040
+ "creat",
4041
+ "edit",
4042
+ "elevat",
4043
+ "inspect",
4044
+ "investigat",
4045
+ "profess",
4046
+ "radiat",
4047
+ "success",
4048
+ "translat"
4049
+ ],
4050
+ "suffixes": [
4051
+ "ion",
4052
+ "or"
4053
+ ]
4054
+ },
4055
+ {
4056
+ "stems": [
4057
+ "author",
4058
+ "christian",
4059
+ "hospital",
4060
+ "human"
4061
+ ],
4062
+ "suffixes": [
4063
+ "",
4064
+ "ity",
4065
+ "s"
4066
+ ]
4067
+ },
4068
+ {
4069
+ "stems": [
4070
+ "nice",
4071
+ "safe",
4072
+ "strange",
4073
+ "wide"
4074
+ ],
4075
+ "suffixes": [
4076
+ "",
4077
+ "ly",
4078
+ "r"
4079
+ ]
4080
+ },
4081
+ {
4082
+ "stems": [
4083
+ "beat",
4084
+ "eat",
4085
+ "fall"
4086
+ ],
4087
+ "suffixes": [
4088
+ "",
4089
+ "en",
4090
+ "ing",
4091
+ "s"
4092
+ ]
4093
+ },
4094
+ {
4095
+ "stems": [
4096
+ "eight",
4097
+ "sevent",
4098
+ "sixt"
4099
+ ],
4100
+ "suffixes": [
4101
+ "",
4102
+ "een",
4103
+ "h",
4104
+ "y"
4105
+ ]
4106
+ },
4107
+ {
4108
+ "stems": [
4109
+ "end",
4110
+ "fail",
4111
+ "press"
4112
+ ],
4113
+ "suffixes": [
4114
+ "",
4115
+ "ed",
4116
+ "ing",
4117
+ "ure"
4118
+ ]
4119
+ },
4120
+ {
4121
+ "stems": [
4122
+ "admira",
4123
+ "anima",
4124
+ "corpora",
4125
+ "equa",
4126
+ "federa",
4127
+ "forma",
4128
+ "genera",
4129
+ "loca"
4130
+ ],
4131
+ "suffixes": [
4132
+ "l",
4133
+ "tion"
4134
+ ]
4135
+ },
4136
+ {
4137
+ "stems": [
4138
+ "arriv",
4139
+ "centr",
4140
+ "cultur",
4141
+ "d",
4142
+ "di",
4143
+ "practic",
4144
+ "se",
4145
+ "trib"
4146
+ ],
4147
+ "suffixes": [
4148
+ "al",
4149
+ [
4150
+ "e",
4151
+ [
4152
+ "",
4153
+ "s"
4154
+ ]
4155
+ ]
4156
+ ]
4157
+ },
4158
+ {
4159
+ "stems": [
4160
+ "composit",
4161
+ "contribut",
4162
+ "corporat",
4163
+ "definit",
4164
+ "imitat",
4165
+ "investigat",
4166
+ "opposit",
4167
+ "tens"
4168
+ ],
4169
+ "suffixes": [
4170
+ "e",
4171
+ "ion"
4172
+ ]
4173
+ },
4174
+ {
4175
+ "stems": [
4176
+ "consist",
4177
+ "d",
4178
+ "moan",
4179
+ "pant",
4180
+ "r",
4181
+ "s",
4182
+ "whimper",
4183
+ "yawn"
4184
+ ],
4185
+ "suffixes": [
4186
+ "ing",
4187
+ "s"
4188
+ ]
4189
+ },
4190
+ {
4191
+ "stems": [
4192
+ "d",
4193
+ "mathematic",
4194
+ "mechanic",
4195
+ "p",
4196
+ "physic",
4197
+ "politic",
4198
+ "statistic",
4199
+ "v"
4200
+ ],
4201
+ "suffixes": [
4202
+ "al",
4203
+ "s"
4204
+ ]
4205
+ },
4206
+ {
4207
+ "stems": [
4208
+ "",
4209
+ "act",
4210
+ "direct",
4211
+ "instruct",
4212
+ "mot",
4213
+ "operat",
4214
+ "sect"
4215
+ ],
4216
+ "suffixes": [
4217
+ "or",
4218
+ [
4219
+ "ion",
4220
+ [
4221
+ "",
4222
+ "s"
4223
+ ]
4224
+ ]
4225
+ ]
4226
+ },
4227
+ {
4228
+ "stems": [
4229
+ "",
4230
+ "be",
4231
+ "famili",
4232
+ "fe",
4233
+ "li",
4234
+ "molecul",
4235
+ "pol"
4236
+ ],
4237
+ "suffixes": [
4238
+ "ar",
4239
+ "es"
4240
+ ]
4241
+ },
4242
+ {
4243
+ "stems": [
4244
+ "",
4245
+ "cycli",
4246
+ "ha",
4247
+ "ho",
4248
+ "ju",
4249
+ "lo",
4250
+ "touri"
4251
+ ],
4252
+ "suffixes": [
4253
+ "ng",
4254
+ "st"
4255
+ ]
4256
+ },
4257
+ {
4258
+ "stems": [
4259
+ "affection",
4260
+ "deb",
4261
+ "imit",
4262
+ "kar",
4263
+ "passion",
4264
+ "pl",
4265
+ "st"
4266
+ ],
4267
+ "suffixes": [
4268
+ "",
4269
+ "ate"
4270
+ ]
4271
+ },
4272
+ {
4273
+ "stems": [
4274
+ "appoint",
4275
+ "argu",
4276
+ "astonish",
4277
+ "depart",
4278
+ "disappoint",
4279
+ "judg",
4280
+ "unemploy"
4281
+ ],
4282
+ "suffixes": [
4283
+ "ed",
4284
+ "ment"
4285
+ ]
4286
+ },
4287
+ {
4288
+ "stems": [
4289
+ "ain",
4290
+ "couldn",
4291
+ "doesn",
4292
+ "hadn",
4293
+ "wasn",
4294
+ "wouldn"
4295
+ ],
4296
+ "suffixes": [
4297
+ "'t",
4298
+ "’t"
4299
+ ]
4300
+ },
4301
+ {
4302
+ "stems": [
4303
+ "associat",
4304
+ "hesitat",
4305
+ "l",
4306
+ "not",
4307
+ "promot",
4308
+ "stat"
4309
+ ],
4310
+ "suffixes": [
4311
+ "ion",
4312
+ [
4313
+ "e",
4314
+ [
4315
+ "",
4316
+ "d"
4317
+ ]
4318
+ ]
4319
+ ]
4320
+ },
4321
+ {
4322
+ "stems": [
4323
+ "commerc",
4324
+ "d",
4325
+ "fac",
4326
+ "financ",
4327
+ "offic",
4328
+ "provinc"
4329
+ ],
4330
+ "suffixes": [
4331
+ "e",
4332
+ "ial"
4333
+ ]
4334
+ },
4335
+ {
4336
+ "stems": [
4337
+ "conservati",
4338
+ "li",
4339
+ "mo",
4340
+ "nati",
4341
+ "objecti",
4342
+ "relati"
4343
+ ],
4344
+ "suffixes": [
4345
+ "on",
4346
+ [
4347
+ "ve",
4348
+ [
4349
+ "",
4350
+ "s"
4351
+ ]
4352
+ ]
4353
+ ]
4354
+ },
4355
+ {
4356
+ "stems": [
4357
+ "da",
4358
+ "existe",
4359
+ "fe",
4360
+ "gla",
4361
+ "occurre",
4362
+ "si"
4363
+ ],
4364
+ "suffixes": [
4365
+ "d",
4366
+ "nce"
4367
+ ]
4368
+ },
4369
+ {
4370
+ "stems": [
4371
+ "electric",
4372
+ "major",
4373
+ "minor",
4374
+ "municipal",
4375
+ "popular",
4376
+ "prior"
4377
+ ],
4378
+ "suffixes": [
4379
+ "",
4380
+ "ity"
4381
+ ]
4382
+ },
4383
+ {
4384
+ "stems": [
4385
+ "bounc",
4386
+ "shak",
4387
+ "tickl"
4388
+ ],
4389
+ "suffixes": [
4390
+ "ing",
4391
+ "y",
4392
+ [
4393
+ "e",
4394
+ [
4395
+ "",
4396
+ "s"
4397
+ ]
4398
+ ]
4399
+ ]
4400
+ },
4401
+ {
4402
+ "stems": [
4403
+ "ca",
4404
+ "office",
4405
+ "wa"
4406
+ ],
4407
+ "suffixes": [
4408
+ "",
4409
+ "s",
4410
+ [
4411
+ "r",
4412
+ [
4413
+ "",
4414
+ "s"
4415
+ ]
4416
+ ]
4417
+ ]
4418
+ },
4419
+ {
4420
+ "stems": [
4421
+ "commi",
4422
+ "permi",
4423
+ "submi"
4424
+ ],
4425
+ "suffixes": [
4426
+ "ssion",
4427
+ "t",
4428
+ "tted"
4429
+ ]
4430
+ },
4431
+ {
4432
+ "stems": [
4433
+ "dark",
4434
+ "high",
4435
+ "weak"
4436
+ ],
4437
+ "suffixes": [
4438
+ "",
4439
+ "er",
4440
+ "ness"
4441
+ ]
4442
+ },
4443
+ {
4444
+ "stems": [
4445
+ "employ",
4446
+ "establish",
4447
+ "punish"
4448
+ ],
4449
+ "suffixes": [
4450
+ "",
4451
+ "ed",
4452
+ "ment"
4453
+ ]
4454
+ },
4455
+ {
4456
+ "stems": [
4457
+ "free",
4458
+ "ju",
4459
+ "li"
4460
+ ],
4461
+ "suffixes": [
4462
+ "",
4463
+ "d",
4464
+ "ly"
4465
+ ]
4466
+ },
4467
+ {
4468
+ "stems": [
4469
+ "part",
4470
+ "president",
4471
+ "resident"
4472
+ ],
4473
+ "suffixes": [
4474
+ "",
4475
+ "ial",
4476
+ "s"
4477
+ ]
4478
+ },
4479
+ {
4480
+ "stems": [
4481
+ "pray",
4482
+ "rid",
4483
+ "teach"
4484
+ ],
4485
+ "suffixes": [
4486
+ "",
4487
+ "ing",
4488
+ [
4489
+ "er",
4490
+ [
4491
+ "",
4492
+ "s"
4493
+ ]
4494
+ ]
4495
+ ]
4496
+ },
4497
+ {
4498
+ "stems": [
4499
+ "",
4500
+ "counti",
4501
+ "do",
4502
+ "handi",
4503
+ "parti"
4504
+ ],
4505
+ "suffixes": [
4506
+ "es",
4507
+ "ng"
4508
+ ]
4509
+ },
4510
+ {
4511
+ "stems": [
4512
+ "academ",
4513
+ "econom",
4514
+ "histor",
4515
+ "m",
4516
+ "strateg"
4517
+ ],
4518
+ "suffixes": [
4519
+ "ic",
4520
+ "y"
4521
+ ]
4522
+ },
4523
+ {
4524
+ "stems": [
4525
+ "acti",
4526
+ "collecti",
4527
+ "operati",
4528
+ "relati",
4529
+ "suggesti"
4530
+ ],
4531
+ "suffixes": [
4532
+ "ng",
4533
+ [
4534
+ "on",
4535
+ [
4536
+ "",
4537
+ "s"
4538
+ ]
4539
+ ]
4540
+ ]
4541
+ },
4542
+ {
4543
+ "stems": [
4544
+ "am",
4545
+ "bab",
4546
+ "mamm",
4547
+ "momm",
4548
+ "wh"
4549
+ ],
4550
+ "suffixes": [
4551
+ "a",
4552
+ [
4553
+ "y",
4554
+ [
4555
+ "",
4556
+ "'s"
4557
+ ]
4558
+ ]
4559
+ ]
4560
+ },
4561
+ {
4562
+ "stems": [
4563
+ "ar",
4564
+ "franch",
4565
+ "parad",
4566
+ "r",
4567
+ "w"
4568
+ ],
4569
+ "suffixes": [
4570
+ "e",
4571
+ "ise"
4572
+ ]
4573
+ },
4574
+ {
4575
+ "stems": [
4576
+ "belie",
4577
+ "lea",
4578
+ "thie",
4579
+ "wol",
4580
+ "yoursel"
4581
+ ],
4582
+ "suffixes": [
4583
+ "f",
4584
+ "ves"
4585
+ ]
4586
+ },
4587
+ {
4588
+ "stems": [
4589
+ "comed",
4590
+ "histor",
4591
+ "hungar",
4592
+ "ital",
4593
+ "lil"
4594
+ ],
4595
+ "suffixes": [
4596
+ "ian",
4597
+ "y"
4598
+ ]
4599
+ },
4600
+ {
4601
+ "stems": [
4602
+ "destroye",
4603
+ "followe",
4604
+ "slippe",
4605
+ "supporte",
4606
+ "travele"
4607
+ ],
4608
+ "suffixes": [
4609
+ "d",
4610
+ "rs"
4611
+ ]
4612
+ },
4613
+ {
4614
+ "stems": [
4615
+ "el",
4616
+ "gabriel",
4617
+ "isabel",
4618
+ "lo",
4619
+ "pau"
4620
+ ],
4621
+ "suffixes": [
4622
+ "",
4623
+ "la"
4624
+ ]
4625
+ },
4626
+ {
4627
+ "stems": [
4628
+ "eviden",
4629
+ "presen",
4630
+ "significan",
4631
+ "silen",
4632
+ "violen"
4633
+ ],
4634
+ "suffixes": [
4635
+ "ce",
4636
+ [
4637
+ "t",
4638
+ [
4639
+ "",
4640
+ "ly"
4641
+ ]
4642
+ ]
4643
+ ]
4644
+ },
4645
+ {
4646
+ "stems": [
4647
+ "form",
4648
+ "occasion",
4649
+ "origin",
4650
+ "person",
4651
+ "profession"
4652
+ ],
4653
+ "suffixes": [
4654
+ "",
4655
+ [
4656
+ "al",
4657
+ [
4658
+ "",
4659
+ "ly"
4660
+ ]
4661
+ ]
4662
+ ]
4663
+ },
4664
+ {
4665
+ "stems": [
4666
+ "friend",
4667
+ "member",
4668
+ "partner",
4669
+ "scholar",
4670
+ "town"
4671
+ ],
4672
+ "suffixes": [
4673
+ "",
4674
+ [
4675
+ "s",
4676
+ [
4677
+ "",
4678
+ "hip"
4679
+ ]
4680
+ ]
4681
+ ]
4682
+ },
4683
+ {
4684
+ "stems": [
4685
+ "accident",
4686
+ "automatic",
4687
+ "basic",
4688
+ "specific"
4689
+ ],
4690
+ "suffixes": [
4691
+ "",
4692
+ "ally"
4693
+ ]
4694
+ },
4695
+ {
4696
+ "stems": [
4697
+ "accus",
4698
+ "alter",
4699
+ "consult",
4700
+ "tempt"
4701
+ ],
4702
+ "suffixes": [
4703
+ "ation",
4704
+ "ed"
4705
+ ]
4706
+ },
4707
+ {
4708
+ "stems": [
4709
+ "ambitio",
4710
+ "cla",
4711
+ "religio",
4712
+ "suspicio"
4713
+ ],
4714
+ "suffixes": [
4715
+ "n",
4716
+ "us"
4717
+ ]
4718
+ },
4719
+ {
4720
+ "stems": [
4721
+ "clu",
4722
+ "cooki",
4723
+ "di",
4724
+ "movi"
4725
+ ],
4726
+ "suffixes": [
4727
+ "ng",
4728
+ [
4729
+ "e",
4730
+ [
4731
+ "",
4732
+ "s"
4733
+ ]
4734
+ ]
4735
+ ]
4736
+ },
4737
+ {
4738
+ "stems": [
4739
+ "easi",
4740
+ "happi",
4741
+ "heavi",
4742
+ "supp"
4743
+ ],
4744
+ "suffixes": [
4745
+ "er",
4746
+ "ly"
4747
+ ]
4748
+ },
4749
+ {
4750
+ "stems": [
4751
+ "could",
4752
+ "would"
4753
+ ],
4754
+ "suffixes": [
4755
+ "",
4756
+ "'ve",
4757
+ "n't",
4758
+ "n’t"
4759
+ ]
4760
+ },
4761
+ {
4762
+ "stems": [
4763
+ "large",
4764
+ "late"
4765
+ ],
4766
+ "suffixes": [
4767
+ "",
4768
+ "ly",
4769
+ "r",
4770
+ "st"
4771
+ ]
4772
+ },
4773
+ {
4774
+ "stems": [
4775
+ "north",
4776
+ "south"
4777
+ ],
4778
+ "suffixes": [
4779
+ "",
4780
+ "ern",
4781
+ [
4782
+ "east",
4783
+ [
4784
+ "",
4785
+ "ern"
4786
+ ]
4787
+ ],
4788
+ [
4789
+ "west",
4790
+ [
4791
+ "",
4792
+ "ern"
4793
+ ]
4794
+ ]
4795
+ ]
4796
+ },
4797
+ {
4798
+ "stems": [
4799
+ "second",
4800
+ "secret"
4801
+ ],
4802
+ "suffixes": [
4803
+ "",
4804
+ "ary",
4805
+ "ly",
4806
+ "s"
4807
+ ]
4808
+ },
4809
+ {
4810
+ "stems": [
4811
+ "biolog",
4812
+ "histor",
4813
+ "psycholog"
4814
+ ],
4815
+ "suffixes": [
4816
+ "ical",
4817
+ "y"
4818
+ ]
4819
+ },
4820
+ {
4821
+ "stems": [
4822
+ "compani",
4823
+ "li",
4824
+ "wag"
4825
+ ],
4826
+ "suffixes": [
4827
+ "es",
4828
+ [
4829
+ "on",
4830
+ [
4831
+ "",
4832
+ "s"
4833
+ ]
4834
+ ]
4835
+ ]
4836
+ },
4837
+ {
4838
+ "stems": [
4839
+ "east",
4840
+ "st",
4841
+ "west"
4842
+ ],
4843
+ "suffixes": [
4844
+ "",
4845
+ "ern"
4846
+ ]
4847
+ },
4848
+ {
4849
+ "stems": [
4850
+ "giggl",
4851
+ "mumbl",
4852
+ "vocalis"
4853
+ ],
4854
+ "suffixes": [
4855
+ "es",
4856
+ "ing"
4857
+ ]
4858
+ },
4859
+ {
4860
+ "stems": [
4861
+ "card",
4862
+ "term"
4863
+ ],
4864
+ "suffixes": [
4865
+ "",
4866
+ "inal",
4867
+ "s"
4868
+ ]
4869
+ },
4870
+ {
4871
+ "stems": [
4872
+ "complet",
4873
+ "separat"
4874
+ ],
4875
+ "suffixes": [
4876
+ "ely",
4877
+ "ion",
4878
+ [
4879
+ "e",
4880
+ [
4881
+ "",
4882
+ "d"
4883
+ ]
4884
+ ]
4885
+ ]
4886
+ },
4887
+ {
4888
+ "stems": [
4889
+ "danger",
4890
+ "joy"
4891
+ ],
4892
+ "suffixes": [
4893
+ "",
4894
+ "ous",
4895
+ "s"
4896
+ ]
4897
+ },
4898
+ {
4899
+ "stems": [
4900
+ "dear",
4901
+ "slight"
4902
+ ],
4903
+ "suffixes": [
4904
+ "",
4905
+ "est",
4906
+ "ly"
4907
+ ]
4908
+ },
4909
+ {
4910
+ "stems": [
4911
+ "possib",
4912
+ "probab"
4913
+ ],
4914
+ "suffixes": [
4915
+ "ility",
4916
+ "le",
4917
+ "ly"
4918
+ ]
4919
+ },
4920
+ {
4921
+ "stems": [
4922
+ "addi",
4923
+ "tradi"
4924
+ ],
4925
+ "suffixes": [
4926
+ "ng",
4927
+ [
4928
+ "tion",
4929
+ [
4930
+ "",
4931
+ "al"
4932
+ ]
4933
+ ]
4934
+ ]
4935
+ },
4936
+ {
4937
+ "stems": [
4938
+ "differen",
4939
+ "instan"
4940
+ ],
4941
+ "suffixes": [
4942
+ [
4943
+ "ce",
4944
+ [
4945
+ "",
4946
+ "s"
4947
+ ]
4948
+ ],
4949
+ [
4950
+ "t",
4951
+ [
4952
+ "",
4953
+ "ly"
4954
+ ]
4955
+ ]
4956
+ ]
4957
+ },
4958
+ {
4959
+ "stems": [
4960
+ "northe",
4961
+ "southe"
4962
+ ],
4963
+ "suffixes": [
4964
+ "rn",
4965
+ [
4966
+ "ast",
4967
+ [
4968
+ "",
4969
+ "ern"
4970
+ ]
4971
+ ]
4972
+ ]
4973
+ },
4974
+ {
4975
+ "stems": [
4976
+ "nothin",
4977
+ "somethin"
4978
+ ],
4979
+ "suffixes": [
4980
+ "'",
4981
+ [
4982
+ "g",
4983
+ [
4984
+ "",
4985
+ "'s"
4986
+ ]
4987
+ ]
4988
+ ]
4989
+ },
4990
+ {
4991
+ "stems": [
4992
+ "require"
4993
+ ],
4994
+ "suffixes": [
4995
+ "",
4996
+ "d",
4997
+ "s",
4998
+ [
4999
+ "ment",
5000
+ [
5001
+ "",
5002
+ "s"
5003
+ ]
5004
+ ]
5005
+ ]
5006
+ },
5007
+ {
5008
+ "stems": [
5009
+ "champion"
5010
+ ],
5011
+ "suffixes": [
5012
+ "",
5013
+ [
5014
+ "s",
5015
+ [
5016
+ "",
5017
+ "hip"
5018
+ ]
5019
+ ],
5020
+ [
5021
+ "ship",
5022
+ [
5023
+ "",
5024
+ "s"
5025
+ ]
5026
+ ]
5027
+ ]
5028
+ },
5029
+ {
5030
+ "stems": [
5031
+ "fift"
5032
+ ],
5033
+ "suffixes": [
5034
+ "een",
5035
+ "h",
5036
+ "y"
5037
+ ]
5038
+ },
5039
+ {
5040
+ "stems": [
5041
+ "public"
5042
+ ],
5043
+ "suffixes": [
5044
+ "",
5045
+ "ity",
5046
+ "ly"
5047
+ ]
5048
+ }
5049
+ ]
5050
+ }
preprocess_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "lowercase": true,
3
+ "separate_apostrophes": false,
4
+ "separate_digits": true,
5
+ "separate_punctuation": true
6
+ }
preprocessing.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # preprocessing.py
2
+
3
+ import re
4
+
5
+ class Preprocessor:
6
+ def __init__(self, lowercase=False, separate_apostrophes=True, separate_digits=True, separate_punctuation=True):
7
+ self.lowercase = lowercase
8
+ self.separate_apostrophes = separate_apostrophes
9
+ self.separate_punctuation = separate_punctuation
10
+ self.separate_digits = separate_digits
11
+
12
+ def preprocess(self, line: str) -> str:
13
+ if self.lowercase:
14
+ line = line.lower()
15
+ if self.separate_apostrophes:
16
+ # Add spaces around apostrophes
17
+ line = re.sub(r"([’'`])", r" \1 ", line)
18
+ # Add spaces around punctuation (except alphanumeric and apostrophes)
19
+ if self.separate_punctuation:
20
+ line = re.sub(r"([^A-Za-z0-9\s’'`])", r" \1 ", line)
21
+ if self.separate_digits:
22
+ line = re.sub(r"(\d)", r" \1 ", line)
23
+
24
+ # Normalize whitespace
25
+ line = re.sub(r"\s+", " ", line)
26
+ return line.strip()
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83cc6d0c3757dce499cb73dec58b5dd30586b574b82060be53e2b9d1cd984b91
3
+ size 433174963
sanity_check.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM
2
+ import torch, torch.nn.functional as F
3
+
4
+ tok = AutoTokenizer.from_pretrained("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M", trust_remote_code=True)
5
+ model = AutoModelForCausalLM.from_pretrained("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M", trust_remote_code=True).eval().to("cuda" if torch.cuda.is_available() else "cpu")
6
+
7
+ print("pad_id:", tok.pad_token_id, "eos_id:", tok.eos_token_id, "bos_id:", tok.bos_token_id)
8
+ assert tok.pad_token_id is not None, "pad_token is None"
9
+ assert tok.pad_token_id != tok.eos_token_id, "BUG: pad_id == eos_id (this can bias sentence scores)"
10
+ assert tok.bos_token_id is None or isinstance(tok.bos_token_id, int)
11
+
12
+ # pick one JSONL example from this task
13
+ s_good = "Create a noun out of the following adjective: clangish. clangishity" # the grammatical one
14
+ s_bad = "Create a noun out of the following adjective: clangish. clangishness" # the ungrammatical one
15
+
16
+ print("\n--- Tokenization debug ---")
17
+ for label, sent in [("good", s_good), ("bad", s_bad)]:
18
+ toks = tok.tokenize(sent)
19
+ ids = tok.encode(sent, add_special_tokens=True)
20
+ print(f"{label} sentence: {sent}")
21
+ print(f" tokens: {toks}")
22
+ print(f" ids : {ids}")
23
+ print("--- End tokenization debug ---\n")
24
+
25
+ def sent_logprob(s):
26
+ # mimic eval: no special tokens
27
+ enc = tok(s, add_special_tokens=False, return_tensors="pt")
28
+ input_ids = enc["input_ids"].to(model.device)
29
+ attn_mask = enc["attention_mask"].to(model.device)
30
+ with torch.no_grad():
31
+ out = model(input_ids=input_ids, attention_mask=attn_mask)
32
+ logits = out.logits[:, :-1, :] # shift for next-token LM
33
+ targets = input_ids[:, 1:] # gold next tokens
34
+ lp = F.log_softmax(logits, dim=-1).gather(-1, targets.unsqueeze(-1)).squeeze(-1)
35
+ # mask out padding if any
36
+ if tok.pad_token_id is not None:
37
+ keep = (targets != tok.pad_token_id)
38
+ lp = lp * keep
39
+ return lp.sum().item()
40
+
41
+ print("LP(good) =", sent_logprob(s_good))
42
+ print("LP(bad) =", sent_logprob(s_bad))
segmentation_tests.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #from tokenizer import ParadigmTokenizerWrapper
2
+
3
+ #tok = ParadigmTokenizerWrapper("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M")
4
+ #enc = tok("the singers were singing a very nice song!")
5
+ #print(tok.tok.convert_ids_to_tokens(enc["input_ids"]))
6
+
7
+
8
+ from transformers import AutoTokenizer
9
+ tok = AutoTokenizer.from_pretrained("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M", trust_remote_code=True, local_files_only=True)
10
+
11
+ print(type(tok))
12
+
13
+ print(tok("the singers were singing a very nice song!"))
14
+
15
+ print(tok.tokenize("the singers were singing a very nice song!"))
16
+
17
+ print(tok.special_tokens_map)
18
+ print(tok.bos_token, tok.eos_token, tok.bos_token_id, tok.eos_token_id)
19
+
20
+ enc = tok("the skibidiboppers were sdjnajning a very nice song!",
21
+ add_special_tokens=True,
22
+ return_attention_mask=True)
23
+ print(tok.convert_ids_to_tokens(enc["input_ids"]))
24
+
25
+ # via HF object
26
+ print(tok.backend_tokenizer.post_processor) # should NOT be None
27
+
28
+ # double-check by reading tokenizer.json directly
29
+ from tokenizers import Tokenizer
30
+ import os
31
+ tk = Tokenizer.from_file(os.path.join("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M", "tokenizer.json"))
32
+ print(tk.post_processor) # should NOT be None
33
+
34
+ enc = tok(["a b", "a b c d"], padding="max_length", max_length=6, return_tensors="pt")
35
+ print(enc["input_ids"]) # shorter row should end with pad ids
36
+ print(enc["attention_mask"]) # 1 for real tokens, 0 for pads
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tokenizer.py
2
+ # Wrapper for ParadigmFinder segmentation + portable HF tokenizer
3
+
4
+ from typing import List, Tuple, Optional, Union, Dict, Any
5
+ import os, json, re
6
+ from transformers import PreTrainedTokenizerFast
7
+
8
+ def _deserialize_suffixes_from_json(sfx_list):
9
+ out = set()
10
+ for item in sfx_list:
11
+ if isinstance(item, list):
12
+ # JSON nested: [base, nested_list]
13
+ base, nested = item
14
+ out.add((base, frozenset(nested)))
15
+ else:
16
+ out.add(item) # plain string like "", "ing", "s"
17
+ return out
18
+
19
+ def _load_paradigms_any(path):
20
+ import json
21
+ with open(path, "r", encoding="utf-8") as f:
22
+ payload = json.load(f)
23
+
24
+ # Case A: new schema with top-level dict {"paradigms": [...]}
25
+ if isinstance(payload, dict) and "paradigms" in payload:
26
+ paradigms = []
27
+ for p in payload["paradigms"]:
28
+ stems = set(p["stems"])
29
+ suffixes = _deserialize_suffixes_from_json(p["suffixes"])
30
+ paradigms.append((stems, suffixes))
31
+ meta = payload.get("meta", {})
32
+ return paradigms, meta
33
+
34
+ # Case B: older “list of pairs” JSON [[stems, suffixes], ...]
35
+ if isinstance(payload, list) and payload and isinstance(payload[0], list):
36
+ paradigms = []
37
+ for stems, suffixes in payload:
38
+ stems = set(stems)
39
+ # suffixes may be ["", ["er", ["", "s"]], "ing"] or already strings
40
+ norm = _deserialize_suffixes_from_json(suffixes)
41
+ paradigms.append((stems, norm))
42
+ return paradigms, {}
43
+
44
+ # Case C: already python-native structure (rare if not using JSON)
45
+ if isinstance(payload, list) and payload and isinstance(payload[0], (list, tuple)) and len(payload[0]) == 2:
46
+ return payload, {}
47
+
48
+ raise ValueError("Unrecognized paradigms.json format")
49
+
50
+ # ----------------------------
51
+ # Paradigm-based segmenter
52
+ # ----------------------------
53
+ class ParadigmFinderSegmenter:
54
+ def __init__(self, paradigms, lowercase=True, space_punct=True):
55
+ self.paradigms = paradigms
56
+ self.lowercase = lowercase
57
+ self.space_punct = space_punct
58
+
59
+ def _preprocess(self, text: str) -> str:
60
+ s = text
61
+ if self.lowercase:
62
+ s = s.lower()
63
+ if self.space_punct:
64
+ s = re.sub(r"([^\w\s'])", r" \1 ", s)
65
+ s = re.sub(r"\s+", " ", s).strip()
66
+ return s
67
+
68
+ # faithful to your segmentation logic
69
+ def _segment_word(self, word: str, fallback=True, top_k=20) -> List[str]:
70
+ def match_suffixes(suffixes, remainder):
71
+ for suffix in suffixes:
72
+ if isinstance(suffix, (tuple, list)):
73
+ base, nested = suffix
74
+ if remainder.startswith(base):
75
+ sub = remainder[len(base):]
76
+ nested_result = match_suffixes(nested, sub)
77
+ if nested_result is not None:
78
+ return [base] + nested_result
79
+ elif remainder == suffix:
80
+ return [suffix] if suffix else []
81
+ return None
82
+
83
+ for stems, suffixes in self.paradigms:
84
+ for stem in stems:
85
+ if word.startswith(stem):
86
+ remainder = word[len(stem):]
87
+ matched_suffix = match_suffixes(suffixes, remainder)
88
+ if matched_suffix is not None:
89
+ return [stem] + matched_suffix
90
+
91
+ if fallback:
92
+ candidates = self.paradigms[:top_k]
93
+ longest = ""
94
+ def collect_flat(sfx):
95
+ for s in sfx:
96
+ if isinstance(s, (tuple, list)):
97
+ yield s[0]
98
+ yield from collect_flat(s[1])
99
+ else:
100
+ yield s
101
+ for _, suffixes in candidates:
102
+ for suffix in collect_flat(suffixes):
103
+ if word.endswith(suffix) and len(suffix) > len(longest):
104
+ longest = suffix
105
+ if longest:
106
+ stem = word[:-len(longest)]
107
+ return [stem, longest]
108
+
109
+ return [word]
110
+
111
+ def segment_with_alignment(self, raw_text: str) -> Tuple[str, List[Optional[int]]]:
112
+ """
113
+ Preprocess + segment; return segmented text and a char map from segmented
114
+ text back to raw indices (None for inserted spaces).
115
+ """
116
+ # 1) Preprocess with alignment
117
+ pre_chars, pre_map = [], []
118
+ s = raw_text.lower() if self.lowercase else raw_text
119
+ out, out_map = [], []
120
+
121
+ # insert spaces around punctuation (if enabled), tracking alignment
122
+ for i, ch in enumerate(s):
123
+ if self.space_punct and re.match(r"[^\w\s']", ch):
124
+ out.append(" "); out_map.append(None)
125
+ out.append(ch); out_map.append(i)
126
+ out.append(" "); out_map.append(None)
127
+ else:
128
+ out.append(ch); out_map.append(i)
129
+
130
+ # collapse/strip spaces
131
+ pre = []
132
+ pre2raw = []
133
+ prev_space = False
134
+ for ch, m in zip(out, out_map):
135
+ if ch.isspace():
136
+ if not prev_space:
137
+ pre.append(" "); pre2raw.append(None)
138
+ prev_space = True
139
+ else:
140
+ pre.append(ch); pre2raw.append(m); prev_space = False
141
+ if pre and pre[0] == " ": pre.pop(0); pre2raw.pop(0)
142
+ if pre and pre[-1] == " ": pre.pop(); pre2raw.pop()
143
+ norm = "".join(pre)
144
+
145
+ # 2) Segment by paradigms, preserving alignment
146
+ seg_chars, seg_map = [], []
147
+ i = 0
148
+ n = len(norm)
149
+ while i < n:
150
+ while i < n and norm[i].isspace():
151
+ i += 1
152
+ if i >= n: break
153
+ j = i
154
+ while j < n and not norm[j].isspace():
155
+ j += 1
156
+ token = norm[i:j]
157
+ token_map = pre2raw[i:j]
158
+ parts = self._segment_word(token, fallback=True)
159
+
160
+ # robust emission: consume all chars exactly once
161
+ pos = 0
162
+ for p_index, part in enumerate(parts):
163
+ L = len(part)
164
+ # clamp to remaining length
165
+ L = min(L, len(token) - pos)
166
+ if L <= 0: continue
167
+ for k in range(L):
168
+ seg_chars.append(token[pos + k])
169
+ seg_map.append(token_map[pos + k])
170
+ pos += L
171
+ if p_index < len(parts) - 1:
172
+ seg_chars.append(" "); seg_map.append(None)
173
+ # inter-token space
174
+ i = j
175
+ while i < n and norm[i].isspace():
176
+ i += 1
177
+ if i < n:
178
+ seg_chars.append(" "); seg_map.append(None)
179
+
180
+ # final collapse (defensive)
181
+ final = []
182
+ final_map = []
183
+ prev_space = False
184
+ for ch, m in zip(seg_chars, seg_map):
185
+ if ch.isspace():
186
+ if not prev_space:
187
+ final.append(" "); final_map.append(None); prev_space = True
188
+ else:
189
+ final.append(ch); final_map.append(m); prev_space = False
190
+ if final and final[0] == " ": final.pop(0); final_map.pop(0)
191
+ if final and final[-1] == " ": final.pop(); final_map.pop()
192
+
193
+ return "".join(final), final_map
194
+
195
+ # ----------------------------
196
+ # Offset remapping helper
197
+ # ----------------------------
198
+ def remap_offsets_to_raw(offsets: List[Tuple[int,int]], pre2raw: List[Optional[int]]) -> List[Tuple[int,int]]:
199
+ mapped = []
200
+ L = len(pre2raw)
201
+ for s,e in offsets:
202
+ s = max(0, min(s, L)); e = max(0, min(e, L))
203
+ rs = re_ = None
204
+ t = s
205
+ while t < e and rs is None:
206
+ if pre2raw[t] is not None: rs = pre2raw[t]
207
+ t += 1
208
+ t = e - 1
209
+ while t >= s and re_ is None:
210
+ if pre2raw[t] is not None: re_ = pre2raw[t] + 1
211
+ t -= 1
212
+ mapped.append((rs if rs is not None else 0, re_ if re_ is not None else 0))
213
+ return mapped
214
+
215
+ # ----------------------------
216
+ # Public wrapper
217
+ # ----------------------------
218
+ class ParadigmTokenizerWrapper(PreTrainedTokenizerFast):
219
+ slow_tokenizer_class = None
220
+
221
+ def __init__(self, *args, **kwargs):
222
+ # ensure fast tokenizer is loaded directly (no slow->fast conversion)
223
+ name_or_path = kwargs.get("name_or_path", None)
224
+ if name_or_path is None and len(args) > 0 and isinstance(args[0], str):
225
+ name_or_path = args[0]
226
+
227
+ if "tokenizer_file" not in kwargs and "tokenizer_object" not in kwargs and name_or_path is not None:
228
+ tf = os.path.join(name_or_path, "tokenizer.json")
229
+ if not os.path.isfile(tf):
230
+ raise FileNotFoundError(f"Expected tokenizer.json at {tf}")
231
+ kwargs["tokenizer_file"] = tf
232
+
233
+ super().__init__(*args, **kwargs)
234
+
235
+ # The folder path AutoTokenizer passes becomes available as:
236
+ # - kwargs.get("name_or_path") on first init
237
+ # - or self.name_or_path after init
238
+ hf_dir = kwargs.get("name_or_path", getattr(self, "name_or_path", None))
239
+ if hf_dir is None:
240
+ # final fallback: derive from tokenizer_file path
241
+ tok_file = getattr(self, "tokenizer_file", None)
242
+ hf_dir = os.path.dirname(tok_file) if tok_file else "."
243
+
244
+ # Load paradigms
245
+ ppath = os.path.join(hf_dir, "paradigms.json")
246
+ if not os.path.exists(ppath):
247
+ raise FileNotFoundError(f"Missing paradigms.json in {hf_dir}")
248
+ self.paradigms, self.paradigms_meta = _load_paradigms_any(ppath)
249
+
250
+ # Load preprocessing flags
251
+ cfg = {"lowercase": True, "space_punct": True}
252
+ cpath = os.path.join(hf_dir, "preprocess_config.json")
253
+ if os.path.exists(cpath):
254
+ with open(cpath, "r", encoding="utf-8") as f:
255
+ cfg.update(json.load(f))
256
+
257
+ self.segmenter = ParadigmFinderSegmenter(
258
+ paradigms=self.paradigms,
259
+ lowercase=cfg.get("lowercase", True),
260
+ space_punct=cfg.get("space_punct", True),
261
+ )
262
+
263
+ # ---- main entry point ----
264
+ def __call__(self, text, **kwargs):
265
+ if isinstance(text, str):
266
+ seg, _ = self.segmenter.segment_with_alignment(text)
267
+ return super().__call__(seg, **kwargs)
268
+ elif isinstance(text, (list, tuple)):
269
+ segs = []
270
+ for t in text:
271
+ seg, _ = self.segmenter.segment_with_alignment(t)
272
+ segs.append(seg)
273
+ return super().__call__(segs, **kwargs)
274
+ else:
275
+ raise TypeError("text must be str or List[str]/Tuple[str]")
276
+
277
+
278
+ def tokenize(self, text, **kwargs):
279
+ # Intercept manual .tokenize() calls to ensure segmentation happens first
280
+ if isinstance(text, str):
281
+ seg, _ = self.segmenter.segment_with_alignment(text)
282
+ return super().tokenize(seg, **kwargs)
283
+ elif isinstance(text, list):
284
+ # Tokenize each string separately, then flatten (matches HF behavior)
285
+ out = []
286
+ for t in text:
287
+ seg, _ = self.segmenter.segment_with_alignment(t)
288
+ out.extend(super().tokenize(seg, **kwargs))
289
+ return out
290
+ else:
291
+ raise TypeError("tokenize() expects str or List[str]")
tokenizer_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "</s>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": false,
38
+ "eos_token": "</s>",
39
+ "extra_special_tokens": {},
40
+ "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "<pad>",
42
+ "tokenizer_class": "ParadigmTokenizerWrapper",
43
+ "unk_token": "<unk>",
44
+ "auto_map": {
45
+ "AutoTokenizer": [
46
+ "tokenizer.ParadigmTokenizerWrapper",
47
+ null
48
+ ]
49
+ }
50
+ }
training.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # File: training.py
2
+ # -----------------------------
3
+ # Main script for pretraining an LM with the next-token prediction loss
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.nn.functional as F
8
+ import numpy as np
9
+ from tqdm import tqdm
10
+ from time import time
11
+ from torch.utils.data import DataLoader
12
+ import os
13
+ import math
14
+ import wandb
15
+ import gc
16
+ import pickle
17
+ from transformers import AutoTokenizer
18
+
19
+ from utils import get_config, setup_experiment, setup_wandb
20
+ from models import initialize_model, initialize_optimizer, initialize_scheduler, initialize_model_and_optimizers, save_epoch_checkpoint
21
+ from data_utils import load_babylm_data
22
+
23
+ def full_train_loop(cfg, model, optimizer, scheduler):
24
+ # Load the BabyLM dataset
25
+ dataloader = load_babylm_data(cfg)
26
+
27
+ # Start the loop
28
+ start_time = time()
29
+ epoch_size = len(dataloader)
30
+ print(epoch_size)
31
+ for epoch in range(cfg["n_epochs"]):
32
+ # Clear cache
33
+ torch.cuda.empty_cache()
34
+
35
+ tr_metrics = train_epoch(cfg, model, optimizer, scheduler, dataloader, epoch, epoch_size, start_time)
36
+ print(f"Epoch {epoch}; train loss: {tr_metrics['loss']}")
37
+ metric_path = os.path.join(cfg["logdir"], f"epoch_{epoch}_metrics.pth")
38
+ torch.save(tr_metrics, metric_path)
39
+
40
+ checkpoint_dir = cfg["checkpoint_dir"]
41
+ save_epoch_checkpoint(model, optimizer, scheduler, epoch, checkpoint_dir)
42
+
43
+ def unpack_batch(minibatch, device):
44
+ input_tokens = minibatch[0].to(device)
45
+ target_tokens = minibatch[1].to(device)
46
+ target_mask = minibatch[2].to(device)
47
+
48
+ return input_tokens, target_tokens, target_mask
49
+
50
+ def train_epoch(cfg, model, optimizer, scheduler, dataloader, epoch, epoch_size, start_time):
51
+ model.train()
52
+ total_loss = 0
53
+ total_tokens = 0
54
+ temp_loss = 0
55
+ temp_tokens = 0
56
+
57
+ device = model.device
58
+ use_amp = device.type == "cuda"
59
+ amp_dtype = torch.bfloat16
60
+
61
+ num_steps = len(dataloader)
62
+ for train_step, minibatch in enumerate(tqdm(dataloader)):
63
+ input_tokens, target_tokens, target_mask = unpack_batch(minibatch, device)
64
+ num_tokens = torch.sum(target_mask).item()
65
+ B = input_tokens.shape[0]
66
+
67
+ # Perform forward pass
68
+ with torch.autocast(device_type="cuda", dtype=amp_dtype) if use_amp else torch.cuda.amp.autocast(enabled=False):
69
+ logits = model(input_tokens).logits
70
+ log_probs = F.log_softmax(logits, dim=2)
71
+ token_log_probs = torch.gather(log_probs, 2, target_tokens.unsqueeze(2)).squeeze(2)
72
+
73
+ # Backward
74
+ loss = - torch.sum(token_log_probs * target_mask) / torch.sum(target_mask)
75
+ loss.backward()
76
+ if cfg["gradient_clip_norm"] != -1:
77
+ nn.utils.clip_grad_norm_(model.parameters(), cfg['gradient_clip_norm'])
78
+ optimizer.step()
79
+ scheduler.step()
80
+ optimizer.zero_grad()
81
+
82
+ total_loss += loss.item() * num_tokens
83
+ total_tokens += num_tokens
84
+ temp_loss += loss.item() * num_tokens
85
+ temp_tokens += num_tokens
86
+
87
+ if cfg["use_wandb"] and (train_step % 10 == 0 and train_step > 0):
88
+ # Compute the steps
89
+ steps = epoch_size * epoch + train_step
90
+ wandb_train_epoch(
91
+ temp_loss / temp_tokens, steps, start_time
92
+ )
93
+
94
+ temp_loss = 0
95
+ temp_tokens = 0
96
+
97
+ # Intermediate checkpoint saving spot
98
+ if epoch == 0 and cfg["training_type"] == "strict_small" and train_step != 0:
99
+ one_million_steps = len(dataloader) // 10
100
+ if train_step % one_million_steps == 0:
101
+ curr_words = f"{train_step // one_million_steps}M"
102
+ save_epoch_checkpoint(model, optimizer, scheduler, curr_words, cfg["checkpoint_dir"])
103
+ if epoch == 0 and cfg["training_type"] == "strict" and train_step != 0:
104
+ one_million_steps = len(dataloader) // 100
105
+ if train_step % one_million_steps == 0 and train_step // one_million_steps < 10:
106
+ curr_words = f"{train_step // one_million_steps}M"
107
+ save_epoch_checkpoint(model, optimizer, scheduler, curr_words, cfg["checkpoint_dir"])
108
+
109
+ ten_million_steps = len(dataloader) // 10
110
+ if train_step % ten_million_steps == 0:
111
+ curr_words = f"{10 * (train_step // ten_million_steps)}M"
112
+ save_epoch_checkpoint(model, optimizer, scheduler, curr_words, cfg["checkpoint_dir"])
113
+
114
+ return {"loss" : total_loss / total_tokens}
115
+
116
+ def wandb_train_epoch(loss, step, start_time):
117
+ time_elapsed = (time() - start_time) / 60
118
+ curr_dict = {
119
+ f"train_metrics/time_elapsed" : time_elapsed,
120
+ f"train_metrics/batch_train_loss" : loss,
121
+ }
122
+ wandb.log(curr_dict, step=step)
123
+
124
+ def main():
125
+ cfg = get_config()
126
+ setup_experiment(cfg)
127
+ if cfg["use_wandb"]:
128
+ setup_wandb(cfg)
129
+
130
+ tok = AutoTokenizer.from_pretrained(cfg["tokenizer_dir"], trust_remote_code=True)
131
+
132
+ model = initialize_model(cfg)
133
+
134
+ model.resize_token_embeddings(len(tok))
135
+ model.config.vocab_size = len(tok)
136
+ model.config.bos_token_id = tok.bos_token_id
137
+ model.config.eos_token_id = tok.eos_token_id
138
+ model.config.pad_token_id = tok.pad_token_id
139
+
140
+
141
+ optimizer = initialize_optimizer(cfg, model)
142
+ scheduler = initialize_scheduler(cfg, model, optimizer)
143
+
144
+ full_train_loop(cfg, model, optimizer, scheduler)
145
+
146
+
147
+ if __name__ == "__main__":
148
+ main()
utils.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils.py
2
+ import argparse
3
+ import os
4
+ import yaml
5
+ import json
6
+ import random
7
+ import numpy as np
8
+ import torch
9
+
10
+ def mkdir(dirpath):
11
+ os.makedirs(dirpath, exist_ok=True)
12
+
13
+ def get_config():
14
+ parser = argparse.ArgumentParser()
15
+
16
+ parser.add_argument('--config', type=str, default='config.json',
17
+ help="Path to base config file (json or yaml)")
18
+ parser.add_argument('--tokenizer_dir', type=str)
19
+ parser.add_argument('--data_dir', type=str)
20
+ parser.add_argument('--train_glob', type=str)
21
+ parser.add_argument('--valid_glob', type=str)
22
+ parser.add_argument('--output_dir', type=str)
23
+
24
+ # Training settings
25
+ parser.add_argument('--datapoint_length', type=int)
26
+ parser.add_argument('--training_type', type=str, choices=["strict", "strict_small"])
27
+ parser.add_argument('--n_epochs', type=int)
28
+ parser.add_argument('--batch_size', type=int)
29
+ parser.add_argument('--learning_rate', type=float)
30
+ parser.add_argument('--weight_decay', type=float)
31
+ parser.add_argument('--num_training_steps', type=int)
32
+ parser.add_argument('--num_warmup_steps', type=int)
33
+ parser.add_argument('--gradient_clip_norm', type=float)
34
+
35
+ # Experiment
36
+ parser.add_argument('--seed', type=int)
37
+ parser.add_argument('--base_folder', type=str)
38
+ parser.add_argument('--experiment_name', type=str)
39
+ parser.add_argument('--use_wandb', action='store_true')
40
+ parser.add_argument('--wandb_project_name', type=str)
41
+ parser.add_argument('--wandb_experiment_name', type=str)
42
+
43
+
44
+
45
+ args = parser.parse_args()
46
+ config = construct_config(args)
47
+ return config
48
+
49
+ def setup_experiment(cfg):
50
+ # Seed
51
+ if cfg.get("seed", -1) == -1:
52
+ cfg["seed"] = random.randint(0, 10**9)
53
+ random.seed(cfg["seed"])
54
+ np.random.seed(cfg["seed"])
55
+ torch.manual_seed(cfg["seed"])
56
+ torch.cuda.manual_seed_all(cfg["seed"])
57
+ torch.backends.cudnn.deterministic = True
58
+ torch.backends.cudnn.benchmark = False
59
+ print(f"[utils] Using seed {cfg['seed']}")
60
+
61
+ # Folders
62
+ cfg["expdir"] = os.path.join(cfg["base_folder"], cfg["experiment_name"])
63
+ cfg["checkpoint_dir"] = os.path.join(cfg["expdir"], 'checkpoints')
64
+ cfg["logdir"] = os.path.join(cfg["expdir"], 'logging')
65
+ mkdir(cfg["expdir"]); mkdir(cfg["checkpoint_dir"]); mkdir(cfg["logdir"])
66
+
67
+ # Save resolved config
68
+ with open(os.path.join(cfg["logdir"], "exp_cfg.yaml"), 'w') as cfg_file:
69
+ yaml.safe_dump(cfg, cfg_file, sort_keys=False)
70
+
71
+ def setup_wandb(cfg):
72
+ try:
73
+ import wandb
74
+ except ImportError:
75
+ raise RuntimeError("use_wandb is true but wandb is not installed")
76
+ wandb.init(
77
+ project=cfg["wandb_project_name"],
78
+ name=cfg["wandb_experiment_name"]
79
+ )
80
+
81
+ def load_file_any(filepath):
82
+ ext = os.path.splitext(filepath)[1].lower()
83
+ with open(filepath, 'r') as f:
84
+ if ext in ['.yaml', '.yml']:
85
+ return yaml.safe_load(f)
86
+ else:
87
+ return json.load(f)
88
+
89
+ def construct_config(args):
90
+ base_cfg = load_file_any(args.config)
91
+ # Overlay CLI args when provided
92
+ for k, v in vars(args).items():
93
+ if k == "config":
94
+ continue
95
+ if v is not None:
96
+ base_cfg[k] = v
97
+ return base_cfg