achille-fusco commited on Aug 16, 2025

Commit

c2760fe

verified ·

1 Parent(s): 6552f98

Upload folder using huggingface_hub

Browse files

Files changed (21) hide show

__init__.py +0 -0
bos_eos_patch.py +42 -0
config.json +26 -0
data_utils.py +108 -0
latest_optimizer.pt +3 -0
latest_scheduler.pt +3 -0
latest_student.pt +3 -0
models.py +90 -0
paradigm_utils.py +430 -0
paradigms.json +5050 -0
preprocess_config.json +6 -0
preprocessing.py +26 -0
pytorch_model.bin +3 -0
sanity_check.py +42 -0
segmentation_tests.py +36 -0
special_tokens_map.json +6 -0
tokenizer.json +0 -0
tokenizer.py +291 -0
tokenizer_config.json +50 -0
training.py +148 -0
utils.py +97 -0

__init__.py ADDED Viewed

File without changes

bos_eos_patch.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from transformers import AutoTokenizer
+from tokenizers import Tokenizer
+from tokenizers.processors import TemplateProcessing
+import os, json
+TOK_DIR = "/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M"
+# 0) sanity: what path will HF load?
+tmp = AutoTokenizer.from_pretrained(TOK_DIR, trust_remote_code=True, local_files_only=True)
+print("HF loads from:", tmp.name_or_path)
+# 1) get bos/eos tokens & ids
+bos, eos = tmp.bos_token, tmp.eos_token
+assert bos and eos, "BOS/EOS not defined in special_tokens_map.json / tokenizer_config.json"
+bos_id, eos_id = tmp.convert_tokens_to_ids([bos, eos])
+# 2) patch tokenizer.json with a TemplateProcessing post-processor
+tok_json = os.path.join(TOK_DIR, "tokenizer.json")
+tk = Tokenizer.from_file(tok_json)
+tk.post_processor = TemplateProcessing(
+    single=f"{bos} $A {eos}",
+    pair=f"{bos} $A {eos} $B:1 {eos}:1",
+    special_tokens=[(bos, bos_id), (eos, eos_id)],
+)
+tk.save(tok_json)
+# 3) (optional) keep bos/eos also in tokenizer_config.json
+cfg_path = os.path.join(TOK_DIR, "tokenizer_config.json")
+with open(cfg_path, "r", encoding="utf-8") as f:
+    cfg = json.load(f)
+cfg["bos_token"] = bos
+cfg["eos_token"] = eos
+with open(cfg_path, "w", encoding="utf-8") as f:
+    json.dump(cfg, f, indent=2)
+# 4) verify post-processor is present after a fresh reload
+tok = AutoTokenizer.from_pretrained(TOK_DIR, trust_remote_code=True, local_files_only=True)
+print("post-processor:", tok.backend_tokenizer.post_processor)  # should NOT be None
+# 5) final check: specials appear when requested
+enc = tok("the singers were singing a very nice song!", add_special_tokens=True, return_attention_mask=False)
+print(tok.convert_ids_to_tokens(enc["input_ids"]))

config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+"tokenizer_dir": "/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M",
+"data_dir": "01-data",
+"train_glob": "*.train",
+"valid_glob": "*.valid",
+"output_dir": "03-models/gpt2_ParFindFast_10M",
+"datapoint_length" : 512,
+"training_type" : "strict_small",
+"n_epochs" : 10,
+"batch_size" : 16,
+"learning_rate" : 0.00005,
+"weight_decay" : 0,
+"num_training_steps" : 200000,
+"num_warmup_steps" : 2000,
+"sft_learning_rate" : 0.00005,
+"gradient_clip_norm" : 1,
+"seed" : -1,
+"base_folder" : "03-models",
+"experiment_name" : "gpt2_ParFindFast_10M",
+"use_wandb" : false,
+"wandb_experiment_name" : "gpt2_ParFindFast",
+"wandb_project_name" : "BabyLM-2025",
+"tokenizer_class": "ParadigmTokenizerWrapper",
+"model_type": "gpt2",
+"vocab_size": 29215
+}

data_utils.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# File: data_utils.py
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torch.nn.utils.rnn import pad_sequence
+# from tokenizer import ParadigmTokenizerWrapper
+from transformers import AutoTokenizer
+import math
+import os
+from tqdm import tqdm
+import pickle
+TRAIN_PATH_10M = '01-data/clean_train_10M'
+DATASETS = ['bnc_spoken', 'childes', 'gutenberg', 'open_subtitles', 'simple_wiki', 'switchboard']
+class FullBabyLMDataset(Dataset):
+    def __init__(self, cfg, pretokenized_data=None):
+        tokenizer_path = cfg["tokenizer_dir"]
+        # Use HF loader so tokenizer_class + auto_map are honored
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_path,
+            trust_remote_code=True,
+            local_files_only=True
+        )
+        # Pull specials directly from the wrapper (it *is* a PreTrainedTokenizerFast)
+        self.model_bos = self.tokenizer.bos_token_id
+        self.model_eos = self.tokenizer.eos_token_id
+        self.model_pad = self.tokenizer.pad_token_id
+        if pretokenized_data is not None:
+            self.data = pretokenized_data
+            return
+        # Tokenize, split and reconstruct each dataset
+        self.data = []
+        dataset_folder = TRAIN_PATH_10M  # using the 10M setting here
+        for dataset in DATASETS:
+            dataset_path = os.path.join(dataset_folder, f'{dataset}.train')
+            with open(dataset_path, 'r', encoding='utf-8') as f:
+                all_text = ' '.join(f.readlines())
+            print(f'Opened {dataset_path}')
+            # Tokenize in BATCH mode so indexing [0] is correct
+            tokenized_dataset = self.tokenizer([all_text])['input_ids'][0]
+            print(f'Tokenized {dataset_path}; {len(tokenized_dataset)} tokens total')
+            # Chunk into datapoints
+            chunk_size = cfg["datapoint_length"]
+            num_chunks = math.ceil(len(tokenized_dataset) / chunk_size)
+            for curr_chunk in tqdm(range(num_chunks), desc=f"Chunking {dataset}"):
+                start = curr_chunk * chunk_size
+                end = (curr_chunk + 1) * chunk_size
+                chunk_tokens = tokenized_dataset[start:end]
+                if isinstance(chunk_tokens, torch.Tensor):
+                    chunk_tokens = chunk_tokens.tolist()
+                self.data.append(chunk_tokens)
+            print(f"Chunked {dataset_path}")
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        # Add BOS/EOS here (sequence length + 2)
+        return torch.as_tensor([self.model_bos] + self.data[idx] + [self.model_eos], dtype=torch.long)
+## General utilities ##
+def load_babylm_data(cfg):
+    num_words = "100M" if cfg["training_type"] == "strict" else "10M"
+    cache_dir = '01-data/cached_train'
+    os.makedirs(cache_dir, exist_ok=True)
+    filename = os.path.join(cache_dir, f'train_gpt2_{num_words}.pkl')
+    # Cache ONLY the tokenized chunks, not the Dataset object
+    if os.path.exists(filename):
+        with open(filename, 'rb') as f:
+            token_chunks = pickle.load(f)
+        full_babylm_dset = FullBabyLMDataset(cfg, pretokenized_data=token_chunks)
+    else:
+        tmp_dataset = FullBabyLMDataset(cfg)
+        with open(filename, 'wb') as f:
+            pickle.dump(tmp_dataset.data, f)
+        full_babylm_dset = tmp_dataset
+    collate_fn = get_collate_fn(full_babylm_dset.model_eos, full_babylm_dset.model_pad)
+    dataloader = DataLoader(
+        full_babylm_dset,
+        batch_size=cfg["batch_size"],
+        shuffle=True,
+        collate_fn=collate_fn,
+        num_workers=0,         # set >0 if your env supports it
+        pin_memory=False       # set True on GPUs if it helps
+    )
+    return dataloader
+def get_collate_fn(model_eos, model_pad):
+    def collate_fn(batch):
+        tokens = pad_sequence(batch, padding_value=model_pad, batch_first=True)
+        input_tokens = tokens[:, :-1]
+        target_tokens = tokens[:, 1:]
+        target_mask = input_tokens != model_pad
+        # Ensure first position is always trainable
+        target_mask[:, 0] = True
+        return input_tokens, target_tokens, target_mask
+    return collate_fn

latest_optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2edd8392c1373169859829d3c0801f75ff860b53375c20cbb801e3b68578564e
+size 866363865

latest_scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2591c628797378c411d922038d0262e546252fffcc965e0b922e08e478400d64
+size 1507

latest_student.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f2f66172c4f8517ab8db532ac2110a9ad28ac57f784f81d95a569653219fd062
+size 433175117

models.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# File: models.py
+# ---------------
+# All functions related to loading and saving models
+import os
+import torch
+from utils import mkdir
+import gc
+import transformers
+from transformers import GPT2LMHeadModel, GPT2Config
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from vllm import LLM, SamplingParams
+DEVICE = torch.device('cuda') if torch.cuda.is_available() \
+    else torch.device('cpu')
+## INITIALIZATION ##
+def initialize_model_and_optimizers(cfg):
+    student = initialize_model(cfg)
+    optimizer = initialize_optimizer(cfg, student)
+    scheduler = initialize_scheduler(cfg, student, optimizer)
+    return student, optimizer, scheduler
+def initialize_model(cfg):
+    # First load the student
+    size = "100m" if cfg['training_type'] == 'strict' else '10m'
+    config = GPT2Config.from_pretrained(f"./03-models/gpt2_ParFindFast_10M")
+    student = GPT2LMHeadModel(config).to(DEVICE)
+    return student
+def get_parameter_names(model, forbidden_layer_types):
+    """
+    Returns the names of the model parameters that are not inside a forbidden layer.
+    """
+    result = []
+    for name, child in model.named_children():
+        result += [
+            f"{name}.{n}"
+            for n in get_parameter_names(child, forbidden_layer_types)
+            if not isinstance(child, tuple(forbidden_layer_types))
+        ]
+    # Add model specific parameters (defined with nn.Parameter) since they are not in any child.
+    result += list(model._parameters.keys())
+    return result
+def initialize_optimizer(cfg, student):
+    lr = cfg['learning_rate']
+    decay_parameters = get_parameter_names(student, ALL_LAYERNORM_LAYERS)
+    decay_parameters = [name for name in decay_parameters if "bias" not in name]
+    optimizer_grouped_parameters = [
+        {
+            "params": [
+                p for n, p in student.named_parameters() if (n in decay_parameters and p.requires_grad)
+            ],
+            "weight_decay": cfg["weight_decay"],
+        },
+        {
+            "params": [
+                p for n, p in student.named_parameters() if (n not in decay_parameters and p.requires_grad)
+            ],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = torch.optim.AdamW(
+        optimizer_grouped_parameters, lr=lr, eps=1e-8, betas=(0.9, 0.999)
+    )
+    return optimizer
+def initialize_scheduler(cfg, student, optimizer):
+    num_training_steps = cfg["num_training_steps"]
+    num_warmup_steps = cfg["num_warmup_steps"]
+    scheduler = transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps = num_warmup_steps,
+                                                             num_training_steps = num_training_steps)
+    return scheduler
+## SAVING AND LOADING ##
+def save_epoch_checkpoint(student, optimizer, scheduler, epoch, checkpoint_dir):
+    # Open a folder for the round
+    folder = os.path.join(checkpoint_dir, f'epoch_{epoch}')
+    mkdir(folder)
+    # Save the metrics and model
+    torch.save(optimizer.state_dict(), os.path.join(folder, 'latest_optimizer.pt'))
+    torch.save(scheduler.state_dict(), os.path.join(folder, 'latest_scheduler.pt'))
+    torch.save(student.state_dict(), os.path.join(folder, 'latest_student.pt'))
+    torch.save(student.state_dict(), os.path.join(folder, 'pytorch_model.bin'))

paradigm_utils.py ADDED Viewed

	@@ -0,0 +1,430 @@

+# paradigm_utils.py
+import time
+from collections import defaultdict
+from tqdm import tqdm
+import os
+import math
+import json
+from typing import List, Tuple, Set, Dict, Any
+def _serialize_suffixes(sfx_set):
+    flat = []
+    for s in sfx_set:
+        if isinstance(s, tuple):
+            base, nested = s
+            flat.append([base, sorted(list(nested))])  # JSON-safe pair
+        else:
+            flat.append(s)  # plain string
+    # stable order: strings first, then pairs; then lexicographic
+    def key(x):
+        return (0, x) if isinstance(x, str) else (1, x[0], tuple(x[1]))
+    return sorted(flat, key=key)
+def paradigms_to_json(paradigms):
+    out = []
+    for stems, suffixes in paradigms:
+        out.append({
+            "stems": sorted(list(stems)),
+            "suffixes": _serialize_suffixes(suffixes),
+        })
+    return out
+def save_paradigms_json(paradigms, path, meta=None):
+    payload = {
+        "schema_version": 1,
+        "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+        "meta": meta or {},
+        "paradigms": paradigms_to_json(paradigms),
+    }
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(payload, f, ensure_ascii=False, indent=2)
+def _deserialize_suffixes(sfx_list):
+    out = set()
+    for item in sfx_list:
+        if isinstance(item, list):  # [base, nested_list]
+            base, nested = item
+            out.add((base, frozenset(nested)))
+        else:
+            out.add(item)
+    return out
+def load_paradigms_json(path):
+    with open(path, "r", encoding="utf-8") as f:
+        payload = json.load(f)
+    paradigms = []
+    for p in payload["paradigms"]:
+        stems = set(p["stems"])
+        suffixes = _deserialize_suffixes(p["suffixes"])
+        paradigms.append((stems, suffixes))
+    meta = payload.get("meta", {})
+    return paradigms, meta
+### -----------------------------
+### 1. Extract (stem, suffix) pairs from vocabulary
+### -----------------------------
+def extract_stem_suffix_pairs(vocab):
+    """Return a mapping from stems to all suffixes they occur with, including null suffix."""
+    stem_to_suffixes = defaultdict(set)
+    for word in tqdm(vocab, desc="[1/7] Extracting stem-suffix pairs"):
+        for i in range(0, len(word) + 1):  # include empty suffix
+            stem, suffix = word[:i], word[i:]
+            stem_to_suffixes[stem].add(suffix)
+    return stem_to_suffixes
+### -----------------------------
+### 2. Group stems by shared suffix sets and normalize by common prefix
+### -----------------------------
+def group_stems_by_suffixes(stem_to_suffixes, min_shared_stems=2, min_suffixes=2):
+    suffix_to_stems = defaultdict(set)
+    for stem, suffixes in stem_to_suffixes.items():
+        suffix_key = frozenset(suffixes)
+        suffix_to_stems[suffix_key].add(stem)
+    normalized_suffix_map = defaultdict(set)
+    for suffixes, stems in tqdm(suffix_to_stems.items(), desc="[2/7] Grouping and normalizing"):
+        non_empty_suffixes = [s for s in suffixes if s]
+        if len(stems) >= min_shared_stems and len(suffixes) >= min_suffixes:
+            common_prefix = os.path.commonprefix(non_empty_suffixes) if non_empty_suffixes else ""
+            if common_prefix:
+                normalized_stems = {stem + common_prefix for stem in stems}
+                adjusted_suffixes = {s[len(common_prefix):] if s.startswith(common_prefix) else s for s in suffixes}
+            else:
+                normalized_stems = stems
+                adjusted_suffixes = suffixes
+            if len(adjusted_suffixes) >= min_suffixes:
+                suffix_key = frozenset(adjusted_suffixes)
+                normalized_suffix_map[suffix_key].update(normalized_stems)
+    paradigms = [(stems, set(suffixes)) for suffixes, stems in normalized_suffix_map.items()]
+    return paradigms
+### -----------------------------
+### 3. Expand stem sets based on suffix set coverage
+### -----------------------------
+def stem_set_expansion(paradigms, stem_to_suffixes):
+    updated = 0
+    suffix_to_stems = {frozenset(suffixes): set(stems) for stems, suffixes in paradigms}
+    for stem, suffixes in tqdm(stem_to_suffixes.items(), desc="[3/7] Expanding stem sets"):
+        added = False
+        for paradigm_suffixes in sorted(suffix_to_stems.keys(), key=lambda x: (-len(x), tuple(sorted(x)))):
+            if paradigm_suffixes.issubset(suffixes):
+                if stem not in suffix_to_stems[paradigm_suffixes]:
+                    suffix_to_stems[paradigm_suffixes].add(stem)
+                    updated += 1
+                    added = True
+        if not added and stem == 'design':
+            print(f"[DEBUG] No suitable paradigm for 'design' with suffixes {suffixes}")
+    enriched = [(stems, set(suffixes)) for suffixes, stems in suffix_to_stems.items()]
+    print(f"✅ Added {updated} stems via stem set expansion.")
+    return enriched
+### -----------------------------
+### 4. Expand suffix sets based on partial compatibility
+### -----------------------------
+def harmonic_number(n):
+    return sum(1.0 / i for i in range(1, n + 1))
+def suffix_set_expansion(paradigms):
+    base = paradigms[:]  # snapshot
+    merged = [ (set(stems), set(suffixes)) for stems, suffixes in base ]
+    enriched_count = 0
+    # Iterate in a deterministic order
+    for i, (stems_i, suffixes_i) in enumerate(sort_paradigms(merged)):
+        for j, (stems_j, suffixes_j) in enumerate(sort_paradigms(merged)):
+            if i == j:
+                continue
+            if suffixes_i > suffixes_j:
+                intersection = stems_i & stems_j
+                denom = max(1, len(stems_j))  # guard
+                if (len(stems_j) - len(intersection)) < (len(stems_j) / harmonic_number(denom)):
+                    stems_i |= stems_j
+                    enriched_count += 1
+                    # do not mutate stems_j/suffixes_j further
+    print(f"\n✅ Enriched {enriched_count} paradigms via suffix set expansion.")
+    # Return back in original tuple-of-sets form
+    return [ (set(st), set(sf)) for st, sf in sort_paradigms(merged) ]
+### -----------------------------
+### 5. Prune subsumed stems
+### -----------------------------
+def prune_subsumed_stems(paradigms):
+    pruned_paradigms = []
+    for i, (stems_i, suffixes_i) in enumerate(paradigms):
+        pruned_stems = set(stems_i)
+        for j, (stems_j, suffixes_j) in enumerate(paradigms):
+            if i == j:
+                continue
+            if suffixes_j >= suffixes_i:
+                pruned_stems -= (stems_j & stems_i)
+        if pruned_stems:
+            pruned_paradigms.append((pruned_stems, suffixes_i))
+    print(f"✅ Pruned to {len(pruned_paradigms)} paradigms after removing subsumed stems.")
+    return sort_paradigms(pruned_paradigms)
+### -----------------------------
+### 6. Sort paradigms by size
+### -----------------------------
+def sort_paradigms(paradigms):
+    """
+    Primary: log(len(stems)) * log(len(suffixes)) (DESC)
+    Ties: (-len(stems), -len(suffixes), lexicographic stems, lexicographic suffix heads)
+    """
+    def score(p):
+        stems, suffixes = p
+        if stems and suffixes:
+            return math.log(len(stems)) * math.log(len(suffixes))
+        return 0.0
+    def tie_key(p):
+        stems, suffixes = p
+        sfx_heads = []
+        for s in suffixes:
+            sfx_heads.append(s[0] if isinstance(s, tuple) else s)
+        return (-len(stems), -len(suffixes),
+                " ".join(sorted(stems)),
+                " ".join(sorted(sfx_heads)))
+    return sorted(paradigms, key=lambda p: (-score(p), tie_key(p)))
+def sort_paradigms_by_suffix_count(paradigms):
+    def score(p):
+        stem_count = len(p[0])
+        suffix_count = len(p[1])
+        if stem_count > 0 and suffix_count > 0:
+            return suffix_count
+        return 0
+    return sorted(paradigms, key=score, reverse=True)
+def nest_suffixes_from_paradigms(paradigms):
+    print("[7/7] Nesting suffixes based on reusable paradigms...")
+    suffix_set_index = {frozenset(suffixes): True for _, suffixes in paradigms}
+    nested_paradigms = []
+    for stems, suffixes in paradigms:
+        suffixes_list = list(suffixes)
+        nested_suffixes = set()
+        used = set()
+        # deterministic nested pairing
+        for i, s1 in enumerate(sorted(suffixes_list)):
+            for j, s2 in enumerate(sorted(suffixes_list)):
+                if i == j or s2 in used or not isinstance(s1, str) or not isinstance(s2, str):
+                    continue
+                if s2.startswith(s1) and s1 != '':
+                    remainder = s2[len(s1):]
+                    if remainder and frozenset({'', remainder}) in suffix_set_index:
+                        nested_suffixes.add((s1, frozenset({'', remainder})))
+                        used.add(s2)
+                        used.add(s1)
+                        break
+        for s in suffixes_list:
+            if s not in used:
+                nested_suffixes.add(s)
+        nested_paradigms.append((set(stems), nested_suffixes))
+    print(f"✅ Nested structure created for {len(nested_paradigms)} paradigms.")
+    return sort_paradigms(nested_paradigms)
+def refine_nested_stem_conflicts(paradigms):
+    """
+    Remove stems from higher-ranked paradigms if they are fully explained by nested structures
+    in lower-ranked paradigms.
+    Args:
+        paradigms: list of (stem_set, suffix_set), where suffix_set may contain nested (str, frozenset) tuples
+    Returns:
+        Refined list of paradigms with redundant derived stems removed
+    """
+    refined_paradigms = paradigms[:]
+    all_suffix_sets = {frozenset(suffixes) for _, suffixes in paradigms}
+    # Build a mapping from nested suffix sets to their parent prefixes
+    derived_stems = set()
+    for stems, suffixes in paradigms:
+        for sfx in suffixes:
+            if isinstance(sfx, tuple):
+                base, nested_suffixes = sfx
+                if frozenset(nested_suffixes) in all_suffix_sets:
+                    for stem in stems:
+                        derived_stems.add(stem + base)
+    # Remove derived stems from paradigms with simple suffix sets (like ['', 's'])
+    updated_paradigms = []
+    for stems, suffixes in refined_paradigms:
+        cleaned_stems = stems - derived_stems
+        updated_paradigms.append((cleaned_stems, suffixes))
+    print(f"✅ Removed {len(derived_stems)} derived stems explained by nested paradigms.")
+    return updated_paradigms
+### -----------------------------
+### 7. Segment word based on ranked paradigms
+### -----------------------------
+def recursive_fallback(word, suffix_set):
+    for suffix in sorted(suffix_set, key=lambda s: -len(s)):
+        if suffix and word.endswith(suffix):
+            stem_candidate = word[:-len(suffix)]
+            rest = recursive_fallback(stem_candidate, suffix_set)
+            return rest + [suffix]
+    return [word]  # fallback to whole word if nothing matches
+### -----------------------------
+### Main runner
+### -----------------------------
+def run_paradigm_extraction(vocab, min_shared_stems=2, min_suffixes=2, enrich_suffix_sets=True):
+    start = time.time()
+    stem_to_suffixes = extract_stem_suffix_pairs(vocab)
+    paradigms = group_stems_by_suffixes(stem_to_suffixes, min_shared_stems, min_suffixes)
+    paradigms = stem_set_expansion(paradigms, stem_to_suffixes)
+    paradigms = sort_paradigms(paradigms)
+    paradigms = prune_subsumed_stems(paradigms)
+    paradigms = sort_paradigms(paradigms)
+    paradigms = nest_suffixes_from_paradigms(paradigms)
+    paradigms = refine_nested_stem_conflicts(paradigms)
+    paradigms = sort_paradigms(paradigms)
+    if enrich_suffix_sets:
+        print("[4/7] Expanding suffix sets based on partial compatibility...")
+        paradigms = suffix_set_expansion(paradigms)
+    paradigms = sort_paradigms(paradigms)
+    paradigms = prune_subsumed_stems(paradigms)
+    paradigms = sort_paradigms(paradigms)
+    '''# Fallback paradigm for unassigned full words
+    vocab_words = set(vocab)
+    assigned_words = set()
+    for stems, suffixes in paradigms:
+        for stem in stems:
+            for suffix in suffixes:
+                if isinstance(suffix, tuple):
+                    base, _ = suffix
+                    assigned_words.add(stem + base)
+                else:
+                    assigned_words.add(stem + suffix)
+    unassigned_words = vocab_words - assigned_words
+    if unassigned_words:
+        print(f"✅ {len(unassigned_words)} full words were not assigned to any paradigm, added fallback paradigm.")
+        paradigms.append((set(unassigned_words), frozenset({""})))
+    paradigms = sort_paradigms(paradigms)'''
+    print(f"\n✅ Extracted {len(paradigms)} paradigms.")
+    print(f"⏱️  Finished in {time.time() - start:.2f} seconds.")
+    return paradigms
+def segment_word_from_nested_paradigms(word, paradigms, fallback=True, top_k=300):
+    """
+    Segment a word based on nested paradigms with optional fallback.
+    Parameters:
+        word (str): The word to segment.
+        paradigms (list): A list of tuples (stems, suffixes) with optional nesting.
+        fallback (bool): Whether to fall back on longest suffix match from top_k paradigms.
+        top_k (int): Number of top paradigms to consider in fallback.
+    Returns:
+        List[str]: Segmented pieces of the word.
+    """
+    def match_suffixes(suffixes, remainder):
+        """Recursive helper to match nested suffix structures."""
+        for suffix in suffixes:
+            if isinstance(suffix, tuple):
+                base, nested = suffix
+                if remainder.startswith(base):
+                    sub = remainder[len(base):]
+                    nested_result = match_suffixes(nested, sub)
+                    if nested_result is not None:
+                        return [base] + nested_result
+            elif remainder == suffix:
+                return [suffix] if suffix else []
+        return None
+    # First pass: try full nested match
+    for stems, suffixes in paradigms:
+        for stem in stems:
+            if word.startswith(stem):
+                remainder = word[len(stem):]
+                matched_suffix = match_suffixes(suffixes, remainder)
+                if matched_suffix is not None:
+                    return [stem] + matched_suffix
+    # Fallback strategy: longest suffix among top_k paradigms
+    if fallback:
+        seen_suffixes = set()
+        def collect_suffixes(suffixes):
+            for s in suffixes:
+                if isinstance(s, tuple):
+                    seen_suffixes.add(s[0])
+                    collect_suffixes(s[1])
+                else:
+                    seen_suffixes.add(s)
+        for _, suffixes in paradigms[:top_k]:
+            collect_suffixes(suffixes)
+        # Try matching the longest suffix first
+        for suffix in sorted(seen_suffixes, key=lambda s: -len(s)):
+            if suffix and word.endswith(suffix):
+                stem = word[:-len(suffix)]
+                return [stem, suffix]
+        return [word]
+    return [word]
+def segment_word_from_paradigms(word, paradigms, top_k=20):
+    """
+    Simpler fallback-only version: match longest suffix among top_k paradigms.
+    Parameters:
+        word (str): Word to segment.
+        paradigms (list): Paradigm structures.
+        top_k (int): How many paradigms to consider.
+    Returns:
+        List[str]: Segmentation result.
+    """
+    candidates = paradigms[:top_k]
+    best_split = None
+    for stems, suffixes in candidates:
+        for suffix in sorted(suffixes, key=lambda s: -len(s) if isinstance(s, str) else -len(s[0])):
+            if isinstance(suffix, tuple):
+                suffix = suffix[0]  # ignore nested for fallback
+            if word.endswith(suffix):
+                stem_candidate = word[:-len(suffix)] if suffix else word
+                if stem_candidate in stems:
+                    split = [stem_candidate, suffix] if suffix else [stem_candidate]
+                    if best_split is None or len(suffix) > len(best_split[-1]):
+                        best_split = split
+    return best_split or [word]

paradigms.json ADDED Viewed

	@@ -0,0 +1,5050 @@

+{
+  "schema_version": 1,
+  "created_at": "2025-08-13T19:03:27Z",
+  "meta": {
+    "min_shared_stems": 2,
+    "min_suffixes": 2,
+    "enrich_suffix_sets": true,
+    "fallback_top_k": 20,
+    "normalization": {
+      "lowercase": true,
+      "separate_apostrophes": false,
+      "separate_digits": true,
+      "separate_punctuation": true
+    }
+  },
+  "paradigms": [
+    {
+      "stems": [
+        "add",
+        "allow",
+        "answer",
+        "appear",
+        "ask",
+        "attack",
+        "attempt",
+        "back",
+        "bang",
+        "belong",
+        "block",
+        "call",
+        "check",
+        "cheer",
+        "claim",
+        "climb",
+        "color",
+        "concern",
+        "cook",
+        "cover",
+        "crawl",
+        "depend",
+        "dream",
+        "end",
+        "enter",
+        "exist",
+        "explain",
+        "extend",
+        "fear",
+        "film",
+        "gasp",
+        "groan",
+        "growl",
+        "hand",
+        "happen",
+        "head",
+        "interest",
+        "kick",
+        "knock",
+        "land",
+        "laugh",
+        "lean",
+        "lift",
+        "listen",
+        "look",
+        "lower",
+        "need",
+        "offer",
+        "order",
+        "pick",
+        "point",
+        "pretend",
+        "print",
+        "pull",
+        "question",
+        "rank",
+        "regard",
+        "remain",
+        "remember",
+        "repeat",
+        "represent",
+        "result",
+        "return",
+        "row",
+        "scream",
+        "seem",
+        "shout",
+        "sign",
+        "sort",
+        "sound",
+        "spell",
+        "start",
+        "stay",
+        "suggest",
+        "talk",
+        "test",
+        "train",
+        "want",
+        "whisper",
+        "wonder",
+        "yell"
+      ],
+      "suffixes": [
+        "",
+        "ed",
+        "ing",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "command",
+        "contain",
+        "count",
+        "flow",
+        "form",
+        "help",
+        "jump",
+        "kill",
+        "light",
+        "park",
+        "play",
+        "record",
+        "report",
+        "roll",
+        "show",
+        "turn",
+        "walk",
+        "work"
+      ],
+      "suffixes": [
+        "",
+        "ed",
+        "er",
+        "ing",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "'a",
+        "accident",
+        "account",
+        "acre",
+        "adult",
+        "advantage",
+        "adventure",
+        "affair",
+        "afterward",
+        "agent",
+        "aid",
+        "airplane",
+        "airport",
+        "album",
+        "alligator",
+        "american",
+        "amount",
+        "ancestor",
+        "angel",
+        "angle",
+        "animal",
+        "ant",
+        "anyway",
+        "appearance",
+        "application",
+        "area",
+        "argument",
+        "arrangement",
+        "arrondissement",
+        "arrow",
+        "art",
+        "article",
+        "aspect",
+        "athlete",
+        "backward",
+        "bag",
+        "ball",
+        "balloon",
+        "banana",
+        "band",
+        "bandage",
+        "bank",
+        "barrel",
+        "basket",
+        "battle",
+        "bean",
+        "beast",
+        "begin",
+        "being",
+        "belief",
+        "benefit",
+        "beside",
+        "bird",
+        "biscuit",
+        "black",
+        "blade",
+        "blanket",
+        "blue",
+        "boat",
+        "bond",
+        "bone",
+        "book",
+        "bottle",
+        "brain",
+        "brake",
+        "brick",
+        "bubble",
+        "buck",
+        "bug",
+        "bullet",
+        "bun",
+        "button",
+        "cadet",
+        "cake",
+        "camera",
+        "canadian",
+        "candidate",
+        "candle",
+        "canton",
+        "cap",
+        "captive",
+        "carrot",
+        "cartoon",
+        "case",
+        "cave",
+        "cd",
+        "cell",
+        "center",
+        "chain",
+        "chamber",
+        "chance",
+        "channel",
+        "chap",
+        "character",
+        "characteristic",
+        "chemical",
+        "chick",
+        "chicken",
+        "chip",
+        "chum",
+        "cigarette",
+        "circle",
+        "circumstance",
+        "citizen",
+        "client",
+        "clip",
+        "clock",
+        "cloud",
+        "club",
+        "coat",
+        "coin",
+        "college",
+        "column",
+        "comb",
+        "comic",
+        "committee",
+        "commune",
+        "competition",
+        "complaint",
+        "compound",
+        "computer",
+        "comrade",
+        "concert",
+        "condition",
+        "cone",
+        "consequence",
+        "contract",
+        "control",
+        "conversation",
+        "corner",
+        "corsair",
+        "cost",
+        "course",
+        "court",
+        "cousin",
+        "cow",
+        "cowboy",
+        "crab",
+        "crayon",
+        "creature",
+        "creek",
+        "crew",
+        "crime",
+        "criminal",
+        "crisp",
+        "crop",
+        "cup",
+        "curtain",
+        "cushion",
+        "customer",
+        "cut",
+        "d'",
+        "dane",
+        "death",
+        "debt",
+        "decade",
+        "decision",
+        "deck",
+        "deed",
+        "degree",
+        "department",
+        "depth",
+        "description",
+        "detective",
+        "device",
+        "dialect",
+        "diamond",
+        "diaper",
+        "dinner",
+        "dinosaur",
+        "disease",
+        "district",
+        "division",
+        "document",
+        "dodd",
+        "dollar",
+        "dolphin",
+        "door",
+        "dot",
+        "doughnut",
+        "down",
+        "dozen",
+        "dragon",
+        "drug",
+        "drum",
+        "dwelling",
+        "eagle",
+        "economic",
+        "edge",
+        "edward",
+        "effect",
+        "effort",
+        "egg",
+        "el",
+        "element",
+        "elephant",
+        "employee",
+        "envelope",
+        "episode",
+        "equation",
+        "error",
+        "european",
+        "evan",
+        "evening",
+        "event",
+        "example",
+        "exception",
+        "exercise",
+        "expense",
+        "expert",
+        "fact",
+        "fan",
+        "fault",
+        "favorite",
+        "feather",
+        "fella",
+        "fellow",
+        "female",
+        "field",
+        "finger",
+        "fit",
+        "flag",
+        "flame",
+        "flat",
+        "flight",
+        "foe",
+        "folk",
+        "food",
+        "forest",
+        "fork",
+        "fortune",
+        "forward",
+        "france",
+        "frog",
+        "fruit",
+        "function",
+        "galley",
+        "game",
+        "garden",
+        "gate",
+        "gene",
+        "generation",
+        "gesture",
+        "get",
+        "ghost",
+        "giant",
+        "gift",
+        "glove",
+        "goal",
+        "governor",
+        "grab",
+        "grape",
+        "greek",
+        "gro",
+        "gros",
+        "ground",
+        "group",
+        "guest",
+        "gun",
+        "habit",
+        "heel",
+        "height",
+        "helicopter",
+        "hill",
+        "his",
+        "historian",
+        "hit",
+        "holiday",
+        "hotel",
+        "hour",
+        "household",
+        "hum",
+        "hundred",
+        "hymn",
+        "idea",
+        "image",
+        "incident",
+        "indian",
+        "individual",
+        "infection",
+        "instrument",
+        "intention",
+        "interview",
+        "island",
+        "item",
+        "jacket",
+        "jap",
+        "jaw",
+        "jean",
+        "jet",
+        "job",
+        "joint",
+        "key",
+        "kitten",
+        "knee",
+        "knight",
+        "knot",
+        "label",
+        "ladder",
+        "lake",
+        "lamb",
+        "lamp",
+        "language",
+        "lawyer",
+        "league",
+        "lecture",
+        "legend",
+        "lesson",
+        "letter",
+        "level",
+        "limb",
+        "lip",
+        "lord",
+        "lot",
+        "luca",
+        "lung",
+        "machine",
+        "magazine",
+        "male",
+        "map",
+        "marble",
+        "marine",
+        "material",
+        "math",
+        "matter",
+        "meal",
+        "medal",
+        "message",
+        "metal",
+        "meter",
+        "method",
+        "mile",
+        "million",
+        "minister",
+        "minute",
+        "missile",
+        "model",
+        "monkey",
+        "monster",
+        "morning",
+        "mountain",
+        "mouth",
+        "mr",
+        "muscle",
+        "mushroom",
+        "musician",
+        "muslim",
+        "nail",
+        "needle",
+        "neighbor",
+        "nerve",
+        "net",
+        "network",
+        "newspaper",
+        "novel",
+        "oar",
+        "object",
+        "occur",
+        "odd",
+        "olympic",
+        "oop",
+        "opinion",
+        "option",
+        "orange",
+        "organisation",
+        "organism",
+        "organization",
+        "our",
+        "owl",
+        "package",
+        "page",
+        "pancake",
+        "paper",
+        "parcel",
+        "parent",
+        "parson",
+        "particle",
+        "passenger",
+        "path",
+        "patient",
+        "pattern",
+        "paw",
+        "peanut",
+        "pen",
+        "pencil",
+        "penguin",
+        "pension",
+        "pepper",
+        "performance",
+        "period",
+        "peter",
+        "photo",
+        "pickle",
+        "picture",
+        "piece",
+        "pig",
+        "pill",
+        "pillow",
+        "pilot",
+        "pin",
+        "pirate",
+        "planet",
+        "plate",
+        "pleasure",
+        "plebe",
+        "plum",
+        "pocket",
+        "poem",
+        "pole",
+        "politician",
+        "pop",
+        "position",
+        "pot",
+        "power",
+        "prefecture",
+        "preparation",
+        "price",
+        "priest",
+        "principle",
+        "prisoner",
+        "prize",
+        "problem",
+        "procedure",
+        "product",
+        "production",
+        "profit",
+        "program",
+        "project",
+        "proposal",
+        "prospect",
+        "province",
+        "provision",
+        "pupil",
+        "purpose",
+        "put",
+        "quarter",
+        "rabbit",
+        "rat",
+        "rate",
+        "ray",
+        "reason",
+        "refer",
+        "reference",
+        "representative",
+        "reptile",
+        "resource",
+        "restaurant",
+        "review",
+        "rhyme",
+        "ribbon",
+        "richard",
+        "risk",
+        "river",
+        "road",
+        "rocket",
+        "roger",
+        "role",
+        "roman",
+        "room",
+        "root",
+        "rope",
+        "round",
+        "route",
+        "run",
+        "russian",
+        "saint",
+        "sale",
+        "sample",
+        "sausage",
+        "savage",
+        "saving",
+        "scale",
+        "scene",
+        "scheme",
+        "school",
+        "science",
+        "scientist",
+        "scout",
+        "season",
+        "sense",
+        "servant",
+        "service",
+        "session",
+        "settlement",
+        "shadow",
+        "shark",
+        "sheet",
+        "shell",
+        "shirt",
+        "shoe",
+        "shop",
+        "shore",
+        "shot",
+        "shoulder",
+        "shriek",
+        "shrine",
+        "side",
+        "signal",
+        "sin",
+        "single",
+        "sit",
+        "site",
+        "skill",
+        "skirt",
+        "sleeve",
+        "smack",
+        "snake",
+        "sneeze",
+        "sniff",
+        "sock",
+        "soldier",
+        "sometime",
+        "song",
+        "soul",
+        "source",
+        "space",
+        "spear",
+        "spider",
+        "spirit",
+        "spoon",
+        "spot",
+        "square",
+        "squeak",
+        "squeeze",
+        "squirrel",
+        "stable",
+        "stack",
+        "stage",
+        "stamp",
+        "standard",
+        "statue",
+        "stone",
+        "storm",
+        "stranger",
+        "strap",
+        "street",
+        "string",
+        "strip",
+        "stroke",
+        "structure",
+        "student",
+        "studio",
+        "style",
+        "subject",
+        "suburb",
+        "sunday",
+        "surrounding",
+        "suspicion",
+        "sweet",
+        "symbol",
+        "system",
+        "table",
+        "tail",
+        "tale",
+        "tap",
+        "tape",
+        "target",
+        "task",
+        "team",
+        "technique",
+        "temperature",
+        "temple",
+        "tenant",
+        "their",
+        "thing",
+        "thought",
+        "thousand",
+        "threat",
+        "ticket",
+        "tiger",
+        "tip",
+        "toad",
+        "tool",
+        "topic",
+        "toward",
+        "towel",
+        "tower",
+        "toy",
+        "treasure",
+        "tree",
+        "trial",
+        "triangle",
+        "trip",
+        "truck",
+        "trunk",
+        "turtle",
+        "twin",
+        "type",
+        "union",
+        "up",
+        "upward",
+        "value",
+        "vegetable",
+        "vehicle",
+        "verse",
+        "vessel",
+        "victim",
+        "video",
+        "village",
+        "vocalize",
+        "vol",
+        "volume",
+        "wall",
+        "warrior",
+        "way",
+        "weapon",
+        "weed",
+        "well",
+        "whale",
+        "wheel",
+        "white",
+        "whoop",
+        "william",
+        "window",
+        "wing",
+        "winner",
+        "winter",
+        "worm",
+        "wrestler",
+        "writing",
+        "your"
+      ],
+      "suffixes": [
+        "",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "arriv",
+        "believ",
+        "breath",
+        "caus",
+        "chang",
+        "charg",
+        "chuckl",
+        "clos",
+        "continu",
+        "creat",
+        "danc",
+        "dat",
+        "decid",
+        "describ",
+        "examin",
+        "fac",
+        "fir",
+        "forc",
+        "glanc",
+        "hop",
+        "includ",
+        "increas",
+        "indicat",
+        "involv",
+        "judg",
+        "lik",
+        "liv",
+        "lov",
+        "mov",
+        "notic",
+        "operat",
+        "ow",
+        "plac",
+        "produc",
+        "promis",
+        "provid",
+        "r",
+        "rais",
+        "remov",
+        "rul",
+        "serv",
+        "shar",
+        "smil",
+        "star",
+        "struggl",
+        "surpris",
+        "us",
+        "voic",
+        "vot",
+        "wav",
+        "wip"
+      ],
+      "suffixes": [
+        "es",
+        "ing",
+        [
+          "e",
+          [
+            "",
+            "d"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "affect",
+        "arm",
+        "award",
+        "border",
+        "bound",
+        "bow",
+        "colour",
+        "comment",
+        "content",
+        "demand",
+        "design",
+        "detail",
+        "di",
+        "doubt",
+        "flood",
+        "fold",
+        "guard",
+        "hat",
+        "heart",
+        "honor",
+        "honour",
+        "hook",
+        "host",
+        "interrupt",
+        "limit",
+        "list",
+        "lock",
+        "mark",
+        "mention",
+        "mind",
+        "murder",
+        "own",
+        "plant",
+        "post",
+        "protest",
+        "remark",
+        "remind",
+        "request",
+        "respect",
+        "respond",
+        "ruin",
+        "screw",
+        "seal",
+        "seat",
+        "suit",
+        "thank",
+        "unit",
+        "view",
+        "volunteer",
+        "wound"
+      ],
+      "suffixes": [
+        "",
+        "ed",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "approach",
+        "attend",
+        "avoid",
+        "boil",
+        "bother",
+        "complain",
+        "consider",
+        "crash",
+        "cross",
+        "dash",
+        "destroy",
+        "drown",
+        "earn",
+        "echo",
+        "expect",
+        "fill",
+        "fix",
+        "flash",
+        "float",
+        "fuck",
+        "gain",
+        "gather",
+        "guess",
+        "hang",
+        "heat",
+        "leap",
+        "lick",
+        "march",
+        "mess",
+        "mix",
+        "pack",
+        "perform",
+        "pour",
+        "rest",
+        "rush",
+        "search",
+        "shift",
+        "smash",
+        "starr",
+        "strain",
+        "stretch",
+        "suffer",
+        "trust",
+        "wander"
+      ],
+      "suffixes": [
+        "",
+        "ed",
+        "ing"
+      ]
+    },
+    {
+      "stems": [
+        "adam",
+        "boy",
+        "bro",
+        "brother",
+        "cat",
+        "chi",
+        "color",
+        "council",
+        "daughter",
+        "day",
+        "doctor",
+        "dog",
+        "doll",
+        "father",
+        "friend",
+        "girl",
+        "god",
+        "government",
+        "guy",
+        "it",
+        "king",
+        "let",
+        "mark",
+        "master",
+        "moment",
+        "mother",
+        "name",
+        "night",
+        "number",
+        "one",
+        "other",
+        "people",
+        "person",
+        "queen",
+        "ship",
+        "sister",
+        "son",
+        "steven",
+        "water",
+        "week",
+        "world",
+        "year"
+      ],
+      "suffixes": [
+        "",
+        "'s",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "address",
+        "breath",
+        "brush",
+        "dat",
+        "dress",
+        "finish",
+        "hop",
+        "kiss",
+        "miss",
+        "ow",
+        "pass",
+        "push",
+        "reach",
+        "star",
+        "touch",
+        "us",
+        "watch",
+        "wish"
+      ],
+      "suffixes": [
+        "",
+        "ed",
+        "es",
+        "ing"
+      ]
+    },
+    {
+      "stems": [
+        "age",
+        "base",
+        "bu",
+        "deserve",
+        "desire",
+        "di",
+        "estimate",
+        "experience",
+        "eye",
+        "feature",
+        "fee",
+        "figure",
+        "file",
+        "han",
+        "hate",
+        "her",
+        "hi",
+        "influence",
+        "issue",
+        "la",
+        "lie",
+        "measure",
+        "name",
+        "phone",
+        "pile",
+        "prove",
+        "puzzle",
+        "recognize",
+        "release",
+        "score",
+        "sentence",
+        "shape",
+        "size",
+        "tie",
+        "tire",
+        "title",
+        "trouble",
+        "win"
+      ],
+      "suffixes": [
+        "",
+        "d",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "beep",
+        "bend",
+        "bit",
+        "board",
+        "bowl",
+        "break",
+        "click",
+        "cough",
+        "drink",
+        "engineer",
+        "feed",
+        "fool",
+        "fund",
+        "grunt",
+        "hurt",
+        "mean",
+        "neighbour",
+        "pay",
+        "pound",
+        "ring",
+        "send",
+        "spend",
+        "sport",
+        "spring",
+        "squeal",
+        "stand",
+        "stream",
+        "suck",
+        "swing",
+        "tear",
+        "tour",
+        "track",
+        "understand"
+      ],
+      "suffixes": [
+        "",
+        "ing",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "arm",
+        "bell",
+        "bill",
+        "bu",
+        "cheek",
+        "cop",
+        "director",
+        "doll",
+        "duck",
+        "factor",
+        "ga",
+        "german",
+        "good",
+        "ha",
+        "heart",
+        "jo",
+        "la",
+        "lad",
+        "photograph",
+        "ra",
+        "sand",
+        "smell",
+        "ton",
+        "trick",
+        "wa",
+        "wood"
+      ],
+      "suffixes": [
+        "",
+        "s",
+        "y"
+      ]
+    },
+    {
+      "stems": [
+        "bomb",
+        "build",
+        "draw",
+        "farm",
+        "fight",
+        "flow",
+        "lay",
+        "play",
+        "read",
+        "sing",
+        "speak",
+        "stick",
+        "work"
+      ],
+      "suffixes": [
+        "",
+        "ing",
+        "s",
+        [
+          "er",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "cheer",
+        "count",
+        "hand",
+        "part",
+        "rain",
+        "read",
+        "rock",
+        "sleep",
+        "stick",
+        "tell",
+        "wear",
+        "wind"
+      ],
+      "suffixes": [
+        "",
+        "ing",
+        "s",
+        "y"
+      ]
+    },
+    {
+      "stems": [
+        "'i",
+        "bridge",
+        "bu",
+        "ca",
+        "co",
+        "deal",
+        "do",
+        "fee",
+        "ha",
+        "hi",
+        "les",
+        "lo",
+        "los",
+        "mean",
+        "nut",
+        "pain",
+        "plan",
+        "plane",
+        "po",
+        "ra",
+        "sea",
+        "si",
+        "ye"
+      ],
+      "suffixes": [
+        "",
+        "s",
+        "t"
+      ]
+    },
+    {
+      "stems": [
+        "absolute",
+        "according",
+        "actual",
+        "anxious",
+        "apparent",
+        "awful",
+        "bad",
+        "bare",
+        "beautiful",
+        "bitter",
+        "bold",
+        "brief",
+        "calm",
+        "careful",
+        "certain",
+        "cheerful",
+        "comparative",
+        "constant",
+        "curious",
+        "current",
+        "dead",
+        "decided",
+        "definite",
+        "desperate",
+        "dreadful",
+        "eager",
+        "earnest",
+        "effective",
+        "entire",
+        "essential",
+        "exact",
+        "exceeding",
+        "excited",
+        "extreme",
+        "fair",
+        "fierce",
+        "firm",
+        "former",
+        "fortunate",
+        "frank",
+        "frequent",
+        "general",
+        "glad",
+        "grim",
+        "ho",
+        "honest",
+        "hurried",
+        "immediate",
+        "impatient",
+        "initial",
+        "joyful",
+        "main",
+        "mere",
+        "most",
+        "natural",
+        "neat",
+        "normal",
+        "obvious",
+        "partial",
+        "particular",
+        "physical",
+        "polite",
+        "poor",
+        "positive",
+        "practical",
+        "previous",
+        "private",
+        "proper",
+        "proud",
+        "quiet",
+        "rapid",
+        "rare",
+        "recent",
+        "regular",
+        "repeated",
+        "rough",
+        "sad",
+        "scarce",
+        "serious",
+        "severe",
+        "sharp",
+        "similar",
+        "simultaneous",
+        "sole",
+        "solemn",
+        "sore",
+        "special",
+        "stern",
+        "strict",
+        "successful",
+        "sudden",
+        "sufficient",
+        "sure",
+        "swift",
+        "technical",
+        "tel",
+        "thorough",
+        "thoughtful",
+        "tick",
+        "tight",
+        "total",
+        "typical",
+        "ultimate",
+        "unfortunate",
+        "unlike",
+        "usual",
+        "wild",
+        "willing",
+        "wonderful"
+      ],
+      "suffixes": [
+        "",
+        "ly"
+      ]
+    },
+    {
+      "stems": [
+        "abandon",
+        "accept",
+        "accomplish",
+        "adopt",
+        "afford",
+        "aim",
+        "air",
+        "alarm",
+        "appeal",
+        "arrest",
+        "assist",
+        "betray",
+        "borrow",
+        "bump",
+        "bust",
+        "conduct",
+        "convert",
+        "crowd",
+        "crown",
+        "crush",
+        "dart",
+        "debut",
+        "defeat",
+        "delay",
+        "delight",
+        "deposit",
+        "desert",
+        "detect",
+        "disappear",
+        "display",
+        "distinguish",
+        "distress",
+        "disturb",
+        "doom",
+        "draft",
+        "dread",
+        "drift",
+        "dump",
+        "expand",
+        "ey",
+        "fad",
+        "fashion",
+        "flush",
+        "focus",
+        "fri",
+        "frighten",
+        "grant",
+        "grasp",
+        "hail",
+        "halt",
+        "haul",
+        "insist",
+        "intend",
+        "jar",
+        "last",
+        "launch",
+        "leak",
+        "lin",
+        "link",
+        "maintain",
+        "melt",
+        "mount",
+        "obey",
+        "obtain",
+        "peep",
+        "piss",
+        "pitch",
+        "poison",
+        "polish",
+        "premier",
+        "prevent",
+        "recall",
+        "recommend",
+        "reform",
+        "register",
+        "relax",
+        "render",
+        "repair",
+        "retain",
+        "reveal",
+        "scratch",
+        "se",
+        "shock",
+        "sight",
+        "snatch",
+        "spill",
+        "spoil",
+        "stuff",
+        "succeed",
+        "suspect",
+        "swallow",
+        "switch",
+        "talent",
+        "thrill",
+        "ti",
+        "toss",
+        "tripp",
+        "tuck",
+        "twist",
+        "we",
+        "weigh",
+        "witness",
+        "wreck",
+        "yield"
+      ],
+      "suffixes": [
+        "",
+        "ed"
+      ]
+    },
+    {
+      "stems": [
+        "acknowledge",
+        "ad",
+        "advise",
+        "an",
+        "approve",
+        "ar",
+        "assume",
+        "assure",
+        "ban",
+        "be",
+        "behave",
+        "ben",
+        "bi",
+        "blame",
+        "capture",
+        "cease",
+        "cla",
+        "col",
+        "compare",
+        "compete",
+        "convince",
+        "cor",
+        "crow",
+        "cure",
+        "da",
+        "damage",
+        "decline",
+        "define",
+        "discharge",
+        "divide",
+        "divorce",
+        "en",
+        "enable",
+        "escape",
+        "exchange",
+        "explode",
+        "for",
+        "fun",
+        "go",
+        "gran",
+        "guide",
+        "har",
+        "hee",
+        "hin",
+        "hire",
+        "hoo",
+        "ignore",
+        "induce",
+        "inquire",
+        "introduce",
+        "ki",
+        "kin",
+        "lai",
+        "lea",
+        "lou",
+        "men",
+        "mi",
+        "min",
+        "moo",
+        "mu",
+        "nature",
+        "nee",
+        "nickname",
+        "nor",
+        "participate",
+        "persuade",
+        "please",
+        "praise",
+        "preserve",
+        "propose",
+        "purchase",
+        "pursue",
+        "realise",
+        "recognise",
+        "refuse",
+        "rejoice",
+        "relieve",
+        "rescue",
+        "reserve",
+        "resolve",
+        "restore",
+        "ri",
+        "san",
+        "scare",
+        "schedule",
+        "secure",
+        "she",
+        "solve",
+        "spare",
+        "stan",
+        "suite",
+        "suppose",
+        "ten",
+        "tumble",
+        "unite",
+        "urge",
+        "venture",
+        "wan",
+        "wee",
+        "welcome",
+        "wi",
+        "woo"
+      ],
+      "suffixes": [
+        "",
+        "d"
+      ]
+    },
+    {
+      "stems": [
+        "brave",
+        "chief",
+        "common",
+        "ear",
+        "elder",
+        "equal",
+        "final",
+        "friend",
+        "heaven",
+        "kind",
+        "month",
+        "new",
+        "official",
+        "plain",
+        "right",
+        "short",
+        "week",
+        "year"
+      ],
+      "suffixes": [
+        "",
+        "ly",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "'it",
+        "'t",
+        "alaeddin",
+        "albert",
+        "alexander",
+        "america",
+        "anne",
+        "bart",
+        "ben",
+        "bessie",
+        "bob",
+        "catherine",
+        "child",
+        "children",
+        "clown",
+        "colin",
+        "colonel",
+        "company",
+        "country",
+        "cromer",
+        "dad",
+        "dada",
+        "daddy",
+        "dat",
+        "david",
+        "dolly",
+        "donald",
+        "earth",
+        "edna",
+        "eleanor",
+        "ellie",
+        "else",
+        "eve",
+        "everybody",
+        "everyone",
+        "everything",
+        "family",
+        "florence",
+        "fraser",
+        "grandma",
+        "he",
+        "here",
+        "how",
+        "husband",
+        "irene",
+        "jane",
+        "john",
+        "jumbo",
+        "jwww",
+        "kitty",
+        "lady",
+        "lara",
+        "life",
+        "lt",
+        "maggie",
+        "mama",
+        "man",
+        "men",
+        "michael",
+        "molly",
+        "mom",
+        "mot",
+        "mum",
+        "mummy",
+        "nobody",
+        "papa",
+        "patty",
+        "paul",
+        "peggy",
+        "rosamund",
+        "sarah",
+        "she",
+        "somebody",
+        "someone",
+        "steve",
+        "ted",
+        "teddy",
+        "that",
+        "there",
+        "today",
+        "tom",
+        "tonight",
+        "uncle",
+        "ursula",
+        "what",
+        "when",
+        "where",
+        "who",
+        "wife",
+        "woman",
+        "women"
+      ],
+      "suffixes": [
+        "",
+        "'s"
+      ]
+    },
+    {
+      "stems": [
+        "crack",
+        "defend",
+        "flow",
+        "hunt",
+        "play",
+        "publish",
+        "us",
+        "wash",
+        "work"
+      ],
+      "suffixes": [
+        "",
+        "ed",
+        "ing",
+        [
+          "er",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "africa",
+        "america",
+        "an",
+        "arabia",
+        "asia",
+        "australia",
+        "austria",
+        "ba",
+        "be",
+        "bi",
+        "broke",
+        "brow",
+        "cha",
+        "chose",
+        "cla",
+        "cor",
+        "crow",
+        "da",
+        "dam",
+        "did",
+        "easter",
+        "ed",
+        "eva",
+        "eve",
+        "fa",
+        "fu",
+        "georgia",
+        "gi",
+        "glen",
+        "gra",
+        "have",
+        "he",
+        "ho",
+        "ia",
+        "in",
+        "india",
+        "is",
+        "ji",
+        "julie",
+        "ki",
+        "korea",
+        "lea",
+        "leo",
+        "li",
+        "ma",
+        "mario",
+        "me",
+        "mi",
+        "mo",
+        "moo",
+        "na",
+        "no",
+        "noo",
+        "ow",
+        "pa",
+        "pi",
+        "rise",
+        "russia",
+        "sa",
+        "ski",
+        "so",
+        "soo",
+        "spoke",
+        "steve",
+        "stole",
+        "su",
+        "ta",
+        "te",
+        "tee",
+        "tha",
+        "the",
+        "ti",
+        "to",
+        "tow",
+        "wi",
+        "wo"
+      ],
+      "suffixes": [
+        "",
+        "n"
+      ]
+    },
+    {
+      "stems": [
+        "am",
+        "an",
+        "and",
+        "aunt",
+        "ba",
+        "bab",
+        "bart",
+        "blood",
+        "bo",
+        "brand",
+        "bull",
+        "bus",
+        "carr",
+        "chill",
+        "cla",
+        "da",
+        "dave",
+        "den",
+        "difficult",
+        "dirt",
+        "dr",
+        "dust",
+        "earl",
+        "ever",
+        "fair",
+        "fort",
+        "frank",
+        "frost",
+        "full",
+        "fur",
+        "fuss",
+        "gloom",
+        "gra",
+        "gravel",
+        "guilt",
+        "hard",
+        "hast",
+        "health",
+        "iv",
+        "jack",
+        "jealous",
+        "joe",
+        "jud",
+        "ka",
+        "loft",
+        "luc",
+        "luck",
+        "ma",
+        "man",
+        "might",
+        "monarch",
+        "na",
+        "pa",
+        "pit",
+        "pre",
+        "rick",
+        "rub",
+        "sa",
+        "saxon",
+        "scott",
+        "sex",
+        "snow",
+        "stink",
+        "th",
+        "the",
+        "thirst",
+        "tin",
+        "to",
+        "tr",
+        "var",
+        "victor",
+        "wealth",
+        "wh",
+        "worth",
+        "ya",
+        "yuck"
+      ],
+      "suffixes": [
+        "",
+        "y"
+      ]
+    },
+    {
+      "stems": [
+        "arm",
+        "cheer",
+        "count",
+        "deliver",
+        "discover",
+        "hair",
+        "hand",
+        "he",
+        "heart",
+        "mess",
+        "part",
+        "recover",
+        "scar",
+        "sh",
+        "treat"
+      ],
+      "suffixes": [
+        "",
+        "ed",
+        "y"
+      ]
+    },
+    {
+      "stems": [
+        "as",
+        "boot",
+        "bu",
+        "do",
+        "ha",
+        "hat",
+        "heart",
+        "hug",
+        "ra",
+        "set",
+        "sing",
+        "ss",
+        "tent",
+        "wa",
+        "ye"
+      ],
+      "suffixes": [
+        "",
+        "h",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "cent",
+        "chart",
+        "custom",
+        "design",
+        "engine",
+        "horn",
+        "mark",
+        "mill",
+        "murder",
+        "own",
+        "pet",
+        "photograph",
+        "port",
+        "short",
+        "tank"
+      ],
+      "suffixes": [
+        "",
+        "er",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "'",
+        "a",
+        "bu",
+        "e",
+        "ha",
+        "he'",
+        "he’",
+        "hi",
+        "i",
+        "it'",
+        "she'",
+        "that'",
+        "there'",
+        "what'"
+      ],
+      "suffixes": [
+        "d",
+        "ll",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "bu",
+        "ca",
+        "chi",
+        "co",
+        "di",
+        "ear",
+        "ha",
+        "hi",
+        "law",
+        "mistake",
+        "ra",
+        "sea",
+        "si",
+        "wa"
+      ],
+      "suffixes": [
+        "",
+        "n",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "flow",
+        "follow",
+        "play",
+        "support",
+        "travel",
+        "work"
+      ],
+      "suffixes": [
+        "",
+        "ed",
+        "ers",
+        "ing",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "blow",
+        "do",
+        "draw",
+        "flow",
+        "grow",
+        "know",
+        "show",
+        "throw"
+      ],
+      "suffixes": [
+        "",
+        "ing",
+        "n",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "do",
+        "eat",
+        "look",
+        "play",
+        "say",
+        "talk",
+        "tell",
+        "think"
+      ],
+      "suffixes": [
+        "",
+        "in'",
+        "ing",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "abilit",
+        "activit",
+        "agenc",
+        "appl",
+        "arm",
+        "authorit",
+        "bab",
+        "batter",
+        "bod",
+        "boundar",
+        "butterfl",
+        "carr",
+        "categor",
+        "centur",
+        "ceremon",
+        "cherr",
+        "cit",
+        "communit",
+        "compan",
+        "cop",
+        "count",
+        "countr",
+        "cr",
+        "deput",
+        "difficult",
+        "dut",
+        "enem",
+        "facilit",
+        "factor",
+        "fair",
+        "famil",
+        "fl",
+        "foll",
+        "fr",
+        "grocer",
+        "hand",
+        "injur",
+        "inquir",
+        "lad",
+        "missionar",
+        "municipalit",
+        "opportunit",
+        "penn",
+        "polic",
+        "pon",
+        "propert",
+        "pupp",
+        "qualit",
+        "raspberr",
+        "responsibilit",
+        "sk",
+        "stor",
+        "strawberr",
+        "stud",
+        "suppl",
+        "territor",
+        "theor",
+        "universit",
+        "worr"
+      ],
+      "suffixes": [
+        "ies",
+        "y"
+      ]
+    },
+    {
+      "stems": [
+        "bit",
+        "catch",
+        "cloth",
+        "coach",
+        "com",
+        "do",
+        "fuss",
+        "go",
+        "hid",
+        "min",
+        "process",
+        "slid",
+        "tim"
+      ],
+      "suffixes": [
+        "",
+        "es",
+        "ing"
+      ]
+    },
+    {
+      "stems": [
+        "ai",
+        "ba",
+        "bake",
+        "dee",
+        "fa",
+        "ma",
+        "no",
+        "pa",
+        "pitche",
+        "sa",
+        "te",
+        "thrille"
+      ],
+      "suffixes": [
+        "",
+        "d",
+        "r"
+      ]
+    },
+    {
+      "stems": [
+        "can",
+        "co",
+        "critic",
+        "di",
+        "emotion",
+        "experiment",
+        "form",
+        "leg",
+        "occasion",
+        "person",
+        "region",
+        "sign"
+      ],
+      "suffixes": [
+        "",
+        "al",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "angel",
+        "ash",
+        "branch",
+        "bus",
+        "bush",
+        "business",
+        "cas",
+        "child",
+        "church",
+        "class",
+        "con",
+        "dan",
+        "di",
+        "dish",
+        "ey",
+        "fox",
+        "fri",
+        "glass",
+        "hat",
+        "hero",
+        "inch",
+        "jam",
+        "li",
+        "lin",
+        "loss",
+        "mass",
+        "mat",
+        "match",
+        "not",
+        "on",
+        "pi",
+        "plan",
+        "potato",
+        "rang",
+        "rat",
+        "rich",
+        "sandwich",
+        "se",
+        "sid",
+        "sit",
+        "ski",
+        "speech",
+        "strip",
+        "tap",
+        "tax",
+        "ti",
+        "to",
+        "tomato",
+        "ton",
+        "trench",
+        "witness"
+      ],
+      "suffixes": [
+        "",
+        "es"
+      ]
+    },
+    {
+      "stems": [
+        "bath",
+        "bo",
+        "breed",
+        "broadcast",
+        "burst",
+        "buy",
+        "buzz",
+        "camp",
+        "carry",
+        "cast",
+        "charm",
+        "chatter",
+        "cheat",
+        "chew",
+        "comfort",
+        "copy",
+        "creep",
+        "cry",
+        "din",
+        "disgust",
+        "dwell",
+        "even",
+        "fly",
+        "glow",
+        "humm",
+        "hurry",
+        "iron",
+        "lack",
+        "market",
+        "marry",
+        "react",
+        "seek",
+        "sell",
+        "sew",
+        "sink",
+        "snow",
+        "splash",
+        "spread",
+        "st",
+        "steal",
+        "steer",
+        "study",
+        "sweep",
+        "swell",
+        "th",
+        "tidy",
+        "trail",
+        "try",
+        "will",
+        "worry"
+      ],
+      "suffixes": [
+        "",
+        "ing"
+      ]
+    },
+    {
+      "stems": [
+        "act",
+        "collect",
+        "discuss",
+        "miss",
+        "pass",
+        "protect",
+        "suggest"
+      ],
+      "suffixes": [
+        "",
+        "ed",
+        "ing",
+        "ion"
+      ]
+    },
+    {
+      "stems": [
+        "burn",
+        "join",
+        "learn",
+        "sigh",
+        "star"
+      ],
+      "suffixes": [
+        "",
+        "ed",
+        "ing",
+        "s",
+        "t"
+      ]
+    },
+    {
+      "stems": [
+        "'",
+        "ann",
+        "argentin",
+        "b",
+        "bab",
+        "bell",
+        "c",
+        "carolin",
+        "clar",
+        "cub",
+        "d",
+        "dan",
+        "dat",
+        "dian",
+        "er",
+        "ev",
+        "g",
+        "georgi",
+        "h",
+        "hast",
+        "juli",
+        "l",
+        "m",
+        "mari",
+        "n",
+        "nin",
+        "r",
+        "ros",
+        "s",
+        "se",
+        "sophi",
+        "t",
+        "te",
+        "th",
+        "tun",
+        "us",
+        "w",
+        "wa",
+        "y"
+      ],
+      "suffixes": [
+        "a",
+        "e"
+      ]
+    },
+    {
+      "stems": [
+        "affect",
+        "confess",
+        "connect",
+        "direct",
+        "elect",
+        "express",
+        "not",
+        "possess",
+        "reflect",
+        "select"
+      ],
+      "suffixes": [
+        "",
+        "ed",
+        "ion"
+      ]
+    },
+    {
+      "stems": [
+        "a",
+        "banne",
+        "cleane",
+        "commande",
+        "compose",
+        "containe",
+        "controlle",
+        "counte",
+        "dea",
+        "designe",
+        "dumpe",
+        "e",
+        "employe",
+        "forme",
+        "founde",
+        "helpe",
+        "jumpe",
+        "kille",
+        "lighte",
+        "longe",
+        "marke",
+        "merge",
+        "murdere",
+        "painte",
+        "parke",
+        "presente",
+        "rea",
+        "recorde",
+        "reporte",
+        "rolle",
+        "rubbe",
+        "showe",
+        "t",
+        "tucke",
+        "turne",
+        "waite",
+        "walke",
+        "warne"
+      ],
+      "suffixes": [
+        "d",
+        "r"
+      ]
+    },
+    {
+      "stems": [
+        "bark",
+        "deal",
+        "hold",
+        "keep",
+        "los",
+        "shoot"
+      ],
+      "suffixes": [
+        "",
+        "er",
+        "ing",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "be",
+        "box",
+        "fish",
+        "los",
+        "rid",
+        "us"
+      ],
+      "suffixes": [
+        "",
+        "er",
+        "es",
+        "ing"
+      ]
+    },
+    {
+      "stems": [
+        "clean",
+        "found",
+        "long",
+        "paint",
+        "wait",
+        "warn"
+      ],
+      "suffixes": [
+        "",
+        "ed",
+        "er",
+        "ing"
+      ]
+    },
+    {
+      "stems": [
+        "e",
+        "they'",
+        "we'",
+        "what'",
+        "you'",
+        "you’"
+      ],
+      "suffixes": [
+        "d",
+        "ll",
+        "re",
+        "ve"
+      ]
+    },
+    {
+      "stems": [
+        "a",
+        "b",
+        "bab",
+        "bell",
+        "comfortabl",
+        "considerabl",
+        "cop",
+        "dr",
+        "e",
+        "eas",
+        "gentl",
+        "grad",
+        "ha",
+        "hast",
+        "he",
+        "incredibl",
+        "inquir",
+        "ja",
+        "jo",
+        "m",
+        "polic",
+        "possibl",
+        "probabl",
+        "reasonabl",
+        "scar",
+        "sh",
+        "shad",
+        "shin",
+        "simpl",
+        "terribl",
+        "th",
+        "the",
+        "tid",
+        "wa"
+      ],
+      "suffixes": [
+        "e",
+        "y"
+      ]
+    },
+    {
+      "stems": [
+        "bright",
+        "deep",
+        "light",
+        "loud",
+        "quick",
+        "short",
+        "slow",
+        "soft",
+        "warm"
+      ],
+      "suffixes": [
+        "",
+        "er",
+        "ly"
+      ]
+    },
+    {
+      "stems": [
+        "ca",
+        "co",
+        "fa",
+        "ha",
+        "ma",
+        "olde",
+        "pa",
+        "smalle",
+        "te"
+      ],
+      "suffixes": [
+        "",
+        "r",
+        "st"
+      ]
+    },
+    {
+      "stems": [
+        "a",
+        "ca",
+        "co",
+        "da",
+        "dea",
+        "e",
+        "ear",
+        "gai",
+        "gi",
+        "grai",
+        "grow",
+        "ha",
+        "i",
+        "k",
+        "l",
+        "mai",
+        "me",
+        "mea",
+        "norma",
+        "ow",
+        "pa",
+        "pai",
+        "rai",
+        "sea",
+        "shaw",
+        "te",
+        "trai",
+        "va",
+        "vo"
+      ],
+      "suffixes": [
+        "l",
+        "n"
+      ]
+    },
+    {
+      "stems": [
+        "babbl",
+        "be",
+        "becom",
+        "bit",
+        "com",
+        "d",
+        "driv",
+        "giv",
+        "hid",
+        "hous",
+        "imitat",
+        "jok",
+        "leav",
+        "los",
+        "mak",
+        "min",
+        "nurs",
+        "practic",
+        "rac",
+        "rid",
+        "slid",
+        "strik",
+        "tak",
+        "tim",
+        "wak",
+        "whin",
+        "whistl",
+        "writ"
+      ],
+      "suffixes": [
+        "ing",
+        [
+          "e",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "addresse",
+        "allie",
+        "applie",
+        "brushe",
+        "crie",
+        "davi",
+        "dresse",
+        "finishe",
+        "frie",
+        "how'",
+        "kisse",
+        "misse",
+        "passe",
+        "pushe",
+        "r",
+        "reache",
+        "studie",
+        "supplie",
+        "touche",
+        "trie",
+        "watche",
+        "where'",
+        "who'",
+        "why'",
+        "wishe",
+        "witnesse",
+        "worrie"
+      ],
+      "suffixes": [
+        "d",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "amaz",
+        "annoy",
+        "await",
+        "begg",
+        "clapp",
+        "confus",
+        "consist",
+        "dragg",
+        "dropp",
+        "embarrass",
+        "excit",
+        "fitt",
+        "kidnapp",
+        "mutter",
+        "oppos",
+        "pant",
+        "plann",
+        "referr",
+        "rubb",
+        "slipp",
+        "stepp",
+        "stirr",
+        "stopp",
+        "surround",
+        "threaten",
+        "travell",
+        "trembl"
+      ],
+      "suffixes": [
+        "ed",
+        "ing"
+      ]
+    },
+    {
+      "stems": [
+        "ba",
+        "bo",
+        "boo",
+        "ear",
+        "eighteen",
+        "eleven",
+        "for",
+        "four",
+        "grow",
+        "ha",
+        "ma",
+        "mon",
+        "my",
+        "nineteen",
+        "nor",
+        "pa",
+        "se",
+        "seven",
+        "six",
+        "tee",
+        "ten",
+        "too",
+        "warm",
+        "wi",
+        "you"
+      ],
+      "suffixes": [
+        "",
+        "th"
+      ]
+    },
+    {
+      "stems": [
+        "clap",
+        "drop",
+        "slip",
+        "step",
+        "stop"
+      ],
+      "suffixes": [
+        "",
+        "ped",
+        "ping",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "great",
+        "hard",
+        "high",
+        "near",
+        "strong"
+      ],
+      "suffixes": [
+        "",
+        "er",
+        "est",
+        "ly"
+      ]
+    },
+    {
+      "stems": [
+        "clear",
+        "light",
+        "open",
+        "part"
+      ],
+      "suffixes": [
+        "",
+        "ed",
+        "ing",
+        "ly",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "achieve",
+        "agree",
+        "announce",
+        "base",
+        "engage",
+        "improve",
+        "replace"
+      ],
+      "suffixes": [
+        "",
+        "d",
+        "ment"
+      ]
+    },
+    {
+      "stems": [
+        "co",
+        "home",
+        "mate",
+        "pie",
+        "range",
+        "star",
+        "ye"
+      ],
+      "suffixes": [
+        "",
+        "r",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "doubt",
+        "end",
+        "fear",
+        "help",
+        "home",
+        "regard",
+        "wire"
+      ],
+      "suffixes": [
+        "",
+        "less",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "advanc",
+        "argu",
+        "bor",
+        "br",
+        "chas",
+        "chok",
+        "dar",
+        "encourag",
+        "gaz",
+        "graduat",
+        "handl",
+        "invit",
+        "paus",
+        "prepar",
+        "realiz",
+        "receiv",
+        "reduc",
+        "sav",
+        "seiz",
+        "surviv",
+        "wast"
+      ],
+      "suffixes": [
+        "ing",
+        [
+          "e",
+          [
+            "",
+            "d"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "a",
+        "absen",
+        "assistan",
+        "confiden",
+        "distan",
+        "fa",
+        "for",
+        "gree",
+        "i",
+        "ignoran",
+        "importan",
+        "independen",
+        "innocen",
+        "intelligen",
+        "pa",
+        "patien",
+        "prin",
+        "ra",
+        "referen",
+        "residen"
+      ],
+      "suffixes": [
+        "ce",
+        "t"
+      ]
+    },
+    {
+      "stems": [
+        "accompan",
+        "appl",
+        "bur",
+        "carr",
+        "cr",
+        "den",
+        "dr",
+        "fanc",
+        "fr",
+        "hurr",
+        "identif",
+        "justif",
+        "marr",
+        "occup",
+        "repl",
+        "satisf",
+        "stud",
+        "suppl",
+        "tr",
+        "worr"
+      ],
+      "suffixes": [
+        "ied",
+        "y"
+      ]
+    },
+    {
+      "stems": [
+        "bet",
+        "cruel",
+        "du",
+        "dump",
+        "for",
+        "has",
+        "jet",
+        "kit",
+        "loyal",
+        "nine",
+        "par",
+        "pat",
+        "pi",
+        "pot",
+        "proper",
+        "rus",
+        "safe",
+        "seven",
+        "six",
+        "spot"
+      ],
+      "suffixes": [
+        "",
+        "ty"
+      ]
+    },
+    {
+      "stems": [
+        "classic",
+        "coast",
+        "constitution",
+        "continent",
+        "education",
+        "electric",
+        "environment",
+        "fat",
+        "fiction",
+        "go",
+        "historic",
+        "logic",
+        "magic",
+        "marsh",
+        "me",
+        "met",
+        "music",
+        "roy",
+        "se",
+        "verb"
+      ],
+      "suffixes": [
+        "",
+        "al"
+      ]
+    },
+    {
+      "stems": [
+        "admir",
+        "agricultur",
+        "anim",
+        "approv",
+        "brut",
+        "c",
+        "can",
+        "fat",
+        "fin",
+        "g",
+        "glob",
+        "h",
+        "mor",
+        "natur",
+        "propos",
+        "sever",
+        "surviv",
+        "univers",
+        "v"
+      ],
+      "suffixes": [
+        "al",
+        "e"
+      ]
+    },
+    {
+      "stems": [
+        "be",
+        "clos",
+        "d",
+        "danc",
+        "driv",
+        "explor",
+        "freez",
+        "liv",
+        "los",
+        "lov",
+        "mak",
+        "manag",
+        "produc",
+        "receiv",
+        "rid",
+        "rul",
+        "us",
+        "wrestl",
+        "writ"
+      ],
+      "suffixes": [
+        "ing",
+        [
+          "e",
+          [
+            "",
+            "r"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "an",
+        "comin",
+        "doin",
+        "eatin",
+        "fuckin",
+        "gettin",
+        "goin",
+        "lookin",
+        "makin",
+        "mornin",
+        "playin",
+        "sayin",
+        "sittin",
+        "takin",
+        "talkin",
+        "tellin",
+        "thinkin",
+        "tryin"
+      ],
+      "suffixes": [
+        "'",
+        "g"
+      ]
+    },
+    {
+      "stems": [
+        "amus",
+        "arrang",
+        "manag",
+        "mov",
+        "retir",
+        "settl"
+      ],
+      "suffixes": [
+        "ement",
+        "ing",
+        [
+          "e",
+          [
+            "",
+            "d"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "be",
+        "gentlem",
+        "policem",
+        "se",
+        "th",
+        "wom"
+      ],
+      "suffixes": [
+        "",
+        "an",
+        "en"
+      ]
+    },
+    {
+      "stems": [
+        "bless",
+        "br",
+        "greet",
+        "paint",
+        "proceed",
+        "record"
+      ],
+      "suffixes": [
+        "",
+        "ed",
+        [
+          "ing",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "brave",
+        "bu",
+        "count",
+        "ga",
+        "hen",
+        "slave"
+      ],
+      "suffixes": [
+        "",
+        "ry",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "can",
+        "didn",
+        "don",
+        "haven",
+        "isn",
+        "won"
+      ],
+      "suffixes": [
+        "",
+        "'t",
+        "’t"
+      ]
+    },
+    {
+      "stems": [
+        "celebrat",
+        "concentrat",
+        "creat",
+        "indicat",
+        "operat",
+        "relat"
+      ],
+      "suffixes": [
+        "ing",
+        "ion",
+        [
+          "e",
+          [
+            "",
+            "d"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "chair",
+        "horse",
+        "new",
+        "post",
+        "rifle",
+        "sea"
+      ],
+      "suffixes": [
+        "",
+        "man",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "confirm",
+        "consider",
+        "form",
+        "found",
+        "inform",
+        "resign"
+      ],
+      "suffixes": [
+        "",
+        "ation",
+        "ed"
+      ]
+    },
+    {
+      "stems": [
+        "ba",
+        "bea",
+        "cracke",
+        "defende",
+        "floo",
+        "flowe",
+        "hea",
+        "hunte",
+        "ma",
+        "manne",
+        "owne",
+        "pai",
+        "playe",
+        "publishe",
+        "roa",
+        "washe",
+        "worke"
+      ],
+      "suffixes": [
+        "d",
+        [
+          "r",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "bea",
+        "bigge",
+        "ea",
+        "earlie",
+        "elde",
+        "fea",
+        "greate",
+        "harde",
+        "highe",
+        "longe",
+        "lowe",
+        "neare",
+        "roa",
+        "stronge",
+        "talle",
+        "va",
+        "younge"
+      ],
+      "suffixes": [
+        "r",
+        "st"
+      ]
+    },
+    {
+      "stems": [
+        "cruise",
+        "doo",
+        "eve",
+        "fu",
+        "gab",
+        "he",
+        "hei",
+        "ka",
+        "ou",
+        "pete",
+        "pipe",
+        "poo",
+        "sauce",
+        "su",
+        "tea",
+        "yea",
+        "you"
+      ],
+      "suffixes": [
+        "",
+        "r"
+      ]
+    },
+    {
+      "stems": [
+        "blaz",
+        "choos",
+        "cycl",
+        "hav",
+        "paddl",
+        "programm",
+        "rattl",
+        "ris",
+        "s",
+        "shin",
+        "smok",
+        "teas",
+        "th",
+        "trad",
+        "twinkl",
+        "w"
+      ],
+      "suffixes": [
+        "e",
+        "ing"
+      ]
+    },
+    {
+      "stems": [
+        "bum",
+        "dum",
+        "gas",
+        "ho",
+        "lea",
+        "overlap",
+        "pee",
+        "pop",
+        "rip",
+        "snap",
+        "trap",
+        "trip",
+        "whip",
+        "wi",
+        "worship",
+        "wrap"
+      ],
+      "suffixes": [
+        "",
+        "ped"
+      ]
+    },
+    {
+      "stems": [
+        "act",
+        "attract",
+        "collect",
+        "impress"
+      ],
+      "suffixes": [
+        "",
+        "ed",
+        "ion",
+        "ive"
+      ]
+    },
+    {
+      "stems": [
+        "carrie",
+        "ha",
+        "line",
+        "si"
+      ],
+      "suffixes": [
+        "",
+        "d",
+        "r",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "develop",
+        "enjoy",
+        "entertain",
+        "treat"
+      ],
+      "suffixes": [
+        "",
+        "ed",
+        "ing",
+        "ment"
+      ]
+    },
+    {
+      "stems": [
+        "",
+        "admir",
+        "approv",
+        "arriv",
+        "buri",
+        "deni",
+        "di",
+        "form",
+        "natur",
+        "propos",
+        "se",
+        "sign",
+        "surviv",
+        "tri"
+      ],
+      "suffixes": [
+        "al",
+        "ed"
+      ]
+    },
+    {
+      "stems": [
+        "",
+        "celebrati",
+        "concentrati",
+        "confusi",
+        "creati",
+        "discussi",
+        "imitati",
+        "indicati",
+        "missi",
+        "passi",
+        "processi",
+        "protecti",
+        "reacti",
+        "so"
+      ],
+      "suffixes": [
+        "ng",
+        "on"
+      ]
+    },
+    {
+      "stems": [
+        "angr",
+        "da",
+        "eas",
+        "happ",
+        "hast",
+        "heart",
+        "heav",
+        "luck",
+        "merr",
+        "necessar",
+        "primar",
+        "read",
+        "stead",
+        "temporar"
+      ],
+      "suffixes": [
+        "ily",
+        "y"
+      ]
+    },
+    {
+      "stems": [
+        "assembl",
+        "b",
+        "curl",
+        "enquir",
+        "fl",
+        "fr",
+        "injur",
+        "inquir",
+        "nodd",
+        "popp",
+        "sp",
+        "spott",
+        "stor",
+        "tast"
+      ],
+      "suffixes": [
+        "ed",
+        "y"
+      ]
+    },
+    {
+      "stems": [
+        "",
+        "act",
+        "collect",
+        "direct",
+        "elect",
+        "illustrat",
+        "l",
+        "locat",
+        "operat",
+        "relat",
+        "situat",
+        "stat",
+        "suggest"
+      ],
+      "suffixes": [
+        "ed",
+        [
+          "ion",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "",
+        "acti",
+        "administrati",
+        "attracti",
+        "collecti",
+        "competiti",
+        "creati",
+        "executi",
+        "explosi",
+        "extensi",
+        "impressi",
+        "moti",
+        "positi"
+      ],
+      "suffixes": [
+        "on",
+        "ve"
+      ]
+    },
+    {
+      "stems": [
+        "a",
+        "age",
+        "ai",
+        "be",
+        "consiste",
+        "depende",
+        "hi",
+        "i",
+        "pai",
+        "sai",
+        "spe",
+        "te",
+        "urge"
+      ],
+      "suffixes": [
+        "d",
+        "nt"
+      ]
+    },
+    {
+      "stems": [
+        "animat",
+        "confus",
+        "construct",
+        "convict",
+        "decorat",
+        "depress",
+        "devot",
+        "distribut",
+        "educat",
+        "execut",
+        "invent",
+        "nominat",
+        "translat"
+      ],
+      "suffixes": [
+        "ed",
+        "ion"
+      ]
+    },
+    {
+      "stems": [
+        "artist",
+        "democrat",
+        "log",
+        "magnet",
+        "top"
+      ],
+      "suffixes": [
+        "",
+        "ic",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "build",
+        "draw",
+        "feel",
+        "meet",
+        "record"
+      ],
+      "suffixes": [
+        "",
+        "s",
+        [
+          "ing",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "colon",
+        "industr",
+        "memor",
+        "part",
+        "tr"
+      ],
+      "suffixes": [
+        "ial",
+        "ies",
+        "y"
+      ]
+    },
+    {
+      "stems": [
+        "correct",
+        "direct",
+        "faint",
+        "li",
+        "utter"
+      ],
+      "suffixes": [
+        "",
+        "ed",
+        "ly"
+      ]
+    },
+    {
+      "stems": [
+        "direct",
+        "distinct",
+        "intent",
+        "on",
+        "perfect"
+      ],
+      "suffixes": [
+        "",
+        "ion",
+        "ly"
+      ]
+    },
+    {
+      "stems": [
+        "act",
+        "sail",
+        "visit"
+      ],
+      "suffixes": [
+        "",
+        "ed",
+        "ing",
+        "s",
+        [
+          "or",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "admir",
+        "combin",
+        "declar",
+        "determin",
+        "examin",
+        "imagin",
+        "invit",
+        "n",
+        "observ",
+        "organiz",
+        "prepar",
+        "quot"
+      ],
+      "suffixes": [
+        "ation",
+        [
+          "e",
+          [
+            "",
+            "d"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "aunt",
+        "carr",
+        "dogg",
+        "duck",
+        "frank",
+        "jack",
+        "jenn",
+        "jul",
+        "mumm",
+        "napp",
+        "pott",
+        "ros"
+      ],
+      "suffixes": [
+        "ie",
+        "y"
+      ]
+    },
+    {
+      "stems": [
+        "be",
+        "bea",
+        "bu",
+        "fin",
+        "i",
+        "ki",
+        "lea",
+        "loa",
+        "no",
+        "swor",
+        "wor",
+        "yar"
+      ],
+      "suffixes": [
+        "n",
+        [
+          "d",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "bu",
+        "c",
+        "har",
+        "ju",
+        "lor",
+        "ma",
+        "ro",
+        "robbe",
+        "sala",
+        "slippe",
+        "t",
+        "wor"
+      ],
+      "suffixes": [
+        "d",
+        "ry"
+      ]
+    },
+    {
+      "stems": [
+        "a",
+        "ashe",
+        "boxe",
+        "d",
+        "e",
+        "fishe",
+        "l",
+        "m",
+        "p",
+        "pea",
+        "va"
+      ],
+      "suffixes": [
+        "r",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "appl",
+        "cod",
+        "grav",
+        "hol",
+        "nois",
+        "ros",
+        "stor",
+        "tast",
+        "to",
+        "ton",
+        "trac"
+      ],
+      "suffixes": [
+        "y",
+        [
+          "e",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "art",
+        "ass",
+        "ex",
+        "guitar",
+        "journal",
+        "mo",
+        "res",
+        "social",
+        "terror",
+        "tour",
+        "wa"
+      ],
+      "suffixes": [
+        "",
+        "ist"
+      ]
+    },
+    {
+      "stems": [
+        "",
+        "a",
+        "ba",
+        "di",
+        "flu",
+        "fooli",
+        "hu",
+        "rubbi",
+        "standi",
+        "wi"
+      ],
+      "suffixes": [
+        "ng",
+        "sh"
+      ]
+    },
+    {
+      "stems": [
+        "",
+        "b",
+        "d",
+        "ev",
+        "h",
+        "horsem",
+        "m",
+        "p",
+        "r",
+        "t"
+      ],
+      "suffixes": [
+        "an",
+        "en"
+      ]
+    },
+    {
+      "stems": [
+        "beck",
+        "bett",
+        "count",
+        "hard",
+        "jenn",
+        "philosoph",
+        "photograph",
+        "pott",
+        "read",
+        "stick"
+      ],
+      "suffixes": [
+        "er",
+        "y"
+      ]
+    },
+    {
+      "stems": [
+        "chatt",
+        "cutt",
+        "digg",
+        "lett",
+        "manufactur",
+        "rubb",
+        "runn",
+        "swimm",
+        "trail",
+        "winn"
+      ],
+      "suffixes": [
+        "er",
+        "ing"
+      ]
+    },
+    {
+      "stems": [
+        "creat",
+        "d",
+        "defens",
+        "dr",
+        "expens",
+        "g",
+        "l",
+        "nat",
+        "offens",
+        "relat"
+      ],
+      "suffixes": [
+        "e",
+        "ive"
+      ]
+    },
+    {
+      "stems": [
+        "",
+        "coloni",
+        "go",
+        "industri",
+        "memori",
+        "parti",
+        "seri",
+        "speci",
+        "tri"
+      ],
+      "suffixes": [
+        "al",
+        "es"
+      ]
+    },
+    {
+      "stems": [
+        "a",
+        "basi",
+        "do",
+        "eri",
+        "heroi",
+        "in",
+        "ma",
+        "mi",
+        "toxi"
+      ],
+      "suffixes": [
+        "c",
+        "n"
+      ]
+    },
+    {
+      "stems": [
+        "activ",
+        "c",
+        "captiv",
+        "commun",
+        "dens",
+        "grav",
+        "intens",
+        "secur",
+        "univers"
+      ],
+      "suffixes": [
+        "e",
+        "ity"
+      ]
+    },
+    {
+      "stems": [
+        "as",
+        "dis",
+        "muc",
+        "o",
+        "ric",
+        "suc",
+        "tc",
+        "u",
+        "zac"
+      ],
+      "suffixes": [
+        "h",
+        "k"
+      ]
+    },
+    {
+      "stems": [
+        "behavio",
+        "colo",
+        "favo",
+        "fo",
+        "harbo",
+        "hono",
+        "labo",
+        "neighbo",
+        "o"
+      ],
+      "suffixes": [
+        "r",
+        "ur"
+      ]
+    },
+    {
+      "stems": [
+        "communicat",
+        "institut",
+        "l",
+        "nat",
+        "on",
+        "operat",
+        "relat",
+        "stat",
+        "vers"
+      ],
+      "suffixes": [
+        "e",
+        [
+          "ion",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "creat",
+        "edit",
+        "elevat",
+        "inspect",
+        "investigat",
+        "profess",
+        "radiat",
+        "success",
+        "translat"
+      ],
+      "suffixes": [
+        "ion",
+        "or"
+      ]
+    },
+    {
+      "stems": [
+        "author",
+        "christian",
+        "hospital",
+        "human"
+      ],
+      "suffixes": [
+        "",
+        "ity",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "nice",
+        "safe",
+        "strange",
+        "wide"
+      ],
+      "suffixes": [
+        "",
+        "ly",
+        "r"
+      ]
+    },
+    {
+      "stems": [
+        "beat",
+        "eat",
+        "fall"
+      ],
+      "suffixes": [
+        "",
+        "en",
+        "ing",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "eight",
+        "sevent",
+        "sixt"
+      ],
+      "suffixes": [
+        "",
+        "een",
+        "h",
+        "y"
+      ]
+    },
+    {
+      "stems": [
+        "end",
+        "fail",
+        "press"
+      ],
+      "suffixes": [
+        "",
+        "ed",
+        "ing",
+        "ure"
+      ]
+    },
+    {
+      "stems": [
+        "admira",
+        "anima",
+        "corpora",
+        "equa",
+        "federa",
+        "forma",
+        "genera",
+        "loca"
+      ],
+      "suffixes": [
+        "l",
+        "tion"
+      ]
+    },
+    {
+      "stems": [
+        "arriv",
+        "centr",
+        "cultur",
+        "d",
+        "di",
+        "practic",
+        "se",
+        "trib"
+      ],
+      "suffixes": [
+        "al",
+        [
+          "e",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "composit",
+        "contribut",
+        "corporat",
+        "definit",
+        "imitat",
+        "investigat",
+        "opposit",
+        "tens"
+      ],
+      "suffixes": [
+        "e",
+        "ion"
+      ]
+    },
+    {
+      "stems": [
+        "consist",
+        "d",
+        "moan",
+        "pant",
+        "r",
+        "s",
+        "whimper",
+        "yawn"
+      ],
+      "suffixes": [
+        "ing",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "d",
+        "mathematic",
+        "mechanic",
+        "p",
+        "physic",
+        "politic",
+        "statistic",
+        "v"
+      ],
+      "suffixes": [
+        "al",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "",
+        "act",
+        "direct",
+        "instruct",
+        "mot",
+        "operat",
+        "sect"
+      ],
+      "suffixes": [
+        "or",
+        [
+          "ion",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "",
+        "be",
+        "famili",
+        "fe",
+        "li",
+        "molecul",
+        "pol"
+      ],
+      "suffixes": [
+        "ar",
+        "es"
+      ]
+    },
+    {
+      "stems": [
+        "",
+        "cycli",
+        "ha",
+        "ho",
+        "ju",
+        "lo",
+        "touri"
+      ],
+      "suffixes": [
+        "ng",
+        "st"
+      ]
+    },
+    {
+      "stems": [
+        "affection",
+        "deb",
+        "imit",
+        "kar",
+        "passion",
+        "pl",
+        "st"
+      ],
+      "suffixes": [
+        "",
+        "ate"
+      ]
+    },
+    {
+      "stems": [
+        "appoint",
+        "argu",
+        "astonish",
+        "depart",
+        "disappoint",
+        "judg",
+        "unemploy"
+      ],
+      "suffixes": [
+        "ed",
+        "ment"
+      ]
+    },
+    {
+      "stems": [
+        "ain",
+        "couldn",
+        "doesn",
+        "hadn",
+        "wasn",
+        "wouldn"
+      ],
+      "suffixes": [
+        "'t",
+        "’t"
+      ]
+    },
+    {
+      "stems": [
+        "associat",
+        "hesitat",
+        "l",
+        "not",
+        "promot",
+        "stat"
+      ],
+      "suffixes": [
+        "ion",
+        [
+          "e",
+          [
+            "",
+            "d"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "commerc",
+        "d",
+        "fac",
+        "financ",
+        "offic",
+        "provinc"
+      ],
+      "suffixes": [
+        "e",
+        "ial"
+      ]
+    },
+    {
+      "stems": [
+        "conservati",
+        "li",
+        "mo",
+        "nati",
+        "objecti",
+        "relati"
+      ],
+      "suffixes": [
+        "on",
+        [
+          "ve",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "da",
+        "existe",
+        "fe",
+        "gla",
+        "occurre",
+        "si"
+      ],
+      "suffixes": [
+        "d",
+        "nce"
+      ]
+    },
+    {
+      "stems": [
+        "electric",
+        "major",
+        "minor",
+        "municipal",
+        "popular",
+        "prior"
+      ],
+      "suffixes": [
+        "",
+        "ity"
+      ]
+    },
+    {
+      "stems": [
+        "bounc",
+        "shak",
+        "tickl"
+      ],
+      "suffixes": [
+        "ing",
+        "y",
+        [
+          "e",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "ca",
+        "office",
+        "wa"
+      ],
+      "suffixes": [
+        "",
+        "s",
+        [
+          "r",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "commi",
+        "permi",
+        "submi"
+      ],
+      "suffixes": [
+        "ssion",
+        "t",
+        "tted"
+      ]
+    },
+    {
+      "stems": [
+        "dark",
+        "high",
+        "weak"
+      ],
+      "suffixes": [
+        "",
+        "er",
+        "ness"
+      ]
+    },
+    {
+      "stems": [
+        "employ",
+        "establish",
+        "punish"
+      ],
+      "suffixes": [
+        "",
+        "ed",
+        "ment"
+      ]
+    },
+    {
+      "stems": [
+        "free",
+        "ju",
+        "li"
+      ],
+      "suffixes": [
+        "",
+        "d",
+        "ly"
+      ]
+    },
+    {
+      "stems": [
+        "part",
+        "president",
+        "resident"
+      ],
+      "suffixes": [
+        "",
+        "ial",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "pray",
+        "rid",
+        "teach"
+      ],
+      "suffixes": [
+        "",
+        "ing",
+        [
+          "er",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "",
+        "counti",
+        "do",
+        "handi",
+        "parti"
+      ],
+      "suffixes": [
+        "es",
+        "ng"
+      ]
+    },
+    {
+      "stems": [
+        "academ",
+        "econom",
+        "histor",
+        "m",
+        "strateg"
+      ],
+      "suffixes": [
+        "ic",
+        "y"
+      ]
+    },
+    {
+      "stems": [
+        "acti",
+        "collecti",
+        "operati",
+        "relati",
+        "suggesti"
+      ],
+      "suffixes": [
+        "ng",
+        [
+          "on",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "am",
+        "bab",
+        "mamm",
+        "momm",
+        "wh"
+      ],
+      "suffixes": [
+        "a",
+        [
+          "y",
+          [
+            "",
+            "'s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "ar",
+        "franch",
+        "parad",
+        "r",
+        "w"
+      ],
+      "suffixes": [
+        "e",
+        "ise"
+      ]
+    },
+    {
+      "stems": [
+        "belie",
+        "lea",
+        "thie",
+        "wol",
+        "yoursel"
+      ],
+      "suffixes": [
+        "f",
+        "ves"
+      ]
+    },
+    {
+      "stems": [
+        "comed",
+        "histor",
+        "hungar",
+        "ital",
+        "lil"
+      ],
+      "suffixes": [
+        "ian",
+        "y"
+      ]
+    },
+    {
+      "stems": [
+        "destroye",
+        "followe",
+        "slippe",
+        "supporte",
+        "travele"
+      ],
+      "suffixes": [
+        "d",
+        "rs"
+      ]
+    },
+    {
+      "stems": [
+        "el",
+        "gabriel",
+        "isabel",
+        "lo",
+        "pau"
+      ],
+      "suffixes": [
+        "",
+        "la"
+      ]
+    },
+    {
+      "stems": [
+        "eviden",
+        "presen",
+        "significan",
+        "silen",
+        "violen"
+      ],
+      "suffixes": [
+        "ce",
+        [
+          "t",
+          [
+            "",
+            "ly"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "form",
+        "occasion",
+        "origin",
+        "person",
+        "profession"
+      ],
+      "suffixes": [
+        "",
+        [
+          "al",
+          [
+            "",
+            "ly"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "friend",
+        "member",
+        "partner",
+        "scholar",
+        "town"
+      ],
+      "suffixes": [
+        "",
+        [
+          "s",
+          [
+            "",
+            "hip"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "accident",
+        "automatic",
+        "basic",
+        "specific"
+      ],
+      "suffixes": [
+        "",
+        "ally"
+      ]
+    },
+    {
+      "stems": [
+        "accus",
+        "alter",
+        "consult",
+        "tempt"
+      ],
+      "suffixes": [
+        "ation",
+        "ed"
+      ]
+    },
+    {
+      "stems": [
+        "ambitio",
+        "cla",
+        "religio",
+        "suspicio"
+      ],
+      "suffixes": [
+        "n",
+        "us"
+      ]
+    },
+    {
+      "stems": [
+        "clu",
+        "cooki",
+        "di",
+        "movi"
+      ],
+      "suffixes": [
+        "ng",
+        [
+          "e",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "easi",
+        "happi",
+        "heavi",
+        "supp"
+      ],
+      "suffixes": [
+        "er",
+        "ly"
+      ]
+    },
+    {
+      "stems": [
+        "could",
+        "would"
+      ],
+      "suffixes": [
+        "",
+        "'ve",
+        "n't",
+        "n’t"
+      ]
+    },
+    {
+      "stems": [
+        "large",
+        "late"
+      ],
+      "suffixes": [
+        "",
+        "ly",
+        "r",
+        "st"
+      ]
+    },
+    {
+      "stems": [
+        "north",
+        "south"
+      ],
+      "suffixes": [
+        "",
+        "ern",
+        [
+          "east",
+          [
+            "",
+            "ern"
+          ]
+        ],
+        [
+          "west",
+          [
+            "",
+            "ern"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "second",
+        "secret"
+      ],
+      "suffixes": [
+        "",
+        "ary",
+        "ly",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "biolog",
+        "histor",
+        "psycholog"
+      ],
+      "suffixes": [
+        "ical",
+        "y"
+      ]
+    },
+    {
+      "stems": [
+        "compani",
+        "li",
+        "wag"
+      ],
+      "suffixes": [
+        "es",
+        [
+          "on",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "east",
+        "st",
+        "west"
+      ],
+      "suffixes": [
+        "",
+        "ern"
+      ]
+    },
+    {
+      "stems": [
+        "giggl",
+        "mumbl",
+        "vocalis"
+      ],
+      "suffixes": [
+        "es",
+        "ing"
+      ]
+    },
+    {
+      "stems": [
+        "card",
+        "term"
+      ],
+      "suffixes": [
+        "",
+        "inal",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "complet",
+        "separat"
+      ],
+      "suffixes": [
+        "ely",
+        "ion",
+        [
+          "e",
+          [
+            "",
+            "d"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "danger",
+        "joy"
+      ],
+      "suffixes": [
+        "",
+        "ous",
+        "s"
+      ]
+    },
+    {
+      "stems": [
+        "dear",
+        "slight"
+      ],
+      "suffixes": [
+        "",
+        "est",
+        "ly"
+      ]
+    },
+    {
+      "stems": [
+        "possib",
+        "probab"
+      ],
+      "suffixes": [
+        "ility",
+        "le",
+        "ly"
+      ]
+    },
+    {
+      "stems": [
+        "addi",
+        "tradi"
+      ],
+      "suffixes": [
+        "ng",
+        [
+          "tion",
+          [
+            "",
+            "al"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "differen",
+        "instan"
+      ],
+      "suffixes": [
+        [
+          "ce",
+          [
+            "",
+            "s"
+          ]
+        ],
+        [
+          "t",
+          [
+            "",
+            "ly"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "northe",
+        "southe"
+      ],
+      "suffixes": [
+        "rn",
+        [
+          "ast",
+          [
+            "",
+            "ern"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "nothin",
+        "somethin"
+      ],
+      "suffixes": [
+        "'",
+        [
+          "g",
+          [
+            "",
+            "'s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "require"
+      ],
+      "suffixes": [
+        "",
+        "d",
+        "s",
+        [
+          "ment",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "champion"
+      ],
+      "suffixes": [
+        "",
+        [
+          "s",
+          [
+            "",
+            "hip"
+          ]
+        ],
+        [
+          "ship",
+          [
+            "",
+            "s"
+          ]
+        ]
+      ]
+    },
+    {
+      "stems": [
+        "fift"
+      ],
+      "suffixes": [
+        "een",
+        "h",
+        "y"
+      ]
+    },
+    {
+      "stems": [
+        "public"
+      ],
+      "suffixes": [
+        "",
+        "ity",
+        "ly"
+      ]
+    }
+  ]
+}

preprocess_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "lowercase": true,
+  "separate_apostrophes": false,
+  "separate_digits": true,
+  "separate_punctuation": true
+}

preprocessing.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# preprocessing.py
+import re
+class Preprocessor:
+    def __init__(self, lowercase=False, separate_apostrophes=True, separate_digits=True, separate_punctuation=True):
+        self.lowercase = lowercase
+        self.separate_apostrophes = separate_apostrophes
+        self.separate_punctuation = separate_punctuation
+        self.separate_digits = separate_digits
+    def preprocess(self, line: str) -> str:
+        if self.lowercase:
+            line = line.lower()
+        if self.separate_apostrophes:
+            # Add spaces around apostrophes
+            line = re.sub(r"([’'`])", r" \1 ", line)
+        # Add spaces around punctuation (except alphanumeric and apostrophes)
+        if self.separate_punctuation:
+            line = re.sub(r"([^A-Za-z0-9\s’'`])", r" \1 ", line)
+        if self.separate_digits:
+            line = re.sub(r"(\d)", r" \1 ", line)
+        # Normalize whitespace
+        line = re.sub(r"\s+", " ", line)
+        return line.strip()

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83cc6d0c3757dce499cb73dec58b5dd30586b574b82060be53e2b9d1cd984b91
+size 433174963

sanity_check.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch, torch.nn.functional as F
+tok = AutoTokenizer.from_pretrained("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M", trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M", trust_remote_code=True).eval().to("cuda" if torch.cuda.is_available() else "cpu")
+print("pad_id:", tok.pad_token_id, "eos_id:", tok.eos_token_id, "bos_id:", tok.bos_token_id)
+assert tok.pad_token_id is not None, "pad_token is None"
+assert tok.pad_token_id != tok.eos_token_id, "BUG: pad_id == eos_id (this can bias sentence scores)"
+assert tok.bos_token_id is None or isinstance(tok.bos_token_id, int)
+# pick one JSONL example from this task
+s_good = "Create a noun out of the following adjective: clangish. clangishity"  # the grammatical one
+s_bad  = "Create a noun out of the following adjective: clangish. clangishness"  # the ungrammatical one
+print("\n--- Tokenization debug ---")
+for label, sent in [("good", s_good), ("bad", s_bad)]:
+    toks = tok.tokenize(sent)
+    ids = tok.encode(sent, add_special_tokens=True)
+    print(f"{label} sentence: {sent}")
+    print(f"  tokens: {toks}")
+    print(f"  ids   : {ids}")
+print("--- End tokenization debug ---\n")
+def sent_logprob(s):
+    # mimic eval: no special tokens
+    enc = tok(s, add_special_tokens=False, return_tensors="pt")
+    input_ids = enc["input_ids"].to(model.device)
+    attn_mask = enc["attention_mask"].to(model.device)
+    with torch.no_grad():
+        out = model(input_ids=input_ids, attention_mask=attn_mask)
+    logits = out.logits[:, :-1, :]                # shift for next-token LM
+    targets = input_ids[:, 1:]                    # gold next tokens
+    lp = F.log_softmax(logits, dim=-1).gather(-1, targets.unsqueeze(-1)).squeeze(-1)
+    # mask out padding if any
+    if tok.pad_token_id is not None:
+        keep = (targets != tok.pad_token_id)
+        lp = lp * keep
+    return lp.sum().item()
+print("LP(good) =", sent_logprob(s_good))
+print("LP(bad)  =", sent_logprob(s_bad))

segmentation_tests.py ADDED Viewed

	@@ -0,0 +1,36 @@

+#from tokenizer import ParadigmTokenizerWrapper
+#tok = ParadigmTokenizerWrapper("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M")
+#enc = tok("the singers were singing a very nice song!")
+#print(tok.tok.convert_ids_to_tokens(enc["input_ids"]))
+from transformers import AutoTokenizer
+tok = AutoTokenizer.from_pretrained("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M", trust_remote_code=True, local_files_only=True)
+print(type(tok))
+print(tok("the singers were singing a very nice song!"))
+print(tok.tokenize("the singers were singing a very nice song!"))
+print(tok.special_tokens_map)
+print(tok.bos_token, tok.eos_token, tok.bos_token_id, tok.eos_token_id)
+enc = tok("the skibidiboppers were sdjnajning a very nice song!",
+          add_special_tokens=True,
+          return_attention_mask=True)
+print(tok.convert_ids_to_tokens(enc["input_ids"]))
+# via HF object
+print(tok.backend_tokenizer.post_processor)   # should NOT be None
+# double-check by reading tokenizer.json directly
+from tokenizers import Tokenizer
+import os
+tk = Tokenizer.from_file(os.path.join("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M", "tokenizer.json"))
+print(tk.post_processor)                      # should NOT be None
+enc = tok(["a b", "a b c d"], padding="max_length", max_length=6, return_tensors="pt")
+print(enc["input_ids"])        # shorter row should end with pad ids
+print(enc["attention_mask"])   # 1 for real tokens, 0 for pads

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.py ADDED Viewed

	@@ -0,0 +1,291 @@

+# tokenizer.py
+# Wrapper for ParadigmFinder segmentation + portable HF tokenizer
+from typing import List, Tuple, Optional, Union, Dict, Any
+import os, json, re
+from transformers import PreTrainedTokenizerFast
+def _deserialize_suffixes_from_json(sfx_list):
+    out = set()
+    for item in sfx_list:
+        if isinstance(item, list):
+            # JSON nested: [base, nested_list]
+            base, nested = item
+            out.add((base, frozenset(nested)))
+        else:
+            out.add(item)  # plain string like "", "ing", "s"
+    return out
+def _load_paradigms_any(path):
+    import json
+    with open(path, "r", encoding="utf-8") as f:
+        payload = json.load(f)
+    # Case A: new schema with top-level dict {"paradigms": [...]}
+    if isinstance(payload, dict) and "paradigms" in payload:
+        paradigms = []
+        for p in payload["paradigms"]:
+            stems = set(p["stems"])
+            suffixes = _deserialize_suffixes_from_json(p["suffixes"])
+            paradigms.append((stems, suffixes))
+        meta = payload.get("meta", {})
+        return paradigms, meta
+    # Case B: older “list of pairs” JSON [[stems, suffixes], ...]
+    if isinstance(payload, list) and payload and isinstance(payload[0], list):
+        paradigms = []
+        for stems, suffixes in payload:
+            stems = set(stems)
+            # suffixes may be ["", ["er", ["", "s"]], "ing"] or already strings
+            norm = _deserialize_suffixes_from_json(suffixes)
+            paradigms.append((stems, norm))
+        return paradigms, {}
+    # Case C: already python-native structure (rare if not using JSON)
+    if isinstance(payload, list) and payload and isinstance(payload[0], (list, tuple)) and len(payload[0]) == 2:
+        return payload, {}
+    raise ValueError("Unrecognized paradigms.json format")
+# ----------------------------
+# Paradigm-based segmenter
+# ----------------------------
+class ParadigmFinderSegmenter:
+    def __init__(self, paradigms, lowercase=True, space_punct=True):
+        self.paradigms = paradigms
+        self.lowercase = lowercase
+        self.space_punct = space_punct
+    def _preprocess(self, text: str) -> str:
+        s = text
+        if self.lowercase:
+            s = s.lower()
+        if self.space_punct:
+            s = re.sub(r"([^\w\s'])", r" \1 ", s)
+        s = re.sub(r"\s+", " ", s).strip()
+        return s
+    # faithful to your segmentation logic
+    def _segment_word(self, word: str, fallback=True, top_k=20) -> List[str]:
+        def match_suffixes(suffixes, remainder):
+            for suffix in suffixes:
+                if isinstance(suffix, (tuple, list)):
+                    base, nested = suffix
+                    if remainder.startswith(base):
+                        sub = remainder[len(base):]
+                        nested_result = match_suffixes(nested, sub)
+                        if nested_result is not None:
+                            return [base] + nested_result
+                elif remainder == suffix:
+                    return [suffix] if suffix else []
+            return None
+        for stems, suffixes in self.paradigms:
+            for stem in stems:
+                if word.startswith(stem):
+                    remainder = word[len(stem):]
+                    matched_suffix = match_suffixes(suffixes, remainder)
+                    if matched_suffix is not None:
+                        return [stem] + matched_suffix
+        if fallback:
+            candidates = self.paradigms[:top_k]
+            longest = ""
+            def collect_flat(sfx):
+                for s in sfx:
+                    if isinstance(s, (tuple, list)):
+                        yield s[0]
+                        yield from collect_flat(s[1])
+                    else:
+                        yield s
+            for _, suffixes in candidates:
+                for suffix in collect_flat(suffixes):
+                    if word.endswith(suffix) and len(suffix) > len(longest):
+                        longest = suffix
+            if longest:
+                stem = word[:-len(longest)]
+                return [stem, longest]
+        return [word]
+    def segment_with_alignment(self, raw_text: str) -> Tuple[str, List[Optional[int]]]:
+        """
+        Preprocess + segment; return segmented text and a char map from segmented
+        text back to raw indices (None for inserted spaces).
+        """
+        # 1) Preprocess with alignment
+        pre_chars, pre_map = [], []
+        s = raw_text.lower() if self.lowercase else raw_text
+        out, out_map = [], []
+        # insert spaces around punctuation (if enabled), tracking alignment
+        for i, ch in enumerate(s):
+            if self.space_punct and re.match(r"[^\w\s']", ch):
+                out.append(" "); out_map.append(None)
+                out.append(ch);  out_map.append(i)
+                out.append(" "); out_map.append(None)
+            else:
+                out.append(ch);  out_map.append(i)
+        # collapse/strip spaces
+        pre = []
+        pre2raw = []
+        prev_space = False
+        for ch, m in zip(out, out_map):
+            if ch.isspace():
+                if not prev_space:
+                    pre.append(" "); pre2raw.append(None)
+                prev_space = True
+            else:
+                pre.append(ch); pre2raw.append(m); prev_space = False
+        if pre and pre[0] == " ":   pre.pop(0); pre2raw.pop(0)
+        if pre and pre[-1] == " ":  pre.pop();   pre2raw.pop()
+        norm = "".join(pre)
+        # 2) Segment by paradigms, preserving alignment
+        seg_chars, seg_map = [], []
+        i = 0
+        n = len(norm)
+        while i < n:
+            while i < n and norm[i].isspace():
+                i += 1
+            if i >= n: break
+            j = i
+            while j < n and not norm[j].isspace():
+                j += 1
+            token = norm[i:j]
+            token_map = pre2raw[i:j]
+            parts = self._segment_word(token, fallback=True)
+            # robust emission: consume all chars exactly once
+            pos = 0
+            for p_index, part in enumerate(parts):
+                L = len(part)
+                # clamp to remaining length
+                L = min(L, len(token) - pos)
+                if L <= 0: continue
+                for k in range(L):
+                    seg_chars.append(token[pos + k])
+                    seg_map.append(token_map[pos + k])
+                pos += L
+                if p_index < len(parts) - 1:
+                    seg_chars.append(" "); seg_map.append(None)
+            # inter-token space
+            i = j
+            while i < n and norm[i].isspace():
+                i += 1
+            if i < n:
+                seg_chars.append(" "); seg_map.append(None)
+        # final collapse (defensive)
+        final = []
+        final_map = []
+        prev_space = False
+        for ch, m in zip(seg_chars, seg_map):
+            if ch.isspace():
+                if not prev_space:
+                    final.append(" "); final_map.append(None); prev_space = True
+            else:
+                final.append(ch); final_map.append(m); prev_space = False
+        if final and final[0] == " ": final.pop(0); final_map.pop(0)
+        if final and final[-1] == " ": final.pop(); final_map.pop()
+        return "".join(final), final_map
+# ----------------------------
+# Offset remapping helper
+# ----------------------------
+def remap_offsets_to_raw(offsets: List[Tuple[int,int]], pre2raw: List[Optional[int]]) -> List[Tuple[int,int]]:
+    mapped = []
+    L = len(pre2raw)
+    for s,e in offsets:
+        s = max(0, min(s, L)); e = max(0, min(e, L))
+        rs = re_ = None
+        t = s
+        while t < e and rs is None:
+            if pre2raw[t] is not None: rs = pre2raw[t]
+            t += 1
+        t = e - 1
+        while t >= s and re_ is None:
+            if pre2raw[t] is not None: re_ = pre2raw[t] + 1
+            t -= 1
+        mapped.append((rs if rs is not None else 0, re_ if re_ is not None else 0))
+    return mapped
+# ----------------------------
+# Public wrapper
+# ----------------------------
+class ParadigmTokenizerWrapper(PreTrainedTokenizerFast):
+    slow_tokenizer_class = None
+    def __init__(self, *args, **kwargs):
+        # ensure fast tokenizer is loaded directly (no slow->fast conversion)
+        name_or_path = kwargs.get("name_or_path", None)
+        if name_or_path is None and len(args) > 0 and isinstance(args[0], str):
+            name_or_path = args[0]
+        if "tokenizer_file" not in kwargs and "tokenizer_object" not in kwargs and name_or_path is not None:
+            tf = os.path.join(name_or_path, "tokenizer.json")
+            if not os.path.isfile(tf):
+                raise FileNotFoundError(f"Expected tokenizer.json at {tf}")
+            kwargs["tokenizer_file"] = tf
+        super().__init__(*args, **kwargs)
+        # The folder path AutoTokenizer passes becomes available as:
+        #  - kwargs.get("name_or_path") on first init
+        #  - or self.name_or_path after init
+        hf_dir = kwargs.get("name_or_path", getattr(self, "name_or_path", None))
+        if hf_dir is None:
+            # final fallback: derive from tokenizer_file path
+            tok_file = getattr(self, "tokenizer_file", None)
+            hf_dir = os.path.dirname(tok_file) if tok_file else "."
+        # Load paradigms
+        ppath = os.path.join(hf_dir, "paradigms.json")
+        if not os.path.exists(ppath):
+            raise FileNotFoundError(f"Missing paradigms.json in {hf_dir}")
+        self.paradigms, self.paradigms_meta = _load_paradigms_any(ppath)
+        # Load preprocessing flags
+        cfg = {"lowercase": True, "space_punct": True}
+        cpath = os.path.join(hf_dir, "preprocess_config.json")
+        if os.path.exists(cpath):
+            with open(cpath, "r", encoding="utf-8") as f:
+                cfg.update(json.load(f))
+        self.segmenter = ParadigmFinderSegmenter(
+            paradigms=self.paradigms,
+            lowercase=cfg.get("lowercase", True),
+            space_punct=cfg.get("space_punct", True),
+        )
+    # ---- main entry point ----
+    def __call__(self, text, **kwargs):
+            if isinstance(text, str):
+                seg, _ = self.segmenter.segment_with_alignment(text)
+                return super().__call__(seg, **kwargs)
+            elif isinstance(text, (list, tuple)):
+                segs = []
+                for t in text:
+                    seg, _ = self.segmenter.segment_with_alignment(t)
+                    segs.append(seg)
+                return super().__call__(segs, **kwargs)
+            else:
+                raise TypeError("text must be str or List[str]/Tuple[str]")
+    def tokenize(self, text, **kwargs):
+        # Intercept manual .tokenize() calls to ensure segmentation happens first
+        if isinstance(text, str):
+            seg, _ = self.segmenter.segment_with_alignment(text)
+            return super().tokenize(seg, **kwargs)
+        elif isinstance(text, list):
+            # Tokenize each string separately, then flatten (matches HF behavior)
+            out = []
+            for t in text:
+                seg, _ = self.segmenter.segment_with_alignment(t)
+                out.extend(super().tokenize(seg, **kwargs))
+            return out
+        else:
+            raise TypeError("tokenize() expects str or List[str]")

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "ParadigmTokenizerWrapper",
+  "unk_token": "<unk>",
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenizer.ParadigmTokenizerWrapper",
+      null
+    ]
+  }
+}

training.py ADDED Viewed

	@@ -0,0 +1,148 @@

+# File: training.py
+# -----------------------------
+# Main script for pretraining an LM with the next-token prediction loss
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from tqdm import tqdm
+from time import time
+from torch.utils.data import DataLoader
+import os
+import math
+import wandb
+import gc
+import pickle
+from transformers import AutoTokenizer
+from utils import get_config, setup_experiment, setup_wandb
+from models import initialize_model, initialize_optimizer, initialize_scheduler, initialize_model_and_optimizers, save_epoch_checkpoint
+from data_utils import load_babylm_data
+def full_train_loop(cfg, model, optimizer, scheduler):
+    # Load the BabyLM dataset
+    dataloader = load_babylm_data(cfg)
+    # Start the loop
+    start_time = time()
+    epoch_size = len(dataloader)
+    print(epoch_size)
+    for epoch in range(cfg["n_epochs"]):
+        # Clear cache
+        torch.cuda.empty_cache()
+        tr_metrics = train_epoch(cfg, model, optimizer, scheduler, dataloader, epoch, epoch_size, start_time)
+        print(f"Epoch {epoch}; train loss: {tr_metrics['loss']}")
+        metric_path = os.path.join(cfg["logdir"], f"epoch_{epoch}_metrics.pth")
+        torch.save(tr_metrics, metric_path)
+        checkpoint_dir = cfg["checkpoint_dir"]
+        save_epoch_checkpoint(model, optimizer, scheduler, epoch, checkpoint_dir)
+def unpack_batch(minibatch, device):
+    input_tokens = minibatch[0].to(device)
+    target_tokens = minibatch[1].to(device)
+    target_mask = minibatch[2].to(device)
+    return input_tokens, target_tokens, target_mask
+def train_epoch(cfg, model, optimizer, scheduler, dataloader, epoch, epoch_size, start_time):
+    model.train()
+    total_loss = 0
+    total_tokens = 0
+    temp_loss = 0
+    temp_tokens = 0
+    device = model.device
+    use_amp = device.type == "cuda"
+    amp_dtype = torch.bfloat16
+    num_steps = len(dataloader)
+    for train_step, minibatch in enumerate(tqdm(dataloader)):
+        input_tokens, target_tokens, target_mask = unpack_batch(minibatch, device)
+        num_tokens = torch.sum(target_mask).item()
+        B = input_tokens.shape[0]
+        # Perform forward pass
+        with torch.autocast(device_type="cuda", dtype=amp_dtype) if use_amp else torch.cuda.amp.autocast(enabled=False):
+            logits = model(input_tokens).logits
+            log_probs = F.log_softmax(logits, dim=2)
+            token_log_probs = torch.gather(log_probs, 2, target_tokens.unsqueeze(2)).squeeze(2)
+        # Backward
+        loss = - torch.sum(token_log_probs * target_mask) / torch.sum(target_mask)
+        loss.backward()
+        if cfg["gradient_clip_norm"] != -1:
+            nn.utils.clip_grad_norm_(model.parameters(), cfg['gradient_clip_norm'])
+        optimizer.step()
+        scheduler.step()
+        optimizer.zero_grad()
+        total_loss += loss.item() * num_tokens
+        total_tokens += num_tokens
+        temp_loss += loss.item() * num_tokens
+        temp_tokens += num_tokens
+        if cfg["use_wandb"] and (train_step % 10 == 0 and train_step > 0):
+            # Compute the steps
+            steps = epoch_size * epoch + train_step
+            wandb_train_epoch(
+                temp_loss / temp_tokens, steps, start_time
+            )
+            temp_loss = 0
+            temp_tokens = 0
+        # Intermediate checkpoint saving spot
+        if epoch == 0 and cfg["training_type"] == "strict_small" and train_step != 0:
+            one_million_steps = len(dataloader) // 10
+            if train_step % one_million_steps == 0:
+                curr_words = f"{train_step // one_million_steps}M"
+                save_epoch_checkpoint(model, optimizer, scheduler, curr_words, cfg["checkpoint_dir"])
+        if epoch == 0 and cfg["training_type"] == "strict" and train_step != 0:
+            one_million_steps = len(dataloader) // 100
+            if train_step % one_million_steps == 0 and train_step // one_million_steps < 10:
+                curr_words = f"{train_step // one_million_steps}M"
+                save_epoch_checkpoint(model, optimizer, scheduler, curr_words, cfg["checkpoint_dir"])
+            ten_million_steps = len(dataloader) // 10
+            if train_step % ten_million_steps == 0:
+                curr_words = f"{10 * (train_step // ten_million_steps)}M"
+                save_epoch_checkpoint(model, optimizer, scheduler, curr_words, cfg["checkpoint_dir"])
+    return {"loss" : total_loss / total_tokens}
+def wandb_train_epoch(loss, step, start_time):
+    time_elapsed = (time() - start_time) / 60
+    curr_dict = {
+        f"train_metrics/time_elapsed" : time_elapsed,
+        f"train_metrics/batch_train_loss" : loss,
+    }
+    wandb.log(curr_dict, step=step)
+def main():
+    cfg = get_config()
+    setup_experiment(cfg)
+    if cfg["use_wandb"]:
+        setup_wandb(cfg)
+    tok = AutoTokenizer.from_pretrained(cfg["tokenizer_dir"], trust_remote_code=True)
+    model = initialize_model(cfg)
+    model.resize_token_embeddings(len(tok))
+    model.config.vocab_size = len(tok)
+    model.config.bos_token_id = tok.bos_token_id
+    model.config.eos_token_id = tok.eos_token_id
+    model.config.pad_token_id = tok.pad_token_id
+    optimizer = initialize_optimizer(cfg, model)
+    scheduler = initialize_scheduler(cfg, model, optimizer)
+    full_train_loop(cfg, model, optimizer, scheduler)
+if __name__ == "__main__":
+    main()

utils.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# utils.py
+import argparse
+import os
+import yaml
+import json
+import random
+import numpy as np
+import torch
+def mkdir(dirpath):
+    os.makedirs(dirpath, exist_ok=True)
+def get_config():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config', type=str, default='config.json',
+                        help="Path to base config file (json or yaml)")
+    parser.add_argument('--tokenizer_dir', type=str)
+    parser.add_argument('--data_dir', type=str)
+    parser.add_argument('--train_glob', type=str)
+    parser.add_argument('--valid_glob', type=str)
+    parser.add_argument('--output_dir', type=str)
+    # Training settings
+    parser.add_argument('--datapoint_length', type=int)
+    parser.add_argument('--training_type', type=str, choices=["strict", "strict_small"])
+    parser.add_argument('--n_epochs', type=int)
+    parser.add_argument('--batch_size', type=int)
+    parser.add_argument('--learning_rate', type=float)
+    parser.add_argument('--weight_decay', type=float)
+    parser.add_argument('--num_training_steps', type=int)
+    parser.add_argument('--num_warmup_steps', type=int)
+    parser.add_argument('--gradient_clip_norm', type=float)
+    # Experiment
+    parser.add_argument('--seed', type=int)
+    parser.add_argument('--base_folder', type=str)
+    parser.add_argument('--experiment_name', type=str)
+    parser.add_argument('--use_wandb', action='store_true')
+    parser.add_argument('--wandb_project_name', type=str)
+    parser.add_argument('--wandb_experiment_name', type=str)
+    args = parser.parse_args()
+    config = construct_config(args)
+    return config
+def setup_experiment(cfg):
+    # Seed
+    if cfg.get("seed", -1) == -1:
+        cfg["seed"] = random.randint(0, 10**9)
+    random.seed(cfg["seed"])
+    np.random.seed(cfg["seed"])
+    torch.manual_seed(cfg["seed"])
+    torch.cuda.manual_seed_all(cfg["seed"])
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    print(f"[utils] Using seed {cfg['seed']}")
+    # Folders
+    cfg["expdir"] = os.path.join(cfg["base_folder"], cfg["experiment_name"])
+    cfg["checkpoint_dir"] = os.path.join(cfg["expdir"], 'checkpoints')
+    cfg["logdir"] = os.path.join(cfg["expdir"], 'logging')
+    mkdir(cfg["expdir"]); mkdir(cfg["checkpoint_dir"]); mkdir(cfg["logdir"])
+    # Save resolved config
+    with open(os.path.join(cfg["logdir"], "exp_cfg.yaml"), 'w') as cfg_file:
+        yaml.safe_dump(cfg, cfg_file, sort_keys=False)
+def setup_wandb(cfg):
+    try:
+        import wandb
+    except ImportError:
+        raise RuntimeError("use_wandb is true but wandb is not installed")
+    wandb.init(
+        project=cfg["wandb_project_name"],
+        name=cfg["wandb_experiment_name"]
+    )
+def load_file_any(filepath):
+    ext = os.path.splitext(filepath)[1].lower()
+    with open(filepath, 'r') as f:
+        if ext in ['.yaml', '.yml']:
+            return yaml.safe_load(f)
+        else:
+            return json.load(f)
+def construct_config(args):
+    base_cfg = load_file_any(args.config)
+    # Overlay CLI args when provided
+    for k, v in vars(args).items():
+        if k == "config":
+            continue
+        if v is not None:
+            base_cfg[k] = v
+    return base_cfg