| |
|
|
| import time |
| from collections import defaultdict |
| from tqdm import tqdm |
| import os |
| import math |
| import json |
| from typing import List, Tuple, Set, Dict, Any |
|
|
| def _serialize_suffixes(sfx_set): |
| flat = [] |
| for s in sfx_set: |
| if isinstance(s, tuple): |
| base, nested = s |
| flat.append([base, sorted(list(nested))]) |
| else: |
| flat.append(s) |
| |
| def key(x): |
| return (0, x) if isinstance(x, str) else (1, x[0], tuple(x[1])) |
| return sorted(flat, key=key) |
|
|
| def paradigms_to_json(paradigms): |
| out = [] |
| for stems, suffixes in paradigms: |
| out.append({ |
| "stems": sorted(list(stems)), |
| "suffixes": _serialize_suffixes(suffixes), |
| }) |
| return out |
|
|
| def save_paradigms_json(paradigms, path, meta=None): |
| payload = { |
| "schema_version": 1, |
| "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), |
| "meta": meta or {}, |
| "paradigms": paradigms_to_json(paradigms), |
| } |
| with open(path, "w", encoding="utf-8") as f: |
| json.dump(payload, f, ensure_ascii=False, indent=2) |
|
|
| def _deserialize_suffixes(sfx_list): |
| out = set() |
| for item in sfx_list: |
| if isinstance(item, list): |
| base, nested = item |
| out.add((base, frozenset(nested))) |
| else: |
| out.add(item) |
| return out |
|
|
| def load_paradigms_json(path): |
| with open(path, "r", encoding="utf-8") as f: |
| payload = json.load(f) |
| paradigms = [] |
| for p in payload["paradigms"]: |
| stems = set(p["stems"]) |
| suffixes = _deserialize_suffixes(p["suffixes"]) |
| paradigms.append((stems, suffixes)) |
| meta = payload.get("meta", {}) |
| return paradigms, meta |
|
|
| |
| |
| |
|
|
| def extract_stem_suffix_pairs(vocab): |
| """Return a mapping from stems to all suffixes they occur with, including null suffix.""" |
| stem_to_suffixes = defaultdict(set) |
| for word in tqdm(vocab, desc="[1/7] Extracting stem-suffix pairs"): |
| for i in range(0, len(word) + 1): |
| stem, suffix = word[:i], word[i:] |
| stem_to_suffixes[stem].add(suffix) |
| return stem_to_suffixes |
|
|
| |
| |
| |
|
|
| def group_stems_by_suffixes(stem_to_suffixes, min_shared_stems=2, min_suffixes=2): |
| suffix_to_stems = defaultdict(set) |
| for stem, suffixes in stem_to_suffixes.items(): |
| suffix_key = frozenset(suffixes) |
| suffix_to_stems[suffix_key].add(stem) |
|
|
| normalized_suffix_map = defaultdict(set) |
|
|
| for suffixes, stems in tqdm(suffix_to_stems.items(), desc="[2/7] Grouping and normalizing"): |
| non_empty_suffixes = [s for s in suffixes if s] |
| if len(stems) >= min_shared_stems and len(suffixes) >= min_suffixes: |
| common_prefix = os.path.commonprefix(non_empty_suffixes) if non_empty_suffixes else "" |
|
|
| if common_prefix: |
| normalized_stems = {stem + common_prefix for stem in stems} |
| adjusted_suffixes = {s[len(common_prefix):] if s.startswith(common_prefix) else s for s in suffixes} |
| else: |
| normalized_stems = stems |
| adjusted_suffixes = suffixes |
|
|
| if len(adjusted_suffixes) >= min_suffixes: |
| suffix_key = frozenset(adjusted_suffixes) |
| normalized_suffix_map[suffix_key].update(normalized_stems) |
|
|
| paradigms = [(stems, set(suffixes)) for suffixes, stems in normalized_suffix_map.items()] |
| return paradigms |
|
|
| |
| |
| |
|
|
| def stem_set_expansion(paradigms, stem_to_suffixes): |
| updated = 0 |
| suffix_to_stems = {frozenset(suffixes): set(stems) for stems, suffixes in paradigms} |
|
|
| for stem, suffixes in tqdm(stem_to_suffixes.items(), desc="[3/7] Expanding stem sets"): |
| added = False |
| for paradigm_suffixes in sorted(suffix_to_stems.keys(), key=lambda x: (-len(x), tuple(sorted(x)))): |
| if paradigm_suffixes.issubset(suffixes): |
| if stem not in suffix_to_stems[paradigm_suffixes]: |
| suffix_to_stems[paradigm_suffixes].add(stem) |
| updated += 1 |
| added = True |
| if not added and stem == 'design': |
| print(f"[DEBUG] No suitable paradigm for 'design' with suffixes {suffixes}") |
|
|
| enriched = [(stems, set(suffixes)) for suffixes, stems in suffix_to_stems.items()] |
| print(f"✅ Added {updated} stems via stem set expansion.") |
| return enriched |
|
|
| |
| |
| |
|
|
| def harmonic_number(n): |
| return sum(1.0 / i for i in range(1, n + 1)) |
|
|
| def suffix_set_expansion(paradigms): |
| base = paradigms[:] |
| merged = [ (set(stems), set(suffixes)) for stems, suffixes in base ] |
| enriched_count = 0 |
|
|
| |
| for i, (stems_i, suffixes_i) in enumerate(sort_paradigms(merged)): |
| for j, (stems_j, suffixes_j) in enumerate(sort_paradigms(merged)): |
| if i == j: |
| continue |
| if suffixes_i > suffixes_j: |
| intersection = stems_i & stems_j |
| denom = max(1, len(stems_j)) |
| if (len(stems_j) - len(intersection)) < (len(stems_j) / harmonic_number(denom)): |
| stems_i |= stems_j |
| enriched_count += 1 |
| |
|
|
| print(f"\n✅ Enriched {enriched_count} paradigms via suffix set expansion.") |
| |
| return [ (set(st), set(sf)) for st, sf in sort_paradigms(merged) ] |
|
|
| |
| |
| |
|
|
| def prune_subsumed_stems(paradigms): |
| pruned_paradigms = [] |
| for i, (stems_i, suffixes_i) in enumerate(paradigms): |
| pruned_stems = set(stems_i) |
| for j, (stems_j, suffixes_j) in enumerate(paradigms): |
| if i == j: |
| continue |
| if suffixes_j >= suffixes_i: |
| pruned_stems -= (stems_j & stems_i) |
| if pruned_stems: |
| pruned_paradigms.append((pruned_stems, suffixes_i)) |
| print(f"✅ Pruned to {len(pruned_paradigms)} paradigms after removing subsumed stems.") |
| return sort_paradigms(pruned_paradigms) |
|
|
| |
| |
| |
|
|
| def sort_paradigms(paradigms): |
| """ |
| Primary: log(len(stems)) * log(len(suffixes)) (DESC) |
| Ties: (-len(stems), -len(suffixes), lexicographic stems, lexicographic suffix heads) |
| """ |
| def score(p): |
| stems, suffixes = p |
| if stems and suffixes: |
| return math.log(len(stems)) * math.log(len(suffixes)) |
| return 0.0 |
|
|
| def tie_key(p): |
| stems, suffixes = p |
| sfx_heads = [] |
| for s in suffixes: |
| sfx_heads.append(s[0] if isinstance(s, tuple) else s) |
| return (-len(stems), -len(suffixes), |
| " ".join(sorted(stems)), |
| " ".join(sorted(sfx_heads))) |
|
|
| return sorted(paradigms, key=lambda p: (-score(p), tie_key(p))) |
|
|
| def sort_paradigms_by_suffix_count(paradigms): |
| def score(p): |
| stem_count = len(p[0]) |
| suffix_count = len(p[1]) |
| if stem_count > 0 and suffix_count > 0: |
| return suffix_count |
| return 0 |
| return sorted(paradigms, key=score, reverse=True) |
|
|
| def nest_suffixes_from_paradigms(paradigms): |
| print("[7/7] Nesting suffixes based on reusable paradigms...") |
|
|
| suffix_set_index = {frozenset(suffixes): True for _, suffixes in paradigms} |
| nested_paradigms = [] |
|
|
| for stems, suffixes in paradigms: |
| suffixes_list = list(suffixes) |
| nested_suffixes = set() |
| used = set() |
|
|
| |
| for i, s1 in enumerate(sorted(suffixes_list)): |
| for j, s2 in enumerate(sorted(suffixes_list)): |
| if i == j or s2 in used or not isinstance(s1, str) or not isinstance(s2, str): |
| continue |
| if s2.startswith(s1) and s1 != '': |
| remainder = s2[len(s1):] |
| if remainder and frozenset({'', remainder}) in suffix_set_index: |
| nested_suffixes.add((s1, frozenset({'', remainder}))) |
| used.add(s2) |
| used.add(s1) |
| break |
|
|
| for s in suffixes_list: |
| if s not in used: |
| nested_suffixes.add(s) |
|
|
| nested_paradigms.append((set(stems), nested_suffixes)) |
|
|
| print(f"✅ Nested structure created for {len(nested_paradigms)} paradigms.") |
| return sort_paradigms(nested_paradigms) |
|
|
|
|
| def refine_nested_stem_conflicts(paradigms): |
| """ |
| Remove stems from higher-ranked paradigms if they are fully explained by nested structures |
| in lower-ranked paradigms. |
| |
| Args: |
| paradigms: list of (stem_set, suffix_set), where suffix_set may contain nested (str, frozenset) tuples |
| |
| Returns: |
| Refined list of paradigms with redundant derived stems removed |
| """ |
| refined_paradigms = paradigms[:] |
| all_suffix_sets = {frozenset(suffixes) for _, suffixes in paradigms} |
|
|
| |
| derived_stems = set() |
| for stems, suffixes in paradigms: |
| for sfx in suffixes: |
| if isinstance(sfx, tuple): |
| base, nested_suffixes = sfx |
| if frozenset(nested_suffixes) in all_suffix_sets: |
| for stem in stems: |
| derived_stems.add(stem + base) |
|
|
| |
| updated_paradigms = [] |
| for stems, suffixes in refined_paradigms: |
| cleaned_stems = stems - derived_stems |
| updated_paradigms.append((cleaned_stems, suffixes)) |
|
|
| print(f"✅ Removed {len(derived_stems)} derived stems explained by nested paradigms.") |
| return updated_paradigms |
|
|
|
|
| |
| |
| |
| def recursive_fallback(word, suffix_set): |
| for suffix in sorted(suffix_set, key=lambda s: -len(s)): |
| if suffix and word.endswith(suffix): |
| stem_candidate = word[:-len(suffix)] |
| rest = recursive_fallback(stem_candidate, suffix_set) |
| return rest + [suffix] |
| return [word] |
|
|
|
|
|
|
| |
| |
| |
|
|
| def run_paradigm_extraction(vocab, min_shared_stems=2, min_suffixes=2, enrich_suffix_sets=True): |
| start = time.time() |
| stem_to_suffixes = extract_stem_suffix_pairs(vocab) |
| paradigms = group_stems_by_suffixes(stem_to_suffixes, min_shared_stems, min_suffixes) |
| paradigms = stem_set_expansion(paradigms, stem_to_suffixes) |
| paradigms = sort_paradigms(paradigms) |
| paradigms = prune_subsumed_stems(paradigms) |
| paradigms = sort_paradigms(paradigms) |
| paradigms = nest_suffixes_from_paradigms(paradigms) |
| paradigms = refine_nested_stem_conflicts(paradigms) |
|
|
| paradigms = sort_paradigms(paradigms) |
| if enrich_suffix_sets: |
| print("[4/7] Expanding suffix sets based on partial compatibility...") |
| paradigms = suffix_set_expansion(paradigms) |
|
|
| paradigms = sort_paradigms(paradigms) |
| paradigms = prune_subsumed_stems(paradigms) |
| paradigms = sort_paradigms(paradigms) |
|
|
|
|
| |
| '''# Fallback paradigm for unassigned full words |
| vocab_words = set(vocab) |
| assigned_words = set() |
| for stems, suffixes in paradigms: |
| for stem in stems: |
| for suffix in suffixes: |
| if isinstance(suffix, tuple): |
| base, _ = suffix |
| assigned_words.add(stem + base) |
| else: |
| assigned_words.add(stem + suffix) |
| |
| unassigned_words = vocab_words - assigned_words |
| if unassigned_words: |
| print(f"✅ {len(unassigned_words)} full words were not assigned to any paradigm, added fallback paradigm.") |
| paradigms.append((set(unassigned_words), frozenset({""}))) |
| |
| |
| paradigms = sort_paradigms(paradigms)''' |
|
|
| print(f"\n✅ Extracted {len(paradigms)} paradigms.") |
| print(f"⏱️ Finished in {time.time() - start:.2f} seconds.") |
| return paradigms |
|
|
| def segment_word_from_nested_paradigms(word, paradigms, fallback=True, top_k=300): |
| """ |
| Segment a word based on nested paradigms with optional fallback. |
| |
| Parameters: |
| word (str): The word to segment. |
| paradigms (list): A list of tuples (stems, suffixes) with optional nesting. |
| fallback (bool): Whether to fall back on longest suffix match from top_k paradigms. |
| top_k (int): Number of top paradigms to consider in fallback. |
| |
| Returns: |
| List[str]: Segmented pieces of the word. |
| """ |
|
|
| def match_suffixes(suffixes, remainder): |
| """Recursive helper to match nested suffix structures.""" |
| for suffix in suffixes: |
| if isinstance(suffix, tuple): |
| base, nested = suffix |
| if remainder.startswith(base): |
| sub = remainder[len(base):] |
| nested_result = match_suffixes(nested, sub) |
| if nested_result is not None: |
| return [base] + nested_result |
| elif remainder == suffix: |
| return [suffix] if suffix else [] |
| return None |
|
|
| |
| for stems, suffixes in paradigms: |
| for stem in stems: |
| if word.startswith(stem): |
| remainder = word[len(stem):] |
| matched_suffix = match_suffixes(suffixes, remainder) |
| if matched_suffix is not None: |
| return [stem] + matched_suffix |
|
|
| |
| if fallback: |
| seen_suffixes = set() |
|
|
| def collect_suffixes(suffixes): |
| for s in suffixes: |
| if isinstance(s, tuple): |
| seen_suffixes.add(s[0]) |
| collect_suffixes(s[1]) |
| else: |
| seen_suffixes.add(s) |
|
|
| for _, suffixes in paradigms[:top_k]: |
| collect_suffixes(suffixes) |
|
|
| |
| for suffix in sorted(seen_suffixes, key=lambda s: -len(s)): |
| if suffix and word.endswith(suffix): |
| stem = word[:-len(suffix)] |
| return [stem, suffix] |
| return [word] |
|
|
| return [word] |
|
|
|
|
| def segment_word_from_paradigms(word, paradigms, top_k=20): |
| """ |
| Simpler fallback-only version: match longest suffix among top_k paradigms. |
| |
| Parameters: |
| word (str): Word to segment. |
| paradigms (list): Paradigm structures. |
| top_k (int): How many paradigms to consider. |
| |
| Returns: |
| List[str]: Segmentation result. |
| """ |
| candidates = paradigms[:top_k] |
| best_split = None |
| for stems, suffixes in candidates: |
| for suffix in sorted(suffixes, key=lambda s: -len(s) if isinstance(s, str) else -len(s[0])): |
| if isinstance(suffix, tuple): |
| suffix = suffix[0] |
| if word.endswith(suffix): |
| stem_candidate = word[:-len(suffix)] if suffix else word |
| if stem_candidate in stems: |
| split = [stem_candidate, suffix] if suffix else [stem_candidate] |
| if best_split is None or len(suffix) > len(best_split[-1]): |
| best_split = split |
| return best_split or [word] |
|
|