# paradigm_utils.py

import time
from collections import defaultdict
from tqdm import tqdm
import os
import math
import json
from typing import List, Tuple, Set, Dict, Any

def _serialize_suffixes(sfx_set):
    flat = []
    for s in sfx_set:
        if isinstance(s, tuple):
            base, nested = s
            flat.append([base, sorted(list(nested))])  # JSON-safe pair
        else:
            flat.append(s)  # plain string
    # stable order: strings first, then pairs; then lexicographic
    def key(x):
        return (0, x) if isinstance(x, str) else (1, x[0], tuple(x[1]))
    return sorted(flat, key=key)

def paradigms_to_json(paradigms):
    out = []
    for stems, suffixes in paradigms:
        out.append({
            "stems": sorted(list(stems)),
            "suffixes": _serialize_suffixes(suffixes),
        })
    return out

def save_paradigms_json(paradigms, path, meta=None):
    payload = {
        "schema_version": 1,
        "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        "meta": meta or {},
        "paradigms": paradigms_to_json(paradigms),
    }
    with open(path, "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)

def _deserialize_suffixes(sfx_list):
    out = set()
    for item in sfx_list:
        if isinstance(item, list):  # [base, nested_list]
            base, nested = item
            out.add((base, frozenset(nested)))
        else:
            out.add(item)
    return out

def load_paradigms_json(path):
    with open(path, "r", encoding="utf-8") as f:
        payload = json.load(f)
    paradigms = []
    for p in payload["paradigms"]:
        stems = set(p["stems"])
        suffixes = _deserialize_suffixes(p["suffixes"])
        paradigms.append((stems, suffixes))
    meta = payload.get("meta", {})
    return paradigms, meta

### -----------------------------
### 1. Extract (stem, suffix) pairs from vocabulary
### -----------------------------

def extract_stem_suffix_pairs(vocab):
    """Return a mapping from stems to all suffixes they occur with, including null suffix."""
    stem_to_suffixes = defaultdict(set)
    for word in tqdm(vocab, desc="[1/7] Extracting stem-suffix pairs"):
        for i in range(0, len(word) + 1):  # include empty suffix
            stem, suffix = word[:i], word[i:]
            stem_to_suffixes[stem].add(suffix)
    return stem_to_suffixes

### -----------------------------
### 2. Group stems by shared suffix sets and normalize by common prefix
### -----------------------------

def group_stems_by_suffixes(stem_to_suffixes, min_shared_stems=2, min_suffixes=2):
    suffix_to_stems = defaultdict(set)
    for stem, suffixes in stem_to_suffixes.items():
        suffix_key = frozenset(suffixes)
        suffix_to_stems[suffix_key].add(stem)

    normalized_suffix_map = defaultdict(set)

    for suffixes, stems in tqdm(suffix_to_stems.items(), desc="[2/7] Grouping and normalizing"):
        non_empty_suffixes = [s for s in suffixes if s]
        if len(stems) >= min_shared_stems and len(suffixes) >= min_suffixes:
            common_prefix = os.path.commonprefix(non_empty_suffixes) if non_empty_suffixes else ""

            if common_prefix:
                normalized_stems = {stem + common_prefix for stem in stems}
                adjusted_suffixes = {s[len(common_prefix):] if s.startswith(common_prefix) else s for s in suffixes}
            else:
                normalized_stems = stems
                adjusted_suffixes = suffixes

            if len(adjusted_suffixes) >= min_suffixes:
                suffix_key = frozenset(adjusted_suffixes)
                normalized_suffix_map[suffix_key].update(normalized_stems)

    paradigms = [(stems, set(suffixes)) for suffixes, stems in normalized_suffix_map.items()]
    return paradigms

### -----------------------------
### 3. Expand stem sets based on suffix set coverage
### -----------------------------

def stem_set_expansion(paradigms, stem_to_suffixes):
    updated = 0
    suffix_to_stems = {frozenset(suffixes): set(stems) for stems, suffixes in paradigms}

    for stem, suffixes in tqdm(stem_to_suffixes.items(), desc="[3/7] Expanding stem sets"):
        added = False
        for paradigm_suffixes in sorted(suffix_to_stems.keys(), key=lambda x: (-len(x), tuple(sorted(x)))):
            if paradigm_suffixes.issubset(suffixes):
                if stem not in suffix_to_stems[paradigm_suffixes]:
                    suffix_to_stems[paradigm_suffixes].add(stem)
                    updated += 1
                    added = True
        if not added and stem == 'design':
            print(f"[DEBUG] No suitable paradigm for 'design' with suffixes {suffixes}")

    enriched = [(stems, set(suffixes)) for suffixes, stems in suffix_to_stems.items()]
    print(f"✅ Added {updated} stems via stem set expansion.")
    return enriched

### -----------------------------
### 4. Expand suffix sets based on partial compatibility
### -----------------------------

def harmonic_number(n):
    return sum(1.0 / i for i in range(1, n + 1))

def suffix_set_expansion(paradigms):
    base = paradigms[:]  # snapshot
    merged = [ (set(stems), set(suffixes)) for stems, suffixes in base ]
    enriched_count = 0

    # Iterate in a deterministic order
    for i, (stems_i, suffixes_i) in enumerate(sort_paradigms(merged)):
        for j, (stems_j, suffixes_j) in enumerate(sort_paradigms(merged)):
            if i == j:
                continue
            if suffixes_i > suffixes_j:
                intersection = stems_i & stems_j
                denom = max(1, len(stems_j))  # guard
                if (len(stems_j) - len(intersection)) < (len(stems_j) / harmonic_number(denom)):
                    stems_i |= stems_j
                    enriched_count += 1
                    # do not mutate stems_j/suffixes_j further

    print(f"\n✅ Enriched {enriched_count} paradigms via suffix set expansion.")
    # Return back in original tuple-of-sets form
    return [ (set(st), set(sf)) for st, sf in sort_paradigms(merged) ]

### -----------------------------
### 5. Prune subsumed stems
### -----------------------------

def prune_subsumed_stems(paradigms):
    pruned_paradigms = []
    for i, (stems_i, suffixes_i) in enumerate(paradigms):
        pruned_stems = set(stems_i)
        for j, (stems_j, suffixes_j) in enumerate(paradigms):
            if i == j:
                continue
            if suffixes_j >= suffixes_i:
                pruned_stems -= (stems_j & stems_i)
        if pruned_stems:
            pruned_paradigms.append((pruned_stems, suffixes_i))
    print(f"✅ Pruned to {len(pruned_paradigms)} paradigms after removing subsumed stems.")
    return sort_paradigms(pruned_paradigms)

### -----------------------------
### 6. Sort paradigms by size
### -----------------------------

def sort_paradigms(paradigms):
    """
    Primary: log(len(stems)) * log(len(suffixes)) (DESC)
    Ties: (-len(stems), -len(suffixes), lexicographic stems, lexicographic suffix heads)
    """
    def score(p):
        stems, suffixes = p
        if stems and suffixes:
            return math.log(len(stems)) * math.log(len(suffixes))
        return 0.0

    def tie_key(p):
        stems, suffixes = p
        sfx_heads = []
        for s in suffixes:
            sfx_heads.append(s[0] if isinstance(s, tuple) else s)
        return (-len(stems), -len(suffixes),
                " ".join(sorted(stems)),
                " ".join(sorted(sfx_heads)))

    return sorted(paradigms, key=lambda p: (-score(p), tie_key(p)))

def sort_paradigms_by_suffix_count(paradigms):
    def score(p):
        stem_count = len(p[0])
        suffix_count = len(p[1])
        if stem_count > 0 and suffix_count > 0:
            return suffix_count
        return 0
    return sorted(paradigms, key=score, reverse=True)

def nest_suffixes_from_paradigms(paradigms):
    print("[7/7] Nesting suffixes based on reusable paradigms...")

    suffix_set_index = {frozenset(suffixes): True for _, suffixes in paradigms}
    nested_paradigms = []

    for stems, suffixes in paradigms:
        suffixes_list = list(suffixes)
        nested_suffixes = set()
        used = set()

        # deterministic nested pairing
        for i, s1 in enumerate(sorted(suffixes_list)):
            for j, s2 in enumerate(sorted(suffixes_list)):
                if i == j or s2 in used or not isinstance(s1, str) or not isinstance(s2, str):
                    continue
                if s2.startswith(s1) and s1 != '':
                    remainder = s2[len(s1):]
                    if remainder and frozenset({'', remainder}) in suffix_set_index:
                        nested_suffixes.add((s1, frozenset({'', remainder})))
                        used.add(s2)
                        used.add(s1)
                        break

        for s in suffixes_list:
            if s not in used:
                nested_suffixes.add(s)

        nested_paradigms.append((set(stems), nested_suffixes))

    print(f"✅ Nested structure created for {len(nested_paradigms)} paradigms.")
    return sort_paradigms(nested_paradigms)


def refine_nested_stem_conflicts(paradigms):
    """
    Remove stems from higher-ranked paradigms if they are fully explained by nested structures
    in lower-ranked paradigms.

    Args:
        paradigms: list of (stem_set, suffix_set), where suffix_set may contain nested (str, frozenset) tuples

    Returns:
        Refined list of paradigms with redundant derived stems removed
    """
    refined_paradigms = paradigms[:]
    all_suffix_sets = {frozenset(suffixes) for _, suffixes in paradigms}

    # Build a mapping from nested suffix sets to their parent prefixes
    derived_stems = set()
    for stems, suffixes in paradigms:
        for sfx in suffixes:
            if isinstance(sfx, tuple):
                base, nested_suffixes = sfx
                if frozenset(nested_suffixes) in all_suffix_sets:
                    for stem in stems:
                        derived_stems.add(stem + base)

    # Remove derived stems from paradigms with simple suffix sets (like ['', 's'])
    updated_paradigms = []
    for stems, suffixes in refined_paradigms:
        cleaned_stems = stems - derived_stems
        updated_paradigms.append((cleaned_stems, suffixes))

    print(f"✅ Removed {len(derived_stems)} derived stems explained by nested paradigms.")
    return updated_paradigms


### -----------------------------
### 7. Segment word based on ranked paradigms
### -----------------------------
def recursive_fallback(word, suffix_set):
    for suffix in sorted(suffix_set, key=lambda s: -len(s)):
        if suffix and word.endswith(suffix):
            stem_candidate = word[:-len(suffix)]
            rest = recursive_fallback(stem_candidate, suffix_set)
            return rest + [suffix]
    return [word]  # fallback to whole word if nothing matches


### -----------------------------
### Main runner
### -----------------------------

def run_paradigm_extraction(vocab, min_shared_stems=2, min_suffixes=2, enrich_suffix_sets=True):
    start = time.time()
    stem_to_suffixes = extract_stem_suffix_pairs(vocab)
    paradigms = group_stems_by_suffixes(stem_to_suffixes, min_shared_stems, min_suffixes)
    paradigms = stem_set_expansion(paradigms, stem_to_suffixes)
    paradigms = sort_paradigms(paradigms)
    paradigms = prune_subsumed_stems(paradigms)
    paradigms = sort_paradigms(paradigms)
    paradigms = nest_suffixes_from_paradigms(paradigms)
    paradigms = refine_nested_stem_conflicts(paradigms)

    paradigms = sort_paradigms(paradigms)
    if enrich_suffix_sets:
        print("[4/7] Expanding suffix sets based on partial compatibility...")
        paradigms = suffix_set_expansion(paradigms)

    paradigms = sort_paradigms(paradigms)
    paradigms = prune_subsumed_stems(paradigms)
    paradigms = sort_paradigms(paradigms)


    '''# Fallback paradigm for unassigned full words
    vocab_words = set(vocab)
    assigned_words = set()
    for stems, suffixes in paradigms:
        for stem in stems:
            for suffix in suffixes:
                if isinstance(suffix, tuple):
                    base, _ = suffix
                    assigned_words.add(stem + base)
                else:
                    assigned_words.add(stem + suffix)

    unassigned_words = vocab_words - assigned_words
    if unassigned_words:
        print(f"✅ {len(unassigned_words)} full words were not assigned to any paradigm, added fallback paradigm.")
        paradigms.append((set(unassigned_words), frozenset({""})))


    paradigms = sort_paradigms(paradigms)'''

    print(f"\n✅ Extracted {len(paradigms)} paradigms.")
    print(f"⏱️  Finished in {time.time() - start:.2f} seconds.")
    return paradigms

def segment_word_from_nested_paradigms(word, paradigms, fallback=True, top_k=300):
    """
    Segment a word based on nested paradigms with optional fallback.

    Parameters:
        word (str): The word to segment.
        paradigms (list): A list of tuples (stems, suffixes) with optional nesting.
        fallback (bool): Whether to fall back on longest suffix match from top_k paradigms.
        top_k (int): Number of top paradigms to consider in fallback.

    Returns:
        List[str]: Segmented pieces of the word.
    """

    def match_suffixes(suffixes, remainder):
        """Recursive helper to match nested suffix structures."""
        for suffix in suffixes:
            if isinstance(suffix, tuple):
                base, nested = suffix
                if remainder.startswith(base):
                    sub = remainder[len(base):]
                    nested_result = match_suffixes(nested, sub)
                    if nested_result is not None:
                        return [base] + nested_result
            elif remainder == suffix:
                return [suffix] if suffix else []
        return None

    # First pass: try full nested match
    for stems, suffixes in paradigms:
        for stem in stems:
            if word.startswith(stem):
                remainder = word[len(stem):]
                matched_suffix = match_suffixes(suffixes, remainder)
                if matched_suffix is not None:
                    return [stem] + matched_suffix

    # Fallback strategy: longest suffix among top_k paradigms
    if fallback:
        seen_suffixes = set()

        def collect_suffixes(suffixes):
            for s in suffixes:
                if isinstance(s, tuple):
                    seen_suffixes.add(s[0])
                    collect_suffixes(s[1])
                else:
                    seen_suffixes.add(s)

        for _, suffixes in paradigms[:top_k]:
            collect_suffixes(suffixes)

        # Try matching the longest suffix first
        for suffix in sorted(seen_suffixes, key=lambda s: -len(s)):
            if suffix and word.endswith(suffix):
                stem = word[:-len(suffix)]
                return [stem, suffix]
        return [word]

    return [word]


def segment_word_from_paradigms(word, paradigms, top_k=20):
    """
    Simpler fallback-only version: match longest suffix among top_k paradigms.

    Parameters:
        word (str): Word to segment.
        paradigms (list): Paradigm structures.
        top_k (int): How many paradigms to consider.

    Returns:
        List[str]: Segmentation result.
    """
    candidates = paradigms[:top_k]
    best_split = None
    for stems, suffixes in candidates:
        for suffix in sorted(suffixes, key=lambda s: -len(s) if isinstance(s, str) else -len(s[0])):
            if isinstance(suffix, tuple):
                suffix = suffix[0]  # ignore nested for fallback
            if word.endswith(suffix):
                stem_candidate = word[:-len(suffix)] if suffix else word
                if stem_candidate in stems:
                    split = [stem_candidate, suffix] if suffix else [stem_candidate]
                    if best_split is None or len(suffix) > len(best_split[-1]):
                        best_split = split
    return best_split or [word]