import json
import os
import re
import spacy
from collections import defaultdict

# Load spaCy once
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

def tokenize(text):
    return re.findall(r"\b\w+\b", text.lower())

def has_bigram(tokens, bigram):
    parts = bigram.split('_')
    for i in range(len(tokens) - len(parts) + 1):
        if tokens[i:i + len(parts)] == parts:
            return True
    return False

def build_inverse_lemma_map(docs_file_path, cache_path=None):
    """
    Build or load a mapping from lemma -> set of surface forms seen in corpus.
    If cache_path is provided and exists, loads from it.
    Else builds from scratch and saves to cache_path.
    """
    if cache_path and os.path.exists(cache_path):
        print(f"[INFO] Loading cached lemma_to_forms from {cache_path}")
        with open(cache_path, "r", encoding="utf-8") as f:
            raw_map = json.load(f)
        return {lemma: set(forms) for lemma, forms in raw_map.items()}

    print(f"[INFO] Building inverse lemma map from {docs_file_path}...")
    lemma_to_forms = defaultdict(set)

    with open(docs_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            doc = json.loads(line)
            tokens = tokenize(doc['text'])
            spacy_doc = nlp(" ".join(tokens))
            for token in spacy_doc:
                lemma_to_forms[token.lemma_].add(token.text.lower())

    if cache_path:
        print(f"[INFO] Saving lemma_to_forms to {cache_path}")
        os.makedirs(os.path.dirname(cache_path), exist_ok=True)
        with open(cache_path, "w", encoding="utf-8") as f:
            json.dump({k: list(v) for k, v in lemma_to_forms.items()}, f, indent=2)

    return lemma_to_forms

def build_inverted_index(docs_file_path, vocab_set, lemma_map_path=None):
    vocab_unigrams = {w for w in vocab_set if '_' not in w}
    vocab_bigrams = {w for w in vocab_set if '_' in w}

    # Load or build lemma map
    lemma_to_forms = build_inverse_lemma_map(docs_file_path, cache_path=lemma_map_path)

    index = defaultdict(lambda: defaultdict(list))
    docs = []
    global_seen_words = set()

    with open(docs_file_path, 'r', encoding='utf-8') as f:
        for doc_id, line in enumerate(f):
            doc = json.loads(line)
            text = doc['text']
            timestamp = int(doc['timestamp'])
            docs.append({"text": text, "timestamp": timestamp})

            tokens = tokenize(text)
            token_set = set(tokens)
            seen_words = set()

            # Match all lemma queries using surface forms
            for lemma in vocab_unigrams:
                surface_forms = lemma_to_forms.get(lemma, set())
                if token_set & surface_forms:
                    index[lemma][timestamp].append(doc_id)
                    seen_words.add(lemma)

            for bigram in vocab_bigrams:
                if bigram not in seen_words and has_bigram(tokens, bigram):
                    index[bigram][timestamp].append(doc_id)
                    seen_words.add(bigram)

            global_seen_words.update(seen_words)

            if (doc_id + 1) % 500 == 0:
                missing = vocab_set - global_seen_words
                print(f"[INFO] After {doc_id+1} docs, {len(missing)} vocab words still not seen.")
                print("Example missing words:", list(missing)[:5])

    missing_final = vocab_set - global_seen_words
    if missing_final:
        print(f"[WARNING] {len(missing_final)} vocab words were never found in any document.")
        print("Examples:", list(missing_final)[:10])

    return index, docs, lemma_to_forms

def save_index_to_disk(index, index_path):
    index_clean = {
        word: {str(ts): doc_ids for ts, doc_ids in ts_dict.items()}
        for word, ts_dict in index.items()
    }
    os.makedirs(os.path.dirname(index_path), exist_ok=True)
    with open(index_path, "w", encoding='utf-8') as f:
        json.dump(index_clean, f, ensure_ascii=False)

def load_index_from_disk(index_path):
    with open(index_path, 'r', encoding='utf-8') as f:
        raw_index = json.load(f)

    index = defaultdict(lambda: defaultdict(list))
    for word, ts_dict in raw_index.items():
        for ts, doc_ids in ts_dict.items():
            index[word][int(ts)] = doc_ids

    return index

def load_docs(docs_file_path):
    docs = []
    with open(docs_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            doc = json.loads(line)
            docs.append({
                "text": doc["text"],
                "timestamp": int(doc["timestamp"])
            })
    return docs

def load_index(docs_file_path, vocab, index_path=None, lemma_map_path=None):
    if index_path and os.path.exists(index_path):
        index = load_index_from_disk(index_path)
        docs = load_docs(docs_file_path)
        lemma_to_forms = build_inverse_lemma_map(docs_file_path, cache_path=lemma_map_path)
        return index, docs, lemma_to_forms

    index, docs, lemma_to_forms = build_inverted_index(
        docs_file_path,
        set(vocab),
        lemma_map_path=lemma_map_path
    )

    if index_path:
        save_index_to_disk(index, index_path)

    return index, docs, lemma_to_forms