"""
Обучение распределённых представлений: Word2Vec (CBOW/Skip-gram), FastText (cbow/skipgram), Doc2Vec (PV-DM/PV-DBOW).
Предоставляет единый интерфейс обучения, сохранения, загрузки и базовых оценок.
"""

from __future__ import annotations

import os
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, List, Optional, Tuple, Dict, Any

import numpy as np
import pandas as pd
from gensim.models import Word2Vec, FastText, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.utils import simple_preprocess

try:
    from glove import Glove, Corpus
    GLOVE_AVAILABLE = True
except ImportError:
    try:
        from glove_python import Glove, Corpus
        GLOVE_AVAILABLE = True
    except ImportError:
        GLOVE_AVAILABLE = False
        print("⚠️ GloVe не установлен. Установите: pip install glove-python-binary")


@dataclass
class TrainConfig:
    model_type: str  # w2v | fasttext | doc2vec | glove
    vector_size: int = 300
    window: int = 8
    min_count: int = 2
    sg: int = 1  # 0=CBOW, 1=Skip-gram для w2v/fasttext; для doc2vec игнорируется
    dm: int = 1  # 1=PV-DM, 0=PV-DBOW для doc2vec
    epochs: int = 10
    workers: int = 4
    negative: int = 5
    hs: int = 0
    seed: int = 42
    # GloVe специфичные параметры
    alpha: float = 0.75  # для GloVe
    x_max: int = 100  # для GloVe


def _tokenize_corpus(texts: Iterable[str]) -> List[List[str]]:
    return [simple_preprocess(t, deacc=False, min_len=1) for t in texts]


def train_word2vec(texts: Iterable[str], cfg: TrainConfig) -> Word2Vec:
    sentences = _tokenize_corpus(texts)
    model = Word2Vec(
        vector_size=cfg.vector_size,
        window=cfg.window,
        min_count=cfg.min_count,
        sg=cfg.sg,
        workers=cfg.workers,
        negative=cfg.negative,
        hs=cfg.hs,
        seed=cfg.seed,
    )
    model.build_vocab(sentences)
    # Если словарь пуст из-за min_count — понижаем порог и повторяем
    if len(model.wv) == 0 and cfg.min_count > 1:
        model.min_count = 1
        model.build_vocab(sentences, update=False)
    if len(model.wv) == 0:
        return model  # вернем пустую модель; UI отобразит, что соседей нет
    model.train(sentences, total_examples=len(sentences), epochs=cfg.epochs)
    return model


def train_fasttext(texts: Iterable[str], cfg: TrainConfig) -> FastText:
    sentences = _tokenize_corpus(texts)
    model = FastText(
        vector_size=cfg.vector_size,
        window=cfg.window,
        min_count=cfg.min_count,
        sg=cfg.sg,
        workers=cfg.workers,
        negative=cfg.negative,
        hs=cfg.hs,
        seed=cfg.seed,
    )
    model.build_vocab(sentences)
    if len(model.wv) == 0 and cfg.min_count > 1:
        model.min_count = 1
        model.build_vocab(sentences, update=False)
    if len(model.wv) == 0:
        return model
    model.train(sentences, total_examples=len(sentences), epochs=cfg.epochs)
    return model


def train_doc2vec(texts: Iterable[str], cfg: TrainConfig) -> Doc2Vec:
    tagged = [TaggedDocument(simple_preprocess(t), [i]) for i, t in enumerate(texts)]
    model = Doc2Vec(
        vector_size=cfg.vector_size,
        window=cfg.window,
        min_count=cfg.min_count,
        dm=cfg.dm,
        workers=cfg.workers,
        negative=cfg.negative,
        hs=cfg.hs,
        seed=cfg.seed,
    )
    model.build_vocab(tagged)
    if len(model.wv) == 0 and cfg.min_count > 1:
        model.min_count = 1
        model.build_vocab(tagged, update=False)
    if len(model.wv) == 0:
        return model
    model.train(tagged, total_examples=len(tagged), epochs=cfg.epochs)
    return model


def train_glove(texts: Iterable[str], cfg: TrainConfig):
    """Обучает GloVe модель."""
    if not GLOVE_AVAILABLE:
        raise ImportError(
            "GloVe не установлен. Установите: pip install glove-python-binary\n"
            "Или используйте альтернативу: pip install glove-python"
        )
    
    sentences = _tokenize_corpus(texts)
    
    # Создаем корпус для GloVe
    corpus = Corpus()
    corpus.fit(sentences, window=cfg.window)
    
    # Обучаем модель
    model = Glove(no_components=cfg.vector_size, learning_rate=0.05)
    model.fit(corpus.matrix, epochs=cfg.epochs, no_threads=cfg.workers, verbose=True)
    model.add_dictionary(corpus.dictionary)
    
    return model


def train_model(texts: Iterable[str], cfg: TrainConfig):
    t0 = time.time()
    if cfg.model_type == "w2v":
        model = train_word2vec(texts, cfg)
    elif cfg.model_type == "fasttext":
        model = train_fasttext(texts, cfg)
    elif cfg.model_type == "doc2vec":
        model = train_doc2vec(texts, cfg)
    elif cfg.model_type == "glove":
        model = train_glove(texts, cfg)
    else:
        raise ValueError("model_type должен быть 'w2v', 'fasttext', 'doc2vec' или 'glove'")
    train_time = time.time() - t0
    return model, train_time


def save_model(model, out_path: str) -> None:
    Path(os.path.dirname(out_path)).mkdir(parents=True, exist_ok=True)
    # GloVe имеет другой метод сохранения
    if GLOVE_AVAILABLE and hasattr(model, 'word_vectors') and hasattr(model, 'dictionary'):
        model.save(out_path)
    else:
        # Gensim модели
        model.save(out_path)


def load_model(path: str):
    # gensim сам определит тип по расширению/классу
    from gensim.models import Word2Vec as _W2V, FastText as _FT, Doc2Vec as _D2V
    try:
        return _W2V.load(path)
    except Exception:
        pass
    try:
        return _FT.load(path)
    except Exception:
        pass
    try:
        return _D2V.load(path)
    except Exception:
        pass
    # Пробуем загрузить GloVe
    if GLOVE_AVAILABLE:
        try:
            from glove import Glove
            return Glove.load(path)
        except Exception:
            pass
    raise ValueError(f"Не удалось загрузить модель из {path}")


def evaluate_neighbors(model, test_words: List[str], topn: int = 10) -> Dict[str, List[Tuple[str, float]]]:
    results: Dict[str, List[Tuple[str, float]]] = {}
    # GloVe имеет другой API
    if GLOVE_AVAILABLE and hasattr(model, 'word_vectors') and hasattr(model, 'dictionary'):
        # GloVe модель - вычисляем ближайших соседей вручную
        for w in test_words:
            try:
                if w in model.dictionary:
                    vec_w = model.word_vectors[model.dictionary[w]]
                    similarities = []
                    for word, idx in model.dictionary.items():
                        if word != w:
                            vec = model.word_vectors[idx]
                            sim = float(np.dot(vec_w, vec) / (np.linalg.norm(vec_w) * np.linalg.norm(vec)))
                            similarities.append((word, sim))
                    similarities.sort(key=lambda x: x[1], reverse=True)
                    results[w] = similarities[:topn]
                else:
                    results[w] = []
            except:
                results[w] = []
    else:
        # Gensim модели (Word2Vec, FastText, Doc2Vec)
        kv = model.wv if hasattr(model, "wv") else model
        for w in test_words:
            if w in kv:
                results[w] = kv.most_similar(w, topn=topn)
            else:
                results[w] = []
    return results


def cosine_similarity(model, word_pairs: List[Tuple[str, str]]) -> List[Tuple[str, str, float]]:
    out: List[Tuple[str, str, float]] = []
    # GloVe имеет другой API
    if GLOVE_AVAILABLE and hasattr(model, 'word_vectors') and hasattr(model, 'dictionary'):
        # GloVe модель
        for a, b in word_pairs:
            try:
                if a in model.dictionary and b in model.dictionary:
                    vec_a = model.word_vectors[model.dictionary[a]]
                    vec_b = model.word_vectors[model.dictionary[b]]
                    sim = float(np.dot(vec_a, vec_b) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b)))
                    out.append((a, b, sim))
                else:
                    out.append((a, b, np.nan))
            except:
                out.append((a, b, np.nan))
    else:
        # Gensim модели
        kv = model.wv if hasattr(model, "wv") else model
        for a, b in word_pairs:
            if a in kv and b in kv:
                out.append((a, b, float(kv.similarity(a, b))))
            else:
                out.append((a, b, np.nan))
    return out


def word_analogy(model, a: str, b: str, c: str, topn: int = 10) -> List[Tuple[str, float]]:
    # GloVe не имеет встроенного метода для аналогий, вычисляем вручную
    if GLOVE_AVAILABLE and hasattr(model, 'word_vectors') and hasattr(model, 'dictionary'):
        # GloVe модель - вычисляем аналогию вручную
        try:
            if all(token in model.dictionary for token in [a, b, c]):
                vec_a = model.word_vectors[model.dictionary[a]]
                vec_b = model.word_vectors[model.dictionary[b]]
                vec_c = model.word_vectors[model.dictionary[c]]
                target = vec_b - vec_a + vec_c
                # Находим ближайшие векторы
                similarities = []
                for word, idx in model.dictionary.items():
                    if word not in [a, b, c]:
                        vec = model.word_vectors[idx]
                        sim = float(np.dot(target, vec) / (np.linalg.norm(target) * np.linalg.norm(vec)))
                        similarities.append((word, sim))
                similarities.sort(key=lambda x: x[1], reverse=True)
                return similarities[:topn]
        except:
            pass
        return []
    else:
        # Gensim модели
        kv = model.wv if hasattr(model, "wv") else model
        if all(token in kv for token in [a, b, c]):
            return kv.most_similar(positive=[b, c], negative=[a], topn=topn)
        return []


def export_training_report(cfg: TrainConfig, train_time: float, model_path: str, extra: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
    data = {
        "Модель": cfg.model_type,
        "Размерность": cfg.vector_size,
        "Окно": cfg.window,
        "Min count": cfg.min_count,
        "Архитектура": ("skipgram" if cfg.sg == 1 else "cbow") if cfg.model_type in {"w2v", "fasttext"} else ("pv-dm" if cfg.dm == 1 else "pv-dbow"),
        "Эпохи": cfg.epochs,
        "Время обучения (с)": round(train_time, 2),
        "Путь": model_path,
    }
    if extra:
        data.update(extra)
    return pd.DataFrame([data])


if __name__ == "__main__":
    texts = [
        "Москва является столицей России.",
        "Париж — столица Франции.",
        "Берлин — столица Германии.",
    ]
    cfg = TrainConfig(model_type="w2v", vector_size=100, window=5, epochs=5, sg=1)
    model, tt = train_model(texts, cfg)
    save_model(model, "models/sample_w2v.model")
    print(evaluate_neighbors(model, ["россии", "франции"]))