import json
import numpy as np
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from pathlib import Path

# Get project root
PROJECT_ROOT = Path(__file__).parent.parent.parent

print("="*70)
print("VISUALIZING SEMANTIC SPACES")
print("Portuguese vs Nheengatu")
print("="*70)

# Load models
pt_model_path = PROJECT_ROOT / "experiments/01_word2vec/results/pt_w2v_large.model"
nhe_model_path = PROJECT_ROOT / "experiments/01_word2vec/results/nhe_w2v_large.model"

if not pt_model_path.exists():
    print(f"❌ PT model not found: {pt_model_path}")
    print("Run 'make exp-word2vec' first")
    exit(1)

if not nhe_model_path.exists():
    print(f"❌ NHE model not found: {nhe_model_path}")
    print("Run 'make exp-word2vec' first")
    exit(1)

pt_model = Word2Vec.load(str(pt_model_path))
nhe_model = Word2Vec.load(str(nhe_model_path))

print(f"✅ PT model: {len(pt_model.wv)} words")
print(f"✅ NHE model: {len(nhe_model.wv)} words")

# Create output directory
plots_dir = PROJECT_ROOT / "experiments/05_visualization/plots"
plots_dir.mkdir(parents=True, exist_ok=True)

# 1. Plot: Most frequent words in both languages
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Portuguese words
pt_words = ['de', 'e', 'a', 'o', 'do', 'da', 'os', 'ou', 'dos', 'em']
pt_vectors = np.array([pt_model.wv[w] for w in pt_words if w in pt_model.wv])

pca_pt = PCA(n_components=2)
pt_2d = pca_pt.fit_transform(pt_vectors)

axes[0].scatter(pt_2d[:, 0], pt_2d[:, 1], alpha=0.7, color='blue')
for i, word in enumerate([w for w in pt_words if w in pt_model.wv]):
    axes[0].annotate(word, (pt_2d[i, 0], pt_2d[i, 1]), fontsize=10)
axes[0].set_title('Portuguese: Most Frequent Words')
axes[0].set_xlabel('PC1')
axes[0].set_ylabel('PC2')

# Nheengatu words
nhe_words = ['ta', 'kuá', 'asui', 'upé', 'mayê', 'ũbeu', 'u', 'rupí', 'waá', 'aárama']
nhe_vectors = np.array([nhe_model.wv[w] for w in nhe_words if w in nhe_model.wv])

pca_nhe = PCA(n_components=2)
nhe_2d = pca_nhe.fit_transform(nhe_vectors)

axes[1].scatter(nhe_2d[:, 0], nhe_2d[:, 1], alpha=0.7, color='green')
for i, word in enumerate([w for w in nhe_words if w in nhe_model.wv]):
    axes[1].annotate(word, (nhe_2d[i, 0], nhe_2d[i, 1]), fontsize=10)
axes[1].set_title('Nheengatu: Most Frequent Words')
axes[1].set_xlabel('PC1')
axes[1].set_ylabel('PC2')

plt.tight_layout()
plt.savefig(plots_dir / 'freq_words_pca.png', dpi=150)
print("✅ Saved: freq_words_pca.png")

# 2. Plot: Semantic fields comparison
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

semantic_fields = {
    'direito/direitu': (['direito', 'lei', 'justiça', 'poder', 'estado'], 
                        ['direitu', 'suãtisá', 'supῖtu', 'kirῖbawasá', 'tetãma']),
    'povo/miíra': (['povo', 'cidadão', 'nação', 'sociedade', 'comunidade'],
                   ['miíra', 'sidadãu', 'tetãma', 'payẽ', 'yumuatiri']),
}

for idx, (title, (pt_words, nhe_words)) in enumerate(semantic_fields.items()):
    # Portuguese
    pt_vecs = np.array([pt_model.wv[w] for w in pt_words if w in pt_model.wv])
    if len(pt_vecs) > 0:
        pca = PCA(n_components=2)
        pt_2d = pca.fit_transform(pt_vecs)
        axes[idx, 0].scatter(pt_2d[:, 0], pt_2d[:, 1], color='blue', alpha=0.7)
        for i, w in enumerate([w for w in pt_words if w in pt_model.wv]):
            axes[idx, 0].annotate(w, (pt_2d[i, 0], pt_2d[i, 1]), fontsize=9)
        axes[idx, 0].set_title(f'Portuguese: {title}')
    
    # Nheengatu
    nhe_vecs = np.array([nhe_model.wv[w] for w in nhe_words if w in nhe_model.wv])
    if len(nhe_vecs) > 0:
        pca = PCA(n_components=2)
        nhe_2d = pca.fit_transform(nhe_vecs)
        axes[idx, 1].scatter(nhe_2d[:, 0], nhe_2d[:, 1], color='green', alpha=0.7)
        for i, w in enumerate([w for w in nhe_words if w in nhe_model.wv]):
            axes[idx, 1].annotate(w, (nhe_2d[i, 0], nhe_2d[i, 1]), fontsize=9)
        axes[idx, 1].set_title(f'Nheengatu: {title}')
    
    # Combined
    axes[idx, 2].scatter(pt_2d[:, 0], pt_2d[:, 1], color='blue', alpha=0.7, label='PT')
    axes[idx, 2].scatter(nhe_2d[:, 0], nhe_2d[:, 1], color='green', alpha=0.7, label='NHE')
    axes[idx, 2].set_title(f'Combined: {title}')
    axes[idx, 2].legend()

plt.tight_layout()
plt.savefig(plots_dir / 'semantic_fields.png', dpi=150)
print("✅ Saved: semantic_fields.png")

# 3. Plot: Singular-plural pairs similarity
singular_plural_pairs = [
    ('aára', 'aáraita', 0.995), ('kutaárawá', 'kutaárawáita', 0.994),
    ('suraára', 'suraáraita', 0.989), ('uka', 'ukaita', 0.988),
    ('amũ', 'amũita', 0.988), ('mã', 'mãita', 0.986),
]

fig, ax = plt.subplots(figsize=(10, 6))
pairs = [f"{s}→{p}" for s, p, _ in singular_plural_pairs]
similarities = [sim for _, _, sim in singular_plural_pairs]

bars = ax.barh(pairs, similarities, color='coral')
ax.set_xlabel('Cosine Similarity')
ax.set_title('Nheengatu Singular-Plural Pairs Similarity')
ax.set_xlim(0.98, 1.0)

for i, (bar, sim) in enumerate(zip(bars, similarities)):
    ax.text(sim + 0.001, bar.get_y() + bar.get_height()/2, f'{sim:.3f}', va='center')

plt.tight_layout()
plt.savefig(plots_dir / 'singular_plural_similarity.png', dpi=150)
print("✅ Saved: singular_plural_similarity.png")

print("\n✅ All visualizations saved to experiments/05_visualization/plots/")