nhengatu-experiments / 05_visualization /plot_semantic_spaces.py
rmacario's picture
Upload folder using huggingface_hub
7724c94 verified
Raw
History Blame Contribute Delete
5.37 kB
import json
import numpy as np
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from pathlib import Path
# Get project root
PROJECT_ROOT = Path(__file__).parent.parent.parent
print("="*70)
print("VISUALIZING SEMANTIC SPACES")
print("Portuguese vs Nheengatu")
print("="*70)
# Load models
pt_model_path = PROJECT_ROOT / "experiments/01_word2vec/results/pt_w2v_large.model"
nhe_model_path = PROJECT_ROOT / "experiments/01_word2vec/results/nhe_w2v_large.model"
if not pt_model_path.exists():
print(f"❌ PT model not found: {pt_model_path}")
print("Run 'make exp-word2vec' first")
exit(1)
if not nhe_model_path.exists():
print(f"❌ NHE model not found: {nhe_model_path}")
print("Run 'make exp-word2vec' first")
exit(1)
pt_model = Word2Vec.load(str(pt_model_path))
nhe_model = Word2Vec.load(str(nhe_model_path))
print(f"✅ PT model: {len(pt_model.wv)} words")
print(f"✅ NHE model: {len(nhe_model.wv)} words")
# Create output directory
plots_dir = PROJECT_ROOT / "experiments/05_visualization/plots"
plots_dir.mkdir(parents=True, exist_ok=True)
# 1. Plot: Most frequent words in both languages
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Portuguese words
pt_words = ['de', 'e', 'a', 'o', 'do', 'da', 'os', 'ou', 'dos', 'em']
pt_vectors = np.array([pt_model.wv[w] for w in pt_words if w in pt_model.wv])
pca_pt = PCA(n_components=2)
pt_2d = pca_pt.fit_transform(pt_vectors)
axes[0].scatter(pt_2d[:, 0], pt_2d[:, 1], alpha=0.7, color='blue')
for i, word in enumerate([w for w in pt_words if w in pt_model.wv]):
axes[0].annotate(word, (pt_2d[i, 0], pt_2d[i, 1]), fontsize=10)
axes[0].set_title('Portuguese: Most Frequent Words')
axes[0].set_xlabel('PC1')
axes[0].set_ylabel('PC2')
# Nheengatu words
nhe_words = ['ta', 'kuá', 'asui', 'upé', 'mayê', 'ũbeu', 'u', 'rupí', 'waá', 'aárama']
nhe_vectors = np.array([nhe_model.wv[w] for w in nhe_words if w in nhe_model.wv])
pca_nhe = PCA(n_components=2)
nhe_2d = pca_nhe.fit_transform(nhe_vectors)
axes[1].scatter(nhe_2d[:, 0], nhe_2d[:, 1], alpha=0.7, color='green')
for i, word in enumerate([w for w in nhe_words if w in nhe_model.wv]):
axes[1].annotate(word, (nhe_2d[i, 0], nhe_2d[i, 1]), fontsize=10)
axes[1].set_title('Nheengatu: Most Frequent Words')
axes[1].set_xlabel('PC1')
axes[1].set_ylabel('PC2')
plt.tight_layout()
plt.savefig(plots_dir / 'freq_words_pca.png', dpi=150)
print("✅ Saved: freq_words_pca.png")
# 2. Plot: Semantic fields comparison
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
semantic_fields = {
'direito/direitu': (['direito', 'lei', 'justiça', 'poder', 'estado'],
['direitu', 'suãtisá', 'supῖtu', 'kirῖbawasá', 'tetãma']),
'povo/miíra': (['povo', 'cidadão', 'nação', 'sociedade', 'comunidade'],
['miíra', 'sidadãu', 'tetãma', 'payẽ', 'yumuatiri']),
}
for idx, (title, (pt_words, nhe_words)) in enumerate(semantic_fields.items()):
# Portuguese
pt_vecs = np.array([pt_model.wv[w] for w in pt_words if w in pt_model.wv])
if len(pt_vecs) > 0:
pca = PCA(n_components=2)
pt_2d = pca.fit_transform(pt_vecs)
axes[idx, 0].scatter(pt_2d[:, 0], pt_2d[:, 1], color='blue', alpha=0.7)
for i, w in enumerate([w for w in pt_words if w in pt_model.wv]):
axes[idx, 0].annotate(w, (pt_2d[i, 0], pt_2d[i, 1]), fontsize=9)
axes[idx, 0].set_title(f'Portuguese: {title}')
# Nheengatu
nhe_vecs = np.array([nhe_model.wv[w] for w in nhe_words if w in nhe_model.wv])
if len(nhe_vecs) > 0:
pca = PCA(n_components=2)
nhe_2d = pca.fit_transform(nhe_vecs)
axes[idx, 1].scatter(nhe_2d[:, 0], nhe_2d[:, 1], color='green', alpha=0.7)
for i, w in enumerate([w for w in nhe_words if w in nhe_model.wv]):
axes[idx, 1].annotate(w, (nhe_2d[i, 0], nhe_2d[i, 1]), fontsize=9)
axes[idx, 1].set_title(f'Nheengatu: {title}')
# Combined
axes[idx, 2].scatter(pt_2d[:, 0], pt_2d[:, 1], color='blue', alpha=0.7, label='PT')
axes[idx, 2].scatter(nhe_2d[:, 0], nhe_2d[:, 1], color='green', alpha=0.7, label='NHE')
axes[idx, 2].set_title(f'Combined: {title}')
axes[idx, 2].legend()
plt.tight_layout()
plt.savefig(plots_dir / 'semantic_fields.png', dpi=150)
print("✅ Saved: semantic_fields.png")
# 3. Plot: Singular-plural pairs similarity
singular_plural_pairs = [
('aára', 'aáraita', 0.995), ('kutaárawá', 'kutaárawáita', 0.994),
('suraára', 'suraáraita', 0.989), ('uka', 'ukaita', 0.988),
('amũ', 'amũita', 0.988), ('mã', 'mãita', 0.986),
]
fig, ax = plt.subplots(figsize=(10, 6))
pairs = [f"{s}{p}" for s, p, _ in singular_plural_pairs]
similarities = [sim for _, _, sim in singular_plural_pairs]
bars = ax.barh(pairs, similarities, color='coral')
ax.set_xlabel('Cosine Similarity')
ax.set_title('Nheengatu Singular-Plural Pairs Similarity')
ax.set_xlim(0.98, 1.0)
for i, (bar, sim) in enumerate(zip(bars, similarities)):
ax.text(sim + 0.001, bar.get_y() + bar.get_height()/2, f'{sim:.3f}', va='center')
plt.tight_layout()
plt.savefig(plots_dir / 'singular_plural_similarity.png', dpi=150)
print("✅ Saved: singular_plural_similarity.png")
print("\n✅ All visualizations saved to experiments/05_visualization/plots/")