Upload folder using huggingface_hub

7724c94 verified about 1 month ago

5.37 kB

	import json
	import numpy as np
	from gensim.models import Word2Vec
	from sklearn.decomposition import PCA
	import matplotlib.pyplot as plt
	import matplotlib.patches as mpatches
	from pathlib import Path

	# Get project root
	PROJECT_ROOT = Path(__file__).parent.parent.parent

	print("="*70)
	print("VISUALIZING SEMANTIC SPACES")
	print("Portuguese vs Nheengatu")
	print("="*70)

	# Load models
	pt_model_path = PROJECT_ROOT / "experiments/01_word2vec/results/pt_w2v_large.model"
	nhe_model_path = PROJECT_ROOT / "experiments/01_word2vec/results/nhe_w2v_large.model"

	if not pt_model_path.exists():
	print(f"❌ PT model not found: {pt_model_path}")
	print("Run 'make exp-word2vec' first")
	exit(1)

	if not nhe_model_path.exists():
	print(f"❌ NHE model not found: {nhe_model_path}")
	print("Run 'make exp-word2vec' first")
	exit(1)

	pt_model = Word2Vec.load(str(pt_model_path))
	nhe_model = Word2Vec.load(str(nhe_model_path))

	print(f"✅ PT model: {len(pt_model.wv)} words")
	print(f"✅ NHE model: {len(nhe_model.wv)} words")

	# Create output directory
	plots_dir = PROJECT_ROOT / "experiments/05_visualization/plots"
	plots_dir.mkdir(parents=True, exist_ok=True)

	# 1. Plot: Most frequent words in both languages
	fig, axes = plt.subplots(1, 2, figsize=(14, 6))

	# Portuguese words
	pt_words = ['de', 'e', 'a', 'o', 'do', 'da', 'os', 'ou', 'dos', 'em']
	pt_vectors = np.array([pt_model.wv[w] for w in pt_words if w in pt_model.wv])

	pca_pt = PCA(n_components=2)
	pt_2d = pca_pt.fit_transform(pt_vectors)

	axes[0].scatter(pt_2d[:, 0], pt_2d[:, 1], alpha=0.7, color='blue')
	for i, word in enumerate([w for w in pt_words if w in pt_model.wv]):
	axes[0].annotate(word, (pt_2d[i, 0], pt_2d[i, 1]), fontsize=10)
	axes[0].set_title('Portuguese: Most Frequent Words')
	axes[0].set_xlabel('PC1')
	axes[0].set_ylabel('PC2')

	# Nheengatu words
	nhe_words = ['ta', 'kuá', 'asui', 'upé', 'mayê', 'ũbeu', 'u', 'rupí', 'waá', 'aárama']
	nhe_vectors = np.array([nhe_model.wv[w] for w in nhe_words if w in nhe_model.wv])

	pca_nhe = PCA(n_components=2)
	nhe_2d = pca_nhe.fit_transform(nhe_vectors)

	axes[1].scatter(nhe_2d[:, 0], nhe_2d[:, 1], alpha=0.7, color='green')
	for i, word in enumerate([w for w in nhe_words if w in nhe_model.wv]):
	axes[1].annotate(word, (nhe_2d[i, 0], nhe_2d[i, 1]), fontsize=10)
	axes[1].set_title('Nheengatu: Most Frequent Words')
	axes[1].set_xlabel('PC1')
	axes[1].set_ylabel('PC2')

	plt.tight_layout()
	plt.savefig(plots_dir / 'freq_words_pca.png', dpi=150)
	print("✅ Saved: freq_words_pca.png")

	# 2. Plot: Semantic fields comparison
	fig, axes = plt.subplots(2, 3, figsize=(15, 10))

	semantic_fields = {
	'direito/direitu': (['direito', 'lei', 'justiça', 'poder', 'estado'],
	['direitu', 'suãtisá', 'supῖtu', 'kirῖbawasá', 'tetãma']),
	'povo/miíra': (['povo', 'cidadão', 'nação', 'sociedade', 'comunidade'],
	['miíra', 'sidadãu', 'tetãma', 'payẽ', 'yumuatiri']),
	}

	for idx, (title, (pt_words, nhe_words)) in enumerate(semantic_fields.items()):
	# Portuguese
	pt_vecs = np.array([pt_model.wv[w] for w in pt_words if w in pt_model.wv])
	if len(pt_vecs) > 0:
	pca = PCA(n_components=2)
	pt_2d = pca.fit_transform(pt_vecs)
	axes[idx, 0].scatter(pt_2d[:, 0], pt_2d[:, 1], color='blue', alpha=0.7)
	for i, w in enumerate([w for w in pt_words if w in pt_model.wv]):
	axes[idx, 0].annotate(w, (pt_2d[i, 0], pt_2d[i, 1]), fontsize=9)
	axes[idx, 0].set_title(f'Portuguese: {title}')

	# Nheengatu
	nhe_vecs = np.array([nhe_model.wv[w] for w in nhe_words if w in nhe_model.wv])
	if len(nhe_vecs) > 0:
	pca = PCA(n_components=2)
	nhe_2d = pca.fit_transform(nhe_vecs)
	axes[idx, 1].scatter(nhe_2d[:, 0], nhe_2d[:, 1], color='green', alpha=0.7)
	for i, w in enumerate([w for w in nhe_words if w in nhe_model.wv]):
	axes[idx, 1].annotate(w, (nhe_2d[i, 0], nhe_2d[i, 1]), fontsize=9)
	axes[idx, 1].set_title(f'Nheengatu: {title}')

	# Combined
	axes[idx, 2].scatter(pt_2d[:, 0], pt_2d[:, 1], color='blue', alpha=0.7, label='PT')
	axes[idx, 2].scatter(nhe_2d[:, 0], nhe_2d[:, 1], color='green', alpha=0.7, label='NHE')
	axes[idx, 2].set_title(f'Combined: {title}')
	axes[idx, 2].legend()

	plt.tight_layout()
	plt.savefig(plots_dir / 'semantic_fields.png', dpi=150)
	print("✅ Saved: semantic_fields.png")

	# 3. Plot: Singular-plural pairs similarity
	singular_plural_pairs = [
	('aára', 'aáraita', 0.995), ('kutaárawá', 'kutaárawáita', 0.994),
	('suraára', 'suraáraita', 0.989), ('uka', 'ukaita', 0.988),
	('amũ', 'amũita', 0.988), ('mã', 'mãita', 0.986),
	]

	fig, ax = plt.subplots(figsize=(10, 6))
	pairs = [f"{s}→{p}" for s, p, _ in singular_plural_pairs]
	similarities = [sim for _, _, sim in singular_plural_pairs]

	bars = ax.barh(pairs, similarities, color='coral')
	ax.set_xlabel('Cosine Similarity')
	ax.set_title('Nheengatu Singular-Plural Pairs Similarity')
	ax.set_xlim(0.98, 1.0)

	for i, (bar, sim) in enumerate(zip(bars, similarities)):
	ax.text(sim + 0.001, bar.get_y() + bar.get_height()/2, f'{sim:.3f}', va='center')

	plt.tight_layout()
	plt.savefig(plots_dir / 'singular_plural_similarity.png', dpi=150)
	print("✅ Saved: singular_plural_similarity.png")

	print("\n✅ All visualizations saved to experiments/05_visualization/plots/")