| import pickle
|
| import json
|
| import numpy as np
|
| import pandas as pd
|
| import matplotlib.pyplot as plt
|
| import seaborn as sns
|
| from sklearn.metrics.pairwise import cosine_similarity
|
| from sklearn.decomposition import PCA
|
| from sklearn.manifold import TSNE
|
| from scipy.stats import pearsonr, spearmanr
|
| import warnings
|
| warnings.filterwarnings('ignore')
|
|
|
|
|
| plt.rcParams['font.family'] = 'DejaVu Sans'
|
| plt.rcParams['axes.unicode_minus'] = False
|
|
|
| def load_label_data():
|
| """๋ผ๋ฒจ ๊ด๋ จ ๋ฐ์ดํฐ ๋ก๋"""
|
|
|
| with open('label_mapping.json', 'r', encoding='utf-8') as f:
|
| label_mapping = json.load(f)
|
|
|
|
|
| with open('label_embeddings.pkl', 'rb') as f:
|
| label_embeddings = pickle.load(f)
|
|
|
| return label_mapping, label_embeddings
|
|
|
| def analyze_label_similarity(label_embeddings, label_mapping):
|
| """๋ผ๋ฒจ ๊ฐ ์ ์ฌ๋ ๋ถ์"""
|
| print("=== ๋ผ๋ฒจ ๊ฐ ์ ์ฌ๋ ๋ถ์ ===")
|
|
|
|
|
| similarity_matrix = cosine_similarity(label_embeddings)
|
|
|
|
|
| print(f"๋ผ๋ฒจ ๊ฐ์: {len(label_embeddings)}")
|
| print(f"์๋ฒ ๋ฉ ์ฐจ์: {label_embeddings.shape[1]}")
|
| print(f"ํ๊ท ์ ์ฌ๋: {np.mean(similarity_matrix):.4f}")
|
| print(f"์ ์ฌ๋ ํ์คํธ์ฐจ: {np.std(similarity_matrix):.4f}")
|
| print(f"์ต๋ ์ ์ฌ๋: {np.max(similarity_matrix):.4f}")
|
| print(f"์ต์ ์ ์ฌ๋: {np.min(similarity_matrix):.4f}")
|
|
|
|
|
| np.fill_diagonal(similarity_matrix, 0)
|
| max_sim_idx = np.unravel_index(np.argmax(similarity_matrix), similarity_matrix.shape)
|
| max_sim_value = similarity_matrix[max_sim_idx]
|
|
|
| labels = list(label_mapping.keys())
|
| print(f"\n๊ฐ์ฅ ์ ์ฌํ ๋ผ๋ฒจ ์:")
|
| print(f" {labels[max_sim_idx[0]]} <-> {labels[max_sim_idx[1]]}: {max_sim_value:.4f}")
|
|
|
| return similarity_matrix
|
|
|
| def analyze_label_correlation(label_embeddings, label_mapping):
|
| """๋ผ๋ฒจ ์๋ฒ ๋ฉ ๊ฐ ์๊ด๊ด๊ณ ๋ถ์"""
|
| print("\n=== ๋ผ๋ฒจ ์๋ฒ ๋ฉ ์๊ด๊ด๊ณ ๋ถ์ ===")
|
|
|
|
|
| corr_matrix = np.corrcoef(label_embeddings.T)
|
|
|
|
|
| print(f"ํ๊ท ์๊ด๊ณ์: {np.mean(corr_matrix):.4f}")
|
| print(f"์๊ด๊ณ์ ํ์คํธ์ฐจ: {np.std(corr_matrix):.4f}")
|
| print(f"์ต๋ ์๊ด๊ณ์: {np.max(corr_matrix):.4f}")
|
| print(f"์ต์ ์๊ด๊ณ์: {np.min(corr_matrix):.4f}")
|
|
|
|
|
| np.fill_diagonal(corr_matrix, 0)
|
| high_corr_threshold = 0.8
|
| high_corr_pairs = np.where(np.abs(corr_matrix) > high_corr_threshold)
|
|
|
| print(f"\n๋์ ์๊ด๊ด๊ณ (|r| > {high_corr_threshold})๋ฅผ ๊ฐ์ง ํน์ฑ ์ ์: {len(high_corr_pairs[0])}")
|
|
|
| return corr_matrix
|
|
|
| def analyze_label_distribution(label_embeddings, label_mapping):
|
| """๋ผ๋ฒจ ๋ถํฌ ๋ถ์"""
|
| print("\n=== ๋ผ๋ฒจ ๋ถํฌ ๋ถ์ ===")
|
|
|
|
|
| embedding_norms = np.linalg.norm(label_embeddings, axis=1)
|
|
|
| print(f"์๋ฒ ๋ฉ ํฌ๊ธฐ ํต๊ณ:")
|
| print(f" ํ๊ท : {np.mean(embedding_norms):.4f}")
|
| print(f" ํ์คํธ์ฐจ: {np.std(embedding_norms):.4f}")
|
| print(f" ์ต๋: {np.max(embedding_norms):.4f}")
|
| print(f" ์ต์: {np.min(embedding_norms):.4f}")
|
|
|
|
|
| labels = list(label_mapping.keys())
|
| max_norm_idx = np.argmax(embedding_norms)
|
| min_norm_idx = np.argmin(embedding_norms)
|
|
|
| print(f"\n๊ฐ์ฅ ํฐ ์๋ฒ ๋ฉ ํฌ๊ธฐ: {labels[max_norm_idx]} ({embedding_norms[max_norm_idx]:.4f})")
|
| print(f"๊ฐ์ฅ ์์ ์๋ฒ ๋ฉ ํฌ๊ธฐ: {labels[min_norm_idx]} ({embedding_norms[min_norm_idx]:.4f})")
|
|
|
| return embedding_norms
|
|
|
| def visualize_label_embeddings(label_embeddings, label_mapping):
|
| """๋ผ๋ฒจ ์๋ฒ ๋ฉ ์๊ฐํ"""
|
| print("\n=== ๋ผ๋ฒจ ์๋ฒ ๋ฉ ์๊ฐํ ===")
|
|
|
|
|
| pca = PCA(n_components=2)
|
| embeddings_2d = pca.fit_transform(label_embeddings)
|
|
|
|
|
| tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(label_embeddings)-1))
|
| embeddings_tsne = tsne.fit_transform(label_embeddings)
|
|
|
|
|
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
|
|
|
| labels = list(label_mapping.keys())
|
|
|
|
|
| ax1.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.7)
|
| ax1.set_title('PCA Visualization of Label Embeddings')
|
| ax1.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
|
| ax1.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
|
|
|
|
|
| ax2.scatter(embeddings_tsne[:, 0], embeddings_tsne[:, 1], alpha=0.7)
|
| ax2.set_title('t-SNE Visualization of Label Embeddings')
|
| ax2.set_xlabel('t-SNE 1')
|
| ax2.set_ylabel('t-SNE 2')
|
|
|
| plt.tight_layout()
|
| plt.savefig('label_embeddings_visualization.png', dpi=300, bbox_inches='tight')
|
| plt.show()
|
|
|
| print(f"PCA ์ค๋ช
๋ ๋ถ์ฐ ๋น์จ: {pca.explained_variance_ratio_[:2]}")
|
| print(f"์ด ์ค๋ช
๋ ๋ถ์ฐ ๋น์จ: {np.sum(pca.explained_variance_ratio_[:2]):.4f}")
|
|
|
| def analyze_independence_metrics(label_embeddings, label_mapping):
|
| """๋
๋ฆฝ์ฑ ์งํ ๋ถ์"""
|
| print("\n=== ๋
๋ฆฝ์ฑ ์งํ ๋ถ์ ===")
|
|
|
|
|
| distances = []
|
| for i in range(len(label_embeddings)):
|
| for j in range(i+1, len(label_embeddings)):
|
| dist = np.linalg.norm(label_embeddings[i] - label_embeddings[j])
|
| distances.append(dist)
|
|
|
| print(f"๋ผ๋ฒจ ๊ฐ ํ๊ท ๊ฑฐ๋ฆฌ: {np.mean(distances):.4f}")
|
| print(f"๋ผ๋ฒจ ๊ฐ ๊ฑฐ๋ฆฌ ํ์คํธ์ฐจ: {np.std(distances):.4f}")
|
|
|
|
|
|
|
| normalized_embeddings = label_embeddings / np.linalg.norm(label_embeddings, axis=1, keepdims=True)
|
| dot_products = []
|
|
|
| for i in range(len(normalized_embeddings)):
|
| for j in range(i+1, len(normalized_embeddings)):
|
| dot_product = np.dot(normalized_embeddings[i], normalized_embeddings[j])
|
| dot_products.append(abs(dot_product))
|
|
|
| print(f"ํ๊ท ๋ด์ ํฌ๊ธฐ: {np.mean(dot_products):.4f}")
|
| print(f"๋ด์ ํฌ๊ธฐ ํ์คํธ์ฐจ: {np.std(dot_products):.4f}")
|
|
|
|
|
| embedding_variance = np.var(label_embeddings, axis=0)
|
| print(f"์๋ฒ ๋ฉ ์ฐจ์๋ณ ๋ถ์ฐ ํ๊ท : {np.mean(embedding_variance):.4f}")
|
| print(f"์๋ฒ ๋ฉ ์ฐจ์๋ณ ๋ถ์ฐ ํ์คํธ์ฐจ: {np.std(embedding_variance):.4f}")
|
|
|
| return distances, dot_products, embedding_variance
|
|
|
| def main():
|
| """๋ฉ์ธ ๋ถ์ ํจ์"""
|
| print("๋ผ๋ฒจ๊ณผ ์
๋ ฅ ๋ฐ์ดํฐ ๊ฐ์ ๋
๋ฆฝ์ฑ ๋ถ์์ ์์ํฉ๋๋ค...")
|
|
|
|
|
| label_mapping, label_embeddings = load_label_data()
|
|
|
|
|
| if isinstance(label_embeddings, dict):
|
|
|
| label_embeddings = np.array(list(label_embeddings.values()))
|
| elif isinstance(label_embeddings, list):
|
|
|
| label_embeddings = np.array(label_embeddings)
|
|
|
| print(f"๋ก๋๋ ๋ผ๋ฒจ ์๋ฒ ๋ฉ ํํ: {label_embeddings.shape}")
|
|
|
|
|
| similarity_matrix = analyze_label_similarity(label_embeddings, label_mapping)
|
|
|
|
|
| corr_matrix = analyze_label_correlation(label_embeddings, label_mapping)
|
|
|
|
|
| embedding_norms = analyze_label_distribution(label_embeddings, label_mapping)
|
|
|
|
|
| distances, dot_products, embedding_variance = analyze_independence_metrics(label_embeddings, label_mapping)
|
|
|
|
|
| visualize_label_embeddings(label_embeddings, label_mapping)
|
|
|
|
|
| print("\n=== ์ข
ํฉ ๋
๋ฆฝ์ฑ ํ๊ฐ ===")
|
|
|
|
|
| avg_similarity = np.mean(similarity_matrix)
|
| if avg_similarity < 0.1:
|
| similarity_score = "๋งค์ฐ ์ข์"
|
| elif avg_similarity < 0.3:
|
| similarity_score = "์ข์"
|
| elif avg_similarity < 0.5:
|
| similarity_score = "๋ณดํต"
|
| else:
|
| similarity_score = "๊ฐ์ ํ์"
|
|
|
|
|
| avg_dot_product = np.mean(dot_products)
|
| if avg_dot_product < 0.1:
|
| orthogonality_score = "๋งค์ฐ ์ข์"
|
| elif avg_dot_product < 0.3:
|
| orthogonality_score = "์ข์"
|
| elif avg_dot_product < 0.5:
|
| orthogonality_score = "๋ณดํต"
|
| else:
|
| orthogonality_score = "๊ฐ์ ํ์"
|
|
|
| print(f"๋ผ๋ฒจ ๊ฐ ์ ์ฌ๋ ํ๊ฐ: {similarity_score} (ํ๊ท ์ ์ฌ๋: {avg_similarity:.4f})")
|
| print(f"๋ผ๋ฒจ ์ง๊ต์ฑ ํ๊ฐ: {orthogonality_score} (ํ๊ท ๋ด์ : {avg_dot_product:.4f})")
|
|
|
| if similarity_score in ["๋งค์ฐ ์ข์", "์ข์"] and orthogonality_score in ["๋งค์ฐ ์ข์", "์ข์"]:
|
| print("์ ์ฒด ํ๊ฐ: ๋ผ๋ฒจ๊ณผ ์
๋ ฅ ๋ฐ์ดํฐ ๊ฐ์ ๋
๋ฆฝ์ฑ์ด ์ํธํฉ๋๋ค.")
|
| elif similarity_score in ["๋ณดํต"] and orthogonality_score in ["๋ณดํต"]:
|
| print("์ ์ฒด ํ๊ฐ: ๋ผ๋ฒจ๊ณผ ์
๋ ฅ ๋ฐ์ดํฐ ๊ฐ์ ๋
๋ฆฝ์ฑ์ด ๋ณดํต ์์ค์
๋๋ค.")
|
| else:
|
| print("์ ์ฒด ํ๊ฐ: ๋ผ๋ฒจ๊ณผ ์
๋ ฅ ๋ฐ์ดํฐ ๊ฐ์ ๋
๋ฆฝ์ฑ ๊ฐ์ ์ด ํ์ํฉ๋๋ค.")
|
|
|
| if __name__ == "__main__":
|
| main()
|
|
|