home / model_v24 /label_independence_analysis.py
Halfotter's picture
Upload 11 files
4c77a3a verified
import pickle
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy.stats import pearsonr, spearmanr
import warnings
warnings.filterwarnings('ignore')
# ํ•œ๊ธ€ ํฐํŠธ ์„ค์ •
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['axes.unicode_minus'] = False
def load_label_data():
"""๋ผ๋ฒจ ๊ด€๋ จ ๋ฐ์ดํ„ฐ ๋กœ๋“œ"""
# ๋ผ๋ฒจ ๋งคํ•‘ ๋กœ๋“œ
with open('label_mapping.json', 'r', encoding='utf-8') as f:
label_mapping = json.load(f)
# ๋ผ๋ฒจ ์ž„๋ฒ ๋”ฉ ๋กœ๋“œ
with open('label_embeddings.pkl', 'rb') as f:
label_embeddings = pickle.load(f)
return label_mapping, label_embeddings
def analyze_label_similarity(label_embeddings, label_mapping):
"""๋ผ๋ฒจ ๊ฐ„ ์œ ์‚ฌ๋„ ๋ถ„์„"""
print("=== ๋ผ๋ฒจ ๊ฐ„ ์œ ์‚ฌ๋„ ๋ถ„์„ ===")
# ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
similarity_matrix = cosine_similarity(label_embeddings)
# ์œ ์‚ฌ๋„ ํ†ต๊ณ„
print(f"๋ผ๋ฒจ ๊ฐœ์ˆ˜: {len(label_embeddings)}")
print(f"์ž„๋ฒ ๋”ฉ ์ฐจ์›: {label_embeddings.shape[1]}")
print(f"ํ‰๊ท  ์œ ์‚ฌ๋„: {np.mean(similarity_matrix):.4f}")
print(f"์œ ์‚ฌ๋„ ํ‘œ์ค€ํŽธ์ฐจ: {np.std(similarity_matrix):.4f}")
print(f"์ตœ๋Œ€ ์œ ์‚ฌ๋„: {np.max(similarity_matrix):.4f}")
print(f"์ตœ์†Œ ์œ ์‚ฌ๋„: {np.min(similarity_matrix):.4f}")
# ๊ฐ€์žฅ ์œ ์‚ฌํ•œ ๋ผ๋ฒจ ์Œ ์ฐพ๊ธฐ
np.fill_diagonal(similarity_matrix, 0) # ์ž๊ธฐ ์ž์‹  ์ œ์™ธ
max_sim_idx = np.unravel_index(np.argmax(similarity_matrix), similarity_matrix.shape)
max_sim_value = similarity_matrix[max_sim_idx]
labels = list(label_mapping.keys())
print(f"\n๊ฐ€์žฅ ์œ ์‚ฌํ•œ ๋ผ๋ฒจ ์Œ:")
print(f" {labels[max_sim_idx[0]]} <-> {labels[max_sim_idx[1]]}: {max_sim_value:.4f}")
return similarity_matrix
def analyze_label_correlation(label_embeddings, label_mapping):
"""๋ผ๋ฒจ ์ž„๋ฒ ๋”ฉ ๊ฐ„ ์ƒ๊ด€๊ด€๊ณ„ ๋ถ„์„"""
print("\n=== ๋ผ๋ฒจ ์ž„๋ฒ ๋”ฉ ์ƒ๊ด€๊ด€๊ณ„ ๋ถ„์„ ===")
# ํ”ผ์–ด์Šจ ์ƒ๊ด€๊ณ„์ˆ˜ ๊ณ„์‚ฐ
corr_matrix = np.corrcoef(label_embeddings.T)
# ์ƒ๊ด€๊ณ„์ˆ˜ ํ†ต๊ณ„
print(f"ํ‰๊ท  ์ƒ๊ด€๊ณ„์ˆ˜: {np.mean(corr_matrix):.4f}")
print(f"์ƒ๊ด€๊ณ„์ˆ˜ ํ‘œ์ค€ํŽธ์ฐจ: {np.std(corr_matrix):.4f}")
print(f"์ตœ๋Œ€ ์ƒ๊ด€๊ณ„์ˆ˜: {np.max(corr_matrix):.4f}")
print(f"์ตœ์†Œ ์ƒ๊ด€๊ณ„์ˆ˜: {np.min(corr_matrix):.4f}")
# ๋†’์€ ์ƒ๊ด€๊ด€๊ณ„๋ฅผ ๊ฐ€์ง„ ํŠน์„ฑ ์Œ ์ฐพ๊ธฐ
np.fill_diagonal(corr_matrix, 0)
high_corr_threshold = 0.8
high_corr_pairs = np.where(np.abs(corr_matrix) > high_corr_threshold)
print(f"\n๋†’์€ ์ƒ๊ด€๊ด€๊ณ„ (|r| > {high_corr_threshold})๋ฅผ ๊ฐ€์ง„ ํŠน์„ฑ ์Œ ์ˆ˜: {len(high_corr_pairs[0])}")
return corr_matrix
def analyze_label_distribution(label_embeddings, label_mapping):
"""๋ผ๋ฒจ ๋ถ„ํฌ ๋ถ„์„"""
print("\n=== ๋ผ๋ฒจ ๋ถ„ํฌ ๋ถ„์„ ===")
# ๊ฐ ๋ผ๋ฒจ ์ž„๋ฒ ๋”ฉ์˜ L2 ๋…ธ๋ฆ„ (ํฌ๊ธฐ)
embedding_norms = np.linalg.norm(label_embeddings, axis=1)
print(f"์ž„๋ฒ ๋”ฉ ํฌ๊ธฐ ํ†ต๊ณ„:")
print(f" ํ‰๊ท : {np.mean(embedding_norms):.4f}")
print(f" ํ‘œ์ค€ํŽธ์ฐจ: {np.std(embedding_norms):.4f}")
print(f" ์ตœ๋Œ€: {np.max(embedding_norms):.4f}")
print(f" ์ตœ์†Œ: {np.min(embedding_norms):.4f}")
# ํฌ๊ธฐ๊ฐ€ ๊ฐ€์žฅ ํฐ/์ž‘์€ ๋ผ๋ฒจ
labels = list(label_mapping.keys())
max_norm_idx = np.argmax(embedding_norms)
min_norm_idx = np.argmin(embedding_norms)
print(f"\n๊ฐ€์žฅ ํฐ ์ž„๋ฒ ๋”ฉ ํฌ๊ธฐ: {labels[max_norm_idx]} ({embedding_norms[max_norm_idx]:.4f})")
print(f"๊ฐ€์žฅ ์ž‘์€ ์ž„๋ฒ ๋”ฉ ํฌ๊ธฐ: {labels[min_norm_idx]} ({embedding_norms[min_norm_idx]:.4f})")
return embedding_norms
def visualize_label_embeddings(label_embeddings, label_mapping):
"""๋ผ๋ฒจ ์ž„๋ฒ ๋”ฉ ์‹œ๊ฐํ™”"""
print("\n=== ๋ผ๋ฒจ ์ž„๋ฒ ๋”ฉ ์‹œ๊ฐํ™” ===")
# PCA๋กœ ์ฐจ์› ์ถ•์†Œ
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(label_embeddings)
# t-SNE๋กœ ์ฐจ์› ์ถ•์†Œ
tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(label_embeddings)-1))
embeddings_tsne = tsne.fit_transform(label_embeddings)
# ์‹œ๊ฐํ™”
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
labels = list(label_mapping.keys())
# PCA ์‹œ๊ฐํ™”
ax1.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.7)
ax1.set_title('PCA Visualization of Label Embeddings')
ax1.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
ax1.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
# t-SNE ์‹œ๊ฐํ™”
ax2.scatter(embeddings_tsne[:, 0], embeddings_tsne[:, 1], alpha=0.7)
ax2.set_title('t-SNE Visualization of Label Embeddings')
ax2.set_xlabel('t-SNE 1')
ax2.set_ylabel('t-SNE 2')
plt.tight_layout()
plt.savefig('label_embeddings_visualization.png', dpi=300, bbox_inches='tight')
plt.show()
print(f"PCA ์„ค๋ช…๋œ ๋ถ„์‚ฐ ๋น„์œจ: {pca.explained_variance_ratio_[:2]}")
print(f"์ด ์„ค๋ช…๋œ ๋ถ„์‚ฐ ๋น„์œจ: {np.sum(pca.explained_variance_ratio_[:2]):.4f}")
def analyze_independence_metrics(label_embeddings, label_mapping):
"""๋…๋ฆฝ์„ฑ ์ง€ํ‘œ ๋ถ„์„"""
print("\n=== ๋…๋ฆฝ์„ฑ ์ง€ํ‘œ ๋ถ„์„ ===")
# 1. ๋ผ๋ฒจ ๊ฐ„ ํ‰๊ท  ๊ฑฐ๋ฆฌ
distances = []
for i in range(len(label_embeddings)):
for j in range(i+1, len(label_embeddings)):
dist = np.linalg.norm(label_embeddings[i] - label_embeddings[j])
distances.append(dist)
print(f"๋ผ๋ฒจ ๊ฐ„ ํ‰๊ท  ๊ฑฐ๋ฆฌ: {np.mean(distances):.4f}")
print(f"๋ผ๋ฒจ ๊ฐ„ ๊ฑฐ๋ฆฌ ํ‘œ์ค€ํŽธ์ฐจ: {np.std(distances):.4f}")
# 2. ๋ผ๋ฒจ ์ž„๋ฒ ๋”ฉ์˜ ์ง๊ต์„ฑ (orthogonality)
# ์ •๊ทœํ™”๋œ ์ž„๋ฒ ๋”ฉ์œผ๋กœ ๋‚ด์  ๊ณ„์‚ฐ
normalized_embeddings = label_embeddings / np.linalg.norm(label_embeddings, axis=1, keepdims=True)
dot_products = []
for i in range(len(normalized_embeddings)):
for j in range(i+1, len(normalized_embeddings)):
dot_product = np.dot(normalized_embeddings[i], normalized_embeddings[j])
dot_products.append(abs(dot_product))
print(f"ํ‰๊ท  ๋‚ด์  ํฌ๊ธฐ: {np.mean(dot_products):.4f}")
print(f"๋‚ด์  ํฌ๊ธฐ ํ‘œ์ค€ํŽธ์ฐจ: {np.std(dot_products):.4f}")
# 3. ๋ผ๋ฒจ ์ž„๋ฒ ๋”ฉ์˜ ๋ถ„์‚ฐ
embedding_variance = np.var(label_embeddings, axis=0)
print(f"์ž„๋ฒ ๋”ฉ ์ฐจ์›๋ณ„ ๋ถ„์‚ฐ ํ‰๊ท : {np.mean(embedding_variance):.4f}")
print(f"์ž„๋ฒ ๋”ฉ ์ฐจ์›๋ณ„ ๋ถ„์‚ฐ ํ‘œ์ค€ํŽธ์ฐจ: {np.std(embedding_variance):.4f}")
return distances, dot_products, embedding_variance
def main():
"""๋ฉ”์ธ ๋ถ„์„ ํ•จ์ˆ˜"""
print("๋ผ๋ฒจ๊ณผ ์ž…๋ ฅ ๋ฐ์ดํ„ฐ ๊ฐ„์˜ ๋…๋ฆฝ์„ฑ ๋ถ„์„์„ ์‹œ์ž‘ํ•ฉ๋‹ˆ๋‹ค...")
# ๋ฐ์ดํ„ฐ ๋กœ๋“œ
label_mapping, label_embeddings = load_label_data()
# ๋ผ๋ฒจ ์ž„๋ฒ ๋”ฉ์„ numpy ๋ฐฐ์—ด๋กœ ๋ณ€ํ™˜
if isinstance(label_embeddings, dict):
# ๋”•์…”๋„ˆ๋ฆฌ์ธ ๊ฒฝ์šฐ ๊ฐ’๋“ค์„ ๋ฐฐ์—ด๋กœ ๋ณ€ํ™˜
label_embeddings = np.array(list(label_embeddings.values()))
elif isinstance(label_embeddings, list):
# ๋ฆฌ์ŠคํŠธ์ธ ๊ฒฝ์šฐ numpy ๋ฐฐ์—ด๋กœ ๋ณ€ํ™˜
label_embeddings = np.array(label_embeddings)
print(f"๋กœ๋“œ๋œ ๋ผ๋ฒจ ์ž„๋ฒ ๋”ฉ ํ˜•ํƒœ: {label_embeddings.shape}")
# 1. ๋ผ๋ฒจ ๊ฐ„ ์œ ์‚ฌ๋„ ๋ถ„์„
similarity_matrix = analyze_label_similarity(label_embeddings, label_mapping)
# 2. ๋ผ๋ฒจ ์ž„๋ฒ ๋”ฉ ์ƒ๊ด€๊ด€๊ณ„ ๋ถ„์„
corr_matrix = analyze_label_correlation(label_embeddings, label_mapping)
# 3. ๋ผ๋ฒจ ๋ถ„ํฌ ๋ถ„์„
embedding_norms = analyze_label_distribution(label_embeddings, label_mapping)
# 4. ๋…๋ฆฝ์„ฑ ์ง€ํ‘œ ๋ถ„์„
distances, dot_products, embedding_variance = analyze_independence_metrics(label_embeddings, label_mapping)
# 5. ์‹œ๊ฐํ™”
visualize_label_embeddings(label_embeddings, label_mapping)
# ์ข…ํ•ฉ ํ‰๊ฐ€
print("\n=== ์ข…ํ•ฉ ๋…๋ฆฝ์„ฑ ํ‰๊ฐ€ ===")
# ์œ ์‚ฌ๋„ ๊ธฐ๋ฐ˜ ํ‰๊ฐ€
avg_similarity = np.mean(similarity_matrix)
if avg_similarity < 0.1:
similarity_score = "๋งค์šฐ ์ข‹์Œ"
elif avg_similarity < 0.3:
similarity_score = "์ข‹์Œ"
elif avg_similarity < 0.5:
similarity_score = "๋ณดํ†ต"
else:
similarity_score = "๊ฐœ์„  ํ•„์š”"
# ๋‚ด์  ๊ธฐ๋ฐ˜ ํ‰๊ฐ€
avg_dot_product = np.mean(dot_products)
if avg_dot_product < 0.1:
orthogonality_score = "๋งค์šฐ ์ข‹์Œ"
elif avg_dot_product < 0.3:
orthogonality_score = "์ข‹์Œ"
elif avg_dot_product < 0.5:
orthogonality_score = "๋ณดํ†ต"
else:
orthogonality_score = "๊ฐœ์„  ํ•„์š”"
print(f"๋ผ๋ฒจ ๊ฐ„ ์œ ์‚ฌ๋„ ํ‰๊ฐ€: {similarity_score} (ํ‰๊ท  ์œ ์‚ฌ๋„: {avg_similarity:.4f})")
print(f"๋ผ๋ฒจ ์ง๊ต์„ฑ ํ‰๊ฐ€: {orthogonality_score} (ํ‰๊ท  ๋‚ด์ : {avg_dot_product:.4f})")
if similarity_score in ["๋งค์šฐ ์ข‹์Œ", "์ข‹์Œ"] and orthogonality_score in ["๋งค์šฐ ์ข‹์Œ", "์ข‹์Œ"]:
print("์ „์ฒด ํ‰๊ฐ€: ๋ผ๋ฒจ๊ณผ ์ž…๋ ฅ ๋ฐ์ดํ„ฐ ๊ฐ„์˜ ๋…๋ฆฝ์„ฑ์ด ์–‘ํ˜ธํ•ฉ๋‹ˆ๋‹ค.")
elif similarity_score in ["๋ณดํ†ต"] and orthogonality_score in ["๋ณดํ†ต"]:
print("์ „์ฒด ํ‰๊ฐ€: ๋ผ๋ฒจ๊ณผ ์ž…๋ ฅ ๋ฐ์ดํ„ฐ ๊ฐ„์˜ ๋…๋ฆฝ์„ฑ์ด ๋ณดํ†ต ์ˆ˜์ค€์ž…๋‹ˆ๋‹ค.")
else:
print("์ „์ฒด ํ‰๊ฐ€: ๋ผ๋ฒจ๊ณผ ์ž…๋ ฅ ๋ฐ์ดํ„ฐ ๊ฐ„์˜ ๋…๋ฆฝ์„ฑ ๊ฐœ์„ ์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค.")
if __name__ == "__main__":
main()