Spaces:
Running
Running
File size: 4,973 Bytes
23680f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import geomstats.backend as gs # type: ignore
from geomstats.geometry.poincare_ball import PoincareBall
def generate_data(n_samples, center_r, center_theta, spread, label):
"""Generates points in polar coordinates, then converts to Cartesian."""
# Random noise in polar coordinates
r_noise = np.random.normal(0, spread, n_samples)
theta_noise = np.random.normal(0, spread, n_samples)
r = np.clip(center_r + r_noise, 0, 0.99) # Keep within unit disk
theta = center_theta + theta_noise
# Convert to Cartesian for plotting
x = r * np.cos(theta)
y = r * np.sin(theta)
return np.column_stack([x, y]), label
def main():
print("Generating synthetic hierarchical data...")
# 1. Define the Hierarchy
# Majority: Center of the space
# Minority: Periphery
# Rare: Deep Periphery (Sub-group of Minority)
# Parameters
majority_n = 500
minority_n = 50
rare_n = 10
# Angle for the minority group
theta_minority = np.pi / 4 # 45 degrees
# --- Euclidean Simulation ---
# In Euclidean space, we simulate "crowding" by placing them close together
# because the space doesn't expand.
# Majority at (0,0)
maj_euc, _ = generate_data(majority_n, 0.0, 0.0, 0.1, "Majority")
# Minority at distance 0.5
min_euc, _ = generate_data(minority_n, 0.5, theta_minority, 0.08, "Minority")
# Rare at distance 0.52 (Buried inside the Minority cluster in Euclidean terms)
rare_euc, _ = generate_data(rare_n, 0.52, theta_minority, 0.04, "Rare")
# --- Hyperbolic Simulation ---
# In Hyperbolic space, we map these same "conceptual" positions to the Poincaré disk.
# The "Rare" group is pushed further out to the edge (r=0.95).
# The "Minority" group is at r=0.8.
# The "Majority" is at r=0.0.
# Majority at center
maj_hyp, _ = generate_data(majority_n, 0.0, 0.0, 0.1, "Majority")
# Minority at r=0.8
min_hyp, _ = generate_data(minority_n, 0.8, theta_minority, 0.05, "Minority")
# Rare at r=0.95 (Deep in the hyperbolic tail)
rare_hyp, _ = generate_data(rare_n, 0.95, theta_minority, 0.01, "Rare")
# --- Visualization ---
fig, axes = plt.subplots(1, 2, figsize=(16, 8))
# Plot 1: Euclidean View
ax = axes[0]
ax.set_title("Euclidean Space (Standard AI)\n'Representation Collapse'", fontsize=14, fontweight='bold')
# Draw Unit Circle for reference
circle = mpatches.Circle((0, 0), 1, color='black', fill=False, linestyle='--', alpha=0.3)
ax.add_artist(circle)
ax.scatter(maj_euc[:, 0], maj_euc[:, 1], c='gray', alpha=0.3, label='Majority (Head)', s=20)
ax.scatter(min_euc[:, 0], min_euc[:, 1], c='blue', alpha=0.6, label='Minority (Tail)', s=40)
ax.scatter(rare_euc[:, 0], rare_euc[:, 1], c='red', alpha=0.9, label='Rare Subgroup (Long Tail)', s=40)
# Annotate the "Crush"
ax.annotate('Indistinguishable\nCluster', xy=(0.55, 0.55), xytext=(0.2, 0.7),
arrowprops=dict(facecolor='black', shrink=0.05), fontsize=12)
ax.set_xlim(-1.1, 1.1)
ax.set_ylim(-1.1, 1.1)
ax.set_aspect('equal')
ax.legend(loc='lower right')
ax.grid(True, alpha=0.2)
# Plot 2: Hyperbolic View
ax = axes[1]
ax.set_title("Hyperbolic Space (HyperView)\n'Hierarchical Expansion'", fontsize=14, fontweight='bold')
# Draw Poincaré Disk Boundary
circle = mpatches.Circle((0, 0), 1, color='black', fill=False, linewidth=2)
ax.add_artist(circle)
ax.scatter(maj_hyp[:, 0], maj_hyp[:, 1], c='gray', alpha=0.3, label='Majority', s=20)
ax.scatter(min_hyp[:, 0], min_hyp[:, 1], c='blue', alpha=0.6, label='Minority', s=40)
ax.scatter(rare_hyp[:, 0], rare_hyp[:, 1], c='red', alpha=0.9, label='Rare Subgroup', s=40)
# Calculate Geodesic Distance (Visual representation)
# We use geomstats to calculate the actual hyperbolic distance between the centers
manifold = PoincareBall(2)
p_min = gs.array([0.8 * np.cos(theta_minority), 0.8 * np.sin(theta_minority)])
p_rare = gs.array([0.95 * np.cos(theta_minority), 0.95 * np.sin(theta_minority)])
dist = manifold.metric.dist(p_min, p_rare)
# Annotate the Expansion
ax.annotate(f'Hyperbolic Dist: {dist:.2f}\n(Distinct & Separable)',
xy=(0.85, 0.85), xytext=(0.2, 0.8),
arrowprops=dict(facecolor='black', shrink=0.05), fontsize=12)
ax.set_xlim(-1.1, 1.1)
ax.set_ylim(-1.1, 1.1)
ax.set_aspect('equal')
ax.legend(loc='lower right')
ax.axis('off') # Hide grid for cleaner Poincaré look
plt.tight_layout()
output_path = 'assets/bias_collapse.png'
plt.savefig(output_path, dpi=300)
print(f"Visualization saved to {output_path}")
if __name__ == "__main__":
main()
|