Spaces:
Sleeping
Sleeping
File size: 5,889 Bytes
cebf2e3 cf65513 bb68168 cf65513 bb68168 cf65513 bb68168 cf65513 bb68168 cf65513 bb68168 cf65513 bb68168 cf65513 bb68168 cf65513 bb68168 cf65513 bb68168 cf65513 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
# file location: semantix-api/services/visualization_service.py
import numpy as np
import umap # pip install umap-learn
from loguru import logger
class VisualizationService:
def __init__(self, word_service):
self.word_service = word_service
def _compute_color(self, similarity: float) -> str:
"""
Given a similarity in [0,1], return an RGB color from blue (0) to red (1).
"""
# Clamp similarity to [0,1] just in case
sim = max(0.0, min(1.0, similarity))
# Simple gradient from blue (0,0,255) to red (255,0,0)
r = int(sim * 255)
g = 0
b = int((1.0 - sim) * 255)
return f"rgb({r}, {g}, {b})"
def prepare_3d_visualization(self, target_word: str, guessed_words: list[str]):
try:
embeddings = []
valid_words = []
target_embedding = self.word_service.get_vector(target_word)
if target_embedding is None:
return self._simple_fallback(target_word, [], [])
embeddings.append(target_embedding)
valid_words.append(target_word)
for word in guessed_words:
vec = self.word_service.get_vector(word)
if vec is not None and not np.all(vec == 0):
embeddings.append(vec)
valid_words.append(word)
# Require more points for UMAP
if len(embeddings) < 5: # Increased minimum points
return self._simple_fallback(target_word, valid_words, embeddings)
# Otherwise, do UMAP with adjusted parameters
embeddings_array = np.array(embeddings)
# Adjust n_neighbors based on dataset size
n_neighbors = min(3, len(embeddings) - 1)
n_components = min(3, len(embeddings) - 1)
reducer = umap.UMAP(
n_components=n_components,
n_neighbors=n_neighbors,
min_dist=0.1,
metric='cosine',
random_state=42,
# Add these parameters to handle small datasets
low_memory=True,
n_epochs=None, # Let UMAP decide
init='random' # Use random initialization instead of spectral
)
try:
embedding_3d = reducer.fit_transform(embeddings_array)
# If we got fewer dimensions, pad with zeros
if embedding_3d.shape[1] < 3:
padding = np.zeros((embedding_3d.shape[0], 3 - embedding_3d.shape[1]))
embedding_3d = np.hstack([embedding_3d, padding])
except (ValueError, TypeError) as e:
logger.warning(f"UMAP failed: {str(e)}, falling back to simple visualization")
return self._simple_fallback(target_word, valid_words, embeddings)
# Center and scale the embeddings
embedding_3d -= embedding_3d[0] # Center on target
max_dist = np.max(np.abs(embedding_3d))
if max_dist > 0:
embedding_3d *= (1.0 / max_dist) # Scale to [-1,1]
result = []
for i, word in enumerate(valid_words):
if i == 0:
result.append({
'word': "???",
'coordinates': embedding_3d[i].tolist(),
'is_target': True,
'similarity': 1.0,
'color': 'rgb(255, 0, 0)'
})
else:
sim = self.word_service.calculate_similarity(target_word, word)
color = self._compute_color(sim)
result.append({
'word': word,
'coordinates': embedding_3d[i].tolist(),
'is_target': False,
'similarity': sim,
'color': color
})
return result
except Exception as e:
logger.exception(f"Error in visualization: {str(e)}")
return self._simple_fallback(target_word, [], [])
def _simple_fallback(self, target_word: str, valid_words: list[str], embeddings: list[np.ndarray]):
"""
Return a minimal 3D layout without UMAP
when the dataset is too small to form a manifold.
"""
# If there's only the target, just place it at the origin.
if len(embeddings) <= 1:
return [{
'word': "???",
'coordinates': [0, 0, 0],
'is_target': True,
'similarity': 1.0,
'color': 'rgb(255, 0, 0)'
}]
# We have at least 2 points (target + 1 guess)
coords = np.random.randn(len(embeddings), 3) * 0.1
coords[0] = [0, 0, 0] # target at origin
result = []
for i, word in enumerate(valid_words):
if i == 0:
# target
result.append({
'word': "???",
'coordinates': coords[i].tolist(),
'is_target': True,
'similarity': 1.0,
'color': 'rgb(255, 0, 0)'
})
else:
sim = self.word_service.calculate_similarity(target_word, word)
color = self._compute_color(sim)
result.append({
'word': word,
'coordinates': coords[i].tolist(),
'is_target': False,
'similarity': sim,
'color': color
})
return result
|