semantix-api / services /visualization_service.py
Miroir's picture
modified viz to handle small set of data points for center
bb68168
# file location: semantix-api/services/visualization_service.py
import numpy as np
import umap # pip install umap-learn
from loguru import logger
class VisualizationService:
def __init__(self, word_service):
self.word_service = word_service
def _compute_color(self, similarity: float) -> str:
"""
Given a similarity in [0,1], return an RGB color from blue (0) to red (1).
"""
# Clamp similarity to [0,1] just in case
sim = max(0.0, min(1.0, similarity))
# Simple gradient from blue (0,0,255) to red (255,0,0)
r = int(sim * 255)
g = 0
b = int((1.0 - sim) * 255)
return f"rgb({r}, {g}, {b})"
def prepare_3d_visualization(self, target_word: str, guessed_words: list[str]):
try:
embeddings = []
valid_words = []
target_embedding = self.word_service.get_vector(target_word)
if target_embedding is None:
return self._simple_fallback(target_word, [], [])
embeddings.append(target_embedding)
valid_words.append(target_word)
for word in guessed_words:
vec = self.word_service.get_vector(word)
if vec is not None and not np.all(vec == 0):
embeddings.append(vec)
valid_words.append(word)
# Require more points for UMAP
if len(embeddings) < 5: # Increased minimum points
return self._simple_fallback(target_word, valid_words, embeddings)
# Otherwise, do UMAP with adjusted parameters
embeddings_array = np.array(embeddings)
# Adjust n_neighbors based on dataset size
n_neighbors = min(3, len(embeddings) - 1)
n_components = min(3, len(embeddings) - 1)
reducer = umap.UMAP(
n_components=n_components,
n_neighbors=n_neighbors,
min_dist=0.1,
metric='cosine',
random_state=42,
# Add these parameters to handle small datasets
low_memory=True,
n_epochs=None, # Let UMAP decide
init='random' # Use random initialization instead of spectral
)
try:
embedding_3d = reducer.fit_transform(embeddings_array)
# If we got fewer dimensions, pad with zeros
if embedding_3d.shape[1] < 3:
padding = np.zeros((embedding_3d.shape[0], 3 - embedding_3d.shape[1]))
embedding_3d = np.hstack([embedding_3d, padding])
except (ValueError, TypeError) as e:
logger.warning(f"UMAP failed: {str(e)}, falling back to simple visualization")
return self._simple_fallback(target_word, valid_words, embeddings)
# Center and scale the embeddings
embedding_3d -= embedding_3d[0] # Center on target
max_dist = np.max(np.abs(embedding_3d))
if max_dist > 0:
embedding_3d *= (1.0 / max_dist) # Scale to [-1,1]
result = []
for i, word in enumerate(valid_words):
if i == 0:
result.append({
'word': "???",
'coordinates': embedding_3d[i].tolist(),
'is_target': True,
'similarity': 1.0,
'color': 'rgb(255, 0, 0)'
})
else:
sim = self.word_service.calculate_similarity(target_word, word)
color = self._compute_color(sim)
result.append({
'word': word,
'coordinates': embedding_3d[i].tolist(),
'is_target': False,
'similarity': sim,
'color': color
})
return result
except Exception as e:
logger.exception(f"Error in visualization: {str(e)}")
return self._simple_fallback(target_word, [], [])
def _simple_fallback(self, target_word: str, valid_words: list[str], embeddings: list[np.ndarray]):
"""
Return a minimal 3D layout without UMAP
when the dataset is too small to form a manifold.
"""
# If there's only the target, just place it at the origin.
if len(embeddings) <= 1:
return [{
'word': "???",
'coordinates': [0, 0, 0],
'is_target': True,
'similarity': 1.0,
'color': 'rgb(255, 0, 0)'
}]
# We have at least 2 points (target + 1 guess)
coords = np.random.randn(len(embeddings), 3) * 0.1
coords[0] = [0, 0, 0] # target at origin
result = []
for i, word in enumerate(valid_words):
if i == 0:
# target
result.append({
'word': "???",
'coordinates': coords[i].tolist(),
'is_target': True,
'similarity': 1.0,
'color': 'rgb(255, 0, 0)'
})
else:
sim = self.word_service.calculate_similarity(target_word, word)
color = self._compute_color(sim)
result.append({
'word': word,
'coordinates': coords[i].tolist(),
'is_target': False,
'similarity': sim,
'color': color
})
return result