# file location: semantix-api/services/visualization_service.py import numpy as np import umap # pip install umap-learn from loguru import logger class VisualizationService: def __init__(self, word_service): self.word_service = word_service def _compute_color(self, similarity: float) -> str: """ Given a similarity in [0,1], return an RGB color from blue (0) to red (1). """ # Clamp similarity to [0,1] just in case sim = max(0.0, min(1.0, similarity)) # Simple gradient from blue (0,0,255) to red (255,0,0) r = int(sim * 255) g = 0 b = int((1.0 - sim) * 255) return f"rgb({r}, {g}, {b})" def prepare_3d_visualization(self, target_word: str, guessed_words: list[str]): try: embeddings = [] valid_words = [] target_embedding = self.word_service.get_vector(target_word) if target_embedding is None: return self._simple_fallback(target_word, [], []) embeddings.append(target_embedding) valid_words.append(target_word) for word in guessed_words: vec = self.word_service.get_vector(word) if vec is not None and not np.all(vec == 0): embeddings.append(vec) valid_words.append(word) # Require more points for UMAP if len(embeddings) < 5: # Increased minimum points return self._simple_fallback(target_word, valid_words, embeddings) # Otherwise, do UMAP with adjusted parameters embeddings_array = np.array(embeddings) # Adjust n_neighbors based on dataset size n_neighbors = min(3, len(embeddings) - 1) n_components = min(3, len(embeddings) - 1) reducer = umap.UMAP( n_components=n_components, n_neighbors=n_neighbors, min_dist=0.1, metric='cosine', random_state=42, # Add these parameters to handle small datasets low_memory=True, n_epochs=None, # Let UMAP decide init='random' # Use random initialization instead of spectral ) try: embedding_3d = reducer.fit_transform(embeddings_array) # If we got fewer dimensions, pad with zeros if embedding_3d.shape[1] < 3: padding = np.zeros((embedding_3d.shape[0], 3 - embedding_3d.shape[1])) embedding_3d = np.hstack([embedding_3d, padding]) except (ValueError, TypeError) as e: logger.warning(f"UMAP failed: {str(e)}, falling back to simple visualization") return self._simple_fallback(target_word, valid_words, embeddings) # Center and scale the embeddings embedding_3d -= embedding_3d[0] # Center on target max_dist = np.max(np.abs(embedding_3d)) if max_dist > 0: embedding_3d *= (1.0 / max_dist) # Scale to [-1,1] result = [] for i, word in enumerate(valid_words): if i == 0: result.append({ 'word': "???", 'coordinates': embedding_3d[i].tolist(), 'is_target': True, 'similarity': 1.0, 'color': 'rgb(255, 0, 0)' }) else: sim = self.word_service.calculate_similarity(target_word, word) color = self._compute_color(sim) result.append({ 'word': word, 'coordinates': embedding_3d[i].tolist(), 'is_target': False, 'similarity': sim, 'color': color }) return result except Exception as e: logger.exception(f"Error in visualization: {str(e)}") return self._simple_fallback(target_word, [], []) def _simple_fallback(self, target_word: str, valid_words: list[str], embeddings: list[np.ndarray]): """ Return a minimal 3D layout without UMAP when the dataset is too small to form a manifold. """ # If there's only the target, just place it at the origin. if len(embeddings) <= 1: return [{ 'word': "???", 'coordinates': [0, 0, 0], 'is_target': True, 'similarity': 1.0, 'color': 'rgb(255, 0, 0)' }] # We have at least 2 points (target + 1 guess) coords = np.random.randn(len(embeddings), 3) * 0.1 coords[0] = [0, 0, 0] # target at origin result = [] for i, word in enumerate(valid_words): if i == 0: # target result.append({ 'word': "???", 'coordinates': coords[i].tolist(), 'is_target': True, 'similarity': 1.0, 'color': 'rgb(255, 0, 0)' }) else: sim = self.word_service.calculate_similarity(target_word, word) color = self._compute_color(sim) result.append({ 'word': word, 'coordinates': coords[i].tolist(), 'is_target': False, 'similarity': sim, 'color': color }) return result