Spaces:

Miroir
/

semantix-api

Sleeping

File size: 5,889 Bytes

# file location: semantix-api/services/visualization_service.py

import numpy as np
import umap  # pip install umap-learn
from loguru import logger

class VisualizationService:
    def __init__(self, word_service):
        self.word_service = word_service

    def _compute_color(self, similarity: float) -> str:
        """

        Given a similarity in [0,1], return an RGB color from blue (0) to red (1).

        """
        # Clamp similarity to [0,1] just in case
        sim = max(0.0, min(1.0, similarity))
        # Simple gradient from blue (0,0,255) to red (255,0,0)
        r = int(sim * 255)
        g = 0
        b = int((1.0 - sim) * 255)
        return f"rgb({r}, {g}, {b})"


    def prepare_3d_visualization(self, target_word: str, guessed_words: list[str]):
        try:
            embeddings = []
            valid_words = []

            target_embedding = self.word_service.get_vector(target_word)
            if target_embedding is None:
                return self._simple_fallback(target_word, [], [])

            embeddings.append(target_embedding)
            valid_words.append(target_word)

            for word in guessed_words:
                vec = self.word_service.get_vector(word)
                if vec is not None and not np.all(vec == 0):
                    embeddings.append(vec)
                    valid_words.append(word)

            # Require more points for UMAP
            if len(embeddings) < 5:  # Increased minimum points
                return self._simple_fallback(target_word, valid_words, embeddings)

            # Otherwise, do UMAP with adjusted parameters
            embeddings_array = np.array(embeddings)
            
            # Adjust n_neighbors based on dataset size
            n_neighbors = min(3, len(embeddings) - 1)
            n_components = min(3, len(embeddings) - 1)

            reducer = umap.UMAP(
                n_components=n_components,
                n_neighbors=n_neighbors,
                min_dist=0.1,
                metric='cosine',
                random_state=42,
                # Add these parameters to handle small datasets
                low_memory=True,
                n_epochs=None,  # Let UMAP decide
                init='random'  # Use random initialization instead of spectral
            )

            try:
                embedding_3d = reducer.fit_transform(embeddings_array)
                
                # If we got fewer dimensions, pad with zeros
                if embedding_3d.shape[1] < 3:
                    padding = np.zeros((embedding_3d.shape[0], 3 - embedding_3d.shape[1]))
                    embedding_3d = np.hstack([embedding_3d, padding])
                
            except (ValueError, TypeError) as e:
                logger.warning(f"UMAP failed: {str(e)}, falling back to simple visualization")
                return self._simple_fallback(target_word, valid_words, embeddings)

            # Center and scale the embeddings
            embedding_3d -= embedding_3d[0]  # Center on target
            max_dist = np.max(np.abs(embedding_3d))
            if max_dist > 0:
                embedding_3d *= (1.0 / max_dist)  # Scale to [-1,1]

            result = []
            for i, word in enumerate(valid_words):
                if i == 0:
                    result.append({
                        'word': "???",
                        'coordinates': embedding_3d[i].tolist(),
                        'is_target': True,
                        'similarity': 1.0,
                        'color': 'rgb(255, 0, 0)'
                    })
                else:
                    sim = self.word_service.calculate_similarity(target_word, word)
                    color = self._compute_color(sim)
                    result.append({
                        'word': word,
                        'coordinates': embedding_3d[i].tolist(),
                        'is_target': False,
                        'similarity': sim,
                        'color': color
                    })
            return result

        except Exception as e:
            logger.exception(f"Error in visualization: {str(e)}")
            return self._simple_fallback(target_word, [], [])

    def _simple_fallback(self, target_word: str, valid_words: list[str], embeddings: list[np.ndarray]):
        """

        Return a minimal 3D layout without UMAP

        when the dataset is too small to form a manifold.

        """
        # If there's only the target, just place it at the origin.
        if len(embeddings) <= 1:
            return [{
                'word': "???",
                'coordinates': [0, 0, 0],
                'is_target': True,
                'similarity': 1.0,
                'color': 'rgb(255, 0, 0)'
            }]

        # We have at least 2 points (target + 1 guess)
        coords = np.random.randn(len(embeddings), 3) * 0.1
        coords[0] = [0, 0, 0]  # target at origin

        result = []
        for i, word in enumerate(valid_words):
            if i == 0:
                # target
                result.append({
                    'word': "???",
                    'coordinates': coords[i].tolist(),
                    'is_target': True,
                    'similarity': 1.0,
                    'color': 'rgb(255, 0, 0)'
                })
            else:
                sim = self.word_service.calculate_similarity(target_word, word)
                color = self._compute_color(sim)
                result.append({
                    'word': word,
                    'coordinates': coords[i].tolist(),
                    'is_target': False,
                    'similarity': sim,
                    'color': color
                })

        return result