File size: 5,889 Bytes
cebf2e3
cf65513
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb68168
cf65513
 
 
 
 
 
 
bb68168
cf65513
 
 
 
 
 
 
 
 
 
bb68168
 
cf65513
 
bb68168
cf65513
bb68168
 
 
 
cf65513
 
bb68168
 
cf65513
 
bb68168
 
 
 
 
cf65513
 
bb68168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf65513
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb68168
 
 
cf65513
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# file location: semantix-api/services/visualization_service.py

import numpy as np
import umap  # pip install umap-learn
from loguru import logger

class VisualizationService:
    def __init__(self, word_service):
        self.word_service = word_service

    def _compute_color(self, similarity: float) -> str:
        """

        Given a similarity in [0,1], return an RGB color from blue (0) to red (1).

        """
        # Clamp similarity to [0,1] just in case
        sim = max(0.0, min(1.0, similarity))
        # Simple gradient from blue (0,0,255) to red (255,0,0)
        r = int(sim * 255)
        g = 0
        b = int((1.0 - sim) * 255)
        return f"rgb({r}, {g}, {b})"


    def prepare_3d_visualization(self, target_word: str, guessed_words: list[str]):
        try:
            embeddings = []
            valid_words = []

            target_embedding = self.word_service.get_vector(target_word)
            if target_embedding is None:
                return self._simple_fallback(target_word, [], [])

            embeddings.append(target_embedding)
            valid_words.append(target_word)

            for word in guessed_words:
                vec = self.word_service.get_vector(word)
                if vec is not None and not np.all(vec == 0):
                    embeddings.append(vec)
                    valid_words.append(word)

            # Require more points for UMAP
            if len(embeddings) < 5:  # Increased minimum points
                return self._simple_fallback(target_word, valid_words, embeddings)

            # Otherwise, do UMAP with adjusted parameters
            embeddings_array = np.array(embeddings)
            
            # Adjust n_neighbors based on dataset size
            n_neighbors = min(3, len(embeddings) - 1)
            n_components = min(3, len(embeddings) - 1)

            reducer = umap.UMAP(
                n_components=n_components,
                n_neighbors=n_neighbors,
                min_dist=0.1,
                metric='cosine',
                random_state=42,
                # Add these parameters to handle small datasets
                low_memory=True,
                n_epochs=None,  # Let UMAP decide
                init='random'  # Use random initialization instead of spectral
            )

            try:
                embedding_3d = reducer.fit_transform(embeddings_array)
                
                # If we got fewer dimensions, pad with zeros
                if embedding_3d.shape[1] < 3:
                    padding = np.zeros((embedding_3d.shape[0], 3 - embedding_3d.shape[1]))
                    embedding_3d = np.hstack([embedding_3d, padding])
                
            except (ValueError, TypeError) as e:
                logger.warning(f"UMAP failed: {str(e)}, falling back to simple visualization")
                return self._simple_fallback(target_word, valid_words, embeddings)

            # Center and scale the embeddings
            embedding_3d -= embedding_3d[0]  # Center on target
            max_dist = np.max(np.abs(embedding_3d))
            if max_dist > 0:
                embedding_3d *= (1.0 / max_dist)  # Scale to [-1,1]

            result = []
            for i, word in enumerate(valid_words):
                if i == 0:
                    result.append({
                        'word': "???",
                        'coordinates': embedding_3d[i].tolist(),
                        'is_target': True,
                        'similarity': 1.0,
                        'color': 'rgb(255, 0, 0)'
                    })
                else:
                    sim = self.word_service.calculate_similarity(target_word, word)
                    color = self._compute_color(sim)
                    result.append({
                        'word': word,
                        'coordinates': embedding_3d[i].tolist(),
                        'is_target': False,
                        'similarity': sim,
                        'color': color
                    })
            return result

        except Exception as e:
            logger.exception(f"Error in visualization: {str(e)}")
            return self._simple_fallback(target_word, [], [])

    def _simple_fallback(self, target_word: str, valid_words: list[str], embeddings: list[np.ndarray]):
        """

        Return a minimal 3D layout without UMAP

        when the dataset is too small to form a manifold.

        """
        # If there's only the target, just place it at the origin.
        if len(embeddings) <= 1:
            return [{
                'word': "???",
                'coordinates': [0, 0, 0],
                'is_target': True,
                'similarity': 1.0,
                'color': 'rgb(255, 0, 0)'
            }]

        # We have at least 2 points (target + 1 guess)
        coords = np.random.randn(len(embeddings), 3) * 0.1
        coords[0] = [0, 0, 0]  # target at origin

        result = []
        for i, word in enumerate(valid_words):
            if i == 0:
                # target
                result.append({
                    'word': "???",
                    'coordinates': coords[i].tolist(),
                    'is_target': True,
                    'similarity': 1.0,
                    'color': 'rgb(255, 0, 0)'
                })
            else:
                sim = self.word_service.calculate_similarity(target_word, word)
                color = self._compute_color(sim)
                result.append({
                    'word': word,
                    'coordinates': coords[i].tolist(),
                    'is_target': False,
                    'similarity': sim,
                    'color': color
                })

        return result