Spaces:

Miroir
/

semantix-api

Sleeping

App Files Files Community

Miroir commited on Feb 1, 2025

Commit

bb68168

1 Parent(s): b9d40b6

modified viz to handle small set of data points for center

Browse files

Files changed (2) hide show

data/word_list.json +0 -6
services/visualization_service.py +36 -29

data/word_list.json CHANGED Viewed

@@ -42,15 +42,11 @@
         "poésie",
         "calligraphie",
         "urbanisme",
-        "chimie_quantique",
         "psychométrie",
         "cybersécurité",
         "e-learning",
-        "jeu_vidéo",
         "holographie",
         "biotechnologie",
-        "intelligence_artificielle",
-        "mécanique_céleste",
         "anthropocène",
         "écoféminisme",
         "cyberpunk",
@@ -61,7 +57,6 @@
         "océanographie",
         "neurosciences",
         "ludologie",
-        "ux_design",
         "cosmologie",
         "astrobiologie",
         "climatologie",
@@ -70,7 +65,6 @@
         "jardinage",
         "marionnette",
         "esport",
-        "philosophie_zén",
         "jazzologie",
         "mycologie",
         "origami",

         "poésie",
         "calligraphie",
         "urbanisme",
         "psychométrie",
         "cybersécurité",
         "e-learning",
         "holographie",
         "biotechnologie",
         "anthropocène",
         "écoféminisme",
         "cyberpunk",
         "océanographie",
         "neurosciences",
         "ludologie",
         "cosmologie",
         "astrobiologie",
         "climatologie",
         "jardinage",
         "marionnette",
         "esport",
         "jazzologie",
         "mycologie",
         "origami",

services/visualization_service.py CHANGED Viewed

@@ -20,6 +20,7 @@ class VisualizationService:
         b = int((1.0 - sim) * 255)
         return f"rgb({r}, {g}, {b})"
     def prepare_3d_visualization(self, target_word: str, guessed_words: list[str]):
         try:
             embeddings = []
@@ -27,13 +28,7 @@ class VisualizationService:
             target_embedding = self.word_service.get_vector(target_word)
             if target_embedding is None:
-                return [{
-                    'word': "???",
-                    'coordinates': [0, 0, 0],
-                    'is_target': True,
-                    'similarity': 1.0,
-                    'color': 'rgb(255, 0, 0)'
-                }]
             embeddings.append(target_embedding)
             valid_words.append(target_word)
@@ -44,32 +39,50 @@ class VisualizationService:
                     embeddings.append(vec)
                     valid_words.append(word)
-            # if there's only 1 or 2 embeddings total, no manifold can form
-            if len(embeddings) < 3:
                 return self._simple_fallback(target_word, valid_words, embeddings)
-            # Otherwise, do UMAP
             embeddings_array = np.array(embeddings)
-            neighbors = min(5, len(embeddings) - 1)
-            import umap
             reducer = umap.UMAP(
-                n_components=3,
-                n_neighbors=neighbors,
                 min_dist=0.1,
                 metric='cosine',
-                random_state=42
             )
-            embedding_3d = reducer.fit_transform(embeddings_array)
-            # Re-center target at (0,0,0)
-            target_coords = embedding_3d[0]
-            embedding_3d -= target_coords
             result = []
             for i, word in enumerate(valid_words):
                 if i == 0:
-                    # target
                     result.append({
                         'word': "???",
                         'coordinates': embedding_3d[i].tolist(),
@@ -89,15 +102,9 @@ class VisualizationService:
                     })
             return result
-        except Exception:
-            logger.exception("Error preparing 3D visualization with UMAP")
-            return [{
-                'word': "???",
-                'coordinates': [0, 0, 0],
-                'is_target': True,
-                'similarity': 1.0,
-                'color': 'rgb(255, 0, 0)'
-            }]
     def _simple_fallback(self, target_word: str, valid_words: list[str], embeddings: list[np.ndarray]):
         """

         b = int((1.0 - sim) * 255)
         return f"rgb({r}, {g}, {b})"
     def prepare_3d_visualization(self, target_word: str, guessed_words: list[str]):
         try:
             embeddings = []
             target_embedding = self.word_service.get_vector(target_word)
             if target_embedding is None:
+                return self._simple_fallback(target_word, [], [])
             embeddings.append(target_embedding)
             valid_words.append(target_word)
                     embeddings.append(vec)
                     valid_words.append(word)
+            # Require more points for UMAP
+            if len(embeddings) < 5:  # Increased minimum points
                 return self._simple_fallback(target_word, valid_words, embeddings)
+            # Otherwise, do UMAP with adjusted parameters
             embeddings_array = np.array(embeddings)
+            # Adjust n_neighbors based on dataset size
+            n_neighbors = min(3, len(embeddings) - 1)
+            n_components = min(3, len(embeddings) - 1)
             reducer = umap.UMAP(
+                n_components=n_components,
+                n_neighbors=n_neighbors,
                 min_dist=0.1,
                 metric='cosine',
+                random_state=42,
+                # Add these parameters to handle small datasets
+                low_memory=True,
+                n_epochs=None,  # Let UMAP decide
+                init='random'  # Use random initialization instead of spectral
             )
+            try:
+                embedding_3d = reducer.fit_transform(embeddings_array)
+                # If we got fewer dimensions, pad with zeros
+                if embedding_3d.shape[1] < 3:
+                    padding = np.zeros((embedding_3d.shape[0], 3 - embedding_3d.shape[1]))
+                    embedding_3d = np.hstack([embedding_3d, padding])
+            except (ValueError, TypeError) as e:
+                logger.warning(f"UMAP failed: {str(e)}, falling back to simple visualization")
+                return self._simple_fallback(target_word, valid_words, embeddings)
+            # Center and scale the embeddings
+            embedding_3d -= embedding_3d[0]  # Center on target
+            max_dist = np.max(np.abs(embedding_3d))
+            if max_dist > 0:
+                embedding_3d *= (1.0 / max_dist)  # Scale to [-1,1]
             result = []
             for i, word in enumerate(valid_words):
                 if i == 0:
                     result.append({
                         'word': "???",
                         'coordinates': embedding_3d[i].tolist(),
                     })
             return result
+        except Exception as e:
+            logger.exception(f"Error in visualization: {str(e)}")
+            return self._simple_fallback(target_word, [], [])
     def _simple_fallback(self, target_word: str, valid_words: list[str], embeddings: list[np.ndarray]):
         """