Miroir commited on
Commit
bb68168
·
1 Parent(s): b9d40b6

modified viz to handle small set of data points for center

Browse files
data/word_list.json CHANGED
@@ -42,15 +42,11 @@
42
  "poésie",
43
  "calligraphie",
44
  "urbanisme",
45
- "chimie_quantique",
46
  "psychométrie",
47
  "cybersécurité",
48
  "e-learning",
49
- "jeu_vidéo",
50
  "holographie",
51
  "biotechnologie",
52
- "intelligence_artificielle",
53
- "mécanique_céleste",
54
  "anthropocène",
55
  "écoféminisme",
56
  "cyberpunk",
@@ -61,7 +57,6 @@
61
  "océanographie",
62
  "neurosciences",
63
  "ludologie",
64
- "ux_design",
65
  "cosmologie",
66
  "astrobiologie",
67
  "climatologie",
@@ -70,7 +65,6 @@
70
  "jardinage",
71
  "marionnette",
72
  "esport",
73
- "philosophie_zén",
74
  "jazzologie",
75
  "mycologie",
76
  "origami",
 
42
  "poésie",
43
  "calligraphie",
44
  "urbanisme",
 
45
  "psychométrie",
46
  "cybersécurité",
47
  "e-learning",
 
48
  "holographie",
49
  "biotechnologie",
 
 
50
  "anthropocène",
51
  "écoféminisme",
52
  "cyberpunk",
 
57
  "océanographie",
58
  "neurosciences",
59
  "ludologie",
 
60
  "cosmologie",
61
  "astrobiologie",
62
  "climatologie",
 
65
  "jardinage",
66
  "marionnette",
67
  "esport",
 
68
  "jazzologie",
69
  "mycologie",
70
  "origami",
services/visualization_service.py CHANGED
@@ -20,6 +20,7 @@ class VisualizationService:
20
  b = int((1.0 - sim) * 255)
21
  return f"rgb({r}, {g}, {b})"
22
 
 
23
  def prepare_3d_visualization(self, target_word: str, guessed_words: list[str]):
24
  try:
25
  embeddings = []
@@ -27,13 +28,7 @@ class VisualizationService:
27
 
28
  target_embedding = self.word_service.get_vector(target_word)
29
  if target_embedding is None:
30
- return [{
31
- 'word': "???",
32
- 'coordinates': [0, 0, 0],
33
- 'is_target': True,
34
- 'similarity': 1.0,
35
- 'color': 'rgb(255, 0, 0)'
36
- }]
37
 
38
  embeddings.append(target_embedding)
39
  valid_words.append(target_word)
@@ -44,32 +39,50 @@ class VisualizationService:
44
  embeddings.append(vec)
45
  valid_words.append(word)
46
 
47
- # if there's only 1 or 2 embeddings total, no manifold can form
48
- if len(embeddings) < 3:
49
  return self._simple_fallback(target_word, valid_words, embeddings)
50
 
51
- # Otherwise, do UMAP
52
  embeddings_array = np.array(embeddings)
53
- neighbors = min(5, len(embeddings) - 1)
 
 
 
54
 
55
- import umap
56
  reducer = umap.UMAP(
57
- n_components=3,
58
- n_neighbors=neighbors,
59
  min_dist=0.1,
60
  metric='cosine',
61
- random_state=42
 
 
 
 
62
  )
63
- embedding_3d = reducer.fit_transform(embeddings_array)
64
 
65
- # Re-center target at (0,0,0)
66
- target_coords = embedding_3d[0]
67
- embedding_3d -= target_coords
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  result = []
70
  for i, word in enumerate(valid_words):
71
  if i == 0:
72
- # target
73
  result.append({
74
  'word': "???",
75
  'coordinates': embedding_3d[i].tolist(),
@@ -89,15 +102,9 @@ class VisualizationService:
89
  })
90
  return result
91
 
92
- except Exception:
93
- logger.exception("Error preparing 3D visualization with UMAP")
94
- return [{
95
- 'word': "???",
96
- 'coordinates': [0, 0, 0],
97
- 'is_target': True,
98
- 'similarity': 1.0,
99
- 'color': 'rgb(255, 0, 0)'
100
- }]
101
 
102
  def _simple_fallback(self, target_word: str, valid_words: list[str], embeddings: list[np.ndarray]):
103
  """
 
20
  b = int((1.0 - sim) * 255)
21
  return f"rgb({r}, {g}, {b})"
22
 
23
+
24
  def prepare_3d_visualization(self, target_word: str, guessed_words: list[str]):
25
  try:
26
  embeddings = []
 
28
 
29
  target_embedding = self.word_service.get_vector(target_word)
30
  if target_embedding is None:
31
+ return self._simple_fallback(target_word, [], [])
 
 
 
 
 
 
32
 
33
  embeddings.append(target_embedding)
34
  valid_words.append(target_word)
 
39
  embeddings.append(vec)
40
  valid_words.append(word)
41
 
42
+ # Require more points for UMAP
43
+ if len(embeddings) < 5: # Increased minimum points
44
  return self._simple_fallback(target_word, valid_words, embeddings)
45
 
46
+ # Otherwise, do UMAP with adjusted parameters
47
  embeddings_array = np.array(embeddings)
48
+
49
+ # Adjust n_neighbors based on dataset size
50
+ n_neighbors = min(3, len(embeddings) - 1)
51
+ n_components = min(3, len(embeddings) - 1)
52
 
 
53
  reducer = umap.UMAP(
54
+ n_components=n_components,
55
+ n_neighbors=n_neighbors,
56
  min_dist=0.1,
57
  metric='cosine',
58
+ random_state=42,
59
+ # Add these parameters to handle small datasets
60
+ low_memory=True,
61
+ n_epochs=None, # Let UMAP decide
62
+ init='random' # Use random initialization instead of spectral
63
  )
 
64
 
65
+ try:
66
+ embedding_3d = reducer.fit_transform(embeddings_array)
67
+
68
+ # If we got fewer dimensions, pad with zeros
69
+ if embedding_3d.shape[1] < 3:
70
+ padding = np.zeros((embedding_3d.shape[0], 3 - embedding_3d.shape[1]))
71
+ embedding_3d = np.hstack([embedding_3d, padding])
72
+
73
+ except (ValueError, TypeError) as e:
74
+ logger.warning(f"UMAP failed: {str(e)}, falling back to simple visualization")
75
+ return self._simple_fallback(target_word, valid_words, embeddings)
76
+
77
+ # Center and scale the embeddings
78
+ embedding_3d -= embedding_3d[0] # Center on target
79
+ max_dist = np.max(np.abs(embedding_3d))
80
+ if max_dist > 0:
81
+ embedding_3d *= (1.0 / max_dist) # Scale to [-1,1]
82
 
83
  result = []
84
  for i, word in enumerate(valid_words):
85
  if i == 0:
 
86
  result.append({
87
  'word': "???",
88
  'coordinates': embedding_3d[i].tolist(),
 
102
  })
103
  return result
104
 
105
+ except Exception as e:
106
+ logger.exception(f"Error in visualization: {str(e)}")
107
+ return self._simple_fallback(target_word, [], [])
 
 
 
 
 
 
108
 
109
  def _simple_fallback(self, target_word: str, valid_words: list[str], embeddings: list[np.ndarray]):
110
  """