Miroir commited on
Commit
73390f3
·
1 Parent(s): 788d0e8

added concept vector handling in study

Browse files
routes/study.py CHANGED
@@ -1,9 +1,9 @@
1
  # routes/study.py
 
2
  from fastapi import APIRouter, HTTPException
3
  from loguru import logger
4
  from typing import List
5
  from pydantic import BaseModel
6
- from services.study_service import StudyService
7
 
8
  router = APIRouter(prefix="/api/study")
9
 
@@ -11,40 +11,79 @@ class NeighborhoodRequest(BaseModel):
11
  word: str
12
  n_neighbors: int = 20
13
 
 
 
 
 
 
 
 
 
 
 
14
  class AnalogyRequest(BaseModel):
15
  word1: str
16
  word2: str
17
  word3: str
 
18
 
19
- class VisualizationRequest(BaseModel):
20
  words: List[str]
 
21
 
22
  def init_router(study_service: StudyService):
23
- @router.post("/neighborhood")
24
- async def analyze_neighborhood(request: NeighborhoodRequest):
25
- """Analyze word neighborhood with detailed semantic information"""
 
26
  try:
27
- return await study_service.analyze_word_neighborhood(
28
- request.word,
29
- request.n_neighbors
 
30
  )
31
  except Exception as e:
32
- logger.error(f"Error analyzing word neighborhood: {str(e)}")
33
  raise HTTPException(status_code=500, detail="Internal server error")
34
 
35
  @router.post("/analogy")
36
  async def analyze_analogy(request: AnalogyRequest):
37
- """Perform word analogy analysis"""
38
  try:
39
- return await study_service.analyze_semantic_analogy(
40
  request.word1,
41
  request.word2,
42
- request.word3
 
43
  )
44
  except Exception as e:
45
  logger.error(f"Error analyzing analogy: {str(e)}")
46
  raise HTTPException(status_code=500, detail="Internal server error")
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  @router.post("/visualization")
49
  async def create_visualization(request: VisualizationRequest):
50
  """Create visualization data for a set of words"""
 
1
  # routes/study.py
2
+ from services.study_service import StudyService
3
  from fastapi import APIRouter, HTTPException
4
  from loguru import logger
5
  from typing import List
6
  from pydantic import BaseModel
 
7
 
8
  router = APIRouter(prefix="/api/study")
9
 
 
11
  word: str
12
  n_neighbors: int = 20
13
 
14
+ class VisualizationRequest(BaseModel):
15
+ words: List[str]
16
+
17
+
18
+
19
+ class ConceptRequest(BaseModel):
20
+ positive_words: List[str]
21
+ negative_words: List[str] = []
22
+ n_results: int = 10
23
+
24
  class AnalogyRequest(BaseModel):
25
  word1: str
26
  word2: str
27
  word3: str
28
+ n_results: int = 10
29
 
30
+ class SemanticFieldRequest(BaseModel):
31
  words: List[str]
32
+ n_neighbors: int = 5
33
 
34
  def init_router(study_service: StudyService):
35
+ router = APIRouter(prefix="/api/study")
36
+
37
+ @router.post("/concept")
38
+ async def analyze_concept(request: ConceptRequest):
39
  try:
40
+ return await study_service.analyze_concept(
41
+ request.positive_words,
42
+ request.negative_words,
43
+ request.n_results
44
  )
45
  except Exception as e:
46
+ logger.error(f"Error analyzing concept: {str(e)}")
47
  raise HTTPException(status_code=500, detail="Internal server error")
48
 
49
  @router.post("/analogy")
50
  async def analyze_analogy(request: AnalogyRequest):
 
51
  try:
52
+ return await study_service.analyze_analogy(
53
  request.word1,
54
  request.word2,
55
+ request.word3,
56
+ request.n_results
57
  )
58
  except Exception as e:
59
  logger.error(f"Error analyzing analogy: {str(e)}")
60
  raise HTTPException(status_code=500, detail="Internal server error")
61
 
62
+ @router.post("/semantic-field")
63
+ async def analyze_semantic_field(request: SemanticFieldRequest):
64
+ try:
65
+ return await study_service.analyze_semantic_field(
66
+ request.words,
67
+ request.n_neighbors
68
+ )
69
+ except Exception as e:
70
+ logger.error(f"Error analyzing semantic field: {str(e)}")
71
+ raise HTTPException(status_code=500, detail="Internal server error")
72
+
73
+ @router.post("/neighborhood")
74
+ async def analyze_neighborhood(request: NeighborhoodRequest):
75
+ """Analyze word neighborhood with detailed semantic information"""
76
+ try:
77
+ return await study_service.analyze_word_neighborhood(
78
+ request.word,
79
+ request.n_neighbors
80
+ )
81
+ except Exception as e:
82
+ logger.error(f"Error analyzing word neighborhood: {str(e)}")
83
+ raise HTTPException(status_code=500, detail="Internal server error")
84
+
85
+
86
+
87
  @router.post("/visualization")
88
  async def create_visualization(request: VisualizationRequest):
89
  """Create visualization data for a set of words"""
routes/word.py CHANGED
@@ -1,51 +1,112 @@
1
- # routes/word.py
2
- from fastapi import APIRouter, HTTPException
3
  from loguru import logger
4
  from typing import List, Optional
 
5
  from services.word_service import WordEmbeddingService
6
 
7
  router = APIRouter(prefix="/api")
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def init_router(word_service: WordEmbeddingService):
10
  @router.post("/similarity")
11
- async def calculate_similarity(word1: str, word2: str):
12
- """Calculate semantic similarity between two words"""
 
 
 
13
  try:
14
- similarity = await word_service.calculate_similarity(word1, word2)
 
 
 
15
  return {"similarity": similarity}
16
  except Exception as e:
17
  logger.error(f"Error calculating similarity: {str(e)}")
18
  raise HTTPException(status_code=500, detail="Internal server error")
19
 
20
  @router.get("/similar-words/{word}")
21
- async def get_similar_words(word: str, n: int = 20):
22
- """Get n most similar words to the input word"""
 
 
 
 
 
 
23
  try:
24
  words = await word_service.get_most_similar_words(word, n)
25
- return {"words": words}
 
 
 
26
  except Exception as e:
27
  logger.error(f"Error getting similar words: {str(e)}")
28
  raise HTTPException(status_code=500, detail="Internal server error")
29
 
30
  @router.post("/words-in-range")
31
- async def get_words_in_range(word: str, min_sim: float, max_sim: float, n: int = 5):
32
- """Get words within a specific similarity range"""
 
 
 
33
  try:
34
- words = await word_service.get_words_in_range(word, min_sim, max_sim, n)
 
 
 
 
 
 
 
 
 
 
 
35
  return {
 
 
 
 
 
36
  "words": words,
37
  "count": len(words)
38
  }
 
 
39
  except Exception as e:
40
  logger.error(f"Error getting words in range: {str(e)}")
41
  raise HTTPException(status_code=500, detail="Internal server error")
42
 
43
  @router.get("/random-word")
44
  async def get_random_word():
45
- """Get a random word from the vocabulary"""
 
 
 
46
  try:
47
  word = await word_service.get_random_word()
 
 
 
 
 
48
  return {"word": word}
 
 
49
  except Exception as e:
50
  logger.error(f"Error getting random word: {str(e)}")
51
  raise HTTPException(status_code=500, detail="Internal server error")
 
1
+ from fastapi import APIRouter, HTTPException, Query, Path
 
2
  from loguru import logger
3
  from typing import List, Optional
4
+ from pydantic import BaseModel, Field, confloat, conint
5
  from services.word_service import WordEmbeddingService
6
 
7
  router = APIRouter(prefix="/api")
8
 
9
+ # Request/Response Models
10
+ class SimilarityRequest(BaseModel):
11
+ word1: str = Field(..., min_length=1, description="First word to compare")
12
+ word2: str = Field(..., min_length=1, description="Second word to compare")
13
+
14
+ class WordsInRangeRequest(BaseModel):
15
+ word: str = Field(..., min_length=1, description="Target word")
16
+ min_sim: float = Field(..., ge=0.0, le=1.0, description="Minimum similarity threshold")
17
+ max_sim: float = Field(..., ge=0.0, le=1.0, description="Maximum similarity threshold")
18
+ n: int = Field(5, ge=1, le=100, description="Number of words to return")
19
+
20
+ class WordResponse(BaseModel):
21
+ word: str
22
+ similarity: float
23
+
24
  def init_router(word_service: WordEmbeddingService):
25
  @router.post("/similarity")
26
+ async def calculate_similarity(request: SimilarityRequest):
27
+ """
28
+ Calculate semantic similarity between two words.
29
+ Returns a similarity score between 0 and 1.
30
+ """
31
  try:
32
+ similarity = await word_service.calculate_similarity(
33
+ request.word1,
34
+ request.word2
35
+ )
36
  return {"similarity": similarity}
37
  except Exception as e:
38
  logger.error(f"Error calculating similarity: {str(e)}")
39
  raise HTTPException(status_code=500, detail="Internal server error")
40
 
41
  @router.get("/similar-words/{word}")
42
+ async def get_similar_words(
43
+ word: str = Path(..., min_length=1, description="Word to find similar words for"),
44
+ n: int = Query(20, ge=1, le=100, description="Number of similar words to return")
45
+ ):
46
+ """
47
+ Get n most similar words to the input word.
48
+ Returns a list of words with their similarity scores.
49
+ """
50
  try:
51
  words = await word_service.get_most_similar_words(word, n)
52
+ return {
53
+ "word": word,
54
+ "similar_words": words
55
+ }
56
  except Exception as e:
57
  logger.error(f"Error getting similar words: {str(e)}")
58
  raise HTTPException(status_code=500, detail="Internal server error")
59
 
60
  @router.post("/words-in-range")
61
+ async def get_words_in_range(request: WordsInRangeRequest):
62
+ """
63
+ Get words within a specific similarity range.
64
+ Returns words with similarities between min_sim and max_sim.
65
+ """
66
  try:
67
+ if request.min_sim >= request.max_sim:
68
+ raise HTTPException(
69
+ status_code=400,
70
+ detail="min_sim must be less than max_sim"
71
+ )
72
+
73
+ words = await word_service.get_words_in_range(
74
+ request.word,
75
+ request.min_sim,
76
+ request.max_sim,
77
+ request.n
78
+ )
79
  return {
80
+ "word": request.word,
81
+ "range": {
82
+ "min": request.min_sim,
83
+ "max": request.max_sim
84
+ },
85
  "words": words,
86
  "count": len(words)
87
  }
88
+ except HTTPException:
89
+ raise
90
  except Exception as e:
91
  logger.error(f"Error getting words in range: {str(e)}")
92
  raise HTTPException(status_code=500, detail="Internal server error")
93
 
94
  @router.get("/random-word")
95
  async def get_random_word():
96
+ """
97
+ Get a random word from the vocabulary.
98
+ Returns a single random word from the model's vocabulary.
99
+ """
100
  try:
101
  word = await word_service.get_random_word()
102
+ if not word:
103
+ raise HTTPException(
104
+ status_code=500,
105
+ detail="Could not generate random word"
106
+ )
107
  return {"word": word}
108
+ except HTTPException:
109
+ raise
110
  except Exception as e:
111
  logger.error(f"Error getting random word: {str(e)}")
112
  raise HTTPException(status_code=500, detail="Internal server error")
services/study_service.py CHANGED
@@ -1,5 +1,5 @@
1
  from loguru import logger
2
- from typing import List, Dict
3
  import numpy as np
4
  from services.word_service import WordEmbeddingService
5
 
@@ -10,7 +10,6 @@ class StudyService:
10
  async def analyze_word_neighborhood(self, word: str, n_neighbors: int = 20) -> Dict:
11
  """Get detailed analysis of a word's semantic neighborhood"""
12
  try:
13
- # Make sure to await the async calls
14
  vector = await self.word_service.get_vector(word)
15
  similar_words = await self.word_service.get_most_similar_words(word, n=n_neighbors)
16
 
@@ -29,61 +28,144 @@ class StudyService:
29
  "vector_norm": None
30
  }
31
 
32
- async def compare_words(self, words: List[str]) -> Dict:
33
- """Compare multiple words to understand their relationships"""
34
- results = []
35
- similarity_matrix = []
 
 
36
 
 
 
37
  try:
38
- for i, word1 in enumerate(words):
39
- # Get vector for current word
40
- vector = await self.word_service.get_vector(word1)
41
- results.append({
42
- "word": word1,
43
- "in_vocabulary": vector is not None,
44
- "vector_norm": float(np.linalg.norm(vector)) if vector is not None else None
45
- })
46
-
47
- # Calculate similarities
48
- row = []
49
- for j, word2 in enumerate(words):
50
- if i <= j: # Only calculate upper triangle
51
- sim = await self.word_service.calculate_similarity(word1, word2)
52
- row.append(sim)
53
- else: # Copy from upper triangle
54
- row.append(similarity_matrix[j][i])
55
- similarity_matrix.append(row)
56
-
 
 
 
 
57
  return {
58
- "words": results,
59
- "similarity_matrix": similarity_matrix
 
 
 
 
60
  }
61
  except Exception as e:
62
- logger.exception(f"Error comparing words: {e}")
63
  return {
64
- "words": [],
65
- "similarity_matrix": []
 
 
 
 
66
  }
67
 
68
- async def get_similarity_ranges(self, word: str) -> Dict:
69
- """Get words in different similarity ranges"""
70
- ranges = [
71
- (0.9, 1.0, "very_high"),
72
- (0.7, 0.9, "high"),
73
- (0.5, 0.7, "medium"),
74
- (0.3, 0.5, "low"),
75
- (0.1, 0.3, "very_low")
76
- ]
77
 
78
- results = {}
 
79
  try:
80
- for min_sim, max_sim, range_name in ranges:
81
- words = await self.word_service.get_words_in_range(
82
- word, min_sim, max_sim, n=5
83
- )
84
- results[range_name] = words
85
-
86
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  except Exception as e:
88
- logger.exception(f"Error getting similarity ranges: {e}")
89
- return {range_name: [] for _, _, range_name in ranges}
 
 
 
 
 
 
1
  from loguru import logger
2
+ from typing import List, Dict, Optional, Tuple
3
  import numpy as np
4
  from services.word_service import WordEmbeddingService
5
 
 
10
  async def analyze_word_neighborhood(self, word: str, n_neighbors: int = 20) -> Dict:
11
  """Get detailed analysis of a word's semantic neighborhood"""
12
  try:
 
13
  vector = await self.word_service.get_vector(word)
14
  similar_words = await self.word_service.get_most_similar_words(word, n=n_neighbors)
15
 
 
28
  "vector_norm": None
29
  }
30
 
31
+ async def analyze_concept(self,
32
+ positive_words: List[str],
33
+ negative_words: List[str] = None,
34
+ n_results: int = 10) -> Dict:
35
+ """
36
+ Analyze a concept defined by positive and negative words
37
 
38
+ Example: "roi - homme + femme = reine"
39
+ """
40
  try:
41
+ negative_words = negative_words or []
42
+
43
+ # Get vectors for all words
44
+ concept_vec = np.zeros(300) # FastText dimension
45
+
46
+ # Add positive word vectors
47
+ for word in positive_words:
48
+ vector = await self.word_service.get_vector(word)
49
+ if vector is not None:
50
+ concept_vec += vector
51
+
52
+ # Subtract negative word vectors
53
+ for word in negative_words:
54
+ vector = await self.word_service.get_vector(word)
55
+ if vector is not None:
56
+ concept_vec -= vector
57
+
58
+ # Normalize the concept vector
59
+ concept_vec = concept_vec / np.linalg.norm(concept_vec)
60
+
61
+ # Find similar words to the concept vector
62
+ similar_words = await self.word_service.get_similar_by_vector(concept_vec, n=n_results)
63
+
64
  return {
65
+ "concept": {
66
+ "positive_words": positive_words,
67
+ "negative_words": negative_words
68
+ },
69
+ "similar_words": similar_words,
70
+ "vector_norm": float(np.linalg.norm(concept_vec))
71
  }
72
  except Exception as e:
73
+ logger.exception(f"Error analyzing concept: {e}")
74
  return {
75
+ "concept": {
76
+ "positive_words": positive_words,
77
+ "negative_words": negative_words
78
+ },
79
+ "similar_words": [],
80
+ "vector_norm": None
81
  }
82
 
83
+ async def analyze_analogy(self,
84
+ word1: str,
85
+ word2: str,
86
+ word3: str,
87
+ n_results: int = 10) -> Dict:
88
+ """
89
+ Analyze word analogies (a:b :: c:?)
 
 
90
 
91
+ Example: paris:france :: berlin:? (should find "allemagne")
92
+ """
93
  try:
94
+ # Get vectors
95
+ vec1 = await self.word_service.get_vector(word1)
96
+ vec2 = await self.word_service.get_vector(word2)
97
+ vec3 = await self.word_service.get_vector(word3)
98
+
99
+ if None in (vec1, vec2, vec3):
100
+ return {
101
+ "analogy": f"{word1}:{word2} :: {word3}:?",
102
+ "similar_words": [],
103
+ "error": "One or more words not found in vocabulary"
104
+ }
105
+
106
+ # Calculate analogy vector (vec2 - vec1 + vec3)
107
+ analogy_vec = vec2 - vec1 + vec3
108
+
109
+ # Normalize the vector
110
+ analogy_vec = analogy_vec / np.linalg.norm(analogy_vec)
111
+
112
+ # Find similar words
113
+ similar_words = await self.word_service.get_similar_by_vector(analogy_vec, n=n_results)
114
+
115
+ return {
116
+ "analogy": f"{word1}:{word2} :: {word3}:?",
117
+ "similar_words": similar_words
118
+ }
119
+ except Exception as e:
120
+ logger.exception(f"Error analyzing analogy: {e}")
121
+ return {
122
+ "analogy": f"{word1}:{word2} :: {word3}:?",
123
+ "similar_words": [],
124
+ "error": str(e)
125
+ }
126
+
127
+ async def analyze_semantic_field(self,
128
+ words: List[str],
129
+ n_neighbors: int = 5) -> Dict:
130
+ """
131
+ Analyze the semantic field created by a group of words
132
+ """
133
+ try:
134
+ results = []
135
+ center_vector = np.zeros(300) # FastText dimension
136
+ valid_vectors = 0
137
+
138
+ # Calculate center of the semantic field
139
+ for word in words:
140
+ vector = await self.word_service.get_vector(word)
141
+ if vector is not None:
142
+ center_vector += vector
143
+ valid_vectors += 1
144
+
145
+ # Analyze each word
146
+ similar = await self.word_service.get_most_similar_words(word, n=n_neighbors)
147
+ results.append({
148
+ "word": word,
149
+ "similar_words": similar,
150
+ "vector_norm": float(np.linalg.norm(vector))
151
+ })
152
+
153
+ if valid_vectors > 0:
154
+ center_vector = center_vector / valid_vectors
155
+ center_similar = await self.word_service.get_similar_by_vector(center_vector, n=n_neighbors)
156
+ else:
157
+ center_similar = []
158
+
159
+ return {
160
+ "words": results,
161
+ "center_word_candidates": center_similar,
162
+ "valid_words_count": valid_vectors
163
+ }
164
  except Exception as e:
165
+ logger.exception(f"Error analyzing semantic field: {e}")
166
+ return {
167
+ "words": [],
168
+ "center_word_candidates": [],
169
+ "valid_words_count": 0,
170
+ "error": str(e)
171
+ }
services/word_service.py CHANGED
@@ -172,4 +172,17 @@ class WordEmbeddingService:
172
  return random.choice(list(self.vocab_vectors.keys()))
173
  except Exception as e:
174
  logger.exception("Error getting random word")
175
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  return random.choice(list(self.vocab_vectors.keys()))
173
  except Exception as e:
174
  logger.exception("Error getting random word")
175
+ return None
176
+
177
+ async def get_similar_by_vector(self, vector: np.ndarray, n: int = 10) -> List[Dict[str, float]]:
178
+ """Find words most similar to a given vector"""
179
+ await self._ensure_model_loaded()
180
+ try:
181
+ # Run the CPU-intensive operation in a thread pool
182
+ similar = await asyncio.to_thread(
183
+ lambda: WordEmbeddingService._model.similar_by_vector(vector, topn=n)
184
+ )
185
+ return [{'word': word, 'similarity': float(sim)} for word, sim in similar]
186
+ except Exception as e:
187
+ logger.exception(f"Error finding similar words by vector: {str(e)}")
188
+ return []