Spaces:
Sleeping
Sleeping
changed word service to handle coroutine problem
Browse files- services/study_service.py +15 -15
- services/word_service.py +10 -9
services/study_service.py
CHANGED
|
@@ -80,38 +80,38 @@ class StudyService:
|
|
| 80 |
"vector_norm": None
|
| 81 |
}
|
| 82 |
|
|
|
|
| 83 |
async def analyze_analogy(self,
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
"""
|
| 89 |
-
Analyze word analogies (a:b :: c:?)
|
| 90 |
-
|
| 91 |
Example: paris:france :: berlin:? (should find "allemagne")
|
| 92 |
"""
|
| 93 |
try:
|
| 94 |
-
# Get vectors
|
| 95 |
vec1 = await self.word_service.get_vector(word1)
|
| 96 |
vec2 = await self.word_service.get_vector(word2)
|
| 97 |
vec3 = await self.word_service.get_vector(word3)
|
| 98 |
-
|
| 99 |
-
if
|
|
|
|
| 100 |
return {
|
| 101 |
"analogy": f"{word1}:{word2} :: {word3}:?",
|
| 102 |
"similar_words": [],
|
| 103 |
"error": "One or more words not found in vocabulary"
|
| 104 |
}
|
| 105 |
-
|
| 106 |
# Calculate analogy vector (vec2 - vec1 + vec3)
|
| 107 |
analogy_vec = vec2 - vec1 + vec3
|
| 108 |
-
|
| 109 |
-
# Normalize the vector
|
| 110 |
analogy_vec = analogy_vec / np.linalg.norm(analogy_vec)
|
| 111 |
-
|
| 112 |
-
# Find similar words
|
| 113 |
similar_words = await self.word_service.get_similar_by_vector(analogy_vec, n=n_results)
|
| 114 |
-
|
| 115 |
return {
|
| 116 |
"analogy": f"{word1}:{word2} :: {word3}:?",
|
| 117 |
"similar_words": similar_words
|
|
|
|
| 80 |
"vector_norm": None
|
| 81 |
}
|
| 82 |
|
| 83 |
+
|
| 84 |
async def analyze_analogy(self,
|
| 85 |
+
word1: str,
|
| 86 |
+
word2: str,
|
| 87 |
+
word3: str,
|
| 88 |
+
n_results: int = 10) -> Dict:
|
| 89 |
"""
|
| 90 |
+
Analyze word analogies (a:b :: c:?).
|
|
|
|
| 91 |
Example: paris:france :: berlin:? (should find "allemagne")
|
| 92 |
"""
|
| 93 |
try:
|
| 94 |
+
# Get vectors for each word
|
| 95 |
vec1 = await self.word_service.get_vector(word1)
|
| 96 |
vec2 = await self.word_service.get_vector(word2)
|
| 97 |
vec3 = await self.word_service.get_vector(word3)
|
| 98 |
+
|
| 99 |
+
# Use explicit checks to see if any vector is missing
|
| 100 |
+
if vec1 is None or vec2 is None or vec3 is None:
|
| 101 |
return {
|
| 102 |
"analogy": f"{word1}:{word2} :: {word3}:?",
|
| 103 |
"similar_words": [],
|
| 104 |
"error": "One or more words not found in vocabulary"
|
| 105 |
}
|
| 106 |
+
|
| 107 |
# Calculate analogy vector (vec2 - vec1 + vec3)
|
| 108 |
analogy_vec = vec2 - vec1 + vec3
|
| 109 |
+
|
| 110 |
+
# Normalize the analogy vector
|
| 111 |
analogy_vec = analogy_vec / np.linalg.norm(analogy_vec)
|
| 112 |
+
|
| 113 |
+
# Find similar words using the analogy vector
|
| 114 |
similar_words = await self.word_service.get_similar_by_vector(analogy_vec, n=n_results)
|
|
|
|
| 115 |
return {
|
| 116 |
"analogy": f"{word1}:{word2} :: {word3}:?",
|
| 117 |
"similar_words": similar_words
|
services/word_service.py
CHANGED
|
@@ -121,8 +121,9 @@ class WordEmbeddingService:
|
|
| 121 |
logger.exception(f"Error finding similar words for: {target_word}")
|
| 122 |
return []
|
| 123 |
|
|
|
|
| 124 |
async def get_words_in_range(self, target_word: str, min_similarity: float,
|
| 125 |
-
|
| 126 |
"""Get words within a similarity range"""
|
| 127 |
try:
|
| 128 |
logger.info(f"Finding words for '{target_word}' in range [{min_similarity}, {max_similarity}]")
|
|
@@ -131,8 +132,8 @@ class WordEmbeddingService:
|
|
| 131 |
logger.warning(f"No vector for target word: {target_word}")
|
| 132 |
return []
|
| 133 |
|
| 134 |
-
#
|
| 135 |
-
|
| 136 |
similarities = []
|
| 137 |
norm_target = np.linalg.norm(target_vec)
|
| 138 |
sample_size = min(100000, len(self.vocab_vectors))
|
|
@@ -141,24 +142,24 @@ class WordEmbeddingService:
|
|
| 141 |
for vocab_word in sampled_words:
|
| 142 |
if vocab_word == target_word.lower():
|
| 143 |
continue
|
| 144 |
-
|
| 145 |
vector = self.vocab_vectors[vocab_word]
|
| 146 |
sim = float(np.dot(vector, target_vec) /
|
| 147 |
-
|
| 148 |
-
|
| 149 |
if min_similarity <= sim <= max_similarity:
|
| 150 |
similarities.append({'word': vocab_word, 'similarity': sim})
|
| 151 |
-
|
| 152 |
return similarities
|
| 153 |
|
|
|
|
| 154 |
similarities = await asyncio.to_thread(process_words)
|
| 155 |
-
|
| 156 |
if not similarities:
|
| 157 |
return []
|
| 158 |
|
| 159 |
similarities.sort(key=lambda x: x['similarity'], reverse=True)
|
| 160 |
selected_words = random.sample(similarities, min(n, len(similarities)))
|
| 161 |
-
|
| 162 |
return selected_words
|
| 163 |
|
| 164 |
except Exception as e:
|
|
|
|
| 121 |
logger.exception(f"Error finding similar words for: {target_word}")
|
| 122 |
return []
|
| 123 |
|
| 124 |
+
|
| 125 |
async def get_words_in_range(self, target_word: str, min_similarity: float,
|
| 126 |
+
max_similarity: float, n: int = 5) -> List[Dict[str, float]]:
|
| 127 |
"""Get words within a similarity range"""
|
| 128 |
try:
|
| 129 |
logger.info(f"Finding words for '{target_word}' in range [{min_similarity}, {max_similarity}]")
|
|
|
|
| 132 |
logger.warning(f"No vector for target word: {target_word}")
|
| 133 |
return []
|
| 134 |
|
| 135 |
+
# Define a synchronous function to process words
|
| 136 |
+
def process_words():
|
| 137 |
similarities = []
|
| 138 |
norm_target = np.linalg.norm(target_vec)
|
| 139 |
sample_size = min(100000, len(self.vocab_vectors))
|
|
|
|
| 142 |
for vocab_word in sampled_words:
|
| 143 |
if vocab_word == target_word.lower():
|
| 144 |
continue
|
| 145 |
+
|
| 146 |
vector = self.vocab_vectors[vocab_word]
|
| 147 |
sim = float(np.dot(vector, target_vec) /
|
| 148 |
+
(np.linalg.norm(vector) * norm_target))
|
| 149 |
+
|
| 150 |
if min_similarity <= sim <= max_similarity:
|
| 151 |
similarities.append({'word': vocab_word, 'similarity': sim})
|
| 152 |
+
|
| 153 |
return similarities
|
| 154 |
|
| 155 |
+
# Use to_thread to run the synchronous function in a thread
|
| 156 |
similarities = await asyncio.to_thread(process_words)
|
| 157 |
+
|
| 158 |
if not similarities:
|
| 159 |
return []
|
| 160 |
|
| 161 |
similarities.sort(key=lambda x: x['similarity'], reverse=True)
|
| 162 |
selected_words = random.sample(similarities, min(n, len(similarities)))
|
|
|
|
| 163 |
return selected_words
|
| 164 |
|
| 165 |
except Exception as e:
|