semantix-api / services /study_service.py
Miroir's picture
improved study service and made it return vectors to handle analysis outside api
c64867f
from loguru import logger
from typing import List, Dict, Optional, Tuple
import numpy as np
from services.word_service import WordEmbeddingService
class StudyService:
def __init__(self, word_service: WordEmbeddingService):
self.word_service = word_service
async def analyze_word_neighborhood(self, word: str, n_neighbors: int = 20) -> Dict:
"""Get detailed analysis of a word's semantic neighborhood"""
try:
vector = await self.word_service.get_vector(word)
similar_words = await self.word_service.get_most_similar_words(word, n=n_neighbors)
return {
"word": word,
"in_vocabulary": vector is not None,
"similar_words": similar_words,
"vector_norm": float(np.linalg.norm(vector)) if vector is not None else None
}
except Exception as e:
logger.exception(f"Error analyzing word neighborhood: {e}")
return {
"word": word,
"in_vocabulary": False,
"similar_words": [],
"vector_norm": None
}
async def analyze_concept(self,
positive_words: List[str],
negative_words: List[str] = None,
n_results: int = 10) -> Dict:
"""
Analyze a concept defined by positive and negative words
Example: "roi - homme + femme = reine"
"""
try:
negative_words = negative_words or []
# Get vectors for all words (assuming FastText dimension = 300)
concept_vec = np.zeros(300)
# Add positive word vectors
for word in positive_words:
vector = await self.word_service.get_vector(word)
if vector is not None:
concept_vec += vector
# Subtract negative word vectors
for word in negative_words:
vector = await self.word_service.get_vector(word)
if vector is not None:
concept_vec -= vector
# Normalize the concept vector
concept_vec = concept_vec / np.linalg.norm(concept_vec)
# Find similar words to the concept vector
similar_words = await self.word_service.get_similar_by_vector(concept_vec, n=n_results)
return {
"concept": {
"positive_words": positive_words,
"negative_words": negative_words
},
"similar_words": similar_words,
"vector_norm": float(np.linalg.norm(concept_vec))
}
except Exception as e:
logger.exception(f"Error analyzing concept: {e}")
return {
"concept": {
"positive_words": positive_words,
"negative_words": negative_words
},
"similar_words": [],
"vector_norm": None
}
async def get_phrase_vector(self, words: List[str]) -> Optional[List[float]]:
"""Compute the averaged embedding for a phrase (list of words)."""
vectors = []
for word in words:
vector = await self.word_service.get_vector(word)
if vector is not None:
vectors.append(vector)
if not vectors:
return None
phrase_vec = np.mean(vectors, axis=0)
return phrase_vec.tolist()
async def cluster_words(self, words: List[str], n_clusters: int = 3) -> Dict:
"""
Cluster the embeddings of the given words using K-Means.
Returns a dictionary with cluster centroids and word assignments.
"""
from sklearn.cluster import KMeans
vectors = []
valid_words = []
for word in words:
vector = await self.word_service.get_vector(word)
if vector is not None:
vectors.append(vector)
valid_words.append(word)
if not vectors:
return {"error": "No valid vectors found."}
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(np.array(vectors))
clusters = {}
for word, label in zip(valid_words, labels):
clusters.setdefault(int(label), []).append(word)
return {"clusters": clusters, "centroids": kmeans.cluster_centers_.tolist()}
async def find_outlier(self, words: List[str]) -> Dict:
"""
Identify the outlier in a list of words (the one least similar to the rest).
"""
vectors = []
valid_words = []
for word in words:
vector = await self.word_service.get_vector(word)
if vector is not None:
vectors.append(vector)
valid_words.append(word)
if len(vectors) < 2:
return {"error": "Not enough valid words to determine an outlier."}
similarities = []
for i, vec in enumerate(vectors):
# Compute average similarity to all other words
sim_sum = 0
count = 0
for j, other_vec in enumerate(vectors):
if i != j:
sim = np.dot(vec, other_vec) / (np.linalg.norm(vec) * np.linalg.norm(other_vec))
sim_sum += sim
count += 1
avg_sim = sim_sum / count if count > 0 else 0
similarities.append(avg_sim)
outlier_index = int(np.argmin(similarities))
return {"outlier": valid_words[outlier_index], "average_similarities": dict(zip(valid_words, similarities))}
async def distance_distribution(self, word: str, sample_size: int = 1000) -> Dict:
"""
Compute the distribution of cosine similarities (or distances) between the target word and a sample of words.
"""
target_vector = await self.word_service.get_vector(word)
if target_vector is None:
return {"error": "Target word not found in vocabulary."}
all_words = list(self.word_service.vocab_vectors.keys())
sample_words = np.random.choice(all_words, size=min(sample_size, len(all_words)), replace=False)
distances = []
for other in sample_words:
other_vector = self.word_service.vocab_vectors[other]
# Using cosine similarity for example
sim = np.dot(target_vector, other_vector) / (np.linalg.norm(target_vector) * np.linalg.norm(other_vector))
distances.append(sim)
return {
"word": word,
"similarity_distribution": {
"min": float(np.min(distances)),
"max": float(np.max(distances)),
"mean": float(np.mean(distances)),
"std": float(np.std(distances))
}
}
async def interpolate_words(self, word1: str, word2: str, steps: int = 5) -> Dict:
"""
Generate a series of intermediate vectors between two words and retrieve the closest word for each interpolation.
"""
vec1 = await self.word_service.get_vector(word1)
vec2 = await self.word_service.get_vector(word2)
if vec1 is None or vec2 is None:
return {"error": "One or both words not found in vocabulary."}
interpolations = []
for i in range(steps + 1):
ratio = i / steps
interp_vec = (1 - ratio) * vec1 + ratio * vec2
# Find closest word to the interpolated vector
similar = await self.word_service.get_similar_by_vector(interp_vec, n=1)
interpolations.append({
"step": i,
"vector": interp_vec.tolist(),
"closest_word": similar[0] if similar else None
})
return {"interpolations": interpolations}
async def combine_word_vectors(self, positive: List[tuple], negative: List[tuple]) -> Optional[List[float]]:
"""
Combine word vectors given weighted positive and negative contributions.
Each input is a list of tuples (word, weight).
Returns the combined normalized vector.
"""
combined_vec = np.zeros(300)
count = 0
for word, weight in positive:
vector = await self.word_service.get_vector(word)
if vector is not None:
combined_vec += weight * vector
count += 1
for word, weight in negative:
vector = await self.word_service.get_vector(word)
if vector is not None:
combined_vec -= weight * vector
count += 1
if count == 0 or np.linalg.norm(combined_vec) == 0:
return None
combined_vec = combined_vec / np.linalg.norm(combined_vec)
return combined_vec.tolist()
async def analyze_analogy(self,
word1: str,
word2: str,
word3: str,
n_results: int = 10) -> Dict:
"""
Analyze word analogies (a:b :: c:?).
Example: paris:france :: berlin:? (should find "allemagne")
"""
try:
# Get vectors for each word
vec1 = await self.word_service.get_vector(word1)
vec2 = await self.word_service.get_vector(word2)
vec3 = await self.word_service.get_vector(word3)
# Use explicit checks to see if any vector is missing
if vec1 is None or vec2 is None or vec3 is None:
return {
"analogy": f"{word1}:{word2} :: {word3}:?",
"similar_words": [],
"error": "One or more words not found in vocabulary"
}
# Calculate analogy vector (vec2 - vec1 + vec3)
analogy_vec = vec2 - vec1 + vec3
# Normalize the analogy vector
analogy_vec = analogy_vec / np.linalg.norm(analogy_vec)
# Find similar words using the analogy vector
similar_words = await self.word_service.get_similar_by_vector(analogy_vec, n=n_results)
return {
"analogy": f"{word1}:{word2} :: {word3}:?",
"similar_words": similar_words
}
except Exception as e:
logger.exception(f"Error analyzing analogy: {e}")
return {
"analogy": f"{word1}:{word2} :: {word3}:?",
"similar_words": [],
"error": str(e)
}
async def analyze_semantic_field(self,
words: List[str],
n_neighbors: int = 5) -> Dict:
"""
Analyze the semantic field created by a group of words
"""
try:
results = []
center_vector = np.zeros(300) # FastText dimension
valid_vectors = 0
# Calculate center of the semantic field
for word in words:
vector = await self.word_service.get_vector(word)
if vector is not None:
center_vector += vector
valid_vectors += 1
# Analyze each word
similar = await self.word_service.get_most_similar_words(word, n=n_neighbors)
results.append({
"word": word,
"similar_words": similar,
"vector_norm": float(np.linalg.norm(vector))
})
if valid_vectors > 0:
center_vector = center_vector / valid_vectors
center_similar = await self.word_service.get_similar_by_vector(center_vector, n=n_neighbors)
else:
center_similar = []
return {
"words": results,
"center_word_candidates": center_similar,
"valid_words_count": valid_vectors
}
except Exception as e:
logger.exception(f"Error analyzing semantic field: {e}")
return {
"words": [],
"center_word_candidates": [],
"valid_words_count": 0,
"error": str(e)
}
async def get_word_vectors(self, words: List[str]) -> Dict:
"""
Retrieve the vector representations for a list of words.
Returns a dictionary with each word and its vector (as a list).
This data can then be sent to an external visualization service.
"""
try:
data = []
for word in words:
vector = await self.word_service.get_vector(word)
data.append({
"word": word,
"vector": vector.tolist() if vector is not None else None
})
return {"data": data}
except Exception as e:
logger.exception(f"Error retrieving word vectors: {e}")
return {"error": str(e)}