Miroir commited on
Commit
cf65513
·
1 Parent(s): b3af544

fastapi setup for huggingface

Browse files
Dockerfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+ ENV MODEL_URL="https://huggingface.co/Miroir/cc.fr.300.reduced/resolve/main/cc.fr.300.reduced.vec"
7
+
8
+ WORKDIR /app
9
+
10
+ COPY --chown=user requirements.txt requirements.txt
11
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
12
+
13
+ COPY --chown=user . .
14
+
15
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel
4
+ from loguru import logger
5
+ import sys
6
+ import os
7
+
8
+ from services.word_service import WordEmbeddingService
9
+ from services.game_service import GameService
10
+ from services.visualization_service import VisualizationService
11
+
12
+ # Configure logger
13
+ logger.remove()
14
+ logger.add(sys.stdout, level="INFO")
15
+
16
+ app = FastAPI()
17
+
18
+ # Configure CORS
19
+ app.add_middleware(
20
+ CORSMiddleware,
21
+ allow_origins=["*"], # Adjust this in production
22
+ allow_credentials=True,
23
+ allow_methods=["*"],
24
+ allow_headers=["*"],
25
+ )
26
+
27
+ # Initialize services
28
+ try:
29
+ word_service = WordEmbeddingService()
30
+ game_service = GameService(word_service)
31
+ visualization_service = VisualizationService(word_service)
32
+ logger.info("Services initialized successfully")
33
+ except Exception as e:
34
+ logger.error(f"Failed to initialize services: {str(e)}")
35
+ raise e
36
+
37
+ # Pydantic models for request validation
38
+ class WordCheck(BaseModel):
39
+ word: str
40
+
41
+ class JokerUse(BaseModel):
42
+ joker_type: str
43
+
44
+ @app.get("/api/game-state")
45
+ async def get_game_state():
46
+ try:
47
+ return game_service.get_state()
48
+ except Exception as e:
49
+ logger.error(f"Error getting game state: {str(e)}")
50
+ raise HTTPException(status_code=500, detail="Internal server error")
51
+
52
+ @app.post("/api/check-word")
53
+ async def check_word(word_check: WordCheck):
54
+ try:
55
+ return game_service.check_word(word_check.word)
56
+ except Exception as e:
57
+ logger.error(f"Error checking word: {str(e)}")
58
+ raise HTTPException(status_code=500, detail="Internal server error")
59
+
60
+ @app.post("/api/use-joker")
61
+ async def use_joker(joker: JokerUse):
62
+ try:
63
+ return game_service.use_joker(joker.joker_type)
64
+ except Exception as e:
65
+ logger.error(f"Error using joker: {str(e)}")
66
+ raise HTTPException(status_code=500, detail="Internal server error")
67
+
68
+ @app.get("/api/visualization")
69
+ async def get_visualization():
70
+ try:
71
+ state = game_service.get_state()
72
+ return visualization_service.prepare_3d_visualization(
73
+ state["target_word"],
74
+ state["guessed_words"]
75
+ )
76
+ except Exception as e:
77
+ logger.error(f"Error getting visualization: {str(e)}")
78
+ raise HTTPException(status_code=500, detail="Internal server error")
79
+
80
+ @app.get("/")
81
+ async def root():
82
+ """Health check endpoint"""
83
+ return {"status": "ok", "message": "Semantix API is running"}
config/__pycache__/game_config.cpython-311.pyc ADDED
Binary file (2.77 kB). View file
 
config/game_config.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ WordVerse Game Configuration
3
+ This file contains all configurable parameters for the WordVerse game.
4
+ """
5
+ from typing import Dict, Any
6
+
7
+ # Main configuration dictionary
8
+ GAME_CONFIG: Dict[str, Any] = {
9
+ # Difficulty Settings
10
+ "difficulty": {
11
+ "easy": {
12
+ "jokers_high_similarity": 3,
13
+ "jokers_medium_similarity": 3,
14
+ "words_per_joker": 5,
15
+ "similarity_threshold": 0.99, # Threshold to find the word
16
+ "time_limit": 300, # in seconds
17
+ },
18
+ "medium": {
19
+ "jokers_high_similarity": 2,
20
+ "jokers_medium_similarity": 2,
21
+ "words_per_joker": 3,
22
+ "similarity_threshold": 0.995,
23
+ "time_limit": 240,
24
+ },
25
+ "hard": {
26
+ "jokers_high_similarity": 1,
27
+ "jokers_medium_similarity": 1,
28
+ "words_per_joker": 2,
29
+ "similarity_threshold": 0.998,
30
+ "time_limit": 180,
31
+ }
32
+ },
33
+
34
+ # Joker System
35
+ "jokers": {
36
+ "similarity_ranges": {
37
+ "high": {
38
+ "min": 0.7,
39
+ "max": 0.8
40
+ },
41
+ "medium": {
42
+ "min": 0.6,
43
+ "max": 0.7
44
+ }
45
+ },
46
+ "cooldown": 3, # Number of guesses required between joker uses
47
+ },
48
+
49
+ # Scoring System
50
+ "scoring": {
51
+ "base_points": 1000,
52
+ "time_bonus": {
53
+ "enabled": True,
54
+ "points_per_second": 10,
55
+ },
56
+ "joker_penalty": {
57
+ "high_similarity": -100,
58
+ "medium_similarity": -50,
59
+ },
60
+ "streak_bonus": {
61
+ "enabled": True,
62
+ "threshold": 0.8, # Similarity threshold for streak
63
+ "multiplier": 1.5,
64
+ }
65
+ },
66
+
67
+ # Game Rules
68
+ "rules": {
69
+ "max_attempts": 0, # 0 for unlimited
70
+ "min_word_length": 3,
71
+ "show_target_word": False, # If false, target word is hidden until found
72
+ "allow_partial_matches": True,
73
+ },
74
+
75
+ # UI/UX
76
+ "interface": {
77
+ "history_size": 50, # Number of words to show in history
78
+ "visualization_auto_toggle": True, # Auto show visualization on key moments
79
+ "visualization_moments": ["word_found", "joker_used"],
80
+ "feedback_levels": ["very_cold", "cold", "warm", "hot", "very_hot"],
81
+ },
82
+
83
+ # Word Selection
84
+ "word_selection": {
85
+ "categories": ["general", "science", "nature", "technology"],
86
+ "difficulty_weights": {
87
+ "easy": {"common": 0.8, "uncommon": 0.2},
88
+ "medium": {"common": 0.5, "uncommon": 0.5},
89
+ "hard": {"common": 0.2, "uncommon": 0.8}
90
+ },
91
+ },
92
+
93
+ # Player Progression
94
+ "progression": {
95
+ "levels_enabled": True,
96
+ "xp_per_game": 100,
97
+ "level_thresholds": [0, 1000, 2500, 5000, 10000],
98
+ "rewards": {
99
+ "level_2": {"bonus_joker": "high_similarity"},
100
+ "level_3": {"bonus_time": 60},
101
+ "level_4": {"bonus_joker": "medium_similarity"},
102
+ "level_5": {"unlimited_time": True}
103
+ }
104
+ }
105
+ }
106
+
107
+ # Current active difficulty level
108
+ CURRENT_DIFFICULTY = "medium"
109
+
110
+ def get_current_config() -> Dict[str, Any]:
111
+ """Get the current game configuration based on difficulty."""
112
+ base_config = GAME_CONFIG.copy()
113
+ difficulty_config = base_config["difficulty"][CURRENT_DIFFICULTY]
114
+
115
+ # Merge difficulty-specific settings into base config
116
+ for key, value in difficulty_config.items():
117
+ if key in base_config:
118
+ base_config[key] = value
119
+
120
+ return base_config
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.104.1
2
+ uvicorn[standard]==0.24.0
3
+ numpy==1.24.3
4
+ gensim==4.3.2
5
+ python-dotenv==1.0.0
6
+ loguru==0.7.2
7
+ requests==2.31.0
8
+ scikit-learn==1.3.2
9
+ umap-learn==0.5.5
services/__pycache__/game_service.cpython-311.pyc ADDED
Binary file (10.8 kB). View file
 
services/__pycache__/model_downloader.cpython-311.pyc ADDED
Binary file (2.17 kB). View file
 
services/__pycache__/visualization_service.cpython-311.pyc ADDED
Binary file (5.49 kB). View file
 
services/__pycache__/word_service.cpython-311.pyc ADDED
Binary file (12.6 kB). View file
 
services/__pycache__/word_service.cpython-313.pyc ADDED
Binary file (1.82 kB). View file
 
services/game_service.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # file location: backend/services/game_service.py
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from loguru import logger
6
+ import random
7
+ from typing import Dict, List
8
+
9
+ class GameService:
10
+ def __init__(self, word_service):
11
+ self.data_file = Path('data/game_state.json')
12
+ self.words_file = Path('data/word_list.json')
13
+ self.word_service = word_service
14
+ self._ensure_data_file()
15
+
16
+ def _ensure_data_file(self):
17
+ """Initialize game state file if it doesn't exist."""
18
+ if not self.data_file.exists():
19
+ self.data_file.parent.mkdir(exist_ok=True)
20
+ self._save_state(self._create_initial_state())
21
+
22
+ def _create_initial_state(self) -> Dict:
23
+ """Create a new game state with default values from config."""
24
+ from config.game_config import GAME_CONFIG, CURRENT_DIFFICULTY
25
+ difficulty_config = GAME_CONFIG["difficulty"][CURRENT_DIFFICULTY]
26
+
27
+ return {
28
+ 'target_word': self._get_random_word(),
29
+ 'attempts': [],
30
+ 'word_found': False,
31
+ 'similar_words': [],
32
+ 'jokers': {
33
+ 'high_similarity': {
34
+ 'remaining': difficulty_config['jokers_high_similarity'],
35
+ 'words_per_use': difficulty_config['words_per_joker']
36
+ },
37
+ 'medium_similarity': {
38
+ 'remaining': difficulty_config['jokers_medium_similarity'],
39
+ 'words_per_use': difficulty_config['words_per_joker']
40
+ }
41
+ }
42
+ }
43
+
44
+ def reset_game(self) -> Dict:
45
+ """Reset the game with a new random word and fresh jokers."""
46
+ try:
47
+ new_state = self._create_initial_state()
48
+ self._save_state(new_state)
49
+ return new_state
50
+ except Exception:
51
+ logger.exception("Error resetting game")
52
+ raise
53
+
54
+ def use_joker(self, joker_type: str) -> Dict:
55
+ """Use a joker to get words within a specific similarity range."""
56
+ try:
57
+ logger.info(f"Using joker of type: {joker_type}")
58
+ state = self._load_state()
59
+
60
+ # Validate joker type and availability
61
+ if joker_type not in ['high_similarity', 'medium_similarity']:
62
+ logger.error(f"Invalid joker type: {joker_type}")
63
+ raise ValueError("Invalid joker type")
64
+
65
+ joker = state['jokers'][joker_type]
66
+ if joker['remaining'] <= 0:
67
+ logger.warning(f"No {joker_type} jokers remaining")
68
+ raise ValueError("No jokers remaining of this type")
69
+
70
+ # Similarity range
71
+ sim_range = {
72
+ 'high_similarity': (0.7, 0.8),
73
+ 'medium_similarity': (0.6, 0.7)
74
+ }[joker_type]
75
+
76
+ target = state['target_word']
77
+ logger.info(f"Target word: {target}, range: {sim_range}")
78
+
79
+ # Get words in range
80
+ similar_words = self.word_service.get_words_in_range(
81
+ target,
82
+ sim_range[0],
83
+ sim_range[1],
84
+ n=joker['words_per_use']
85
+ )
86
+
87
+ # Log the results
88
+ logger.info(f"Found {len(similar_words)} words using joker:")
89
+ for w in similar_words:
90
+ logger.info(f"- {w['word']} (similarity: {w['similarity']:.3f})")
91
+
92
+ # Update joker count
93
+ joker['remaining'] -= 1
94
+ self._save_state(state)
95
+
96
+ logger.info(f"Remaining {joker_type} jokers: {joker['remaining']}")
97
+
98
+ return {'joker_words': similar_words, 'jokers': state['jokers']}
99
+
100
+ except Exception:
101
+ logger.exception("Error using joker")
102
+ raise
103
+
104
+ def get_center_word_power(self, chosen_words: List[str]) -> Dict[str, float]:
105
+ """
106
+ Compute and return the “center word” based on the user’s chosen words
107
+ and the current target word.
108
+ """
109
+ try:
110
+ # Load current state to get the target word
111
+ state = self._load_state()
112
+ target_word = state['target_word']
113
+
114
+ result = self.word_service.get_center_word(chosen_words, target_word)
115
+ if not result:
116
+ logger.warning("Center word power returned no result.")
117
+ return {}
118
+
119
+ logger.info(f"Center word found: {result['word']} (sim={result['similarity']:.3f})")
120
+ return result
121
+
122
+ except Exception:
123
+ logger.exception("Error computing center word power")
124
+ return {}
125
+
126
+ def _get_random_word(self) -> str:
127
+ """Get a random word from the game's word list."""
128
+ try:
129
+ with open(self.words_file, 'r', encoding='utf-8') as f:
130
+ words_data = json.load(f)
131
+ return random.choice(words_data['words'])
132
+ except Exception:
133
+ logger.exception("Error loading word list")
134
+ return "mathématiques" # fallback word
135
+
136
+ def save_attempt(self, word: str, similarity: float) -> Dict:
137
+ """Save a word attempt and update game state."""
138
+ try:
139
+ if not word or similarity <= 0:
140
+ return self._load_state()
141
+
142
+ state = self._load_state()
143
+ state['attempts'].append({'word': word, 'similarity': similarity})
144
+
145
+ # Check if word is found (similarity > 0.99)
146
+ if similarity > 0.99:
147
+ state['word_found'] = True
148
+ # Get similar words when the target is found
149
+ state['similar_words'] = self.word_service.get_most_similar_words(
150
+ state['target_word'], n=100
151
+ )
152
+
153
+ self._save_state(state)
154
+ return state
155
+ except Exception:
156
+ logger.exception("Error saving attempt")
157
+ raise
158
+
159
+ def _save_state(self, state: Dict) -> None:
160
+ """Save game state to file."""
161
+ try:
162
+ self.data_file.parent.mkdir(exist_ok=True)
163
+ with open(self.data_file, 'w', encoding='utf-8') as f:
164
+ json.dump(state, f, ensure_ascii=False, indent=2)
165
+ except Exception:
166
+ logger.exception("Error saving game state")
167
+ raise
168
+
169
+ def _load_state(self) -> Dict:
170
+ """Load game state from file."""
171
+ try:
172
+ if not self.data_file.exists():
173
+ self._ensure_data_file()
174
+ with open(self.data_file, 'r', encoding='utf-8') as f:
175
+ return json.load(f)
176
+ except Exception:
177
+ logger.exception("Error loading game state")
178
+ raise
179
+
180
+ def get_state(self) -> Dict:
181
+ """Get current game state."""
182
+ try:
183
+ return self._load_state()
184
+ except Exception:
185
+ logger.exception("Error getting game state")
186
+ raise
187
+
188
+ def get_history(self) -> List[Dict]:
189
+ """Get history of attempts."""
190
+ try:
191
+ state = self._load_state()
192
+ return state['attempts']
193
+ except Exception:
194
+ logger.exception("Error getting history")
195
+ return []
196
+
services/model_downloader.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from loguru import logger
4
+ from pathlib import Path
5
+
6
+ def download_model(url: str, model_path: str):
7
+ """Download the model file if it doesn't exist."""
8
+ if os.path.exists(model_path):
9
+ logger.info(f"Model file already exists at {model_path}")
10
+ return
11
+
12
+ logger.info(f"Downloading model from {url}")
13
+ os.makedirs(os.path.dirname(model_path), exist_ok=True)
14
+
15
+ try:
16
+ response = requests.get(url, stream=True)
17
+ response.raise_for_status()
18
+
19
+ total_size = int(response.headers.get('content-length', 0))
20
+ block_size = 1024 # 1 KB
21
+
22
+ with open(model_path, 'wb') as f:
23
+ for data in response.iter_content(block_size):
24
+ f.write(data)
25
+
26
+ logger.info(f"Model downloaded successfully to {model_path}")
27
+ except Exception as e:
28
+ logger.error(f"Error downloading model: {str(e)}")
29
+ raise
services/visualization_service.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # file location: backend/services/visualization_service.py
2
+
3
+ import numpy as np
4
+ import umap # pip install umap-learn
5
+ from loguru import logger
6
+
7
+ class VisualizationService:
8
+ def __init__(self, word_service):
9
+ self.word_service = word_service
10
+
11
+ def _compute_color(self, similarity: float) -> str:
12
+ """
13
+ Given a similarity in [0,1], return an RGB color from blue (0) to red (1).
14
+ """
15
+ # Clamp similarity to [0,1] just in case
16
+ sim = max(0.0, min(1.0, similarity))
17
+ # Simple gradient from blue (0,0,255) to red (255,0,0)
18
+ r = int(sim * 255)
19
+ g = 0
20
+ b = int((1.0 - sim) * 255)
21
+ return f"rgb({r}, {g}, {b})"
22
+
23
+ def prepare_3d_visualization(self, target_word: str, guessed_words: list[str]):
24
+ try:
25
+ embeddings = []
26
+ valid_words = []
27
+
28
+ target_embedding = self.word_service.get_vector(target_word)
29
+ if target_embedding is None:
30
+ return [{
31
+ 'word': "???",
32
+ 'coordinates': [0, 0, 0],
33
+ 'is_target': True,
34
+ 'similarity': 1.0,
35
+ 'color': 'rgb(255, 0, 0)'
36
+ }]
37
+
38
+ embeddings.append(target_embedding)
39
+ valid_words.append(target_word)
40
+
41
+ for word in guessed_words:
42
+ vec = self.word_service.get_vector(word)
43
+ if vec is not None and not np.all(vec == 0):
44
+ embeddings.append(vec)
45
+ valid_words.append(word)
46
+
47
+ # if there's only 1 or 2 embeddings total, no manifold can form
48
+ if len(embeddings) < 3:
49
+ return self._simple_fallback(target_word, valid_words, embeddings)
50
+
51
+ # Otherwise, do UMAP
52
+ embeddings_array = np.array(embeddings)
53
+ neighbors = min(5, len(embeddings) - 1)
54
+
55
+ import umap
56
+ reducer = umap.UMAP(
57
+ n_components=3,
58
+ n_neighbors=neighbors,
59
+ min_dist=0.1,
60
+ metric='cosine',
61
+ random_state=42
62
+ )
63
+ embedding_3d = reducer.fit_transform(embeddings_array)
64
+
65
+ # Re-center target at (0,0,0)
66
+ target_coords = embedding_3d[0]
67
+ embedding_3d -= target_coords
68
+
69
+ result = []
70
+ for i, word in enumerate(valid_words):
71
+ if i == 0:
72
+ # target
73
+ result.append({
74
+ 'word': "???",
75
+ 'coordinates': embedding_3d[i].tolist(),
76
+ 'is_target': True,
77
+ 'similarity': 1.0,
78
+ 'color': 'rgb(255, 0, 0)'
79
+ })
80
+ else:
81
+ sim = self.word_service.calculate_similarity(target_word, word)
82
+ color = self._compute_color(sim)
83
+ result.append({
84
+ 'word': word,
85
+ 'coordinates': embedding_3d[i].tolist(),
86
+ 'is_target': False,
87
+ 'similarity': sim,
88
+ 'color': color
89
+ })
90
+ return result
91
+
92
+ except Exception:
93
+ logger.exception("Error preparing 3D visualization with UMAP")
94
+ return [{
95
+ 'word': "???",
96
+ 'coordinates': [0, 0, 0],
97
+ 'is_target': True,
98
+ 'similarity': 1.0,
99
+ 'color': 'rgb(255, 0, 0)'
100
+ }]
101
+
102
+ def _simple_fallback(self, target_word: str, valid_words: list[str], embeddings: list[np.ndarray]):
103
+ """
104
+ Return a minimal 3D layout without UMAP
105
+ when the dataset is too small to form a manifold.
106
+ """
107
+ # If there's only the target, just place it at the origin.
108
+ if len(embeddings) <= 1:
109
+ return [{
110
+ 'word': "???",
111
+ 'coordinates': [0, 0, 0],
112
+ 'is_target': True,
113
+ 'similarity': 1.0,
114
+ 'color': 'rgb(255, 0, 0)'
115
+ }]
116
+
117
+ # We have at least 2 points (target + 1 guess)
118
+ coords = np.random.randn(len(embeddings), 3) * 0.1
119
+ coords[0] = [0, 0, 0] # target at origin
120
+
121
+ result = []
122
+ for i, word in enumerate(valid_words):
123
+ if i == 0:
124
+ # target
125
+ result.append({
126
+ 'word': "???",
127
+ 'coordinates': coords[i].tolist(),
128
+ 'is_target': True,
129
+ 'similarity': 1.0,
130
+ 'color': 'rgb(255, 0, 0)'
131
+ })
132
+ else:
133
+ sim = self.word_service.calculate_similarity(target_word, word)
134
+ color = self._compute_color(sim)
135
+ result.append({
136
+ 'word': word,
137
+ 'coordinates': coords[i].tolist(),
138
+ 'is_target': False,
139
+ 'similarity': sim,
140
+ 'color': color
141
+ })
142
+
143
+ return result
services/word_service.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from loguru import logger
2
+ import numpy as np
3
+ from typing import List, Dict
4
+ import random
5
+ from gensim.models import KeyedVectors
6
+ import os
7
+ import tempfile
8
+ import requests
9
+
10
+ class WordEmbeddingService:
11
+ _instance = None
12
+ _model = None
13
+
14
+ def __new__(cls):
15
+ if cls._instance is None:
16
+ cls._instance = super(WordEmbeddingService, cls).__new__(cls)
17
+ return cls._instance
18
+
19
+ def __init__(self):
20
+ if not WordEmbeddingService._model:
21
+ self._initialize_model()
22
+
23
+ def _initialize_model(self):
24
+ """Initialize the model only when needed"""
25
+ try:
26
+ # Get model URL from environment variable
27
+ model_url = os.getenv('MODEL_URL', 'https://huggingface.co/Miroir/cc.fr.300.reduced/resolve/main/cc.fr.300.reduced.vec')
28
+
29
+ logger.info("Loading FastText embeddings from URL...")
30
+
31
+ # Create a temporary file to store the model
32
+ with tempfile.NamedTemporaryFile(delete=False) as temp_file:
33
+ # Download the file
34
+ response = requests.get(model_url, stream=True)
35
+ response.raise_for_status()
36
+
37
+ # Write the content to the temporary file
38
+ for chunk in response.iter_content(chunk_size=8192):
39
+ if chunk:
40
+ temp_file.write(chunk)
41
+
42
+ temp_file.flush()
43
+
44
+ # Load the model from the temporary file
45
+ WordEmbeddingService._model = KeyedVectors.load_word2vec_format(temp_file.name)
46
+
47
+ # Build vocabulary vectors
48
+ self.vocab_vectors = {
49
+ word: WordEmbeddingService._model[word]
50
+ for word in WordEmbeddingService._model.index_to_key
51
+ }
52
+
53
+ logger.info(f"FastText model loaded successfully with "
54
+ f"{len(self.vocab_vectors)} words in the vocabulary.")
55
+
56
+ except Exception as e:
57
+ logger.exception(f"Failed to load FastText model: {str(e)}")
58
+ raise
59
+
60
+ def _ensure_model_loaded(self):
61
+ """Ensure the model is loaded before any operation"""
62
+ if not WordEmbeddingService._model:
63
+ self._initialize_model()
64
+
65
+ def calculate_similarity(self, word1: str, word2: str) -> float:
66
+ self._ensure_model_loaded()
67
+ try:
68
+ w1, w2 = word1.lower(), word2.lower()
69
+ if w1 not in WordEmbeddingService._model or w2 not in WordEmbeddingService._model:
70
+ logger.warning(f"One or both words not in FastText vocab: '{word1}', '{word2}'")
71
+ return 0.0
72
+ return float(WordEmbeddingService._model.similarity(w1, w2))
73
+ except Exception:
74
+ logger.exception(f"Error calculating similarity between '{word1}' and '{word2}'")
75
+ return 0.0
76
+
77
+
78
+ def get_vector(self, word: str) -> np.ndarray:
79
+ """
80
+ Retrieve the vector representation of a word.
81
+ Returns None if the word is not found in the FastText vocabulary.
82
+ """
83
+ try:
84
+ w = word.lower()
85
+ if w not in self.model:
86
+ logger.warning(f"No vector found for word: {word}")
87
+ return None
88
+ return self.model[w]
89
+ except Exception:
90
+ logger.exception(f"Error getting vector for word: {word}")
91
+ return None
92
+
93
+ def get_most_similar_words(self, target_word: str, n: int = 100) -> List[Dict[str, float]]:
94
+ """
95
+ Return the `n` most similar words to `target_word`.
96
+ An empty list is returned if `target_word` is out of vocabulary.
97
+ """
98
+ try:
99
+ w = target_word.lower()
100
+ if w not in self.model:
101
+ logger.warning(f"Target word not found in vocab: {target_word}")
102
+ return []
103
+ similar = self.model.most_similar(w, topn=n)
104
+ return [{'word': word, 'similarity': float(sim)} for word, sim in similar]
105
+ except Exception:
106
+ logger.exception(f"Error finding similar words for: {target_word}")
107
+ return []
108
+
109
+ def get_words_in_range(self, target_word: str, min_similarity: float,
110
+ max_similarity: float, n: int = 5) -> List[Dict[str, float]]:
111
+ """
112
+ Retrieve up to `n` words whose similarity to `target_word`
113
+ lies within [min_similarity, max_similarity].
114
+ The results are randomly sampled from all words meeting the criterion.
115
+ """
116
+ try:
117
+ logger.info(f"Finding words for '{target_word}' in range "
118
+ f"[{min_similarity}, {max_similarity}]")
119
+ target_vec = self.get_vector(target_word)
120
+ if target_vec is None:
121
+ logger.warning(f"No vector for target word: {target_word}")
122
+ return []
123
+
124
+ similarities = []
125
+ norm_target = np.linalg.norm(target_vec)
126
+
127
+ # Sample from vocabulary to improve performance
128
+ sample_size = min(100000, len(self.vocab_vectors))
129
+ sampled_words = random.sample(list(self.vocab_vectors.keys()), sample_size)
130
+
131
+ for vocab_word in sampled_words:
132
+ if vocab_word == target_word.lower():
133
+ continue
134
+
135
+ vector = self.vocab_vectors[vocab_word]
136
+ sim = float(np.dot(vector, target_vec) /
137
+ (np.linalg.norm(vector) * norm_target))
138
+
139
+ if min_similarity <= sim <= max_similarity:
140
+ similarities.append({'word': vocab_word, 'similarity': sim})
141
+
142
+ logger.info(f"Found {len(similarities)} words in the range.")
143
+ if not similarities:
144
+ return []
145
+
146
+ similarities.sort(key=lambda x: x['similarity'], reverse=True)
147
+ selected_words = random.sample(similarities, min(n, len(similarities)))
148
+
149
+ for w in selected_words:
150
+ logger.debug(f"Selected: {w['word']} (sim={w['similarity']:.3f})")
151
+ return selected_words
152
+
153
+ except Exception:
154
+ logger.exception(f"Error finding words in range for: {target_word}")
155
+ return []
156
+
157
+ def get_center_word(self, chosen_words: List[str], target_word: str) -> Dict[str, float]:
158
+ """
159
+ Compute the centroid of (chosen_words + target_word) vectors,
160
+ then find the single word in the vocabulary whose vector is closest
161
+ to that centroid (in cosine similarity).
162
+ """
163
+ if not chosen_words:
164
+ logger.warning("No chosen words provided.")
165
+ return {}
166
+
167
+ vectors = []
168
+ for w in chosen_words:
169
+ vec = self.get_vector(w)
170
+ if vec is not None:
171
+ vectors.append(vec)
172
+
173
+ target_vec = self.get_vector(target_word)
174
+ if target_vec is not None:
175
+ vectors.append(target_vec)
176
+
177
+ if not vectors:
178
+ logger.warning("No valid vectors found among chosen or target words.")
179
+ return {}
180
+
181
+ centroid = np.mean(vectors, axis=0)
182
+ best_word = None
183
+ best_similarity = -1.0
184
+
185
+ # Sample from vocabulary to improve performance
186
+ sample_size = min(100000, len(self.vocab_vectors))
187
+ sampled_words = random.sample(list(self.vocab_vectors.keys()), sample_size)
188
+
189
+ for vocab_word in sampled_words:
190
+ if vocab_word == target_word.lower() or vocab_word in [cw.lower() for cw in chosen_words]:
191
+ continue
192
+
193
+ vector = self.vocab_vectors[vocab_word]
194
+ sim = float(np.dot(vector, centroid) /
195
+ (np.linalg.norm(vector) * np.linalg.norm(centroid)))
196
+
197
+ if sim > best_similarity:
198
+ best_similarity = sim
199
+ best_word = vocab_word
200
+
201
+ if best_word is None:
202
+ logger.warning("Could not find a center word.")
203
+ return {}
204
+
205
+ return {"word": best_word, "similarity": best_similarity}