vimalk78 commited on
Commit
9cd7541
·
1 Parent(s): 5676df3

fix: clean up repository after removing LFS cache files

Browse files
Dockerfile CHANGED
@@ -31,6 +31,7 @@ RUN pip install --no-cache-dir --upgrade pip && \
31
  # Copy all source code
32
  COPY crossword-app/frontend/ ./frontend/
33
  COPY crossword-app/backend-py/ ./backend-py/
 
34
 
35
  # Copy cache directory with pre-built models and NLTK data
36
  COPY cache-dir/ ./cache-dir/
@@ -84,6 +85,10 @@ ENV PIP_NO_CACHE_DIR=1
84
  ENV CACHE_DIR=/app/backend-py/cache
85
  ENV NLTK_DATA=/app/backend-py/cache/nltk_data
86
 
 
 
 
 
87
  # Health check
88
  # HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
89
  # CMD curl -f http://localhost:7860/health || exit 1
 
31
  # Copy all source code
32
  COPY crossword-app/frontend/ ./frontend/
33
  COPY crossword-app/backend-py/ ./backend-py/
34
+ COPY crossword-app/words/ ./backend-py/words/
35
 
36
  # Copy cache directory with pre-built models and NLTK data
37
  COPY cache-dir/ ./cache-dir/
 
85
  ENV CACHE_DIR=/app/backend-py/cache
86
  ENV NLTK_DATA=/app/backend-py/cache/nltk_data
87
 
88
+ # Set vocabulary source and path for Norvig vocabulary
89
+ ENV VOCAB_SOURCE=norvig
90
+ ENV NORVIG_VOCAB_PATH=/app/backend-py/words/norvig/count_1w100k.txt
91
+
92
  # Health check
93
  # HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
94
  # CMD curl -f http://localhost:7860/health || exit 1
crossword-app/backend-py/.env.example CHANGED
@@ -10,6 +10,31 @@ EMBEDDING_MODEL=sentence-transformers/all-mpnet-base-v2
10
  WORD_SIMILARITY_THRESHOLD=0.65
11
  MAX_VOCAB_SIZE=30000
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # HuggingFace Configuration (if needed for cloud inference)
14
  HUGGINGFACE_API_KEY=your_huggingface_api_key_here
15
 
 
10
  WORD_SIMILARITY_THRESHOLD=0.65
11
  MAX_VOCAB_SIZE=30000
12
 
13
+ # Vocabulary Configuration
14
+ # Options: "norvig" (default, recommended), "wordfreq" (legacy)
15
+ VOCAB_SOURCE=norvig
16
+ NORVIG_VOCAB_PATH=words/norvig/count_1w100k.txt
17
+ THEMATIC_VOCAB_SIZE_LIMIT=100000
18
+ THEMATIC_MODEL_NAME=all-mpnet-base-v2
19
+
20
+ # Cache Configuration
21
+ CACHE_DIR=./cache-dir
22
+
23
+ # Debug and Development Options
24
+ ENABLE_DEBUG_TAB=true
25
+ ENABLE_DISTRIBUTION_NORMALIZATION=false
26
+
27
+ # Multi-topic Configuration
28
+ MULTI_TOPIC_METHOD=soft_minimum
29
+ SOFT_MIN_BETA=10.0
30
+ SOFT_MIN_ADAPTIVE=true
31
+ SOFT_MIN_MIN_WORDS=15
32
+ SOFT_MIN_MAX_RETRIES=5
33
+ SOFT_MIN_BETA_DECAY=0.7
34
+
35
+ # Normalization Configuration (when enabled)
36
+ NORMALIZATION_METHOD=similarity_range
37
+
38
  # HuggingFace Configuration (if needed for cloud inference)
39
  HUGGINGFACE_API_KEY=your_huggingface_api_key_here
40
 
crossword-app/backend-py/requirements.txt CHANGED
@@ -41,7 +41,7 @@ torch==2.5.1
41
  transformers==4.47.1
42
  scikit-learn==1.5.2
43
  huggingface-hub==0.26.2
44
- wordfreq==3.1.0
45
 
46
  # NLTK dependencies for WordNet clue generation
47
  nltk==3.8.1
 
41
  transformers==4.47.1
42
  scikit-learn==1.5.2
43
  huggingface-hub==0.26.2
44
+ # wordfreq==3.1.0 # Optional: fallback vocabulary source (use VOCAB_SOURCE=wordfreq)
45
 
46
  # NLTK dependencies for WordNet clue generation
47
  nltk==3.8.1
crossword-app/backend-py/src/services/norvig_vocabulary_manager.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Norvig Vocabulary Manager
4
+
5
+ Provides a WordFreq-compatible interface using Peter Norvig's curated word lists.
6
+ Replaces the WordFreq-based vocabulary system with clean, high-quality word data
7
+ from norvig.com/ngrams/count_1w100k.txt.
8
+
9
+ Features:
10
+ - Clean vocabulary without web-scraped junk or typos
11
+ - Google-quality curation by Peter Norvig (Director of Research)
12
+ - Maintains WordFreq compatibility for seamless integration
13
+ - Preserves all existing frequency tier and difficulty systems
14
+
15
+ Environment Variables:
16
+ - NORVIG_VOCAB_PATH: Path to Norvig word count file (default: hack/norvig/count_1w100k.txt)
17
+ - CACHE_DIR: Cache directory for processed vocabulary data
18
+ """
19
+
20
+ import os
21
+ import pickle
22
+ import logging
23
+ import numpy as np
24
+ from pathlib import Path
25
+ from typing import List, Tuple, Dict, Optional, Counter
26
+ from collections import Counter
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ class NorgivVocabularyManager:
32
+ """
33
+ Norvig vocabulary manager that provides a WordFreq-compatible interface.
34
+ Loads and processes Peter Norvig's curated word lists for crossword generation.
35
+ """
36
+
37
+ def __init__(self, cache_dir: Optional[str] = None, vocab_size_limit: Optional[int] = None):
38
+ """Initialize Norvig vocabulary manager.
39
+
40
+ Args:
41
+ cache_dir: Directory for caching vocabulary and frequency data
42
+ vocab_size_limit: Maximum vocabulary size (None for full Norvig list)
43
+ """
44
+ if cache_dir is None:
45
+ cache_dir = os.getenv("CACHE_DIR")
46
+ if cache_dir is None:
47
+ cache_dir = os.path.join(os.path.dirname(__file__), 'model_cache')
48
+
49
+ self.cache_dir = Path(cache_dir)
50
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
51
+
52
+ # Vocabulary size configuration
53
+ self.vocab_size_limit = vocab_size_limit or int(os.getenv("THEMATIC_VOCAB_SIZE_LIMIT",
54
+ os.getenv("MAX_VOCABULARY_SIZE", "100000")))
55
+
56
+ # Norvig file configuration
57
+ norvig_path = os.getenv("NORVIG_VOCAB_PATH", "words/norvig/count_1w100k.txt")
58
+ if not os.path.isabs(norvig_path):
59
+ # Make relative paths relative to backend-py directory (2 levels up from this file)
60
+ # Current: crossword-app/backend-py/src/services/norvig_vocabulary_manager.py
61
+ # Target: crossword-app/backend-py/words/norvig/count_1w100k.txt
62
+ backend_root = Path(__file__).parent.parent.parent
63
+ self.norvig_file_path = backend_root / norvig_path
64
+ else:
65
+ self.norvig_file_path = Path(norvig_path)
66
+
67
+ # Cache paths - use "norvig" prefix to distinguish from wordfreq cache
68
+ self.vocab_cache_path = self.cache_dir / f"norvig_vocabulary_{self.vocab_size_limit}.pkl"
69
+ self.frequency_cache_path = self.cache_dir / f"norvig_frequencies_{self.vocab_size_limit}.pkl"
70
+
71
+ # Loaded data
72
+ self.vocabulary: List[str] = []
73
+ self.word_frequencies: Counter = Counter()
74
+ self.is_loaded = False
75
+
76
+ logger.info(f"📝 Norvig Vocabulary Manager initialized")
77
+ logger.info(f" 📂 Cache dir: {self.cache_dir}")
78
+ logger.info(f" 📊 Vocab limit: {self.vocab_size_limit:,}")
79
+ logger.info(f" 📄 Norvig file: {self.norvig_file_path}")
80
+
81
+ def load_vocabulary(self) -> Tuple[List[str], Counter]:
82
+ """Load vocabulary and frequency data, with caching."""
83
+ if self.is_loaded:
84
+ return self.vocabulary, self.word_frequencies
85
+
86
+ # Try loading from cache
87
+ if self._load_from_cache():
88
+ logger.info(f"✅ Loaded Norvig vocabulary from cache: {len(self.vocabulary):,} words")
89
+ self.is_loaded = True
90
+ return self.vocabulary, self.word_frequencies
91
+
92
+ # Generate from Norvig file
93
+ logger.info("🔄 Generating vocabulary from Norvig file...")
94
+ self._generate_vocabulary_from_norvig()
95
+
96
+ # Save to cache
97
+ self._save_to_cache()
98
+
99
+ self.is_loaded = True
100
+ return self.vocabulary, self.word_frequencies
101
+
102
+ def _load_from_cache(self) -> bool:
103
+ """Load vocabulary and frequencies from cache."""
104
+ try:
105
+ if self.vocab_cache_path.exists() and self.frequency_cache_path.exists():
106
+ logger.info(f"📦 Loading Norvig vocabulary from cache...")
107
+ logger.info(f" Vocab cache: {self.vocab_cache_path}")
108
+ logger.info(f" Freq cache: {self.frequency_cache_path}")
109
+
110
+ # Validate cache files are readable
111
+ if not os.access(self.vocab_cache_path, os.R_OK):
112
+ logger.warning(f"⚠️ Vocabulary cache file not readable: {self.vocab_cache_path}")
113
+ return False
114
+
115
+ if not os.access(self.frequency_cache_path, os.R_OK):
116
+ logger.warning(f"⚠️ Frequency cache file not readable: {self.frequency_cache_path}")
117
+ return False
118
+
119
+ with open(self.vocab_cache_path, 'rb') as f:
120
+ self.vocabulary = pickle.load(f)
121
+
122
+ with open(self.frequency_cache_path, 'rb') as f:
123
+ self.word_frequencies = pickle.load(f)
124
+
125
+ # Validate loaded data
126
+ if not self.vocabulary or not self.word_frequencies:
127
+ logger.warning("⚠️ Cache files contain empty data")
128
+ return False
129
+
130
+ logger.info(f"✅ Loaded {len(self.vocabulary):,} words and {len(self.word_frequencies):,} frequencies from cache")
131
+ return True
132
+ else:
133
+ missing = []
134
+ if not self.vocab_cache_path.exists():
135
+ missing.append(f"vocabulary ({self.vocab_cache_path})")
136
+ if not self.frequency_cache_path.exists():
137
+ missing.append(f"frequency ({self.frequency_cache_path})")
138
+ logger.info(f"📂 Cache files missing: {', '.join(missing)}")
139
+ return False
140
+ except Exception as e:
141
+ logger.warning(f"⚠️ Cache loading failed: {e}")
142
+
143
+ return False
144
+
145
+ def _save_to_cache(self):
146
+ """Save vocabulary and frequencies to cache."""
147
+ try:
148
+ logger.info("💾 Saving Norvig vocabulary to cache...")
149
+
150
+ with open(self.vocab_cache_path, 'wb') as f:
151
+ pickle.dump(self.vocabulary, f)
152
+
153
+ with open(self.frequency_cache_path, 'wb') as f:
154
+ pickle.dump(self.word_frequencies, f)
155
+
156
+ logger.info("✅ Norvig vocabulary cached successfully")
157
+ except Exception as e:
158
+ logger.warning(f"⚠️ Cache saving failed: {e}")
159
+
160
+ def _generate_vocabulary_from_norvig(self):
161
+ """Generate filtered vocabulary from Norvig word count file."""
162
+ if not self.norvig_file_path.exists():
163
+ raise FileNotFoundError(f"Norvig vocabulary file not found: {self.norvig_file_path}")
164
+
165
+ logger.info(f"📚 Loading words from Norvig file: {self.norvig_file_path}")
166
+
167
+ raw_word_counts = self._load_norvig_file()
168
+ logger.info(f"📥 Loaded {len(raw_word_counts):,} raw words from Norvig file")
169
+
170
+ # Apply crossword-suitable filtering
171
+ filtered_words = []
172
+ frequency_data = Counter()
173
+
174
+ logger.info("🔍 Applying crossword filtering...")
175
+ for word, count in raw_word_counts.items():
176
+ if self._is_crossword_suitable(word):
177
+ word_lower = word.lower()
178
+ filtered_words.append(word_lower)
179
+ frequency_data[word_lower] = count
180
+
181
+ if len(filtered_words) >= self.vocab_size_limit:
182
+ break
183
+
184
+ # Remove duplicates and sort
185
+ self.vocabulary = sorted(list(set(filtered_words)))
186
+ self.word_frequencies = frequency_data
187
+
188
+ logger.info(f"✅ Generated filtered Norvig vocabulary: {len(self.vocabulary):,} words")
189
+ logger.info(f"📊 Frequency data coverage: {len(self.word_frequencies):,} words")
190
+
191
+ # Log some stats about the filtered vocabulary
192
+ if self.vocabulary:
193
+ lengths = [len(word) for word in self.vocabulary]
194
+ logger.info(f"📏 Word length range: {min(lengths)}-{max(lengths)} chars")
195
+ logger.info(f"🔢 Average word length: {np.mean(lengths):.1f} chars")
196
+
197
+ if self.word_frequencies:
198
+ counts = list(self.word_frequencies.values())
199
+ logger.info(f"📈 Frequency range: {min(counts):,} - {max(counts):,}")
200
+
201
+ def _load_norvig_file(self) -> Dict[str, int]:
202
+ """Load Norvig word count file and return word->count mapping."""
203
+ word_counts = {}
204
+
205
+ try:
206
+ with open(self.norvig_file_path, 'r', encoding='utf-8') as f:
207
+ for line_num, line in enumerate(f, 1):
208
+ line = line.strip()
209
+ if not line:
210
+ continue
211
+
212
+ # Parse tab-separated format: WORD\tCOUNT
213
+ parts = line.split('\t')
214
+ if len(parts) == 2:
215
+ word, count_str = parts
216
+ try:
217
+ count = int(count_str)
218
+ word_counts[word.upper()] = count
219
+ except ValueError:
220
+ logger.warning(f"⚠️ Invalid count on line {line_num}: {line}")
221
+ else:
222
+ logger.warning(f"⚠️ Invalid format on line {line_num}: {line}")
223
+
224
+ return word_counts
225
+
226
+ except Exception as e:
227
+ logger.error(f"❌ Failed to load Norvig file {self.norvig_file_path}: {e}")
228
+ raise
229
+
230
+ def _is_crossword_suitable(self, word: str) -> bool:
231
+ """Check if word is suitable for crosswords (same logic as WordFreq version)."""
232
+ word = word.lower().strip()
233
+
234
+ # Length check (3-12 characters for crosswords)
235
+ if len(word) < 3 or len(word) > 12:
236
+ return False
237
+
238
+ # Must be alphabetic only
239
+ if not word.isalpha():
240
+ return False
241
+
242
+ # Skip boring/common words (same as WordFreq version)
243
+ boring_words = {
244
+ 'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'this', 'that',
245
+ 'with', 'from', 'they', 'were', 'been', 'have', 'their', 'said', 'each',
246
+ 'which', 'what', 'there', 'will', 'more', 'when', 'some', 'like', 'into',
247
+ 'time', 'very', 'only', 'has', 'had', 'who', 'its', 'now', 'find', 'long',
248
+ 'down', 'day', 'did', 'get', 'come', 'made', 'may', 'part'
249
+ }
250
+
251
+ if word in boring_words:
252
+ return False
253
+
254
+ # Skip obvious plurals (simple heuristic)
255
+ if len(word) > 4 and word.endswith('s') and not word.endswith(('ss', 'us', 'is')):
256
+ return False
257
+
258
+ # Skip words with repeated characters (often not real words)
259
+ if len(set(word)) < len(word) * 0.6: # Less than 60% unique characters
260
+ return False
261
+
262
+ return True
263
+
264
+ def get_word_frequency(self, word: str) -> float:
265
+ """Get word frequency as a normalized score (compatible with WordFreq API)."""
266
+ word_lower = word.lower()
267
+ if word_lower not in self.word_frequencies:
268
+ return 0.0
269
+
270
+ # Convert count to normalized frequency similar to WordFreq
271
+ # Use log scale similar to WordFreq's approach
272
+ count = self.word_frequencies[word_lower]
273
+ max_count = max(self.word_frequencies.values()) if self.word_frequencies else 1
274
+
275
+ # Normalize to 0-1 range with log scaling
276
+ normalized_freq = np.log10(count + 1) / np.log10(max_count + 1)
277
+ return float(normalized_freq)
278
+
279
+ def get_vocabulary_stats(self) -> Dict:
280
+ """Get statistics about the loaded vocabulary."""
281
+ if not self.is_loaded:
282
+ self.load_vocabulary()
283
+
284
+ stats = {
285
+ "total_words": len(self.vocabulary),
286
+ "vocabulary_source": "norvig",
287
+ "norvig_file": str(self.norvig_file_path),
288
+ "vocab_size_limit": self.vocab_size_limit,
289
+ }
290
+
291
+ if self.vocabulary:
292
+ lengths = [len(word) for word in self.vocabulary]
293
+ stats.update({
294
+ "min_word_length": min(lengths),
295
+ "max_word_length": max(lengths),
296
+ "avg_word_length": np.mean(lengths),
297
+ })
298
+
299
+ if self.word_frequencies:
300
+ counts = list(self.word_frequencies.values())
301
+ stats.update({
302
+ "min_frequency": min(counts),
303
+ "max_frequency": max(counts),
304
+ "total_frequency": sum(counts),
305
+ })
306
+
307
+ return stats
crossword-app/backend-py/src/services/thematic_word_service.py CHANGED
@@ -50,12 +50,21 @@ import time
50
  from collections import Counter
51
  from pathlib import Path
52
 
53
- # WordFreq imports (assumed to be available)
54
- from wordfreq import word_frequency, zipf_frequency, top_n_list
55
-
56
  # Use backend's logging configuration
57
  logger = logging.getLogger(__name__)
58
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  def get_timestamp():
60
  return datetime.now().strftime("%H:%M:%S")
61
 
@@ -65,7 +74,7 @@ def get_datetimestamp():
65
 
66
  class VocabularyManager:
67
  """
68
- Centralized vocabulary management using WordFreq as the single source.
69
  Handles loading, filtering, caching, and frequency data generation.
70
  """
71
 
@@ -74,7 +83,7 @@ class VocabularyManager:
74
 
75
  Args:
76
  cache_dir: Directory for caching vocabulary and embeddings
77
- vocab_size_limit: Maximum vocabulary size (None for full WordFreq vocabulary)
78
  """
79
  if cache_dir is None:
80
  # Check environment variable for cache directory
@@ -89,9 +98,29 @@ class VocabularyManager:
89
  self.vocab_size_limit = vocab_size_limit or int(os.getenv("THEMATIC_VOCAB_SIZE_LIMIT",
90
  os.getenv("MAX_VOCABULARY_SIZE", "100000")))
91
 
92
- # Cache paths
93
- self.vocab_cache_path = self.cache_dir / f"vocabulary_{self.vocab_size_limit}.pkl"
94
- self.frequency_cache_path = self.cache_dir / f"frequencies_{self.vocab_size_limit}.pkl"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  # Loaded data
97
  self.vocabulary: List[str] = []
@@ -102,7 +131,14 @@ class VocabularyManager:
102
  """Load vocabulary and frequency data, with caching."""
103
  if self.is_loaded:
104
  return self.vocabulary, self.word_frequencies
 
 
 
 
 
 
105
 
 
106
  # Try loading from cache
107
  if self._load_from_cache():
108
  logger.info(f"✅ Loaded vocabulary from cache: {len(self.vocabulary):,} words")
@@ -179,6 +215,9 @@ class VocabularyManager:
179
 
180
  def _generate_vocabulary_from_wordfreq(self):
181
  """Generate filtered vocabulary from WordFreq database."""
 
 
 
182
  logger.info(f"📚 Fetching top {self.vocab_size_limit:,} words from WordFreq...")
183
 
184
  # Get comprehensive word list from WordFreq
@@ -282,6 +321,28 @@ class ThematicWordService:
282
  int(os.getenv("THEMATIC_VOCAB_SIZE_LIMIT",
283
  os.getenv("MAX_VOCABULARY_SIZE", "100000"))))
284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  # Configuration parameters for softmax weighted selection
286
  self.similarity_temperature = float(os.getenv("SIMILARITY_TEMPERATURE", "0.2"))
287
  self.use_softmax_selection = os.getenv("USE_SOFTMAX_SELECTION", "true").lower() == "true"
@@ -312,7 +373,7 @@ class ThematicWordService:
312
  self.enable_debug_tab = os.getenv("ENABLE_DEBUG_TAB", "false").lower() == "true"
313
 
314
  # Core components
315
- self.vocab_manager = VocabularyManager(str(self.cache_dir), self.vocab_size_limit)
316
  self.model: Optional[SentenceTransformer] = None
317
 
318
  # Loaded data
@@ -323,8 +384,8 @@ class ThematicWordService:
323
  self.tier_descriptions: Dict[str, str] = {}
324
  self.word_percentiles: Dict[str, float] = {}
325
 
326
- # Cache paths for embeddings
327
- vocab_hash = f"{self.model_name.replace('/', '_')}_{self.vocab_size_limit}"
328
  self.embeddings_cache_path = self.cache_dir / f"embeddings_{vocab_hash}.npy"
329
 
330
  self.is_initialized = False
@@ -1330,28 +1391,40 @@ class ThematicWordService:
1330
 
1331
  def get_cache_status(self) -> Dict[str, Any]:
1332
  """Get detailed cache status information."""
1333
- vocab_exists = self.vocab_manager.vocab_cache_path.exists()
1334
- freq_exists = self.vocab_manager.frequency_cache_path.exists()
 
 
 
 
 
 
 
 
 
 
 
 
1335
  embeddings_exists = self.embeddings_cache_path.exists()
1336
 
1337
  status = {
1338
  "cache_directory": str(self.cache_dir),
1339
  "vocabulary_cache": {
1340
- "path": str(self.vocab_manager.vocab_cache_path),
1341
  "exists": vocab_exists,
1342
- "readable": vocab_exists and os.access(self.vocab_manager.vocab_cache_path, os.R_OK)
1343
  },
1344
  "frequency_cache": {
1345
- "path": str(self.vocab_manager.frequency_cache_path),
1346
  "exists": freq_exists,
1347
- "readable": freq_exists and os.access(self.vocab_manager.frequency_cache_path, os.R_OK)
1348
  },
1349
  "embeddings_cache": {
1350
  "path": str(self.embeddings_cache_path),
1351
  "exists": embeddings_exists,
1352
  "readable": embeddings_exists and os.access(self.embeddings_cache_path, os.R_OK)
1353
  },
1354
- "complete": vocab_exists and freq_exists and embeddings_exists
1355
  }
1356
 
1357
  # Add size information if files exist
@@ -1519,7 +1592,9 @@ class ThematicWordService:
1519
  "custom_sentence": custom_sentence,
1520
  "multi_theme": multi_theme,
1521
  "thematic_pool_size": thematic_pool,
1522
- "min_similarity": min_similarity
 
 
1523
  },
1524
  "thematic_pool": [
1525
  {
 
50
  from collections import Counter
51
  from pathlib import Path
52
 
 
 
 
53
  # Use backend's logging configuration
54
  logger = logging.getLogger(__name__)
55
 
56
+ # WordFreq imports (for backward compatibility)
57
+ try:
58
+ from wordfreq import word_frequency, zipf_frequency, top_n_list
59
+ WORDFREQ_AVAILABLE = True
60
+ except ImportError:
61
+ logger.warning("WordFreq not available, using Norvig vocabulary only")
62
+ WORDFREQ_AVAILABLE = False
63
+
64
+ # Norvig vocabulary imports
65
+ from .norvig_vocabulary_manager import NorgivVocabularyManager
66
+
67
+
68
  def get_timestamp():
69
  return datetime.now().strftime("%H:%M:%S")
70
 
 
74
 
75
  class VocabularyManager:
76
  """
77
+ Centralized vocabulary management supporting both WordFreq and Norvig sources.
78
  Handles loading, filtering, caching, and frequency data generation.
79
  """
80
 
 
83
 
84
  Args:
85
  cache_dir: Directory for caching vocabulary and embeddings
86
+ vocab_size_limit: Maximum vocabulary size (None for full vocabulary)
87
  """
88
  if cache_dir is None:
89
  # Check environment variable for cache directory
 
98
  self.vocab_size_limit = vocab_size_limit or int(os.getenv("THEMATIC_VOCAB_SIZE_LIMIT",
99
  os.getenv("MAX_VOCABULARY_SIZE", "100000")))
100
 
101
+ # Vocabulary source configuration
102
+ self.vocab_source = os.getenv("VOCAB_SOURCE", "norvig").lower()
103
+ logger.info(f"📚 Vocabulary source: {self.vocab_source}")
104
+
105
+ # Initialize appropriate vocabulary manager
106
+ if self.vocab_source == "norvig":
107
+ self.vocab_manager = NorgivVocabularyManager(cache_dir, vocab_size_limit)
108
+ elif self.vocab_source == "wordfreq" and WORDFREQ_AVAILABLE:
109
+ self.vocab_manager = None # Use built-in WordFreq logic
110
+ else:
111
+ if not WORDFREQ_AVAILABLE:
112
+ logger.warning("⚠️ WordFreq not available, falling back to Norvig")
113
+ self.vocab_source = "norvig"
114
+ self.vocab_manager = NorgivVocabularyManager(cache_dir, vocab_size_limit)
115
+ else:
116
+ logger.warning(f"⚠️ Unknown vocab source '{self.vocab_source}', falling back to Norvig")
117
+ self.vocab_source = "norvig"
118
+ self.vocab_manager = NorgivVocabularyManager(cache_dir, vocab_size_limit)
119
+
120
+ # Cache paths (include source in filename)
121
+ source_suffix = f"_{self.vocab_source}" if self.vocab_source != "wordfreq" else ""
122
+ self.vocab_cache_path = self.cache_dir / f"vocabulary{source_suffix}_{self.vocab_size_limit}.pkl"
123
+ self.frequency_cache_path = self.cache_dir / f"frequencies{source_suffix}_{self.vocab_size_limit}.pkl"
124
 
125
  # Loaded data
126
  self.vocabulary: List[str] = []
 
131
  """Load vocabulary and frequency data, with caching."""
132
  if self.is_loaded:
133
  return self.vocabulary, self.word_frequencies
134
+
135
+ # Use Norvig vocabulary manager if configured
136
+ if self.vocab_manager is not None:
137
+ self.vocabulary, self.word_frequencies = self.vocab_manager.load_vocabulary()
138
+ self.is_loaded = True
139
+ return self.vocabulary, self.word_frequencies
140
 
141
+ # Fallback to WordFreq logic for backward compatibility
142
  # Try loading from cache
143
  if self._load_from_cache():
144
  logger.info(f"✅ Loaded vocabulary from cache: {len(self.vocabulary):,} words")
 
215
 
216
  def _generate_vocabulary_from_wordfreq(self):
217
  """Generate filtered vocabulary from WordFreq database."""
218
+ if not WORDFREQ_AVAILABLE:
219
+ raise ImportError("WordFreq is not available, cannot generate vocabulary")
220
+
221
  logger.info(f"📚 Fetching top {self.vocab_size_limit:,} words from WordFreq...")
222
 
223
  # Get comprehensive word list from WordFreq
 
321
  int(os.getenv("THEMATIC_VOCAB_SIZE_LIMIT",
322
  os.getenv("MAX_VOCABULARY_SIZE", "100000"))))
323
 
324
+ # Vocabulary source configuration
325
+ self.vocab_source = os.getenv("VOCAB_SOURCE", "norvig").lower()
326
+ logger.info(f"📚 Vocabulary source: {self.vocab_source}")
327
+
328
+ # Initialize appropriate vocabulary manager
329
+ if self.vocab_source == "norvig":
330
+ from .norvig_vocabulary_manager import NorgivVocabularyManager
331
+ self.vocab_manager = NorgivVocabularyManager(str(self.cache_dir), self.vocab_size_limit)
332
+ elif self.vocab_source == "wordfreq" and WORDFREQ_AVAILABLE:
333
+ self.vocab_manager = None # Use built-in WordFreq logic
334
+ else:
335
+ if not WORDFREQ_AVAILABLE:
336
+ logger.warning("⚠️ WordFreq not available, falling back to Norvig")
337
+ self.vocab_source = "norvig"
338
+ from .norvig_vocabulary_manager import NorgivVocabularyManager
339
+ self.vocab_manager = NorgivVocabularyManager(str(self.cache_dir), self.vocab_size_limit)
340
+ else:
341
+ logger.warning(f"⚠️ Unknown vocab source '{self.vocab_source}', falling back to Norvig")
342
+ self.vocab_source = "norvig"
343
+ from .norvig_vocabulary_manager import NorgivVocabularyManager
344
+ self.vocab_manager = NorgivVocabularyManager(str(self.cache_dir), self.vocab_size_limit)
345
+
346
  # Configuration parameters for softmax weighted selection
347
  self.similarity_temperature = float(os.getenv("SIMILARITY_TEMPERATURE", "0.2"))
348
  self.use_softmax_selection = os.getenv("USE_SOFTMAX_SELECTION", "true").lower() == "true"
 
373
  self.enable_debug_tab = os.getenv("ENABLE_DEBUG_TAB", "false").lower() == "true"
374
 
375
  # Core components
376
+ # Note: vocab_manager already initialized in constructor based on VOCAB_SOURCE
377
  self.model: Optional[SentenceTransformer] = None
378
 
379
  # Loaded data
 
384
  self.tier_descriptions: Dict[str, str] = {}
385
  self.word_percentiles: Dict[str, float] = {}
386
 
387
+ # Cache paths for embeddings (include vocabulary source for proper separation)
388
+ vocab_hash = f"{self.model_name.replace('/', '_')}_{self.vocab_source}_{self.vocab_size_limit}"
389
  self.embeddings_cache_path = self.cache_dir / f"embeddings_{vocab_hash}.npy"
390
 
391
  self.is_initialized = False
 
1391
 
1392
  def get_cache_status(self) -> Dict[str, Any]:
1393
  """Get detailed cache status information."""
1394
+ # Handle different vocabulary manager types
1395
+ if self.vocab_manager is not None:
1396
+ # Using Norvig or other vocab manager with cache paths
1397
+ vocab_exists = self.vocab_manager.vocab_cache_path.exists()
1398
+ freq_exists = self.vocab_manager.frequency_cache_path.exists()
1399
+ vocab_path = str(self.vocab_manager.vocab_cache_path)
1400
+ freq_path = str(self.vocab_manager.frequency_cache_path)
1401
+ else:
1402
+ # Using WordFreq (no separate cache files)
1403
+ vocab_exists = False
1404
+ freq_exists = False
1405
+ vocab_path = "N/A (using WordFreq)"
1406
+ freq_path = "N/A (using WordFreq)"
1407
+
1408
  embeddings_exists = self.embeddings_cache_path.exists()
1409
 
1410
  status = {
1411
  "cache_directory": str(self.cache_dir),
1412
  "vocabulary_cache": {
1413
+ "path": vocab_path,
1414
  "exists": vocab_exists,
1415
+ "readable": vocab_exists and os.access(vocab_path, os.R_OK) if vocab_exists else False
1416
  },
1417
  "frequency_cache": {
1418
+ "path": freq_path,
1419
  "exists": freq_exists,
1420
+ "readable": freq_exists and os.access(freq_path, os.R_OK) if freq_exists else False
1421
  },
1422
  "embeddings_cache": {
1423
  "path": str(self.embeddings_cache_path),
1424
  "exists": embeddings_exists,
1425
  "readable": embeddings_exists and os.access(self.embeddings_cache_path, os.R_OK)
1426
  },
1427
+ "complete": (vocab_exists or self.vocab_manager is None) and (freq_exists or self.vocab_manager is None) and embeddings_exists
1428
  }
1429
 
1430
  # Add size information if files exist
 
1592
  "custom_sentence": custom_sentence,
1593
  "multi_theme": multi_theme,
1594
  "thematic_pool_size": thematic_pool,
1595
+ "min_similarity": min_similarity,
1596
+ "multi_topic_method": self.multi_topic_method if len(topics) > 1 else None,
1597
+ "soft_min_beta": self.soft_min_beta if len(topics) > 1 and self.multi_topic_method == "soft_minimum" else None
1598
  },
1599
  "thematic_pool": [
1600
  {
crossword-app/frontend/src/components/DebugTab.jsx CHANGED
@@ -53,6 +53,12 @@ const DebugTab = ({ debugData }) => {
53
  <div><strong>Thematic Pool Size:</strong> {debugData.generation_params.thematic_pool_size}</div>
54
  <div><strong>Min Similarity:</strong> {debugData.generation_params.min_similarity}</div>
55
  <div><strong>Multi-theme:</strong> {debugData.generation_params.multi_theme ? 'Yes' : 'No'}</div>
 
 
 
 
 
 
56
  {debugData.generation_params.custom_sentence && (
57
  <div><strong>Custom Sentence:</strong> "{debugData.generation_params.custom_sentence}"</div>
58
  )}
@@ -71,6 +77,9 @@ const DebugTab = ({ debugData }) => {
71
  <li><strong>Composite Score</strong> = (1 - difficulty_weight) × similarity + difficulty_weight × frequency_alignment</li>
72
  <li><strong>Frequency Alignment</strong>: Gaussian distribution favoring target percentiles by difficulty</li>
73
  <li><strong>Softmax Selection</strong>: Probabilistic selection based on composite scores with temperature control</li>
 
 
 
74
  </ul>
75
 
76
  <h4>Difficulty Targets:</h4>
@@ -177,6 +186,10 @@ const DebugTab = ({ debugData }) => {
177
  onClick={() => handleSort('similarity')}
178
  style={{ cursor: 'pointer', userSelect: 'none' }}
179
  className={sortBy === 'similarity' ? 'sorted-column' : ''}
 
 
 
 
180
  >
181
  Similarity{getSortIcon('similarity')}
182
  </th>
@@ -299,6 +312,9 @@ const DebugTab = ({ debugData }) => {
299
  <li><strong>Composite Score</strong> = (1 - difficulty_weight) × similarity + difficulty_weight × frequency_alignment</li>
300
  <li><strong>Frequency Alignment</strong>: Gaussian distribution favoring target percentiles by difficulty</li>
301
  <li><strong>Softmax Selection</strong>: Probabilistic selection based on composite scores with temperature control</li>
 
 
 
302
  </ul>
303
 
304
  <h4>Difficulty Targets:</h4>
 
53
  <div><strong>Thematic Pool Size:</strong> {debugData.generation_params.thematic_pool_size}</div>
54
  <div><strong>Min Similarity:</strong> {debugData.generation_params.min_similarity}</div>
55
  <div><strong>Multi-theme:</strong> {debugData.generation_params.multi_theme ? 'Yes' : 'No'}</div>
56
+ {debugData.generation_params.multi_topic_method && (
57
+ <div><strong>Multi-Topic Method:</strong> {debugData.generation_params.multi_topic_method}</div>
58
+ )}
59
+ {debugData.generation_params.soft_min_beta && (
60
+ <div><strong>Soft Min Beta:</strong> {debugData.generation_params.soft_min_beta}</div>
61
+ )}
62
  {debugData.generation_params.custom_sentence && (
63
  <div><strong>Custom Sentence:</strong> "{debugData.generation_params.custom_sentence}"</div>
64
  )}
 
77
  <li><strong>Composite Score</strong> = (1 - difficulty_weight) × similarity + difficulty_weight × frequency_alignment</li>
78
  <li><strong>Frequency Alignment</strong>: Gaussian distribution favoring target percentiles by difficulty</li>
79
  <li><strong>Softmax Selection</strong>: Probabilistic selection based on composite scores with temperature control</li>
80
+ {debugData.generation_params.multi_topic_method && (
81
+ <li><strong>Multi-Topic Similarity:</strong> Uses {debugData.generation_params.multi_topic_method} method to find words relevant to ALL topics</li>
82
+ )}
83
  </ul>
84
 
85
  <h4>Difficulty Targets:</h4>
 
186
  onClick={() => handleSort('similarity')}
187
  style={{ cursor: 'pointer', userSelect: 'none' }}
188
  className={sortBy === 'similarity' ? 'sorted-column' : ''}
189
+ title={debugData.generation_params.multi_topic_method ?
190
+ `Multi-Topic Similarity (${debugData.generation_params.multi_topic_method}): Score representing relevance to ALL topics simultaneously. ${debugData.generation_params.multi_topic_method === 'soft_minimum' ? 'Uses soft minimum aggregation (β=' + debugData.generation_params.soft_min_beta + ') - high scores mean the word relates well to every selected topic.' : 'Aggregated across all topics.'}` :
191
+ 'Similarity: Semantic similarity score to the selected topic (0.0 to 1.0)'
192
+ }
193
  >
194
  Similarity{getSortIcon('similarity')}
195
  </th>
 
312
  <li><strong>Composite Score</strong> = (1 - difficulty_weight) × similarity + difficulty_weight × frequency_alignment</li>
313
  <li><strong>Frequency Alignment</strong>: Gaussian distribution favoring target percentiles by difficulty</li>
314
  <li><strong>Softmax Selection</strong>: Probabilistic selection based on composite scores with temperature control</li>
315
+ {debugData.generation_params.multi_topic_method && (
316
+ <li><strong>Multi-Topic Similarity:</strong> Uses {debugData.generation_params.multi_topic_method} method to find words relevant to ALL topics</li>
317
+ )}
318
  </ul>
319
 
320
  <h4>Difficulty Targets:</h4>
{hack → crossword-app/words}/norvig/count_1w.txt RENAMED
File without changes
{hack → crossword-app/words}/norvig/count_1w100k.txt RENAMED
File without changes