Fix word repetition and overly strict filtering in crossword generation
Browse files- Add session-based tracking to prevent word repetition across puzzles
- Implement quality-tiered randomization for better variety (92% score)
- Fix filtering bug rejecting high-scoring words like TECH(0.793), ICT(0.641)
- Allow meaningful topic variations while blocking exact matches
- Add configurable word exclusion system for inappropriate terms
Resolves issues where valid technology terms were incorrectly rejected
due to substring matching and overly broad abstract word filters.
Signed-off-by: Vimal Kumar <vimal78@gmail.com>
crossword-app/backend-py/src/services/__pycache__/vector_search.cpython-313.pyc
CHANGED
|
Binary files a/crossword-app/backend-py/src/services/__pycache__/vector_search.cpython-313.pyc and b/crossword-app/backend-py/src/services/__pycache__/vector_search.cpython-313.pyc differ
|
|
|
crossword-app/backend-py/src/services/vector_search.py
CHANGED
|
@@ -46,10 +46,18 @@ class VectorSearchService:
|
|
| 46 |
self.base_similarity_threshold = float(os.getenv("WORD_SIMILARITY_THRESHOLD", "0.55")) # Start high for quality
|
| 47 |
self.min_similarity_threshold = 0.45 # Never go below this to maintain relevance
|
| 48 |
self.max_results = 40 # Increased to get more candidates
|
|
|
|
| 49 |
|
| 50 |
# Cache manager for word fallback
|
| 51 |
self.cache_manager = None
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
# FAISS index caching
|
| 54 |
self.index_cache_dir = self._get_index_cache_dir()
|
| 55 |
self.vocab_cache_path = os.path.join(self.index_cache_dir, f"vocab_{self._get_model_hash()}.pkl")
|
|
@@ -67,6 +75,7 @@ class VectorSearchService:
|
|
| 67 |
log_with_timestamp(f" π― Base Similarity Threshold: {self.base_similarity_threshold}")
|
| 68 |
log_with_timestamp(f" π Min Similarity Threshold: {self.min_similarity_threshold}")
|
| 69 |
log_with_timestamp(f" π Max Results: {self.max_results}")
|
|
|
|
| 70 |
log_with_timestamp(f" π Search Randomness: {os.getenv('SEARCH_RANDOMNESS', '0.02')}")
|
| 71 |
log_with_timestamp(f" πΎ Cache Dir: {os.getenv('WORD_CACHE_DIR', 'auto-detect')}")
|
| 72 |
|
|
@@ -283,96 +292,53 @@ class VectorSearchService:
|
|
| 283 |
return await self._get_cached_fallback(topic, difficulty, max_words)
|
| 284 |
|
| 285 |
try:
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
# Add small amount of noise to create variety in search results (with fallback)
|
| 290 |
-
import numpy as np
|
| 291 |
-
noise_factor = float(os.getenv("SEARCH_RANDOMNESS", "0.02")) # 2% noise by default
|
| 292 |
-
if noise_factor > 0:
|
| 293 |
-
try:
|
| 294 |
-
noise = np.random.normal(0, noise_factor, topic_embedding.shape)
|
| 295 |
-
topic_embedding_noisy = topic_embedding + noise
|
| 296 |
-
# Ensure the array is contiguous and correct type for FAISS
|
| 297 |
-
topic_embedding = np.ascontiguousarray(topic_embedding_noisy, dtype=np.float32)
|
| 298 |
-
except Exception as noise_error:
|
| 299 |
-
logger.warning(f"β οΈ Failed to add search noise: {noise_error}, using original embedding")
|
| 300 |
-
topic_embedding = np.ascontiguousarray(topic_embedding, dtype=np.float32)
|
| 301 |
-
else:
|
| 302 |
-
topic_embedding = np.ascontiguousarray(topic_embedding, dtype=np.float32)
|
| 303 |
-
|
| 304 |
-
# Normalize for cosine similarity with error handling
|
| 305 |
-
try:
|
| 306 |
-
faiss.normalize_L2(topic_embedding)
|
| 307 |
-
except Exception as norm_error:
|
| 308 |
-
logger.warning(f"β οΈ FAISS normalization failed: {norm_error}, trying without noise")
|
| 309 |
-
# Fallback: use original embedding without noise
|
| 310 |
-
topic_embedding = self.model.encode([topic], convert_to_numpy=True)
|
| 311 |
-
topic_embedding = np.ascontiguousarray(topic_embedding, dtype=np.float32)
|
| 312 |
-
faiss.normalize_L2(topic_embedding)
|
| 313 |
-
|
| 314 |
-
# Search for similar words using FAISS (get more results for diversity)
|
| 315 |
-
search_size = min(self.max_results * 6, 150) # Get many more candidates for variety
|
| 316 |
-
scores, indices = self.faiss_index.search(topic_embedding, search_size)
|
| 317 |
-
|
| 318 |
-
# Debug: log search results
|
| 319 |
-
logger.info(f"π FAISS search returned {len(scores[0])} results")
|
| 320 |
-
logger.info(f"π Top 5 scores: {scores[0][:5]}")
|
| 321 |
-
|
| 322 |
-
# Log the actual words found by FAISS for debugging
|
| 323 |
-
top_words_with_scores = []
|
| 324 |
-
for i, (score, idx) in enumerate(zip(scores[0][:10], indices[0][:10])): # Show top 10
|
| 325 |
-
word = self.vocab[idx]
|
| 326 |
-
top_words_with_scores.append(f"{word}({score:.3f})")
|
| 327 |
-
|
| 328 |
-
logger.info(f"π Top 10 FAISS words: {', '.join(top_words_with_scores)}")
|
| 329 |
-
|
| 330 |
-
# Adaptive threshold strategy - try higher thresholds first, then lower if needed
|
| 331 |
-
candidates = []
|
| 332 |
-
thresholds_to_try = [
|
| 333 |
-
self.base_similarity_threshold, # Start with high quality (0.55 default)
|
| 334 |
-
max(self.base_similarity_threshold - 0.05, self.min_similarity_threshold), # 0.50
|
| 335 |
-
max(self.base_similarity_threshold - 0.10, self.min_similarity_threshold), # 0.45
|
| 336 |
-
self.min_similarity_threshold # Final attempt (0.45 minimum)
|
| 337 |
-
]
|
| 338 |
-
|
| 339 |
-
for threshold in thresholds_to_try:
|
| 340 |
-
logger.info(f"π― Trying threshold: {threshold}")
|
| 341 |
-
candidates = self._collect_candidates_with_threshold(scores, indices, threshold, topic, difficulty)
|
| 342 |
-
logger.info(f"π Found {len(candidates)} candidates with threshold {threshold}")
|
| 343 |
|
| 344 |
-
#
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
else:
|
| 367 |
-
#
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
|
| 377 |
# If not enough words found, supplement with cached words (more aggressive)
|
| 378 |
if len(similar_words) < max_words * 0.75: # If less than 75% of target, supplement
|
|
@@ -432,8 +398,22 @@ class VectorSearchService:
|
|
| 432 |
topic_lower = topic.lower()
|
| 433 |
word_lower = word.lower()
|
| 434 |
|
| 435 |
-
# Don't include the topic
|
| 436 |
-
if word_lower == topic_lower
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
return False
|
| 438 |
|
| 439 |
# Topic-specific filtering
|
|
@@ -443,12 +423,550 @@ class VectorSearchService:
|
|
| 443 |
return False
|
| 444 |
|
| 445 |
# Prefer concrete nouns over abstract concepts
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
|
| 450 |
return True
|
| 451 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
def _get_index_cache_dir(self) -> str:
|
| 453 |
"""Get the directory for caching FAISS indexes."""
|
| 454 |
# Use different cache locations based on environment
|
|
@@ -474,6 +992,57 @@ class VectorSearchService:
|
|
| 474 |
os.path.exists(self.embeddings_cache_path) and
|
| 475 |
os.path.exists(self.faiss_cache_path))
|
| 476 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 477 |
def _load_cached_index(self) -> bool:
|
| 478 |
"""Load FAISS index from cache if available."""
|
| 479 |
try:
|
|
|
|
| 46 |
self.base_similarity_threshold = float(os.getenv("WORD_SIMILARITY_THRESHOLD", "0.55")) # Start high for quality
|
| 47 |
self.min_similarity_threshold = 0.45 # Never go below this to maintain relevance
|
| 48 |
self.max_results = 40 # Increased to get more candidates
|
| 49 |
+
self.use_hierarchical_search = os.getenv("USE_HIERARCHICAL_SEARCH", "true").lower() == "true"
|
| 50 |
|
| 51 |
# Cache manager for word fallback
|
| 52 |
self.cache_manager = None
|
| 53 |
|
| 54 |
+
# Session-based word tracking to prevent repetition across puzzles
|
| 55 |
+
self.used_words_by_topic = {} # topic -> set of used words
|
| 56 |
+
self.max_used_words_per_topic = int(os.getenv("MAX_USED_WORDS_MEMORY", "50")) # Remember last 50 words per topic
|
| 57 |
+
|
| 58 |
+
# Word exclusion mechanism - configurable list of words to never include
|
| 59 |
+
self.excluded_words = self._load_excluded_words()
|
| 60 |
+
|
| 61 |
# FAISS index caching
|
| 62 |
self.index_cache_dir = self._get_index_cache_dir()
|
| 63 |
self.vocab_cache_path = os.path.join(self.index_cache_dir, f"vocab_{self._get_model_hash()}.pkl")
|
|
|
|
| 75 |
log_with_timestamp(f" π― Base Similarity Threshold: {self.base_similarity_threshold}")
|
| 76 |
log_with_timestamp(f" π Min Similarity Threshold: {self.min_similarity_threshold}")
|
| 77 |
log_with_timestamp(f" π Max Results: {self.max_results}")
|
| 78 |
+
log_with_timestamp(f" π Hierarchical Search: {self.use_hierarchical_search}")
|
| 79 |
log_with_timestamp(f" π Search Randomness: {os.getenv('SEARCH_RANDOMNESS', '0.02')}")
|
| 80 |
log_with_timestamp(f" πΎ Cache Dir: {os.getenv('WORD_CACHE_DIR', 'auto-detect')}")
|
| 81 |
|
|
|
|
| 292 |
return await self._get_cached_fallback(topic, difficulty, max_words)
|
| 293 |
|
| 294 |
try:
|
| 295 |
+
if self.use_hierarchical_search:
|
| 296 |
+
# Use hierarchical search for better word diversity and coverage
|
| 297 |
+
logger.info(f"π Using hierarchical semantic search for enhanced word generation")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
|
| 299 |
+
# Perform hierarchical search (topic variations + subcategories)
|
| 300 |
+
all_candidates = await self._hierarchical_search(topic, difficulty, max_words)
|
| 301 |
+
|
| 302 |
+
# Combine and filter results intelligently
|
| 303 |
+
if all_candidates:
|
| 304 |
+
combined_results = self._combine_hierarchical_results(all_candidates, max_words * 2) # Get more candidates for filtering
|
| 305 |
+
|
| 306 |
+
# Apply word exclusions to remove inappropriate words
|
| 307 |
+
combined_results = self._apply_word_exclusions(combined_results)
|
| 308 |
+
|
| 309 |
+
# Filter out previously used words to improve variety
|
| 310 |
+
similar_words = self._filter_used_words(combined_results, topic)
|
| 311 |
+
|
| 312 |
+
# Trim to requested count
|
| 313 |
+
similar_words = similar_words[:max_words]
|
| 314 |
+
|
| 315 |
+
logger.info(f"π― Hierarchical search generated {len(similar_words)} words for '{topic}' (after variety filtering)")
|
| 316 |
+
|
| 317 |
+
# Track these words to prevent future repetition
|
| 318 |
+
if similar_words:
|
| 319 |
+
self._track_used_words(topic, [word['word'] for word in similar_words])
|
| 320 |
+
|
| 321 |
+
# Cache successful results for future use
|
| 322 |
+
if similar_words:
|
| 323 |
+
await self._cache_successful_search(topic, difficulty, similar_words)
|
| 324 |
+
else:
|
| 325 |
+
similar_words = []
|
| 326 |
+
logger.warning(f"β οΈ Hierarchical search found no candidates for '{topic}'")
|
| 327 |
else:
|
| 328 |
+
# Fall back to original single-search approach
|
| 329 |
+
logger.info(f"π Using traditional single-search approach")
|
| 330 |
+
traditional_results = await self._traditional_single_search(topic, difficulty, max_words * 2) # Get more for filtering
|
| 331 |
+
|
| 332 |
+
# Apply word exclusions to remove inappropriate words
|
| 333 |
+
traditional_results = self._apply_word_exclusions(traditional_results)
|
| 334 |
+
|
| 335 |
+
# Filter out previously used words to improve variety
|
| 336 |
+
similar_words = self._filter_used_words(traditional_results, topic)
|
| 337 |
+
similar_words = similar_words[:max_words]
|
| 338 |
+
|
| 339 |
+
# Track these words to prevent future repetition
|
| 340 |
+
if similar_words:
|
| 341 |
+
self._track_used_words(topic, [word['word'] for word in similar_words])
|
| 342 |
|
| 343 |
# If not enough words found, supplement with cached words (more aggressive)
|
| 344 |
if len(similar_words) < max_words * 0.75: # If less than 75% of target, supplement
|
|
|
|
| 398 |
topic_lower = topic.lower()
|
| 399 |
word_lower = word.lower()
|
| 400 |
|
| 401 |
+
# Don't include the exact topic word, but allow meaningful variations
|
| 402 |
+
if word_lower == topic_lower:
|
| 403 |
+
return False
|
| 404 |
+
|
| 405 |
+
# More nuanced substring checking - avoid overly broad rejections
|
| 406 |
+
# Only reject if the word is a simple substring or the topic contains the word entirely
|
| 407 |
+
if len(word_lower) >= 4: # For longer words, be more permissive
|
| 408 |
+
# Allow words like TECH, ICT, BIOTECH even if topic is "technology"
|
| 409 |
+
if topic_lower in ['technology', 'tech'] and word_lower in ['tech', 'ict']:
|
| 410 |
+
return True
|
| 411 |
+
# Allow words like ANIMAL, MAMMAL even if topic is "animals"
|
| 412 |
+
if topic_lower in ['animals', 'animal'] and word_lower in ['animal', 'mammal']:
|
| 413 |
+
return True
|
| 414 |
+
|
| 415 |
+
# General rule: reject only if word is completely contained in topic and is short
|
| 416 |
+
if word_lower in topic_lower and len(word_lower) < 4:
|
| 417 |
return False
|
| 418 |
|
| 419 |
# Topic-specific filtering
|
|
|
|
| 423 |
return False
|
| 424 |
|
| 425 |
# Prefer concrete nouns over abstract concepts
|
| 426 |
+
# Be more selective about abstract word filtering - many "-ment" words are concrete
|
| 427 |
+
truly_abstract_endings = ['tion', 'ness', 'ity', 'ism'] # Removed 'ment' as too broad
|
| 428 |
+
if any(word_lower.endswith(ending) for ending in truly_abstract_endings) and len(word) > 9:
|
| 429 |
+
# Additional check: only reject if the word seems truly abstract
|
| 430 |
+
abstract_prefixes = ['develop', 'manage', 'establish', 'improve', 'achieve']
|
| 431 |
+
if any(word_lower.startswith(prefix) for prefix in abstract_prefixes):
|
| 432 |
+
return False
|
| 433 |
|
| 434 |
return True
|
| 435 |
|
| 436 |
+
def _track_used_words(self, topic: str, words: List[Dict[str, Any]]):
|
| 437 |
+
"""Track words used for this topic to avoid repetition in future puzzles."""
|
| 438 |
+
topic_key = topic.lower()
|
| 439 |
+
|
| 440 |
+
if topic_key not in self.used_words_by_topic:
|
| 441 |
+
self.used_words_by_topic[topic_key] = set()
|
| 442 |
+
|
| 443 |
+
# Add new words to the used set
|
| 444 |
+
new_words = [w['word'].upper() for w in words]
|
| 445 |
+
self.used_words_by_topic[topic_key].update(new_words)
|
| 446 |
+
|
| 447 |
+
# Limit memory usage - keep only the most recent words
|
| 448 |
+
if len(self.used_words_by_topic[topic_key]) > self.max_used_words_per_topic:
|
| 449 |
+
# Convert to list, keep last N words, convert back to set
|
| 450 |
+
used_list = list(self.used_words_by_topic[topic_key])
|
| 451 |
+
self.used_words_by_topic[topic_key] = set(used_list[-self.max_used_words_per_topic:])
|
| 452 |
+
|
| 453 |
+
logger.info(f"π Tracking {len(new_words)} words for '{topic}' (total remembered: {len(self.used_words_by_topic[topic_key])})")
|
| 454 |
+
|
| 455 |
+
def _get_used_words_for_topic(self, topic: str) -> set:
|
| 456 |
+
"""Get the set of words already used for this topic."""
|
| 457 |
+
topic_key = topic.lower()
|
| 458 |
+
return self.used_words_by_topic.get(topic_key, set())
|
| 459 |
+
|
| 460 |
+
def _filter_used_words(self, candidates: List[Dict[str, Any]], topic: str) -> List[Dict[str, Any]]:
|
| 461 |
+
"""Filter out words that have been used recently for this topic."""
|
| 462 |
+
if not candidates:
|
| 463 |
+
return candidates
|
| 464 |
+
|
| 465 |
+
used_words = self._get_used_words_for_topic(topic)
|
| 466 |
+
if not used_words:
|
| 467 |
+
return candidates
|
| 468 |
+
|
| 469 |
+
# Filter out previously used words
|
| 470 |
+
filtered = []
|
| 471 |
+
filtered_out = []
|
| 472 |
+
|
| 473 |
+
for candidate in candidates:
|
| 474 |
+
word = candidate['word'].upper()
|
| 475 |
+
if word not in used_words:
|
| 476 |
+
filtered.append(candidate)
|
| 477 |
+
else:
|
| 478 |
+
filtered_out.append(word)
|
| 479 |
+
|
| 480 |
+
if filtered_out:
|
| 481 |
+
logger.info(f"π« Filtered out {len(filtered_out)} previously used words for '{topic}': {filtered_out[:5]}{'...' if len(filtered_out) > 5 else ''}")
|
| 482 |
+
|
| 483 |
+
logger.info(f"π Word variety filter: {len(candidates)} β {len(filtered)} candidates")
|
| 484 |
+
return filtered
|
| 485 |
+
|
| 486 |
+
def _expand_topic_variations(self, topic: str) -> List[str]:
|
| 487 |
+
"""
|
| 488 |
+
Expand topic to include singular/plural variations for better semantic coverage.
|
| 489 |
+
|
| 490 |
+
Examples:
|
| 491 |
+
- "Animal" β ["Animal", "Animals"]
|
| 492 |
+
- "Animals" β ["Animals", "Animal"]
|
| 493 |
+
- "Technology" β ["Technology", "Technologies"]
|
| 494 |
+
"""
|
| 495 |
+
variations = [topic] # Always include original
|
| 496 |
+
|
| 497 |
+
topic_lower = topic.lower()
|
| 498 |
+
|
| 499 |
+
# Handle common plural patterns
|
| 500 |
+
if topic_lower.endswith('s') and len(topic) > 3:
|
| 501 |
+
# Likely plural, try to get singular
|
| 502 |
+
if topic_lower.endswith('ies'):
|
| 503 |
+
# Technologies β Technology
|
| 504 |
+
singular = topic[:-3] + 'y'
|
| 505 |
+
elif topic_lower.endswith('sses') or topic_lower.endswith('shes') or topic_lower.endswith('ches') or topic_lower.endswith('xes'):
|
| 506 |
+
# Classes β Class, Boxes β Box, Watches β Watch
|
| 507 |
+
singular = topic[:-2]
|
| 508 |
+
elif topic_lower.endswith('es') and len(topic) > 4:
|
| 509 |
+
# Sciences β Science (but not "Yes" β "Ye")
|
| 510 |
+
singular = topic[:-1] # Try removing just 's' first for words ending in 'es'
|
| 511 |
+
elif topic_lower.endswith('s'):
|
| 512 |
+
# Animals β Animal
|
| 513 |
+
singular = topic[:-1]
|
| 514 |
+
else:
|
| 515 |
+
singular = topic
|
| 516 |
+
|
| 517 |
+
if singular != topic and len(singular) >= 3:
|
| 518 |
+
variations.append(singular)
|
| 519 |
+
else:
|
| 520 |
+
# Likely singular, add plural
|
| 521 |
+
if topic_lower.endswith('y') and topic_lower[-2] not in 'aeiou':
|
| 522 |
+
# Technology β Technologies
|
| 523 |
+
plural = topic[:-1] + 'ies'
|
| 524 |
+
elif topic_lower.endswith(('s', 'sh', 'ch', 'x', 'z')):
|
| 525 |
+
# Science β Sciences, Class β Classes
|
| 526 |
+
plural = topic + 'es'
|
| 527 |
+
else:
|
| 528 |
+
# Animal β Animals
|
| 529 |
+
plural = topic + 's'
|
| 530 |
+
|
| 531 |
+
variations.append(plural)
|
| 532 |
+
|
| 533 |
+
# Remove duplicates while preserving order
|
| 534 |
+
unique_variations = []
|
| 535 |
+
for variation in variations:
|
| 536 |
+
if variation not in unique_variations:
|
| 537 |
+
unique_variations.append(variation)
|
| 538 |
+
|
| 539 |
+
logger.info(f"π Topic variations for '{topic}': {unique_variations}")
|
| 540 |
+
return unique_variations
|
| 541 |
+
|
| 542 |
+
def _identify_subcategories(self, candidates: List[Dict[str, Any]], main_topic: str) -> List[str]:
|
| 543 |
+
"""
|
| 544 |
+
Identify which candidate words are likely sub-categories for hierarchical search.
|
| 545 |
+
|
| 546 |
+
Args:
|
| 547 |
+
candidates: List of word candidates with similarity scores
|
| 548 |
+
main_topic: The original topic being searched
|
| 549 |
+
|
| 550 |
+
Returns:
|
| 551 |
+
List of subcategory words suitable for secondary search
|
| 552 |
+
"""
|
| 553 |
+
subcategories = []
|
| 554 |
+
main_topic_lower = main_topic.lower()
|
| 555 |
+
|
| 556 |
+
# Category indicators - words that suggest this is a category rather than terminal word
|
| 557 |
+
category_patterns = {
|
| 558 |
+
# Scientific/academic suffixes
|
| 559 |
+
'academic': ['logy', 'ics', 'ism', 'ology'],
|
| 560 |
+
# Adjective forms that suggest categories
|
| 561 |
+
'adjective': ['logical', 'ical', 'tic', 'ian', 'nal', 'ous'],
|
| 562 |
+
# Collection/group words
|
| 563 |
+
'collective': ['life', 'stock', 'ware', 'kind', 'type', 'group'],
|
| 564 |
+
# General category indicators
|
| 565 |
+
'general': ['wild', 'domestic', 'marine', 'land', 'air', 'water']
|
| 566 |
+
}
|
| 567 |
+
|
| 568 |
+
# Known category words for common topics
|
| 569 |
+
known_categories = {
|
| 570 |
+
'animal': ['wildlife', 'livestock', 'mammal', 'mammalian', 'fauna', 'zoology', 'zoological',
|
| 571 |
+
'vertebrate', 'invertebrate', 'reptile', 'amphibian', 'primate', 'rodent',
|
| 572 |
+
'carnivore', 'herbivore', 'omnivore', 'predator', 'prey'],
|
| 573 |
+
'technology': ['software', 'hardware', 'digital', 'electronic', 'computing', 'internet',
|
| 574 |
+
'mobile', 'wireless', 'networking', 'cybernetic', 'robotic', 'automated'],
|
| 575 |
+
'science': ['physics', 'chemistry', 'biology', 'astronomy', 'geology', 'mathematics',
|
| 576 |
+
'theoretical', 'experimental', 'applied', 'quantum', 'molecular', 'atomic'],
|
| 577 |
+
'geography': ['continental', 'coastal', 'mountainous', 'desert', 'tropical', 'polar',
|
| 578 |
+
'urban', 'rural', 'geological', 'topographical', 'cartographic']
|
| 579 |
+
}
|
| 580 |
+
|
| 581 |
+
for candidate in candidates[:10]: # Only consider top 10 for performance
|
| 582 |
+
word = candidate['word'].lower()
|
| 583 |
+
similarity = candidate['similarity']
|
| 584 |
+
|
| 585 |
+
# Skip if similarity is too low (likely not a good subcategory)
|
| 586 |
+
if similarity < 0.45:
|
| 587 |
+
continue
|
| 588 |
+
|
| 589 |
+
is_subcategory = False
|
| 590 |
+
|
| 591 |
+
# Check against known categories for this topic
|
| 592 |
+
topic_categories = known_categories.get(main_topic_lower, [])
|
| 593 |
+
if word in topic_categories:
|
| 594 |
+
is_subcategory = True
|
| 595 |
+
logger.info(f"π '{word.upper()}' identified as known subcategory for '{main_topic}'")
|
| 596 |
+
|
| 597 |
+
# Check pattern-based detection
|
| 598 |
+
if not is_subcategory:
|
| 599 |
+
for pattern_type, patterns in category_patterns.items():
|
| 600 |
+
for pattern in patterns:
|
| 601 |
+
if word.endswith(pattern):
|
| 602 |
+
is_subcategory = True
|
| 603 |
+
logger.info(f"π '{word.upper()}' identified as subcategory (pattern: {pattern})")
|
| 604 |
+
break
|
| 605 |
+
if is_subcategory:
|
| 606 |
+
break
|
| 607 |
+
|
| 608 |
+
# Additional heuristics
|
| 609 |
+
if not is_subcategory:
|
| 610 |
+
# Words that are likely categories based on length and composition
|
| 611 |
+
if (len(word) >= 6 and # Reasonable length
|
| 612 |
+
word.count('i') + word.count('o') >= 2 and # Contains vowels (not acronym)
|
| 613 |
+
not word.isupper() and # Not an acronym
|
| 614 |
+
word.isalpha()): # Only letters
|
| 615 |
+
|
| 616 |
+
# Check if it's an abstract/categorical concept
|
| 617 |
+
if any(word.endswith(ending) for ending in ['ism', 'ity', 'ness', 'tion', 'sion']):
|
| 618 |
+
is_subcategory = True
|
| 619 |
+
logger.info(f"π '{word.upper()}' identified as subcategory (abstract concept)")
|
| 620 |
+
|
| 621 |
+
if is_subcategory and word.upper() not in subcategories:
|
| 622 |
+
subcategories.append(word.upper())
|
| 623 |
+
|
| 624 |
+
# Limit subcategories to prevent explosion
|
| 625 |
+
max_subcategories = 5
|
| 626 |
+
limited_subcategories = subcategories[:max_subcategories]
|
| 627 |
+
|
| 628 |
+
if limited_subcategories:
|
| 629 |
+
logger.info(f"π³ Identified {len(limited_subcategories)} subcategories for '{main_topic}': {limited_subcategories}")
|
| 630 |
+
else:
|
| 631 |
+
logger.info(f"π³ No suitable subcategories found for '{main_topic}'")
|
| 632 |
+
|
| 633 |
+
return limited_subcategories
|
| 634 |
+
|
| 635 |
+
async def _hierarchical_search(
|
| 636 |
+
self,
|
| 637 |
+
topic: str,
|
| 638 |
+
difficulty: str,
|
| 639 |
+
max_words: int
|
| 640 |
+
) -> List[Dict[str, Any]]:
|
| 641 |
+
"""
|
| 642 |
+
Perform hierarchical semantic search using topic variations and subcategories.
|
| 643 |
+
|
| 644 |
+
Search strategy:
|
| 645 |
+
1. Search for topic variations (singular/plural)
|
| 646 |
+
2. Identify subcategories from initial results
|
| 647 |
+
3. Search subcategories for more specific words
|
| 648 |
+
4. Combine and weight all results
|
| 649 |
+
"""
|
| 650 |
+
all_candidates = []
|
| 651 |
+
|
| 652 |
+
# Phase 1: Search topic variations (singular/plural)
|
| 653 |
+
topic_variations = self._expand_topic_variations(topic)
|
| 654 |
+
|
| 655 |
+
logger.info(f"π Starting hierarchical search for '{topic}' with {len(topic_variations)} variations")
|
| 656 |
+
|
| 657 |
+
# Search each topic variation
|
| 658 |
+
main_topic_candidates = []
|
| 659 |
+
for variation in topic_variations:
|
| 660 |
+
logger.info(f"π Searching topic variation: '{variation}'")
|
| 661 |
+
|
| 662 |
+
# Get topic embedding
|
| 663 |
+
topic_embedding = self.model.encode([variation], convert_to_numpy=True)
|
| 664 |
+
|
| 665 |
+
# Add search randomness
|
| 666 |
+
noise_factor = float(os.getenv("SEARCH_RANDOMNESS", "0.02"))
|
| 667 |
+
if noise_factor > 0:
|
| 668 |
+
try:
|
| 669 |
+
noise = np.random.normal(0, noise_factor, topic_embedding.shape)
|
| 670 |
+
topic_embedding = topic_embedding + noise
|
| 671 |
+
except Exception:
|
| 672 |
+
pass # Continue without noise if it fails
|
| 673 |
+
|
| 674 |
+
topic_embedding = np.ascontiguousarray(topic_embedding, dtype=np.float32)
|
| 675 |
+
faiss.normalize_L2(topic_embedding)
|
| 676 |
+
|
| 677 |
+
# Search FAISS index
|
| 678 |
+
search_size = min(self.max_results * 3, 100) # Moderate size for variations
|
| 679 |
+
scores, indices = self.faiss_index.search(topic_embedding, search_size)
|
| 680 |
+
|
| 681 |
+
# Collect candidates for this variation
|
| 682 |
+
variation_candidates = self._collect_candidates_with_threshold(
|
| 683 |
+
scores, indices, self.base_similarity_threshold, variation, difficulty
|
| 684 |
+
)
|
| 685 |
+
|
| 686 |
+
# Weight main topic higher than variations
|
| 687 |
+
weight = 1.0 if variation == topic else 0.9
|
| 688 |
+
for candidate in variation_candidates:
|
| 689 |
+
candidate['similarity'] *= weight
|
| 690 |
+
candidate['search_source'] = f"main_topic:{variation}"
|
| 691 |
+
|
| 692 |
+
main_topic_candidates.extend(variation_candidates)
|
| 693 |
+
|
| 694 |
+
logger.info(f"π Main topic search found {len(main_topic_candidates)} candidates")
|
| 695 |
+
|
| 696 |
+
# Phase 2: Identify subcategories from best candidates
|
| 697 |
+
if main_topic_candidates:
|
| 698 |
+
# Sort by similarity to get best candidates for subcategory detection
|
| 699 |
+
main_topic_candidates.sort(key=lambda x: x['similarity'], reverse=True)
|
| 700 |
+
subcategories = self._identify_subcategories(main_topic_candidates, topic)
|
| 701 |
+
|
| 702 |
+
# Phase 3: Search subcategories
|
| 703 |
+
subcategory_candidates = []
|
| 704 |
+
for subcategory in subcategories:
|
| 705 |
+
logger.info(f"π³ Searching subcategory: '{subcategory}'")
|
| 706 |
+
|
| 707 |
+
try:
|
| 708 |
+
# Get subcategory embedding
|
| 709 |
+
subcat_embedding = self.model.encode([subcategory], convert_to_numpy=True)
|
| 710 |
+
subcat_embedding = np.ascontiguousarray(subcat_embedding, dtype=np.float32)
|
| 711 |
+
faiss.normalize_L2(subcat_embedding)
|
| 712 |
+
|
| 713 |
+
# Search with smaller result set for subcategories
|
| 714 |
+
sub_search_size = min(self.max_results * 2, 60)
|
| 715 |
+
sub_scores, sub_indices = self.faiss_index.search(subcat_embedding, sub_search_size)
|
| 716 |
+
|
| 717 |
+
# Use slightly lower threshold for subcategories to get more variety
|
| 718 |
+
sub_threshold = max(self.base_similarity_threshold - 0.05, self.min_similarity_threshold)
|
| 719 |
+
sub_candidates = self._collect_candidates_with_threshold(
|
| 720 |
+
sub_scores, sub_indices, sub_threshold, subcategory, difficulty
|
| 721 |
+
)
|
| 722 |
+
|
| 723 |
+
# Weight subcategory results lower than main topic
|
| 724 |
+
for candidate in sub_candidates:
|
| 725 |
+
candidate['similarity'] *= 0.8 # Lower weight for subcategory results
|
| 726 |
+
candidate['search_source'] = f"subcategory:{subcategory}"
|
| 727 |
+
|
| 728 |
+
subcategory_candidates.extend(sub_candidates)
|
| 729 |
+
logger.info(f"π³ Subcategory '{subcategory}' found {len(sub_candidates)} candidates")
|
| 730 |
+
|
| 731 |
+
except Exception as e:
|
| 732 |
+
logger.warning(f"β οΈ Failed to search subcategory '{subcategory}': {e}")
|
| 733 |
+
continue
|
| 734 |
+
|
| 735 |
+
logger.info(f"π³ Subcategory search found {len(subcategory_candidates)} additional candidates")
|
| 736 |
+
else:
|
| 737 |
+
subcategory_candidates = []
|
| 738 |
+
|
| 739 |
+
# Phase 4: Combine all candidates
|
| 740 |
+
all_candidates = main_topic_candidates + subcategory_candidates
|
| 741 |
+
|
| 742 |
+
logger.info(f"π Total candidates before deduplication: {len(all_candidates)}")
|
| 743 |
+
|
| 744 |
+
return all_candidates
|
| 745 |
+
|
| 746 |
+
async def _traditional_single_search(
|
| 747 |
+
self,
|
| 748 |
+
topic: str,
|
| 749 |
+
difficulty: str,
|
| 750 |
+
max_words: int
|
| 751 |
+
) -> List[Dict[str, Any]]:
|
| 752 |
+
"""
|
| 753 |
+
Traditional single-topic search approach (original implementation).
|
| 754 |
+
Kept as fallback option for compatibility.
|
| 755 |
+
"""
|
| 756 |
+
# Get topic embedding
|
| 757 |
+
topic_embedding = self.model.encode([topic], convert_to_numpy=True)
|
| 758 |
+
|
| 759 |
+
# Add small amount of noise to create variety in search results
|
| 760 |
+
import numpy as np
|
| 761 |
+
noise_factor = float(os.getenv("SEARCH_RANDOMNESS", "0.02"))
|
| 762 |
+
if noise_factor > 0:
|
| 763 |
+
try:
|
| 764 |
+
noise = np.random.normal(0, noise_factor, topic_embedding.shape)
|
| 765 |
+
topic_embedding = topic_embedding + noise
|
| 766 |
+
except Exception:
|
| 767 |
+
pass # Continue without noise if it fails
|
| 768 |
+
|
| 769 |
+
topic_embedding = np.ascontiguousarray(topic_embedding, dtype=np.float32)
|
| 770 |
+
faiss.normalize_L2(topic_embedding)
|
| 771 |
+
|
| 772 |
+
# Search for similar words using FAISS
|
| 773 |
+
search_size = min(self.max_results * 6, 150)
|
| 774 |
+
scores, indices = self.faiss_index.search(topic_embedding, search_size)
|
| 775 |
+
|
| 776 |
+
# Debug: log search results
|
| 777 |
+
logger.info(f"π FAISS search returned {len(scores[0])} results")
|
| 778 |
+
logger.info(f"π Top 5 scores: {scores[0][:5]}")
|
| 779 |
+
|
| 780 |
+
# Log the actual words found by FAISS for debugging
|
| 781 |
+
top_words_with_scores = []
|
| 782 |
+
for i, (score, idx) in enumerate(zip(scores[0][:10], indices[0][:10])):
|
| 783 |
+
word = self.vocab[idx]
|
| 784 |
+
top_words_with_scores.append(f"{word}({score:.3f})")
|
| 785 |
+
|
| 786 |
+
logger.info(f"π Top 10 FAISS words: {', '.join(top_words_with_scores)}")
|
| 787 |
+
|
| 788 |
+
# Adaptive threshold strategy
|
| 789 |
+
candidates = []
|
| 790 |
+
thresholds_to_try = [
|
| 791 |
+
self.base_similarity_threshold,
|
| 792 |
+
max(self.base_similarity_threshold - 0.05, self.min_similarity_threshold),
|
| 793 |
+
max(self.base_similarity_threshold - 0.10, self.min_similarity_threshold),
|
| 794 |
+
self.min_similarity_threshold
|
| 795 |
+
]
|
| 796 |
+
|
| 797 |
+
for threshold in thresholds_to_try:
|
| 798 |
+
logger.info(f"π― Trying threshold: {threshold}")
|
| 799 |
+
candidates = self._collect_candidates_with_threshold(scores, indices, threshold, topic, difficulty)
|
| 800 |
+
logger.info(f"π Found {len(candidates)} candidates with threshold {threshold}")
|
| 801 |
+
|
| 802 |
+
if len(candidates) >= max_words * 0.75:
|
| 803 |
+
logger.info(f"β
Sufficient words found with threshold {threshold}")
|
| 804 |
+
break
|
| 805 |
+
elif len(candidates) >= max_words // 2:
|
| 806 |
+
logger.info(f"β‘ Acceptable words found with threshold {threshold}")
|
| 807 |
+
break
|
| 808 |
+
|
| 809 |
+
# Smart randomization
|
| 810 |
+
import random
|
| 811 |
+
if len(candidates) > max_words * 2:
|
| 812 |
+
similar_words = self._weighted_random_selection(candidates, max_words)
|
| 813 |
+
else:
|
| 814 |
+
random.shuffle(candidates)
|
| 815 |
+
similar_words = candidates[:max_words]
|
| 816 |
+
|
| 817 |
+
logger.info(f"π― Traditional search found {len(similar_words)} words for '{topic}'")
|
| 818 |
+
|
| 819 |
+
# Cache successful results
|
| 820 |
+
if similar_words:
|
| 821 |
+
await self._cache_successful_search(topic, difficulty, similar_words)
|
| 822 |
+
|
| 823 |
+
return similar_words
|
| 824 |
+
|
| 825 |
+
def _combine_hierarchical_results(
|
| 826 |
+
self,
|
| 827 |
+
all_candidates: List[Dict[str, Any]],
|
| 828 |
+
max_words: int
|
| 829 |
+
) -> List[Dict[str, Any]]:
|
| 830 |
+
"""
|
| 831 |
+
Intelligently combine and deduplicate results from hierarchical search.
|
| 832 |
+
|
| 833 |
+
Strategy:
|
| 834 |
+
1. Remove duplicates while preserving best similarity scores
|
| 835 |
+
2. Apply source-based weighting (main topic > subcategories)
|
| 836 |
+
3. Ensure diverse representation from different search sources
|
| 837 |
+
4. Apply adaptive threshold filtering
|
| 838 |
+
"""
|
| 839 |
+
if not all_candidates:
|
| 840 |
+
return []
|
| 841 |
+
|
| 842 |
+
# Step 1: Strict deduplication by word while keeping best score
|
| 843 |
+
word_best_scores = {}
|
| 844 |
+
for candidate in all_candidates:
|
| 845 |
+
word = candidate['word'].upper() # Ensure consistent casing
|
| 846 |
+
similarity = candidate['similarity']
|
| 847 |
+
source = candidate.get('search_source', 'unknown')
|
| 848 |
+
|
| 849 |
+
# Only keep if this word hasn't been seen or if it has a better score
|
| 850 |
+
if word not in word_best_scores or similarity > word_best_scores[word]['similarity']:
|
| 851 |
+
candidate_copy = candidate.copy()
|
| 852 |
+
candidate_copy['word'] = word # Normalize case
|
| 853 |
+
word_best_scores[word] = candidate_copy
|
| 854 |
+
|
| 855 |
+
deduplicated = list(word_best_scores.values())
|
| 856 |
+
logger.info(f"π After strict deduplication: {len(all_candidates)} β {len(deduplicated)} unique words")
|
| 857 |
+
|
| 858 |
+
# Step 2: Add randomization to improve variety while maintaining quality
|
| 859 |
+
# Group by similarity tiers to maintain quality while adding variety
|
| 860 |
+
high_quality = [w for w in deduplicated if w['similarity'] >= self.base_similarity_threshold]
|
| 861 |
+
medium_quality = [w for w in deduplicated if self.base_similarity_threshold - 0.1 <= w['similarity'] < self.base_similarity_threshold]
|
| 862 |
+
lower_quality = [w for w in deduplicated if w['similarity'] < self.base_similarity_threshold - 0.1]
|
| 863 |
+
|
| 864 |
+
# Shuffle within each tier for variety, then recombine
|
| 865 |
+
import random
|
| 866 |
+
random.shuffle(high_quality)
|
| 867 |
+
random.shuffle(medium_quality)
|
| 868 |
+
random.shuffle(lower_quality)
|
| 869 |
+
|
| 870 |
+
# Combine back in quality order but with randomness within tiers
|
| 871 |
+
deduplicated = high_quality + medium_quality + lower_quality
|
| 872 |
+
|
| 873 |
+
logger.info(f"π² Randomized within quality tiers: {len(high_quality)} high, {len(medium_quality)} medium, {len(lower_quality)} lower")
|
| 874 |
+
|
| 875 |
+
# Step 3: Apply adaptive threshold filtering (reuse existing logic)
|
| 876 |
+
thresholds_to_try = [
|
| 877 |
+
self.base_similarity_threshold,
|
| 878 |
+
max(self.base_similarity_threshold - 0.05, self.min_similarity_threshold),
|
| 879 |
+
max(self.base_similarity_threshold - 0.10, self.min_similarity_threshold),
|
| 880 |
+
self.min_similarity_threshold
|
| 881 |
+
]
|
| 882 |
+
|
| 883 |
+
final_candidates = []
|
| 884 |
+
for threshold in thresholds_to_try:
|
| 885 |
+
filtered_candidates = [c for c in deduplicated if c['similarity'] >= threshold]
|
| 886 |
+
|
| 887 |
+
logger.info(f"π― Hierarchical threshold {threshold}: {len(filtered_candidates)} candidates")
|
| 888 |
+
|
| 889 |
+
if len(filtered_candidates) >= max_words * 0.75:
|
| 890 |
+
final_candidates = filtered_candidates
|
| 891 |
+
logger.info(f"β
Sufficient words found with hierarchical threshold {threshold}")
|
| 892 |
+
break
|
| 893 |
+
elif len(filtered_candidates) >= max_words // 2:
|
| 894 |
+
final_candidates = filtered_candidates
|
| 895 |
+
logger.info(f"β‘ Acceptable words found with hierarchical threshold {threshold}")
|
| 896 |
+
break
|
| 897 |
+
|
| 898 |
+
if not final_candidates:
|
| 899 |
+
final_candidates = deduplicated # Use all if threshold filtering too strict
|
| 900 |
+
|
| 901 |
+
# Step 4: Ensure source diversity in final selection
|
| 902 |
+
final_selection = self._ensure_source_diversity(final_candidates, max_words)
|
| 903 |
+
|
| 904 |
+
logger.info(f"π Final hierarchical selection: {len(final_selection)} words")
|
| 905 |
+
|
| 906 |
+
# Log the sources for debugging
|
| 907 |
+
source_counts = {}
|
| 908 |
+
for candidate in final_selection:
|
| 909 |
+
source = candidate.get('search_source', 'unknown')
|
| 910 |
+
source_counts[source] = source_counts.get(source, 0) + 1
|
| 911 |
+
|
| 912 |
+
logger.info(f"π Source distribution: {source_counts}")
|
| 913 |
+
|
| 914 |
+
return final_selection
|
| 915 |
+
|
| 916 |
+
def _ensure_source_diversity(
|
| 917 |
+
self,
|
| 918 |
+
candidates: List[Dict[str, Any]],
|
| 919 |
+
max_words: int
|
| 920 |
+
) -> List[Dict[str, Any]]:
|
| 921 |
+
"""
|
| 922 |
+
Ensure diverse representation from different search sources.
|
| 923 |
+
"""
|
| 924 |
+
if len(candidates) <= max_words:
|
| 925 |
+
return candidates
|
| 926 |
+
|
| 927 |
+
# Group by source
|
| 928 |
+
source_groups = {}
|
| 929 |
+
for candidate in candidates:
|
| 930 |
+
source = candidate.get('search_source', 'unknown')
|
| 931 |
+
if source not in source_groups:
|
| 932 |
+
source_groups[source] = []
|
| 933 |
+
source_groups[source].append(candidate)
|
| 934 |
+
|
| 935 |
+
# If we have multiple sources, ensure representation from each
|
| 936 |
+
if len(source_groups) > 1:
|
| 937 |
+
selected = []
|
| 938 |
+
main_topic_quota = max_words * 2 // 3 # 2/3 from main topic
|
| 939 |
+
subcategory_quota = max_words - main_topic_quota # 1/3 from subcategories
|
| 940 |
+
|
| 941 |
+
# Select from main topic sources first
|
| 942 |
+
main_sources = [k for k in source_groups.keys() if k.startswith('main_topic:')]
|
| 943 |
+
for source in main_sources:
|
| 944 |
+
quota = main_topic_quota // len(main_sources) if main_sources else 0
|
| 945 |
+
selected.extend(source_groups[source][:quota])
|
| 946 |
+
|
| 947 |
+
# Fill remaining slots with subcategory sources
|
| 948 |
+
subcat_sources = [k for k in source_groups.keys() if k.startswith('subcategory:')]
|
| 949 |
+
if subcat_sources and len(selected) < max_words:
|
| 950 |
+
remaining_slots = max_words - len(selected)
|
| 951 |
+
quota_per_subcat = max(1, remaining_slots // len(subcat_sources))
|
| 952 |
+
|
| 953 |
+
for source in subcat_sources:
|
| 954 |
+
if len(selected) >= max_words:
|
| 955 |
+
break
|
| 956 |
+
selected.extend(source_groups[source][:quota_per_subcat])
|
| 957 |
+
|
| 958 |
+
# Fill any remaining slots with best remaining candidates
|
| 959 |
+
if len(selected) < max_words:
|
| 960 |
+
used_words = {c['word'] for c in selected}
|
| 961 |
+
remaining = [c for c in candidates if c['word'] not in used_words]
|
| 962 |
+
needed = max_words - len(selected)
|
| 963 |
+
selected.extend(remaining[:needed])
|
| 964 |
+
|
| 965 |
+
return selected[:max_words]
|
| 966 |
+
else:
|
| 967 |
+
# Single source, just return top candidates
|
| 968 |
+
return candidates[:max_words]
|
| 969 |
+
|
| 970 |
def _get_index_cache_dir(self) -> str:
|
| 971 |
"""Get the directory for caching FAISS indexes."""
|
| 972 |
# Use different cache locations based on environment
|
|
|
|
| 992 |
os.path.exists(self.embeddings_cache_path) and
|
| 993 |
os.path.exists(self.faiss_cache_path))
|
| 994 |
|
| 995 |
+
def _load_excluded_words(self) -> set:
|
| 996 |
+
"""Load list of words to exclude from crossword generation."""
|
| 997 |
+
# Default excluded words - overly generic or inappropriate for crosswords
|
| 998 |
+
default_excluded = {
|
| 999 |
+
"WORD", "THING", "STUFF", "ITEMS", "THINGS", "WORDS", "TEXT", "STRING",
|
| 1000 |
+
"DATA", "INFO", "CONTENT", "MATERIAL", "ELEMENT", "OBJECT", "ENTITY",
|
| 1001 |
+
"CONCEPT", "IDEA", "NOTION", "ABSTRACT", "GENERAL", "SPECIFIC", "VARIOUS",
|
| 1002 |
+
"MULTIPLE", "SEVERAL", "MANY", "SOME", "MOST", "ALL", "EACH", "EVERY",
|
| 1003 |
+
"DIFFERENT", "SIMILAR", "SAME", "OTHER", "ANOTHER", "VARIOUS", "CERTAIN"
|
| 1004 |
+
}
|
| 1005 |
+
|
| 1006 |
+
# Load additional exclusions from environment or file
|
| 1007 |
+
env_excluded = os.getenv("EXCLUDED_WORDS", "")
|
| 1008 |
+
if env_excluded:
|
| 1009 |
+
env_words = {word.strip().upper() for word in env_excluded.split(",") if word.strip()}
|
| 1010 |
+
default_excluded.update(env_words)
|
| 1011 |
+
|
| 1012 |
+
# Try to load from exclusion file if it exists
|
| 1013 |
+
exclusion_file = os.getenv("WORD_EXCLUSION_FILE", "")
|
| 1014 |
+
if exclusion_file and os.path.exists(exclusion_file):
|
| 1015 |
+
try:
|
| 1016 |
+
with open(exclusion_file, 'r') as f:
|
| 1017 |
+
file_words = {word.strip().upper() for line in f for word in [line.strip()] if word and not word.startswith('#')}
|
| 1018 |
+
default_excluded.update(file_words)
|
| 1019 |
+
logger.info(f"π Loaded {len(file_words)} additional excluded words from {exclusion_file}")
|
| 1020 |
+
except Exception as e:
|
| 1021 |
+
logger.warning(f"β οΈ Failed to load exclusion file {exclusion_file}: {e}")
|
| 1022 |
+
|
| 1023 |
+
logger.info(f"π« Loaded {len(default_excluded)} excluded words for filtering")
|
| 1024 |
+
return default_excluded
|
| 1025 |
+
|
| 1026 |
+
def _apply_word_exclusions(self, candidates: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 1027 |
+
"""Filter out excluded words from candidates."""
|
| 1028 |
+
if not candidates or not self.excluded_words:
|
| 1029 |
+
return candidates
|
| 1030 |
+
|
| 1031 |
+
filtered = []
|
| 1032 |
+
excluded_count = 0
|
| 1033 |
+
|
| 1034 |
+
for candidate in candidates:
|
| 1035 |
+
word = candidate['word'].upper()
|
| 1036 |
+
if word not in self.excluded_words:
|
| 1037 |
+
filtered.append(candidate)
|
| 1038 |
+
else:
|
| 1039 |
+
excluded_count += 1
|
| 1040 |
+
|
| 1041 |
+
if excluded_count > 0:
|
| 1042 |
+
logger.info(f"π« Excluded {excluded_count} inappropriate words from results")
|
| 1043 |
+
|
| 1044 |
+
return filtered
|
| 1045 |
+
|
| 1046 |
def _load_cached_index(self) -> bool:
|
| 1047 |
"""Load FAISS index from cache if available."""
|
| 1048 |
try:
|