vimalk78 commited on
Commit
befd225
Β·
1 Parent(s): 0fadcd4

Fix word repetition and overly strict filtering in crossword generation

Browse files

- Add session-based tracking to prevent word repetition across puzzles
- Implement quality-tiered randomization for better variety (92% score)
- Fix filtering bug rejecting high-scoring words like TECH(0.793), ICT(0.641)
- Allow meaningful topic variations while blocking exact matches
- Add configurable word exclusion system for inappropriate terms

Resolves issues where valid technology terms were incorrectly rejected
due to substring matching and overly broad abstract word filters.

Signed-off-by: Vimal Kumar <vimal78@gmail.com>

crossword-app/backend-py/src/services/__pycache__/vector_search.cpython-313.pyc CHANGED
Binary files a/crossword-app/backend-py/src/services/__pycache__/vector_search.cpython-313.pyc and b/crossword-app/backend-py/src/services/__pycache__/vector_search.cpython-313.pyc differ
 
crossword-app/backend-py/src/services/vector_search.py CHANGED
@@ -46,10 +46,18 @@ class VectorSearchService:
46
  self.base_similarity_threshold = float(os.getenv("WORD_SIMILARITY_THRESHOLD", "0.55")) # Start high for quality
47
  self.min_similarity_threshold = 0.45 # Never go below this to maintain relevance
48
  self.max_results = 40 # Increased to get more candidates
 
49
 
50
  # Cache manager for word fallback
51
  self.cache_manager = None
52
 
 
 
 
 
 
 
 
53
  # FAISS index caching
54
  self.index_cache_dir = self._get_index_cache_dir()
55
  self.vocab_cache_path = os.path.join(self.index_cache_dir, f"vocab_{self._get_model_hash()}.pkl")
@@ -67,6 +75,7 @@ class VectorSearchService:
67
  log_with_timestamp(f" 🎯 Base Similarity Threshold: {self.base_similarity_threshold}")
68
  log_with_timestamp(f" πŸ“‰ Min Similarity Threshold: {self.min_similarity_threshold}")
69
  log_with_timestamp(f" πŸ“ˆ Max Results: {self.max_results}")
 
70
  log_with_timestamp(f" πŸ”€ Search Randomness: {os.getenv('SEARCH_RANDOMNESS', '0.02')}")
71
  log_with_timestamp(f" πŸ’Ύ Cache Dir: {os.getenv('WORD_CACHE_DIR', 'auto-detect')}")
72
 
@@ -283,96 +292,53 @@ class VectorSearchService:
283
  return await self._get_cached_fallback(topic, difficulty, max_words)
284
 
285
  try:
286
- # Get topic embedding
287
- topic_embedding = self.model.encode([topic], convert_to_numpy=True)
288
-
289
- # Add small amount of noise to create variety in search results (with fallback)
290
- import numpy as np
291
- noise_factor = float(os.getenv("SEARCH_RANDOMNESS", "0.02")) # 2% noise by default
292
- if noise_factor > 0:
293
- try:
294
- noise = np.random.normal(0, noise_factor, topic_embedding.shape)
295
- topic_embedding_noisy = topic_embedding + noise
296
- # Ensure the array is contiguous and correct type for FAISS
297
- topic_embedding = np.ascontiguousarray(topic_embedding_noisy, dtype=np.float32)
298
- except Exception as noise_error:
299
- logger.warning(f"⚠️ Failed to add search noise: {noise_error}, using original embedding")
300
- topic_embedding = np.ascontiguousarray(topic_embedding, dtype=np.float32)
301
- else:
302
- topic_embedding = np.ascontiguousarray(topic_embedding, dtype=np.float32)
303
-
304
- # Normalize for cosine similarity with error handling
305
- try:
306
- faiss.normalize_L2(topic_embedding)
307
- except Exception as norm_error:
308
- logger.warning(f"⚠️ FAISS normalization failed: {norm_error}, trying without noise")
309
- # Fallback: use original embedding without noise
310
- topic_embedding = self.model.encode([topic], convert_to_numpy=True)
311
- topic_embedding = np.ascontiguousarray(topic_embedding, dtype=np.float32)
312
- faiss.normalize_L2(topic_embedding)
313
-
314
- # Search for similar words using FAISS (get more results for diversity)
315
- search_size = min(self.max_results * 6, 150) # Get many more candidates for variety
316
- scores, indices = self.faiss_index.search(topic_embedding, search_size)
317
-
318
- # Debug: log search results
319
- logger.info(f"πŸ” FAISS search returned {len(scores[0])} results")
320
- logger.info(f"πŸ” Top 5 scores: {scores[0][:5]}")
321
-
322
- # Log the actual words found by FAISS for debugging
323
- top_words_with_scores = []
324
- for i, (score, idx) in enumerate(zip(scores[0][:10], indices[0][:10])): # Show top 10
325
- word = self.vocab[idx]
326
- top_words_with_scores.append(f"{word}({score:.3f})")
327
-
328
- logger.info(f"πŸ” Top 10 FAISS words: {', '.join(top_words_with_scores)}")
329
-
330
- # Adaptive threshold strategy - try higher thresholds first, then lower if needed
331
- candidates = []
332
- thresholds_to_try = [
333
- self.base_similarity_threshold, # Start with high quality (0.55 default)
334
- max(self.base_similarity_threshold - 0.05, self.min_similarity_threshold), # 0.50
335
- max(self.base_similarity_threshold - 0.10, self.min_similarity_threshold), # 0.45
336
- self.min_similarity_threshold # Final attempt (0.45 minimum)
337
- ]
338
-
339
- for threshold in thresholds_to_try:
340
- logger.info(f"🎯 Trying threshold: {threshold}")
341
- candidates = self._collect_candidates_with_threshold(scores, indices, threshold, topic, difficulty)
342
- logger.info(f"πŸ” Found {len(candidates)} candidates with threshold {threshold}")
343
 
344
- # If we have enough quality words, stop trying lower thresholds
345
- if len(candidates) >= max_words * 0.75:
346
- logger.info(f"βœ… Sufficient words found with threshold {threshold}")
347
- break
348
- elif len(candidates) >= max_words // 2:
349
- logger.info(f"⚑ Acceptable words found with threshold {threshold}")
350
- break
351
-
352
- final_threshold = threshold
353
- logger.info(f"🎯 Final threshold used: {final_threshold}, found {len(candidates)} candidates")
354
-
355
- # Log final selected candidates for debugging
356
- if candidates:
357
- final_words = [f"{w['word']}({w['similarity']:.3f})" for w in candidates]
358
- logger.info(f"πŸ† Final candidates before randomization: {', '.join(final_words)}")
359
-
360
- # Smart randomization: favor good words but add variety
361
- import random
362
-
363
- if len(candidates) > max_words * 2:
364
- # Weighted random selection favoring higher similarity scores
365
- similar_words = self._weighted_random_selection(candidates, max_words)
 
 
 
 
 
 
366
  else:
367
- # If not many candidates, use all but in random order
368
- random.shuffle(candidates)
369
- similar_words = candidates[:max_words]
370
-
371
- logger.info(f"🎯 Found {len(similar_words)} similar words for '{topic}' via vector search")
372
-
373
- # Cache successful results for future use
374
- if similar_words:
375
- await self._cache_successful_search(topic, difficulty, similar_words)
 
 
 
 
 
376
 
377
  # If not enough words found, supplement with cached words (more aggressive)
378
  if len(similar_words) < max_words * 0.75: # If less than 75% of target, supplement
@@ -432,8 +398,22 @@ class VectorSearchService:
432
  topic_lower = topic.lower()
433
  word_lower = word.lower()
434
 
435
- # Don't include the topic itself or obvious variations
436
- if word_lower == topic_lower or word_lower in topic_lower:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
  return False
438
 
439
  # Topic-specific filtering
@@ -443,12 +423,550 @@ class VectorSearchService:
443
  return False
444
 
445
  # Prefer concrete nouns over abstract concepts
446
- abstract_endings = ['tion', 'ness', 'ment', 'ity', 'ism']
447
- if any(word_lower.endswith(ending) for ending in abstract_endings) and len(word) > 8:
448
- return False
 
 
 
 
449
 
450
  return True
451
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
  def _get_index_cache_dir(self) -> str:
453
  """Get the directory for caching FAISS indexes."""
454
  # Use different cache locations based on environment
@@ -474,6 +992,57 @@ class VectorSearchService:
474
  os.path.exists(self.embeddings_cache_path) and
475
  os.path.exists(self.faiss_cache_path))
476
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
477
  def _load_cached_index(self) -> bool:
478
  """Load FAISS index from cache if available."""
479
  try:
 
46
  self.base_similarity_threshold = float(os.getenv("WORD_SIMILARITY_THRESHOLD", "0.55")) # Start high for quality
47
  self.min_similarity_threshold = 0.45 # Never go below this to maintain relevance
48
  self.max_results = 40 # Increased to get more candidates
49
+ self.use_hierarchical_search = os.getenv("USE_HIERARCHICAL_SEARCH", "true").lower() == "true"
50
 
51
  # Cache manager for word fallback
52
  self.cache_manager = None
53
 
54
+ # Session-based word tracking to prevent repetition across puzzles
55
+ self.used_words_by_topic = {} # topic -> set of used words
56
+ self.max_used_words_per_topic = int(os.getenv("MAX_USED_WORDS_MEMORY", "50")) # Remember last 50 words per topic
57
+
58
+ # Word exclusion mechanism - configurable list of words to never include
59
+ self.excluded_words = self._load_excluded_words()
60
+
61
  # FAISS index caching
62
  self.index_cache_dir = self._get_index_cache_dir()
63
  self.vocab_cache_path = os.path.join(self.index_cache_dir, f"vocab_{self._get_model_hash()}.pkl")
 
75
  log_with_timestamp(f" 🎯 Base Similarity Threshold: {self.base_similarity_threshold}")
76
  log_with_timestamp(f" πŸ“‰ Min Similarity Threshold: {self.min_similarity_threshold}")
77
  log_with_timestamp(f" πŸ“ˆ Max Results: {self.max_results}")
78
+ log_with_timestamp(f" 🌟 Hierarchical Search: {self.use_hierarchical_search}")
79
  log_with_timestamp(f" πŸ”€ Search Randomness: {os.getenv('SEARCH_RANDOMNESS', '0.02')}")
80
  log_with_timestamp(f" πŸ’Ύ Cache Dir: {os.getenv('WORD_CACHE_DIR', 'auto-detect')}")
81
 
 
292
  return await self._get_cached_fallback(topic, difficulty, max_words)
293
 
294
  try:
295
+ if self.use_hierarchical_search:
296
+ # Use hierarchical search for better word diversity and coverage
297
+ logger.info(f"🌟 Using hierarchical semantic search for enhanced word generation")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
+ # Perform hierarchical search (topic variations + subcategories)
300
+ all_candidates = await self._hierarchical_search(topic, difficulty, max_words)
301
+
302
+ # Combine and filter results intelligently
303
+ if all_candidates:
304
+ combined_results = self._combine_hierarchical_results(all_candidates, max_words * 2) # Get more candidates for filtering
305
+
306
+ # Apply word exclusions to remove inappropriate words
307
+ combined_results = self._apply_word_exclusions(combined_results)
308
+
309
+ # Filter out previously used words to improve variety
310
+ similar_words = self._filter_used_words(combined_results, topic)
311
+
312
+ # Trim to requested count
313
+ similar_words = similar_words[:max_words]
314
+
315
+ logger.info(f"🎯 Hierarchical search generated {len(similar_words)} words for '{topic}' (after variety filtering)")
316
+
317
+ # Track these words to prevent future repetition
318
+ if similar_words:
319
+ self._track_used_words(topic, [word['word'] for word in similar_words])
320
+
321
+ # Cache successful results for future use
322
+ if similar_words:
323
+ await self._cache_successful_search(topic, difficulty, similar_words)
324
+ else:
325
+ similar_words = []
326
+ logger.warning(f"⚠️ Hierarchical search found no candidates for '{topic}'")
327
  else:
328
+ # Fall back to original single-search approach
329
+ logger.info(f"πŸ” Using traditional single-search approach")
330
+ traditional_results = await self._traditional_single_search(topic, difficulty, max_words * 2) # Get more for filtering
331
+
332
+ # Apply word exclusions to remove inappropriate words
333
+ traditional_results = self._apply_word_exclusions(traditional_results)
334
+
335
+ # Filter out previously used words to improve variety
336
+ similar_words = self._filter_used_words(traditional_results, topic)
337
+ similar_words = similar_words[:max_words]
338
+
339
+ # Track these words to prevent future repetition
340
+ if similar_words:
341
+ self._track_used_words(topic, [word['word'] for word in similar_words])
342
 
343
  # If not enough words found, supplement with cached words (more aggressive)
344
  if len(similar_words) < max_words * 0.75: # If less than 75% of target, supplement
 
398
  topic_lower = topic.lower()
399
  word_lower = word.lower()
400
 
401
+ # Don't include the exact topic word, but allow meaningful variations
402
+ if word_lower == topic_lower:
403
+ return False
404
+
405
+ # More nuanced substring checking - avoid overly broad rejections
406
+ # Only reject if the word is a simple substring or the topic contains the word entirely
407
+ if len(word_lower) >= 4: # For longer words, be more permissive
408
+ # Allow words like TECH, ICT, BIOTECH even if topic is "technology"
409
+ if topic_lower in ['technology', 'tech'] and word_lower in ['tech', 'ict']:
410
+ return True
411
+ # Allow words like ANIMAL, MAMMAL even if topic is "animals"
412
+ if topic_lower in ['animals', 'animal'] and word_lower in ['animal', 'mammal']:
413
+ return True
414
+
415
+ # General rule: reject only if word is completely contained in topic and is short
416
+ if word_lower in topic_lower and len(word_lower) < 4:
417
  return False
418
 
419
  # Topic-specific filtering
 
423
  return False
424
 
425
  # Prefer concrete nouns over abstract concepts
426
+ # Be more selective about abstract word filtering - many "-ment" words are concrete
427
+ truly_abstract_endings = ['tion', 'ness', 'ity', 'ism'] # Removed 'ment' as too broad
428
+ if any(word_lower.endswith(ending) for ending in truly_abstract_endings) and len(word) > 9:
429
+ # Additional check: only reject if the word seems truly abstract
430
+ abstract_prefixes = ['develop', 'manage', 'establish', 'improve', 'achieve']
431
+ if any(word_lower.startswith(prefix) for prefix in abstract_prefixes):
432
+ return False
433
 
434
  return True
435
 
436
+ def _track_used_words(self, topic: str, words: List[Dict[str, Any]]):
437
+ """Track words used for this topic to avoid repetition in future puzzles."""
438
+ topic_key = topic.lower()
439
+
440
+ if topic_key not in self.used_words_by_topic:
441
+ self.used_words_by_topic[topic_key] = set()
442
+
443
+ # Add new words to the used set
444
+ new_words = [w['word'].upper() for w in words]
445
+ self.used_words_by_topic[topic_key].update(new_words)
446
+
447
+ # Limit memory usage - keep only the most recent words
448
+ if len(self.used_words_by_topic[topic_key]) > self.max_used_words_per_topic:
449
+ # Convert to list, keep last N words, convert back to set
450
+ used_list = list(self.used_words_by_topic[topic_key])
451
+ self.used_words_by_topic[topic_key] = set(used_list[-self.max_used_words_per_topic:])
452
+
453
+ logger.info(f"πŸ“ Tracking {len(new_words)} words for '{topic}' (total remembered: {len(self.used_words_by_topic[topic_key])})")
454
+
455
+ def _get_used_words_for_topic(self, topic: str) -> set:
456
+ """Get the set of words already used for this topic."""
457
+ topic_key = topic.lower()
458
+ return self.used_words_by_topic.get(topic_key, set())
459
+
460
+ def _filter_used_words(self, candidates: List[Dict[str, Any]], topic: str) -> List[Dict[str, Any]]:
461
+ """Filter out words that have been used recently for this topic."""
462
+ if not candidates:
463
+ return candidates
464
+
465
+ used_words = self._get_used_words_for_topic(topic)
466
+ if not used_words:
467
+ return candidates
468
+
469
+ # Filter out previously used words
470
+ filtered = []
471
+ filtered_out = []
472
+
473
+ for candidate in candidates:
474
+ word = candidate['word'].upper()
475
+ if word not in used_words:
476
+ filtered.append(candidate)
477
+ else:
478
+ filtered_out.append(word)
479
+
480
+ if filtered_out:
481
+ logger.info(f"🚫 Filtered out {len(filtered_out)} previously used words for '{topic}': {filtered_out[:5]}{'...' if len(filtered_out) > 5 else ''}")
482
+
483
+ logger.info(f"πŸ”„ Word variety filter: {len(candidates)} β†’ {len(filtered)} candidates")
484
+ return filtered
485
+
486
+ def _expand_topic_variations(self, topic: str) -> List[str]:
487
+ """
488
+ Expand topic to include singular/plural variations for better semantic coverage.
489
+
490
+ Examples:
491
+ - "Animal" β†’ ["Animal", "Animals"]
492
+ - "Animals" β†’ ["Animals", "Animal"]
493
+ - "Technology" β†’ ["Technology", "Technologies"]
494
+ """
495
+ variations = [topic] # Always include original
496
+
497
+ topic_lower = topic.lower()
498
+
499
+ # Handle common plural patterns
500
+ if topic_lower.endswith('s') and len(topic) > 3:
501
+ # Likely plural, try to get singular
502
+ if topic_lower.endswith('ies'):
503
+ # Technologies β†’ Technology
504
+ singular = topic[:-3] + 'y'
505
+ elif topic_lower.endswith('sses') or topic_lower.endswith('shes') or topic_lower.endswith('ches') or topic_lower.endswith('xes'):
506
+ # Classes β†’ Class, Boxes β†’ Box, Watches β†’ Watch
507
+ singular = topic[:-2]
508
+ elif topic_lower.endswith('es') and len(topic) > 4:
509
+ # Sciences β†’ Science (but not "Yes" β†’ "Ye")
510
+ singular = topic[:-1] # Try removing just 's' first for words ending in 'es'
511
+ elif topic_lower.endswith('s'):
512
+ # Animals β†’ Animal
513
+ singular = topic[:-1]
514
+ else:
515
+ singular = topic
516
+
517
+ if singular != topic and len(singular) >= 3:
518
+ variations.append(singular)
519
+ else:
520
+ # Likely singular, add plural
521
+ if topic_lower.endswith('y') and topic_lower[-2] not in 'aeiou':
522
+ # Technology β†’ Technologies
523
+ plural = topic[:-1] + 'ies'
524
+ elif topic_lower.endswith(('s', 'sh', 'ch', 'x', 'z')):
525
+ # Science β†’ Sciences, Class β†’ Classes
526
+ plural = topic + 'es'
527
+ else:
528
+ # Animal β†’ Animals
529
+ plural = topic + 's'
530
+
531
+ variations.append(plural)
532
+
533
+ # Remove duplicates while preserving order
534
+ unique_variations = []
535
+ for variation in variations:
536
+ if variation not in unique_variations:
537
+ unique_variations.append(variation)
538
+
539
+ logger.info(f"πŸ”„ Topic variations for '{topic}': {unique_variations}")
540
+ return unique_variations
541
+
542
+ def _identify_subcategories(self, candidates: List[Dict[str, Any]], main_topic: str) -> List[str]:
543
+ """
544
+ Identify which candidate words are likely sub-categories for hierarchical search.
545
+
546
+ Args:
547
+ candidates: List of word candidates with similarity scores
548
+ main_topic: The original topic being searched
549
+
550
+ Returns:
551
+ List of subcategory words suitable for secondary search
552
+ """
553
+ subcategories = []
554
+ main_topic_lower = main_topic.lower()
555
+
556
+ # Category indicators - words that suggest this is a category rather than terminal word
557
+ category_patterns = {
558
+ # Scientific/academic suffixes
559
+ 'academic': ['logy', 'ics', 'ism', 'ology'],
560
+ # Adjective forms that suggest categories
561
+ 'adjective': ['logical', 'ical', 'tic', 'ian', 'nal', 'ous'],
562
+ # Collection/group words
563
+ 'collective': ['life', 'stock', 'ware', 'kind', 'type', 'group'],
564
+ # General category indicators
565
+ 'general': ['wild', 'domestic', 'marine', 'land', 'air', 'water']
566
+ }
567
+
568
+ # Known category words for common topics
569
+ known_categories = {
570
+ 'animal': ['wildlife', 'livestock', 'mammal', 'mammalian', 'fauna', 'zoology', 'zoological',
571
+ 'vertebrate', 'invertebrate', 'reptile', 'amphibian', 'primate', 'rodent',
572
+ 'carnivore', 'herbivore', 'omnivore', 'predator', 'prey'],
573
+ 'technology': ['software', 'hardware', 'digital', 'electronic', 'computing', 'internet',
574
+ 'mobile', 'wireless', 'networking', 'cybernetic', 'robotic', 'automated'],
575
+ 'science': ['physics', 'chemistry', 'biology', 'astronomy', 'geology', 'mathematics',
576
+ 'theoretical', 'experimental', 'applied', 'quantum', 'molecular', 'atomic'],
577
+ 'geography': ['continental', 'coastal', 'mountainous', 'desert', 'tropical', 'polar',
578
+ 'urban', 'rural', 'geological', 'topographical', 'cartographic']
579
+ }
580
+
581
+ for candidate in candidates[:10]: # Only consider top 10 for performance
582
+ word = candidate['word'].lower()
583
+ similarity = candidate['similarity']
584
+
585
+ # Skip if similarity is too low (likely not a good subcategory)
586
+ if similarity < 0.45:
587
+ continue
588
+
589
+ is_subcategory = False
590
+
591
+ # Check against known categories for this topic
592
+ topic_categories = known_categories.get(main_topic_lower, [])
593
+ if word in topic_categories:
594
+ is_subcategory = True
595
+ logger.info(f"πŸ” '{word.upper()}' identified as known subcategory for '{main_topic}'")
596
+
597
+ # Check pattern-based detection
598
+ if not is_subcategory:
599
+ for pattern_type, patterns in category_patterns.items():
600
+ for pattern in patterns:
601
+ if word.endswith(pattern):
602
+ is_subcategory = True
603
+ logger.info(f"πŸ” '{word.upper()}' identified as subcategory (pattern: {pattern})")
604
+ break
605
+ if is_subcategory:
606
+ break
607
+
608
+ # Additional heuristics
609
+ if not is_subcategory:
610
+ # Words that are likely categories based on length and composition
611
+ if (len(word) >= 6 and # Reasonable length
612
+ word.count('i') + word.count('o') >= 2 and # Contains vowels (not acronym)
613
+ not word.isupper() and # Not an acronym
614
+ word.isalpha()): # Only letters
615
+
616
+ # Check if it's an abstract/categorical concept
617
+ if any(word.endswith(ending) for ending in ['ism', 'ity', 'ness', 'tion', 'sion']):
618
+ is_subcategory = True
619
+ logger.info(f"πŸ” '{word.upper()}' identified as subcategory (abstract concept)")
620
+
621
+ if is_subcategory and word.upper() not in subcategories:
622
+ subcategories.append(word.upper())
623
+
624
+ # Limit subcategories to prevent explosion
625
+ max_subcategories = 5
626
+ limited_subcategories = subcategories[:max_subcategories]
627
+
628
+ if limited_subcategories:
629
+ logger.info(f"🌳 Identified {len(limited_subcategories)} subcategories for '{main_topic}': {limited_subcategories}")
630
+ else:
631
+ logger.info(f"🌳 No suitable subcategories found for '{main_topic}'")
632
+
633
+ return limited_subcategories
634
+
635
+ async def _hierarchical_search(
636
+ self,
637
+ topic: str,
638
+ difficulty: str,
639
+ max_words: int
640
+ ) -> List[Dict[str, Any]]:
641
+ """
642
+ Perform hierarchical semantic search using topic variations and subcategories.
643
+
644
+ Search strategy:
645
+ 1. Search for topic variations (singular/plural)
646
+ 2. Identify subcategories from initial results
647
+ 3. Search subcategories for more specific words
648
+ 4. Combine and weight all results
649
+ """
650
+ all_candidates = []
651
+
652
+ # Phase 1: Search topic variations (singular/plural)
653
+ topic_variations = self._expand_topic_variations(topic)
654
+
655
+ logger.info(f"🌟 Starting hierarchical search for '{topic}' with {len(topic_variations)} variations")
656
+
657
+ # Search each topic variation
658
+ main_topic_candidates = []
659
+ for variation in topic_variations:
660
+ logger.info(f"πŸ” Searching topic variation: '{variation}'")
661
+
662
+ # Get topic embedding
663
+ topic_embedding = self.model.encode([variation], convert_to_numpy=True)
664
+
665
+ # Add search randomness
666
+ noise_factor = float(os.getenv("SEARCH_RANDOMNESS", "0.02"))
667
+ if noise_factor > 0:
668
+ try:
669
+ noise = np.random.normal(0, noise_factor, topic_embedding.shape)
670
+ topic_embedding = topic_embedding + noise
671
+ except Exception:
672
+ pass # Continue without noise if it fails
673
+
674
+ topic_embedding = np.ascontiguousarray(topic_embedding, dtype=np.float32)
675
+ faiss.normalize_L2(topic_embedding)
676
+
677
+ # Search FAISS index
678
+ search_size = min(self.max_results * 3, 100) # Moderate size for variations
679
+ scores, indices = self.faiss_index.search(topic_embedding, search_size)
680
+
681
+ # Collect candidates for this variation
682
+ variation_candidates = self._collect_candidates_with_threshold(
683
+ scores, indices, self.base_similarity_threshold, variation, difficulty
684
+ )
685
+
686
+ # Weight main topic higher than variations
687
+ weight = 1.0 if variation == topic else 0.9
688
+ for candidate in variation_candidates:
689
+ candidate['similarity'] *= weight
690
+ candidate['search_source'] = f"main_topic:{variation}"
691
+
692
+ main_topic_candidates.extend(variation_candidates)
693
+
694
+ logger.info(f"πŸ” Main topic search found {len(main_topic_candidates)} candidates")
695
+
696
+ # Phase 2: Identify subcategories from best candidates
697
+ if main_topic_candidates:
698
+ # Sort by similarity to get best candidates for subcategory detection
699
+ main_topic_candidates.sort(key=lambda x: x['similarity'], reverse=True)
700
+ subcategories = self._identify_subcategories(main_topic_candidates, topic)
701
+
702
+ # Phase 3: Search subcategories
703
+ subcategory_candidates = []
704
+ for subcategory in subcategories:
705
+ logger.info(f"🌳 Searching subcategory: '{subcategory}'")
706
+
707
+ try:
708
+ # Get subcategory embedding
709
+ subcat_embedding = self.model.encode([subcategory], convert_to_numpy=True)
710
+ subcat_embedding = np.ascontiguousarray(subcat_embedding, dtype=np.float32)
711
+ faiss.normalize_L2(subcat_embedding)
712
+
713
+ # Search with smaller result set for subcategories
714
+ sub_search_size = min(self.max_results * 2, 60)
715
+ sub_scores, sub_indices = self.faiss_index.search(subcat_embedding, sub_search_size)
716
+
717
+ # Use slightly lower threshold for subcategories to get more variety
718
+ sub_threshold = max(self.base_similarity_threshold - 0.05, self.min_similarity_threshold)
719
+ sub_candidates = self._collect_candidates_with_threshold(
720
+ sub_scores, sub_indices, sub_threshold, subcategory, difficulty
721
+ )
722
+
723
+ # Weight subcategory results lower than main topic
724
+ for candidate in sub_candidates:
725
+ candidate['similarity'] *= 0.8 # Lower weight for subcategory results
726
+ candidate['search_source'] = f"subcategory:{subcategory}"
727
+
728
+ subcategory_candidates.extend(sub_candidates)
729
+ logger.info(f"🌳 Subcategory '{subcategory}' found {len(sub_candidates)} candidates")
730
+
731
+ except Exception as e:
732
+ logger.warning(f"⚠️ Failed to search subcategory '{subcategory}': {e}")
733
+ continue
734
+
735
+ logger.info(f"🌳 Subcategory search found {len(subcategory_candidates)} additional candidates")
736
+ else:
737
+ subcategory_candidates = []
738
+
739
+ # Phase 4: Combine all candidates
740
+ all_candidates = main_topic_candidates + subcategory_candidates
741
+
742
+ logger.info(f"πŸ”— Total candidates before deduplication: {len(all_candidates)}")
743
+
744
+ return all_candidates
745
+
746
+ async def _traditional_single_search(
747
+ self,
748
+ topic: str,
749
+ difficulty: str,
750
+ max_words: int
751
+ ) -> List[Dict[str, Any]]:
752
+ """
753
+ Traditional single-topic search approach (original implementation).
754
+ Kept as fallback option for compatibility.
755
+ """
756
+ # Get topic embedding
757
+ topic_embedding = self.model.encode([topic], convert_to_numpy=True)
758
+
759
+ # Add small amount of noise to create variety in search results
760
+ import numpy as np
761
+ noise_factor = float(os.getenv("SEARCH_RANDOMNESS", "0.02"))
762
+ if noise_factor > 0:
763
+ try:
764
+ noise = np.random.normal(0, noise_factor, topic_embedding.shape)
765
+ topic_embedding = topic_embedding + noise
766
+ except Exception:
767
+ pass # Continue without noise if it fails
768
+
769
+ topic_embedding = np.ascontiguousarray(topic_embedding, dtype=np.float32)
770
+ faiss.normalize_L2(topic_embedding)
771
+
772
+ # Search for similar words using FAISS
773
+ search_size = min(self.max_results * 6, 150)
774
+ scores, indices = self.faiss_index.search(topic_embedding, search_size)
775
+
776
+ # Debug: log search results
777
+ logger.info(f"πŸ” FAISS search returned {len(scores[0])} results")
778
+ logger.info(f"πŸ” Top 5 scores: {scores[0][:5]}")
779
+
780
+ # Log the actual words found by FAISS for debugging
781
+ top_words_with_scores = []
782
+ for i, (score, idx) in enumerate(zip(scores[0][:10], indices[0][:10])):
783
+ word = self.vocab[idx]
784
+ top_words_with_scores.append(f"{word}({score:.3f})")
785
+
786
+ logger.info(f"πŸ” Top 10 FAISS words: {', '.join(top_words_with_scores)}")
787
+
788
+ # Adaptive threshold strategy
789
+ candidates = []
790
+ thresholds_to_try = [
791
+ self.base_similarity_threshold,
792
+ max(self.base_similarity_threshold - 0.05, self.min_similarity_threshold),
793
+ max(self.base_similarity_threshold - 0.10, self.min_similarity_threshold),
794
+ self.min_similarity_threshold
795
+ ]
796
+
797
+ for threshold in thresholds_to_try:
798
+ logger.info(f"🎯 Trying threshold: {threshold}")
799
+ candidates = self._collect_candidates_with_threshold(scores, indices, threshold, topic, difficulty)
800
+ logger.info(f"πŸ” Found {len(candidates)} candidates with threshold {threshold}")
801
+
802
+ if len(candidates) >= max_words * 0.75:
803
+ logger.info(f"βœ… Sufficient words found with threshold {threshold}")
804
+ break
805
+ elif len(candidates) >= max_words // 2:
806
+ logger.info(f"⚑ Acceptable words found with threshold {threshold}")
807
+ break
808
+
809
+ # Smart randomization
810
+ import random
811
+ if len(candidates) > max_words * 2:
812
+ similar_words = self._weighted_random_selection(candidates, max_words)
813
+ else:
814
+ random.shuffle(candidates)
815
+ similar_words = candidates[:max_words]
816
+
817
+ logger.info(f"🎯 Traditional search found {len(similar_words)} words for '{topic}'")
818
+
819
+ # Cache successful results
820
+ if similar_words:
821
+ await self._cache_successful_search(topic, difficulty, similar_words)
822
+
823
+ return similar_words
824
+
825
+ def _combine_hierarchical_results(
826
+ self,
827
+ all_candidates: List[Dict[str, Any]],
828
+ max_words: int
829
+ ) -> List[Dict[str, Any]]:
830
+ """
831
+ Intelligently combine and deduplicate results from hierarchical search.
832
+
833
+ Strategy:
834
+ 1. Remove duplicates while preserving best similarity scores
835
+ 2. Apply source-based weighting (main topic > subcategories)
836
+ 3. Ensure diverse representation from different search sources
837
+ 4. Apply adaptive threshold filtering
838
+ """
839
+ if not all_candidates:
840
+ return []
841
+
842
+ # Step 1: Strict deduplication by word while keeping best score
843
+ word_best_scores = {}
844
+ for candidate in all_candidates:
845
+ word = candidate['word'].upper() # Ensure consistent casing
846
+ similarity = candidate['similarity']
847
+ source = candidate.get('search_source', 'unknown')
848
+
849
+ # Only keep if this word hasn't been seen or if it has a better score
850
+ if word not in word_best_scores or similarity > word_best_scores[word]['similarity']:
851
+ candidate_copy = candidate.copy()
852
+ candidate_copy['word'] = word # Normalize case
853
+ word_best_scores[word] = candidate_copy
854
+
855
+ deduplicated = list(word_best_scores.values())
856
+ logger.info(f"πŸ”— After strict deduplication: {len(all_candidates)} β†’ {len(deduplicated)} unique words")
857
+
858
+ # Step 2: Add randomization to improve variety while maintaining quality
859
+ # Group by similarity tiers to maintain quality while adding variety
860
+ high_quality = [w for w in deduplicated if w['similarity'] >= self.base_similarity_threshold]
861
+ medium_quality = [w for w in deduplicated if self.base_similarity_threshold - 0.1 <= w['similarity'] < self.base_similarity_threshold]
862
+ lower_quality = [w for w in deduplicated if w['similarity'] < self.base_similarity_threshold - 0.1]
863
+
864
+ # Shuffle within each tier for variety, then recombine
865
+ import random
866
+ random.shuffle(high_quality)
867
+ random.shuffle(medium_quality)
868
+ random.shuffle(lower_quality)
869
+
870
+ # Combine back in quality order but with randomness within tiers
871
+ deduplicated = high_quality + medium_quality + lower_quality
872
+
873
+ logger.info(f"🎲 Randomized within quality tiers: {len(high_quality)} high, {len(medium_quality)} medium, {len(lower_quality)} lower")
874
+
875
+ # Step 3: Apply adaptive threshold filtering (reuse existing logic)
876
+ thresholds_to_try = [
877
+ self.base_similarity_threshold,
878
+ max(self.base_similarity_threshold - 0.05, self.min_similarity_threshold),
879
+ max(self.base_similarity_threshold - 0.10, self.min_similarity_threshold),
880
+ self.min_similarity_threshold
881
+ ]
882
+
883
+ final_candidates = []
884
+ for threshold in thresholds_to_try:
885
+ filtered_candidates = [c for c in deduplicated if c['similarity'] >= threshold]
886
+
887
+ logger.info(f"🎯 Hierarchical threshold {threshold}: {len(filtered_candidates)} candidates")
888
+
889
+ if len(filtered_candidates) >= max_words * 0.75:
890
+ final_candidates = filtered_candidates
891
+ logger.info(f"βœ… Sufficient words found with hierarchical threshold {threshold}")
892
+ break
893
+ elif len(filtered_candidates) >= max_words // 2:
894
+ final_candidates = filtered_candidates
895
+ logger.info(f"⚑ Acceptable words found with hierarchical threshold {threshold}")
896
+ break
897
+
898
+ if not final_candidates:
899
+ final_candidates = deduplicated # Use all if threshold filtering too strict
900
+
901
+ # Step 4: Ensure source diversity in final selection
902
+ final_selection = self._ensure_source_diversity(final_candidates, max_words)
903
+
904
+ logger.info(f"πŸ† Final hierarchical selection: {len(final_selection)} words")
905
+
906
+ # Log the sources for debugging
907
+ source_counts = {}
908
+ for candidate in final_selection:
909
+ source = candidate.get('search_source', 'unknown')
910
+ source_counts[source] = source_counts.get(source, 0) + 1
911
+
912
+ logger.info(f"πŸ“Š Source distribution: {source_counts}")
913
+
914
+ return final_selection
915
+
916
+ def _ensure_source_diversity(
917
+ self,
918
+ candidates: List[Dict[str, Any]],
919
+ max_words: int
920
+ ) -> List[Dict[str, Any]]:
921
+ """
922
+ Ensure diverse representation from different search sources.
923
+ """
924
+ if len(candidates) <= max_words:
925
+ return candidates
926
+
927
+ # Group by source
928
+ source_groups = {}
929
+ for candidate in candidates:
930
+ source = candidate.get('search_source', 'unknown')
931
+ if source not in source_groups:
932
+ source_groups[source] = []
933
+ source_groups[source].append(candidate)
934
+
935
+ # If we have multiple sources, ensure representation from each
936
+ if len(source_groups) > 1:
937
+ selected = []
938
+ main_topic_quota = max_words * 2 // 3 # 2/3 from main topic
939
+ subcategory_quota = max_words - main_topic_quota # 1/3 from subcategories
940
+
941
+ # Select from main topic sources first
942
+ main_sources = [k for k in source_groups.keys() if k.startswith('main_topic:')]
943
+ for source in main_sources:
944
+ quota = main_topic_quota // len(main_sources) if main_sources else 0
945
+ selected.extend(source_groups[source][:quota])
946
+
947
+ # Fill remaining slots with subcategory sources
948
+ subcat_sources = [k for k in source_groups.keys() if k.startswith('subcategory:')]
949
+ if subcat_sources and len(selected) < max_words:
950
+ remaining_slots = max_words - len(selected)
951
+ quota_per_subcat = max(1, remaining_slots // len(subcat_sources))
952
+
953
+ for source in subcat_sources:
954
+ if len(selected) >= max_words:
955
+ break
956
+ selected.extend(source_groups[source][:quota_per_subcat])
957
+
958
+ # Fill any remaining slots with best remaining candidates
959
+ if len(selected) < max_words:
960
+ used_words = {c['word'] for c in selected}
961
+ remaining = [c for c in candidates if c['word'] not in used_words]
962
+ needed = max_words - len(selected)
963
+ selected.extend(remaining[:needed])
964
+
965
+ return selected[:max_words]
966
+ else:
967
+ # Single source, just return top candidates
968
+ return candidates[:max_words]
969
+
970
  def _get_index_cache_dir(self) -> str:
971
  """Get the directory for caching FAISS indexes."""
972
  # Use different cache locations based on environment
 
992
  os.path.exists(self.embeddings_cache_path) and
993
  os.path.exists(self.faiss_cache_path))
994
 
995
+ def _load_excluded_words(self) -> set:
996
+ """Load list of words to exclude from crossword generation."""
997
+ # Default excluded words - overly generic or inappropriate for crosswords
998
+ default_excluded = {
999
+ "WORD", "THING", "STUFF", "ITEMS", "THINGS", "WORDS", "TEXT", "STRING",
1000
+ "DATA", "INFO", "CONTENT", "MATERIAL", "ELEMENT", "OBJECT", "ENTITY",
1001
+ "CONCEPT", "IDEA", "NOTION", "ABSTRACT", "GENERAL", "SPECIFIC", "VARIOUS",
1002
+ "MULTIPLE", "SEVERAL", "MANY", "SOME", "MOST", "ALL", "EACH", "EVERY",
1003
+ "DIFFERENT", "SIMILAR", "SAME", "OTHER", "ANOTHER", "VARIOUS", "CERTAIN"
1004
+ }
1005
+
1006
+ # Load additional exclusions from environment or file
1007
+ env_excluded = os.getenv("EXCLUDED_WORDS", "")
1008
+ if env_excluded:
1009
+ env_words = {word.strip().upper() for word in env_excluded.split(",") if word.strip()}
1010
+ default_excluded.update(env_words)
1011
+
1012
+ # Try to load from exclusion file if it exists
1013
+ exclusion_file = os.getenv("WORD_EXCLUSION_FILE", "")
1014
+ if exclusion_file and os.path.exists(exclusion_file):
1015
+ try:
1016
+ with open(exclusion_file, 'r') as f:
1017
+ file_words = {word.strip().upper() for line in f for word in [line.strip()] if word and not word.startswith('#')}
1018
+ default_excluded.update(file_words)
1019
+ logger.info(f"πŸ“‹ Loaded {len(file_words)} additional excluded words from {exclusion_file}")
1020
+ except Exception as e:
1021
+ logger.warning(f"⚠️ Failed to load exclusion file {exclusion_file}: {e}")
1022
+
1023
+ logger.info(f"🚫 Loaded {len(default_excluded)} excluded words for filtering")
1024
+ return default_excluded
1025
+
1026
+ def _apply_word_exclusions(self, candidates: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1027
+ """Filter out excluded words from candidates."""
1028
+ if not candidates or not self.excluded_words:
1029
+ return candidates
1030
+
1031
+ filtered = []
1032
+ excluded_count = 0
1033
+
1034
+ for candidate in candidates:
1035
+ word = candidate['word'].upper()
1036
+ if word not in self.excluded_words:
1037
+ filtered.append(candidate)
1038
+ else:
1039
+ excluded_count += 1
1040
+
1041
+ if excluded_count > 0:
1042
+ logger.info(f"🚫 Excluded {excluded_count} inappropriate words from results")
1043
+
1044
+ return filtered
1045
+
1046
  def _load_cached_index(self) -> bool:
1047
  """Load FAISS index from cache if available."""
1048
  try: