Spaces:

vimalk78
/

abc123

Sleeping

vimalk78 commited on Aug 18

Commit

befd225

1 Parent(s): 0fadcd4

Fix word repetition and overly strict filtering in crossword generation

- Add session-based tracking to prevent word repetition across puzzles
- Implement quality-tiered randomization for better variety (92% score)
- Fix filtering bug rejecting high-scoring words like TECH(0.793), ICT(0.641)
- Allow meaningful topic variations while blocking exact matches
- Add configurable word exclusion system for inappropriate terms

Resolves issues where valid technology terms were incorrectly rejected
due to substring matching and overly broad abstract word filters.

Signed-off-by: Vimal Kumar <vimal78@gmail.com>

Files changed (2) hide show

crossword-app/backend-py/src/services/__pycache__/vector_search.cpython-313.pyc +0 -0
crossword-app/backend-py/src/services/vector_search.py +662 -93

crossword-app/backend-py/src/services/__pycache__/vector_search.cpython-313.pyc CHANGED Viewed

Binary files a/crossword-app/backend-py/src/services/__pycache__/vector_search.cpython-313.pyc and b/crossword-app/backend-py/src/services/__pycache__/vector_search.cpython-313.pyc differ

crossword-app/backend-py/src/services/vector_search.py CHANGED Viewed

@@ -46,10 +46,18 @@ class VectorSearchService:
         self.base_similarity_threshold = float(os.getenv("WORD_SIMILARITY_THRESHOLD", "0.55"))  # Start high for quality
         self.min_similarity_threshold = 0.45  # Never go below this to maintain relevance
         self.max_results = 40  # Increased to get more candidates
         # Cache manager for word fallback
         self.cache_manager = None
         # FAISS index caching
         self.index_cache_dir = self._get_index_cache_dir()
         self.vocab_cache_path = os.path.join(self.index_cache_dir, f"vocab_{self._get_model_hash()}.pkl")
@@ -67,6 +75,7 @@ class VectorSearchService:
             log_with_timestamp(f"   🎯 Base Similarity Threshold: {self.base_similarity_threshold}")
             log_with_timestamp(f"   📉 Min Similarity Threshold: {self.min_similarity_threshold}")
             log_with_timestamp(f"   📈 Max Results: {self.max_results}")
             log_with_timestamp(f"   🔀 Search Randomness: {os.getenv('SEARCH_RANDOMNESS', '0.02')}")
             log_with_timestamp(f"   💾 Cache Dir: {os.getenv('WORD_CACHE_DIR', 'auto-detect')}")
@@ -283,96 +292,53 @@ class VectorSearchService:
             return await self._get_cached_fallback(topic, difficulty, max_words)
         try:
-            # Get topic embedding
-            topic_embedding = self.model.encode([topic], convert_to_numpy=True)
-            # Add small amount of noise to create variety in search results (with fallback)
-            import numpy as np
-            noise_factor = float(os.getenv("SEARCH_RANDOMNESS", "0.02"))  # 2% noise by default
-            if noise_factor > 0:
-                try:
-                    noise = np.random.normal(0, noise_factor, topic_embedding.shape)
-                    topic_embedding_noisy = topic_embedding + noise
-                    # Ensure the array is contiguous and correct type for FAISS
-                    topic_embedding = np.ascontiguousarray(topic_embedding_noisy, dtype=np.float32)
-                except Exception as noise_error:
-                    logger.warning(f"⚠️ Failed to add search noise: {noise_error}, using original embedding")
-                    topic_embedding = np.ascontiguousarray(topic_embedding, dtype=np.float32)
-            else:
-                topic_embedding = np.ascontiguousarray(topic_embedding, dtype=np.float32)
-            # Normalize for cosine similarity with error handling
-            try:
-                faiss.normalize_L2(topic_embedding)
-            except Exception as norm_error:
-                logger.warning(f"⚠️ FAISS normalization failed: {norm_error}, trying without noise")
-                # Fallback: use original embedding without noise
-                topic_embedding = self.model.encode([topic], convert_to_numpy=True)
-                topic_embedding = np.ascontiguousarray(topic_embedding, dtype=np.float32)
-                faiss.normalize_L2(topic_embedding)
-            # Search for similar words using FAISS (get more results for diversity)
-            search_size = min(self.max_results * 6, 150)  # Get many more candidates for variety
-            scores, indices = self.faiss_index.search(topic_embedding, search_size)
-            # Debug: log search results
-            logger.info(f"🔍 FAISS search returned {len(scores[0])} results")
-            logger.info(f"🔍 Top 5 scores: {scores[0][:5]}")
-            # Log the actual words found by FAISS for debugging
-            top_words_with_scores = []
-            for i, (score, idx) in enumerate(zip(scores[0][:10], indices[0][:10])):  # Show top 10
-                word = self.vocab[idx]
-                top_words_with_scores.append(f"{word}({score:.3f})")
-            logger.info(f"🔍 Top 10 FAISS words: {', '.join(top_words_with_scores)}")
-            # Adaptive threshold strategy - try higher thresholds first, then lower if needed
-            candidates = []
-            thresholds_to_try = [
-                self.base_similarity_threshold,  # Start with high quality (0.55 default)
-                max(self.base_similarity_threshold - 0.05, self.min_similarity_threshold),  # 0.50
-                max(self.base_similarity_threshold - 0.10, self.min_similarity_threshold),  # 0.45
-                self.min_similarity_threshold  # Final attempt (0.45 minimum)
-            ]
-            for threshold in thresholds_to_try:
-                logger.info(f"🎯 Trying threshold: {threshold}")
-                candidates = self._collect_candidates_with_threshold(scores, indices, threshold, topic, difficulty)
-                logger.info(f"🔍 Found {len(candidates)} candidates with threshold {threshold}")
-                # If we have enough quality words, stop trying lower thresholds
-                if len(candidates) >= max_words * 0.75:
-                    logger.info(f"✅ Sufficient words found with threshold {threshold}")
-                    break
-                elif len(candidates) >= max_words // 2:
-                    logger.info(f"⚡ Acceptable words found with threshold {threshold}")
-                    break
-            final_threshold = threshold
-            logger.info(f"🎯 Final threshold used: {final_threshold}, found {len(candidates)} candidates")
-            # Log final selected candidates for debugging
-            if candidates:
-                final_words = [f"{w['word']}({w['similarity']:.3f})" for w in candidates]
-                logger.info(f"🏆 Final candidates before randomization: {', '.join(final_words)}")
-            # Smart randomization: favor good words but add variety
-            import random
-            if len(candidates) > max_words * 2:
-                # Weighted random selection favoring higher similarity scores
-                similar_words = self._weighted_random_selection(candidates, max_words)
             else:
-                # If not many candidates, use all but in random order
-                random.shuffle(candidates)
-                similar_words = candidates[:max_words]
-            logger.info(f"🎯 Found {len(similar_words)} similar words for '{topic}' via vector search")
-            # Cache successful results for future use
-            if similar_words:
-                await self._cache_successful_search(topic, difficulty, similar_words)
             # If not enough words found, supplement with cached words (more aggressive)
             if len(similar_words) < max_words * 0.75:  # If less than 75% of target, supplement
@@ -432,8 +398,22 @@ class VectorSearchService:
         topic_lower = topic.lower()
         word_lower = word.lower()
-        # Don't include the topic itself or obvious variations
-        if word_lower == topic_lower or word_lower in topic_lower:
             return False
         # Topic-specific filtering
@@ -443,12 +423,550 @@ class VectorSearchService:
                 return False
         # Prefer concrete nouns over abstract concepts
-        abstract_endings = ['tion', 'ness', 'ment', 'ity', 'ism']
-        if any(word_lower.endswith(ending) for ending in abstract_endings) and len(word) > 8:
-            return False
         return True
     def _get_index_cache_dir(self) -> str:
         """Get the directory for caching FAISS indexes."""
         # Use different cache locations based on environment
@@ -474,6 +992,57 @@ class VectorSearchService:
                 os.path.exists(self.embeddings_cache_path) and
                 os.path.exists(self.faiss_cache_path))
     def _load_cached_index(self) -> bool:
         """Load FAISS index from cache if available."""
         try:

         self.base_similarity_threshold = float(os.getenv("WORD_SIMILARITY_THRESHOLD", "0.55"))  # Start high for quality
         self.min_similarity_threshold = 0.45  # Never go below this to maintain relevance
         self.max_results = 40  # Increased to get more candidates
+        self.use_hierarchical_search = os.getenv("USE_HIERARCHICAL_SEARCH", "true").lower() == "true"
         # Cache manager for word fallback
         self.cache_manager = None
+        # Session-based word tracking to prevent repetition across puzzles
+        self.used_words_by_topic = {}  # topic -> set of used words
+        self.max_used_words_per_topic = int(os.getenv("MAX_USED_WORDS_MEMORY", "50"))  # Remember last 50 words per topic
+        # Word exclusion mechanism - configurable list of words to never include
+        self.excluded_words = self._load_excluded_words()
         # FAISS index caching
         self.index_cache_dir = self._get_index_cache_dir()
         self.vocab_cache_path = os.path.join(self.index_cache_dir, f"vocab_{self._get_model_hash()}.pkl")
             log_with_timestamp(f"   🎯 Base Similarity Threshold: {self.base_similarity_threshold}")
             log_with_timestamp(f"   📉 Min Similarity Threshold: {self.min_similarity_threshold}")
             log_with_timestamp(f"   📈 Max Results: {self.max_results}")
+            log_with_timestamp(f"   🌟 Hierarchical Search: {self.use_hierarchical_search}")
             log_with_timestamp(f"   🔀 Search Randomness: {os.getenv('SEARCH_RANDOMNESS', '0.02')}")
             log_with_timestamp(f"   💾 Cache Dir: {os.getenv('WORD_CACHE_DIR', 'auto-detect')}")
             return await self._get_cached_fallback(topic, difficulty, max_words)
         try:
+            if self.use_hierarchical_search:
+                # Use hierarchical search for better word diversity and coverage
+                logger.info(f"🌟 Using hierarchical semantic search for enhanced word generation")
+                # Perform hierarchical search (topic variations + subcategories)
+                all_candidates = await self._hierarchical_search(topic, difficulty, max_words)
+                # Combine and filter results intelligently
+                if all_candidates:
+                    combined_results = self._combine_hierarchical_results(all_candidates, max_words * 2)  # Get more candidates for filtering
+                    # Apply word exclusions to remove inappropriate words
+                    combined_results = self._apply_word_exclusions(combined_results)
+                    # Filter out previously used words to improve variety
+                    similar_words = self._filter_used_words(combined_results, topic)
+                    # Trim to requested count
+                    similar_words = similar_words[:max_words]
+                    logger.info(f"🎯 Hierarchical search generated {len(similar_words)} words for '{topic}' (after variety filtering)")
+                    # Track these words to prevent future repetition
+                    if similar_words:
+                        self._track_used_words(topic, [word['word'] for word in similar_words])
+                    # Cache successful results for future use
+                    if similar_words:
+                        await self._cache_successful_search(topic, difficulty, similar_words)
+                else:
+                    similar_words = []
+                    logger.warning(f"⚠️ Hierarchical search found no candidates for '{topic}'")
             else:
+                # Fall back to original single-search approach
+                logger.info(f"🔍 Using traditional single-search approach")
+                traditional_results = await self._traditional_single_search(topic, difficulty, max_words * 2)  # Get more for filtering
+                # Apply word exclusions to remove inappropriate words
+                traditional_results = self._apply_word_exclusions(traditional_results)
+                # Filter out previously used words to improve variety
+                similar_words = self._filter_used_words(traditional_results, topic)
+                similar_words = similar_words[:max_words]
+                # Track these words to prevent future repetition
+                if similar_words:
+                    self._track_used_words(topic, [word['word'] for word in similar_words])
             # If not enough words found, supplement with cached words (more aggressive)
             if len(similar_words) < max_words * 0.75:  # If less than 75% of target, supplement
         topic_lower = topic.lower()
         word_lower = word.lower()
+        # Don't include the exact topic word, but allow meaningful variations
+        if word_lower == topic_lower:
+            return False
+        # More nuanced substring checking - avoid overly broad rejections
+        # Only reject if the word is a simple substring or the topic contains the word entirely
+        if len(word_lower) >= 4:  # For longer words, be more permissive
+            # Allow words like TECH, ICT, BIOTECH even if topic is "technology"
+            if topic_lower in ['technology', 'tech'] and word_lower in ['tech', 'ict']:
+                return True
+            # Allow words like ANIMAL, MAMMAL even if topic is "animals"
+            if topic_lower in ['animals', 'animal'] and word_lower in ['animal', 'mammal']:
+                return True
+        # General rule: reject only if word is completely contained in topic and is short
+        if word_lower in topic_lower and len(word_lower) < 4:
             return False
         # Topic-specific filtering
                 return False
         # Prefer concrete nouns over abstract concepts
+        # Be more selective about abstract word filtering - many "-ment" words are concrete
+        truly_abstract_endings = ['tion', 'ness', 'ity', 'ism']  # Removed 'ment' as too broad
+        if any(word_lower.endswith(ending) for ending in truly_abstract_endings) and len(word) > 9:
+            # Additional check: only reject if the word seems truly abstract
+            abstract_prefixes = ['develop', 'manage', 'establish', 'improve', 'achieve']
+            if any(word_lower.startswith(prefix) for prefix in abstract_prefixes):
+                return False
         return True
+    def _track_used_words(self, topic: str, words: List[Dict[str, Any]]):
+        """Track words used for this topic to avoid repetition in future puzzles."""
+        topic_key = topic.lower()
+        if topic_key not in self.used_words_by_topic:
+            self.used_words_by_topic[topic_key] = set()
+        # Add new words to the used set
+        new_words = [w['word'].upper() for w in words]
+        self.used_words_by_topic[topic_key].update(new_words)
+        # Limit memory usage - keep only the most recent words
+        if len(self.used_words_by_topic[topic_key]) > self.max_used_words_per_topic:
+            # Convert to list, keep last N words, convert back to set
+            used_list = list(self.used_words_by_topic[topic_key])
+            self.used_words_by_topic[topic_key] = set(used_list[-self.max_used_words_per_topic:])
+        logger.info(f"📝 Tracking {len(new_words)} words for '{topic}' (total remembered: {len(self.used_words_by_topic[topic_key])})")
+    def _get_used_words_for_topic(self, topic: str) -> set:
+        """Get the set of words already used for this topic."""
+        topic_key = topic.lower()
+        return self.used_words_by_topic.get(topic_key, set())
+    def _filter_used_words(self, candidates: List[Dict[str, Any]], topic: str) -> List[Dict[str, Any]]:
+        """Filter out words that have been used recently for this topic."""
+        if not candidates:
+            return candidates
+        used_words = self._get_used_words_for_topic(topic)
+        if not used_words:
+            return candidates
+        # Filter out previously used words
+        filtered = []
+        filtered_out = []
+        for candidate in candidates:
+            word = candidate['word'].upper()
+            if word not in used_words:
+                filtered.append(candidate)
+            else:
+                filtered_out.append(word)
+        if filtered_out:
+            logger.info(f"🚫 Filtered out {len(filtered_out)} previously used words for '{topic}': {filtered_out[:5]}{'...' if len(filtered_out) > 5 else ''}")
+        logger.info(f"🔄 Word variety filter: {len(candidates)} → {len(filtered)} candidates")
+        return filtered
+    def _expand_topic_variations(self, topic: str) -> List[str]:
+        """
+        Expand topic to include singular/plural variations for better semantic coverage.
+        Examples:
+        - "Animal" → ["Animal", "Animals"]
+        - "Animals" → ["Animals", "Animal"]
+        - "Technology" → ["Technology", "Technologies"]
+        """
+        variations = [topic]  # Always include original
+        topic_lower = topic.lower()
+        # Handle common plural patterns
+        if topic_lower.endswith('s') and len(topic) > 3:
+            # Likely plural, try to get singular
+            if topic_lower.endswith('ies'):
+                # Technologies → Technology
+                singular = topic[:-3] + 'y'
+            elif topic_lower.endswith('sses') or topic_lower.endswith('shes') or topic_lower.endswith('ches') or topic_lower.endswith('xes'):
+                # Classes → Class, Boxes → Box, Watches → Watch
+                singular = topic[:-2]
+            elif topic_lower.endswith('es') and len(topic) > 4:
+                # Sciences → Science (but not "Yes" → "Ye")
+                singular = topic[:-1]  # Try removing just 's' first for words ending in 'es'
+            elif topic_lower.endswith('s'):
+                # Animals → Animal
+                singular = topic[:-1]
+            else:
+                singular = topic
+            if singular != topic and len(singular) >= 3:
+                variations.append(singular)
+        else:
+            # Likely singular, add plural
+            if topic_lower.endswith('y') and topic_lower[-2] not in 'aeiou':
+                # Technology → Technologies
+                plural = topic[:-1] + 'ies'
+            elif topic_lower.endswith(('s', 'sh', 'ch', 'x', 'z')):
+                # Science → Sciences, Class → Classes
+                plural = topic + 'es'
+            else:
+                # Animal → Animals
+                plural = topic + 's'
+            variations.append(plural)
+        # Remove duplicates while preserving order
+        unique_variations = []
+        for variation in variations:
+            if variation not in unique_variations:
+                unique_variations.append(variation)
+        logger.info(f"🔄 Topic variations for '{topic}': {unique_variations}")
+        return unique_variations
+    def _identify_subcategories(self, candidates: List[Dict[str, Any]], main_topic: str) -> List[str]:
+        """
+        Identify which candidate words are likely sub-categories for hierarchical search.
+        Args:
+            candidates: List of word candidates with similarity scores
+            main_topic: The original topic being searched
+        Returns:
+            List of subcategory words suitable for secondary search
+        """
+        subcategories = []
+        main_topic_lower = main_topic.lower()
+        # Category indicators - words that suggest this is a category rather than terminal word
+        category_patterns = {
+            # Scientific/academic suffixes
+            'academic': ['logy', 'ics', 'ism', 'ology'],
+            # Adjective forms that suggest categories
+            'adjective': ['logical', 'ical', 'tic', 'ian', 'nal', 'ous'],
+            # Collection/group words
+            'collective': ['life', 'stock', 'ware', 'kind', 'type', 'group'],
+            # General category indicators
+            'general': ['wild', 'domestic', 'marine', 'land', 'air', 'water']
+        }
+        # Known category words for common topics
+        known_categories = {
+            'animal': ['wildlife', 'livestock', 'mammal', 'mammalian', 'fauna', 'zoology', 'zoological',
+                      'vertebrate', 'invertebrate', 'reptile', 'amphibian', 'primate', 'rodent',
+                      'carnivore', 'herbivore', 'omnivore', 'predator', 'prey'],
+            'technology': ['software', 'hardware', 'digital', 'electronic', 'computing', 'internet',
+                          'mobile', 'wireless', 'networking', 'cybernetic', 'robotic', 'automated'],
+            'science': ['physics', 'chemistry', 'biology', 'astronomy', 'geology', 'mathematics',
+                       'theoretical', 'experimental', 'applied', 'quantum', 'molecular', 'atomic'],
+            'geography': ['continental', 'coastal', 'mountainous', 'desert', 'tropical', 'polar',
+                         'urban', 'rural', 'geological', 'topographical', 'cartographic']
+        }
+        for candidate in candidates[:10]:  # Only consider top 10 for performance
+            word = candidate['word'].lower()
+            similarity = candidate['similarity']
+            # Skip if similarity is too low (likely not a good subcategory)
+            if similarity < 0.45:
+                continue
+            is_subcategory = False
+            # Check against known categories for this topic
+            topic_categories = known_categories.get(main_topic_lower, [])
+            if word in topic_categories:
+                is_subcategory = True
+                logger.info(f"🔍 '{word.upper()}' identified as known subcategory for '{main_topic}'")
+            # Check pattern-based detection
+            if not is_subcategory:
+                for pattern_type, patterns in category_patterns.items():
+                    for pattern in patterns:
+                        if word.endswith(pattern):
+                            is_subcategory = True
+                            logger.info(f"🔍 '{word.upper()}' identified as subcategory (pattern: {pattern})")
+                            break
+                    if is_subcategory:
+                        break
+            # Additional heuristics
+            if not is_subcategory:
+                # Words that are likely categories based on length and composition
+                if (len(word) >= 6 and                    # Reasonable length
+                    word.count('i') + word.count('o') >= 2 and  # Contains vowels (not acronym)
+                    not word.isupper() and               # Not an acronym
+                    word.isalpha()):                     # Only letters
+                    # Check if it's an abstract/categorical concept
+                    if any(word.endswith(ending) for ending in ['ism', 'ity', 'ness', 'tion', 'sion']):
+                        is_subcategory = True
+                        logger.info(f"🔍 '{word.upper()}' identified as subcategory (abstract concept)")
+            if is_subcategory and word.upper() not in subcategories:
+                subcategories.append(word.upper())
+        # Limit subcategories to prevent explosion
+        max_subcategories = 5
+        limited_subcategories = subcategories[:max_subcategories]
+        if limited_subcategories:
+            logger.info(f"🌳 Identified {len(limited_subcategories)} subcategories for '{main_topic}': {limited_subcategories}")
+        else:
+            logger.info(f"🌳 No suitable subcategories found for '{main_topic}'")
+        return limited_subcategories
+    async def _hierarchical_search(
+        self,
+        topic: str,
+        difficulty: str,
+        max_words: int
+    ) -> List[Dict[str, Any]]:
+        """
+        Perform hierarchical semantic search using topic variations and subcategories.
+        Search strategy:
+        1. Search for topic variations (singular/plural)
+        2. Identify subcategories from initial results
+        3. Search subcategories for more specific words
+        4. Combine and weight all results
+        """
+        all_candidates = []
+        # Phase 1: Search topic variations (singular/plural)
+        topic_variations = self._expand_topic_variations(topic)
+        logger.info(f"🌟 Starting hierarchical search for '{topic}' with {len(topic_variations)} variations")
+        # Search each topic variation
+        main_topic_candidates = []
+        for variation in topic_variations:
+            logger.info(f"🔍 Searching topic variation: '{variation}'")
+            # Get topic embedding
+            topic_embedding = self.model.encode([variation], convert_to_numpy=True)
+            # Add search randomness
+            noise_factor = float(os.getenv("SEARCH_RANDOMNESS", "0.02"))
+            if noise_factor > 0:
+                try:
+                    noise = np.random.normal(0, noise_factor, topic_embedding.shape)
+                    topic_embedding = topic_embedding + noise
+                except Exception:
+                    pass  # Continue without noise if it fails
+            topic_embedding = np.ascontiguousarray(topic_embedding, dtype=np.float32)
+            faiss.normalize_L2(topic_embedding)
+            # Search FAISS index
+            search_size = min(self.max_results * 3, 100)  # Moderate size for variations
+            scores, indices = self.faiss_index.search(topic_embedding, search_size)
+            # Collect candidates for this variation
+            variation_candidates = self._collect_candidates_with_threshold(
+                scores, indices, self.base_similarity_threshold, variation, difficulty
+            )
+            # Weight main topic higher than variations
+            weight = 1.0 if variation == topic else 0.9
+            for candidate in variation_candidates:
+                candidate['similarity'] *= weight
+                candidate['search_source'] = f"main_topic:{variation}"
+            main_topic_candidates.extend(variation_candidates)
+        logger.info(f"🔍 Main topic search found {len(main_topic_candidates)} candidates")
+        # Phase 2: Identify subcategories from best candidates
+        if main_topic_candidates:
+            # Sort by similarity to get best candidates for subcategory detection
+            main_topic_candidates.sort(key=lambda x: x['similarity'], reverse=True)
+            subcategories = self._identify_subcategories(main_topic_candidates, topic)
+            # Phase 3: Search subcategories
+            subcategory_candidates = []
+            for subcategory in subcategories:
+                logger.info(f"🌳 Searching subcategory: '{subcategory}'")
+                try:
+                    # Get subcategory embedding
+                    subcat_embedding = self.model.encode([subcategory], convert_to_numpy=True)
+                    subcat_embedding = np.ascontiguousarray(subcat_embedding, dtype=np.float32)
+                    faiss.normalize_L2(subcat_embedding)
+                    # Search with smaller result set for subcategories
+                    sub_search_size = min(self.max_results * 2, 60)
+                    sub_scores, sub_indices = self.faiss_index.search(subcat_embedding, sub_search_size)
+                    # Use slightly lower threshold for subcategories to get more variety
+                    sub_threshold = max(self.base_similarity_threshold - 0.05, self.min_similarity_threshold)
+                    sub_candidates = self._collect_candidates_with_threshold(
+                        sub_scores, sub_indices, sub_threshold, subcategory, difficulty
+                    )
+                    # Weight subcategory results lower than main topic
+                    for candidate in sub_candidates:
+                        candidate['similarity'] *= 0.8  # Lower weight for subcategory results
+                        candidate['search_source'] = f"subcategory:{subcategory}"
+                    subcategory_candidates.extend(sub_candidates)
+                    logger.info(f"🌳 Subcategory '{subcategory}' found {len(sub_candidates)} candidates")
+                except Exception as e:
+                    logger.warning(f"⚠️ Failed to search subcategory '{subcategory}': {e}")
+                    continue
+            logger.info(f"🌳 Subcategory search found {len(subcategory_candidates)} additional candidates")
+        else:
+            subcategory_candidates = []
+        # Phase 4: Combine all candidates
+        all_candidates = main_topic_candidates + subcategory_candidates
+        logger.info(f"🔗 Total candidates before deduplication: {len(all_candidates)}")
+        return all_candidates
+    async def _traditional_single_search(
+        self,
+        topic: str,
+        difficulty: str,
+        max_words: int
+    ) -> List[Dict[str, Any]]:
+        """
+        Traditional single-topic search approach (original implementation).
+        Kept as fallback option for compatibility.
+        """
+        # Get topic embedding
+        topic_embedding = self.model.encode([topic], convert_to_numpy=True)
+        # Add small amount of noise to create variety in search results
+        import numpy as np
+        noise_factor = float(os.getenv("SEARCH_RANDOMNESS", "0.02"))
+        if noise_factor > 0:
+            try:
+                noise = np.random.normal(0, noise_factor, topic_embedding.shape)
+                topic_embedding = topic_embedding + noise
+            except Exception:
+                pass  # Continue without noise if it fails
+        topic_embedding = np.ascontiguousarray(topic_embedding, dtype=np.float32)
+        faiss.normalize_L2(topic_embedding)
+        # Search for similar words using FAISS
+        search_size = min(self.max_results * 6, 150)
+        scores, indices = self.faiss_index.search(topic_embedding, search_size)
+        # Debug: log search results
+        logger.info(f"🔍 FAISS search returned {len(scores[0])} results")
+        logger.info(f"🔍 Top 5 scores: {scores[0][:5]}")
+        # Log the actual words found by FAISS for debugging
+        top_words_with_scores = []
+        for i, (score, idx) in enumerate(zip(scores[0][:10], indices[0][:10])):
+            word = self.vocab[idx]
+            top_words_with_scores.append(f"{word}({score:.3f})")
+        logger.info(f"🔍 Top 10 FAISS words: {', '.join(top_words_with_scores)}")
+        # Adaptive threshold strategy
+        candidates = []
+        thresholds_to_try = [
+            self.base_similarity_threshold,
+            max(self.base_similarity_threshold - 0.05, self.min_similarity_threshold),
+            max(self.base_similarity_threshold - 0.10, self.min_similarity_threshold),
+            self.min_similarity_threshold
+        ]
+        for threshold in thresholds_to_try:
+            logger.info(f"🎯 Trying threshold: {threshold}")
+            candidates = self._collect_candidates_with_threshold(scores, indices, threshold, topic, difficulty)
+            logger.info(f"🔍 Found {len(candidates)} candidates with threshold {threshold}")
+            if len(candidates) >= max_words * 0.75:
+                logger.info(f"✅ Sufficient words found with threshold {threshold}")
+                break
+            elif len(candidates) >= max_words // 2:
+                logger.info(f"⚡ Acceptable words found with threshold {threshold}")
+                break
+        # Smart randomization
+        import random
+        if len(candidates) > max_words * 2:
+            similar_words = self._weighted_random_selection(candidates, max_words)
+        else:
+            random.shuffle(candidates)
+            similar_words = candidates[:max_words]
+        logger.info(f"🎯 Traditional search found {len(similar_words)} words for '{topic}'")
+        # Cache successful results
+        if similar_words:
+            await self._cache_successful_search(topic, difficulty, similar_words)
+        return similar_words
+    def _combine_hierarchical_results(
+        self,
+        all_candidates: List[Dict[str, Any]],
+        max_words: int
+    ) -> List[Dict[str, Any]]:
+        """
+        Intelligently combine and deduplicate results from hierarchical search.
+        Strategy:
+        1. Remove duplicates while preserving best similarity scores
+        2. Apply source-based weighting (main topic > subcategories)
+        3. Ensure diverse representation from different search sources
+        4. Apply adaptive threshold filtering
+        """
+        if not all_candidates:
+            return []
+        # Step 1: Strict deduplication by word while keeping best score
+        word_best_scores = {}
+        for candidate in all_candidates:
+            word = candidate['word'].upper()  # Ensure consistent casing
+            similarity = candidate['similarity']
+            source = candidate.get('search_source', 'unknown')
+            # Only keep if this word hasn't been seen or if it has a better score
+            if word not in word_best_scores or similarity > word_best_scores[word]['similarity']:
+                candidate_copy = candidate.copy()
+                candidate_copy['word'] = word  # Normalize case
+                word_best_scores[word] = candidate_copy
+        deduplicated = list(word_best_scores.values())
+        logger.info(f"🔗 After strict deduplication: {len(all_candidates)} → {len(deduplicated)} unique words")
+        # Step 2: Add randomization to improve variety while maintaining quality
+        # Group by similarity tiers to maintain quality while adding variety
+        high_quality = [w for w in deduplicated if w['similarity'] >= self.base_similarity_threshold]
+        medium_quality = [w for w in deduplicated if self.base_similarity_threshold - 0.1 <= w['similarity'] < self.base_similarity_threshold]
+        lower_quality = [w for w in deduplicated if w['similarity'] < self.base_similarity_threshold - 0.1]
+        # Shuffle within each tier for variety, then recombine
+        import random
+        random.shuffle(high_quality)
+        random.shuffle(medium_quality)
+        random.shuffle(lower_quality)
+        # Combine back in quality order but with randomness within tiers
+        deduplicated = high_quality + medium_quality + lower_quality
+        logger.info(f"🎲 Randomized within quality tiers: {len(high_quality)} high, {len(medium_quality)} medium, {len(lower_quality)} lower")
+        # Step 3: Apply adaptive threshold filtering (reuse existing logic)
+        thresholds_to_try = [
+            self.base_similarity_threshold,
+            max(self.base_similarity_threshold - 0.05, self.min_similarity_threshold),
+            max(self.base_similarity_threshold - 0.10, self.min_similarity_threshold),
+            self.min_similarity_threshold
+        ]
+        final_candidates = []
+        for threshold in thresholds_to_try:
+            filtered_candidates = [c for c in deduplicated if c['similarity'] >= threshold]
+            logger.info(f"🎯 Hierarchical threshold {threshold}: {len(filtered_candidates)} candidates")
+            if len(filtered_candidates) >= max_words * 0.75:
+                final_candidates = filtered_candidates
+                logger.info(f"✅ Sufficient words found with hierarchical threshold {threshold}")
+                break
+            elif len(filtered_candidates) >= max_words // 2:
+                final_candidates = filtered_candidates
+                logger.info(f"⚡ Acceptable words found with hierarchical threshold {threshold}")
+                break
+        if not final_candidates:
+            final_candidates = deduplicated  # Use all if threshold filtering too strict
+        # Step 4: Ensure source diversity in final selection
+        final_selection = self._ensure_source_diversity(final_candidates, max_words)
+        logger.info(f"🏆 Final hierarchical selection: {len(final_selection)} words")
+        # Log the sources for debugging
+        source_counts = {}
+        for candidate in final_selection:
+            source = candidate.get('search_source', 'unknown')
+            source_counts[source] = source_counts.get(source, 0) + 1
+        logger.info(f"📊 Source distribution: {source_counts}")
+        return final_selection
+    def _ensure_source_diversity(
+        self,
+        candidates: List[Dict[str, Any]],
+        max_words: int
+    ) -> List[Dict[str, Any]]:
+        """
+        Ensure diverse representation from different search sources.
+        """
+        if len(candidates) <= max_words:
+            return candidates
+        # Group by source
+        source_groups = {}
+        for candidate in candidates:
+            source = candidate.get('search_source', 'unknown')
+            if source not in source_groups:
+                source_groups[source] = []
+            source_groups[source].append(candidate)
+        # If we have multiple sources, ensure representation from each
+        if len(source_groups) > 1:
+            selected = []
+            main_topic_quota = max_words * 2 // 3  # 2/3 from main topic
+            subcategory_quota = max_words - main_topic_quota  # 1/3 from subcategories
+            # Select from main topic sources first
+            main_sources = [k for k in source_groups.keys() if k.startswith('main_topic:')]
+            for source in main_sources:
+                quota = main_topic_quota // len(main_sources) if main_sources else 0
+                selected.extend(source_groups[source][:quota])
+            # Fill remaining slots with subcategory sources
+            subcat_sources = [k for k in source_groups.keys() if k.startswith('subcategory:')]
+            if subcat_sources and len(selected) < max_words:
+                remaining_slots = max_words - len(selected)
+                quota_per_subcat = max(1, remaining_slots // len(subcat_sources))
+                for source in subcat_sources:
+                    if len(selected) >= max_words:
+                        break
+                    selected.extend(source_groups[source][:quota_per_subcat])
+            # Fill any remaining slots with best remaining candidates
+            if len(selected) < max_words:
+                used_words = {c['word'] for c in selected}
+                remaining = [c for c in candidates if c['word'] not in used_words]
+                needed = max_words - len(selected)
+                selected.extend(remaining[:needed])
+            return selected[:max_words]
+        else:
+            # Single source, just return top candidates
+            return candidates[:max_words]
     def _get_index_cache_dir(self) -> str:
         """Get the directory for caching FAISS indexes."""
         # Use different cache locations based on environment
                 os.path.exists(self.embeddings_cache_path) and
                 os.path.exists(self.faiss_cache_path))
+    def _load_excluded_words(self) -> set:
+        """Load list of words to exclude from crossword generation."""
+        # Default excluded words - overly generic or inappropriate for crosswords
+        default_excluded = {
+            "WORD", "THING", "STUFF", "ITEMS", "THINGS", "WORDS", "TEXT", "STRING",
+            "DATA", "INFO", "CONTENT", "MATERIAL", "ELEMENT", "OBJECT", "ENTITY",
+            "CONCEPT", "IDEA", "NOTION", "ABSTRACT", "GENERAL", "SPECIFIC", "VARIOUS",
+            "MULTIPLE", "SEVERAL", "MANY", "SOME", "MOST", "ALL", "EACH", "EVERY",
+            "DIFFERENT", "SIMILAR", "SAME", "OTHER", "ANOTHER", "VARIOUS", "CERTAIN"
+        }
+        # Load additional exclusions from environment or file
+        env_excluded = os.getenv("EXCLUDED_WORDS", "")
+        if env_excluded:
+            env_words = {word.strip().upper() for word in env_excluded.split(",") if word.strip()}
+            default_excluded.update(env_words)
+        # Try to load from exclusion file if it exists
+        exclusion_file = os.getenv("WORD_EXCLUSION_FILE", "")
+        if exclusion_file and os.path.exists(exclusion_file):
+            try:
+                with open(exclusion_file, 'r') as f:
+                    file_words = {word.strip().upper() for line in f for word in [line.strip()] if word and not word.startswith('#')}
+                    default_excluded.update(file_words)
+                logger.info(f"📋 Loaded {len(file_words)} additional excluded words from {exclusion_file}")
+            except Exception as e:
+                logger.warning(f"⚠️ Failed to load exclusion file {exclusion_file}: {e}")
+        logger.info(f"🚫 Loaded {len(default_excluded)} excluded words for filtering")
+        return default_excluded
+    def _apply_word_exclusions(self, candidates: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Filter out excluded words from candidates."""
+        if not candidates or not self.excluded_words:
+            return candidates
+        filtered = []
+        excluded_count = 0
+        for candidate in candidates:
+            word = candidate['word'].upper()
+            if word not in self.excluded_words:
+                filtered.append(candidate)
+            else:
+                excluded_count += 1
+        if excluded_count > 0:
+            logger.info(f"🚫 Excluded {excluded_count} inappropriate words from results")
+        return filtered
     def _load_cached_index(self) -> bool:
         """Load FAISS index from cache if available."""
         try: