Spaces:

vimalk78
/

abc123

Sleeping

App Files Files Community

vimalk78 commited on Sep 3

Commit

9cd7541

1 Parent(s): 5676df3

fix: clean up repository after removing LFS cache files

Browse files

Files changed (8) hide show

Dockerfile +5 -0
crossword-app/backend-py/.env.example +25 -0
crossword-app/backend-py/requirements.txt +1 -1
crossword-app/backend-py/src/services/norvig_vocabulary_manager.py +307 -0
crossword-app/backend-py/src/services/thematic_word_service.py +94 -19
crossword-app/frontend/src/components/DebugTab.jsx +16 -0
{hack → crossword-app/words}/norvig/count_1w.txt +0 -0
{hack → crossword-app/words}/norvig/count_1w100k.txt +0 -0

Dockerfile CHANGED Viewed

@@ -31,6 +31,7 @@ RUN pip install --no-cache-dir --upgrade pip && \
 # Copy all source code
 COPY crossword-app/frontend/ ./frontend/
 COPY crossword-app/backend-py/ ./backend-py/
 # Copy cache directory with pre-built models and NLTK data
 COPY cache-dir/ ./cache-dir/
@@ -84,6 +85,10 @@ ENV PIP_NO_CACHE_DIR=1
 ENV CACHE_DIR=/app/backend-py/cache
 ENV NLTK_DATA=/app/backend-py/cache/nltk_data
 # Health check
 # HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
 #     CMD curl -f http://localhost:7860/health || exit 1

 # Copy all source code
 COPY crossword-app/frontend/ ./frontend/
 COPY crossword-app/backend-py/ ./backend-py/
+COPY crossword-app/words/ ./backend-py/words/
 # Copy cache directory with pre-built models and NLTK data
 COPY cache-dir/ ./cache-dir/
 ENV CACHE_DIR=/app/backend-py/cache
 ENV NLTK_DATA=/app/backend-py/cache/nltk_data
+# Set vocabulary source and path for Norvig vocabulary
+ENV VOCAB_SOURCE=norvig
+ENV NORVIG_VOCAB_PATH=/app/backend-py/words/norvig/count_1w100k.txt
 # Health check
 # HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
 #     CMD curl -f http://localhost:7860/health || exit 1

crossword-app/backend-py/.env.example CHANGED Viewed

@@ -10,6 +10,31 @@ EMBEDDING_MODEL=sentence-transformers/all-mpnet-base-v2
 WORD_SIMILARITY_THRESHOLD=0.65
 MAX_VOCAB_SIZE=30000
 # HuggingFace Configuration (if needed for cloud inference)
 HUGGINGFACE_API_KEY=your_huggingface_api_key_here

 WORD_SIMILARITY_THRESHOLD=0.65
 MAX_VOCAB_SIZE=30000
+# Vocabulary Configuration
+# Options: "norvig" (default, recommended), "wordfreq" (legacy)
+VOCAB_SOURCE=norvig
+NORVIG_VOCAB_PATH=words/norvig/count_1w100k.txt
+THEMATIC_VOCAB_SIZE_LIMIT=100000
+THEMATIC_MODEL_NAME=all-mpnet-base-v2
+# Cache Configuration
+CACHE_DIR=./cache-dir
+# Debug and Development Options
+ENABLE_DEBUG_TAB=true
+ENABLE_DISTRIBUTION_NORMALIZATION=false
+# Multi-topic Configuration
+MULTI_TOPIC_METHOD=soft_minimum
+SOFT_MIN_BETA=10.0
+SOFT_MIN_ADAPTIVE=true
+SOFT_MIN_MIN_WORDS=15
+SOFT_MIN_MAX_RETRIES=5
+SOFT_MIN_BETA_DECAY=0.7
+# Normalization Configuration (when enabled)
+NORMALIZATION_METHOD=similarity_range
 # HuggingFace Configuration (if needed for cloud inference)
 HUGGINGFACE_API_KEY=your_huggingface_api_key_here

crossword-app/backend-py/requirements.txt CHANGED Viewed

@@ -41,7 +41,7 @@ torch==2.5.1
 transformers==4.47.1
 scikit-learn==1.5.2
 huggingface-hub==0.26.2
-wordfreq==3.1.0
 # NLTK dependencies for WordNet clue generation
 nltk==3.8.1

 transformers==4.47.1
 scikit-learn==1.5.2
 huggingface-hub==0.26.2
+# wordfreq==3.1.0  # Optional: fallback vocabulary source (use VOCAB_SOURCE=wordfreq)
 # NLTK dependencies for WordNet clue generation
 nltk==3.8.1

crossword-app/backend-py/src/services/norvig_vocabulary_manager.py ADDED Viewed

	@@ -0,0 +1,307 @@

+#!/usr/bin/env python3
+"""
+Norvig Vocabulary Manager
+Provides a WordFreq-compatible interface using Peter Norvig's curated word lists.
+Replaces the WordFreq-based vocabulary system with clean, high-quality word data
+from norvig.com/ngrams/count_1w100k.txt.
+Features:
+- Clean vocabulary without web-scraped junk or typos
+- Google-quality curation by Peter Norvig (Director of Research)
+- Maintains WordFreq compatibility for seamless integration
+- Preserves all existing frequency tier and difficulty systems
+Environment Variables:
+- NORVIG_VOCAB_PATH: Path to Norvig word count file (default: hack/norvig/count_1w100k.txt)
+- CACHE_DIR: Cache directory for processed vocabulary data
+"""
+import os
+import pickle
+import logging
+import numpy as np
+from pathlib import Path
+from typing import List, Tuple, Dict, Optional, Counter
+from collections import Counter
+logger = logging.getLogger(__name__)
+class NorgivVocabularyManager:
+    """
+    Norvig vocabulary manager that provides a WordFreq-compatible interface.
+    Loads and processes Peter Norvig's curated word lists for crossword generation.
+    """
+    def __init__(self, cache_dir: Optional[str] = None, vocab_size_limit: Optional[int] = None):
+        """Initialize Norvig vocabulary manager.
+        Args:
+            cache_dir: Directory for caching vocabulary and frequency data
+            vocab_size_limit: Maximum vocabulary size (None for full Norvig list)
+        """
+        if cache_dir is None:
+            cache_dir = os.getenv("CACHE_DIR")
+            if cache_dir is None:
+                cache_dir = os.path.join(os.path.dirname(__file__), 'model_cache')
+        self.cache_dir = Path(cache_dir)
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+        # Vocabulary size configuration
+        self.vocab_size_limit = vocab_size_limit or int(os.getenv("THEMATIC_VOCAB_SIZE_LIMIT",
+                                                                 os.getenv("MAX_VOCABULARY_SIZE", "100000")))
+        # Norvig file configuration
+        norvig_path = os.getenv("NORVIG_VOCAB_PATH", "words/norvig/count_1w100k.txt")
+        if not os.path.isabs(norvig_path):
+            # Make relative paths relative to backend-py directory (2 levels up from this file)
+            # Current: crossword-app/backend-py/src/services/norvig_vocabulary_manager.py
+            # Target: crossword-app/backend-py/words/norvig/count_1w100k.txt
+            backend_root = Path(__file__).parent.parent.parent
+            self.norvig_file_path = backend_root / norvig_path
+        else:
+            self.norvig_file_path = Path(norvig_path)
+        # Cache paths - use "norvig" prefix to distinguish from wordfreq cache
+        self.vocab_cache_path = self.cache_dir / f"norvig_vocabulary_{self.vocab_size_limit}.pkl"
+        self.frequency_cache_path = self.cache_dir / f"norvig_frequencies_{self.vocab_size_limit}.pkl"
+        # Loaded data
+        self.vocabulary: List[str] = []
+        self.word_frequencies: Counter = Counter()
+        self.is_loaded = False
+        logger.info(f"📝 Norvig Vocabulary Manager initialized")
+        logger.info(f"   📂 Cache dir: {self.cache_dir}")
+        logger.info(f"   📊 Vocab limit: {self.vocab_size_limit:,}")
+        logger.info(f"   📄 Norvig file: {self.norvig_file_path}")
+    def load_vocabulary(self) -> Tuple[List[str], Counter]:
+        """Load vocabulary and frequency data, with caching."""
+        if self.is_loaded:
+            return self.vocabulary, self.word_frequencies
+        # Try loading from cache
+        if self._load_from_cache():
+            logger.info(f"✅ Loaded Norvig vocabulary from cache: {len(self.vocabulary):,} words")
+            self.is_loaded = True
+            return self.vocabulary, self.word_frequencies
+        # Generate from Norvig file
+        logger.info("🔄 Generating vocabulary from Norvig file...")
+        self._generate_vocabulary_from_norvig()
+        # Save to cache
+        self._save_to_cache()
+        self.is_loaded = True
+        return self.vocabulary, self.word_frequencies
+    def _load_from_cache(self) -> bool:
+        """Load vocabulary and frequencies from cache."""
+        try:
+            if self.vocab_cache_path.exists() and self.frequency_cache_path.exists():
+                logger.info(f"📦 Loading Norvig vocabulary from cache...")
+                logger.info(f"  Vocab cache: {self.vocab_cache_path}")
+                logger.info(f"  Freq cache: {self.frequency_cache_path}")
+                # Validate cache files are readable
+                if not os.access(self.vocab_cache_path, os.R_OK):
+                    logger.warning(f"⚠️ Vocabulary cache file not readable: {self.vocab_cache_path}")
+                    return False
+                if not os.access(self.frequency_cache_path, os.R_OK):
+                    logger.warning(f"⚠️ Frequency cache file not readable: {self.frequency_cache_path}")
+                    return False
+                with open(self.vocab_cache_path, 'rb') as f:
+                    self.vocabulary = pickle.load(f)
+                with open(self.frequency_cache_path, 'rb') as f:
+                    self.word_frequencies = pickle.load(f)
+                # Validate loaded data
+                if not self.vocabulary or not self.word_frequencies:
+                    logger.warning("⚠️ Cache files contain empty data")
+                    return False
+                logger.info(f"✅ Loaded {len(self.vocabulary):,} words and {len(self.word_frequencies):,} frequencies from cache")
+                return True
+            else:
+                missing = []
+                if not self.vocab_cache_path.exists():
+                    missing.append(f"vocabulary ({self.vocab_cache_path})")
+                if not self.frequency_cache_path.exists():
+                    missing.append(f"frequency ({self.frequency_cache_path})")
+                logger.info(f"📂 Cache files missing: {', '.join(missing)}")
+                return False
+        except Exception as e:
+            logger.warning(f"⚠️ Cache loading failed: {e}")
+        return False
+    def _save_to_cache(self):
+        """Save vocabulary and frequencies to cache."""
+        try:
+            logger.info("💾 Saving Norvig vocabulary to cache...")
+            with open(self.vocab_cache_path, 'wb') as f:
+                pickle.dump(self.vocabulary, f)
+            with open(self.frequency_cache_path, 'wb') as f:
+                pickle.dump(self.word_frequencies, f)
+            logger.info("✅ Norvig vocabulary cached successfully")
+        except Exception as e:
+            logger.warning(f"⚠️ Cache saving failed: {e}")
+    def _generate_vocabulary_from_norvig(self):
+        """Generate filtered vocabulary from Norvig word count file."""
+        if not self.norvig_file_path.exists():
+            raise FileNotFoundError(f"Norvig vocabulary file not found: {self.norvig_file_path}")
+        logger.info(f"📚 Loading words from Norvig file: {self.norvig_file_path}")
+        raw_word_counts = self._load_norvig_file()
+        logger.info(f"📥 Loaded {len(raw_word_counts):,} raw words from Norvig file")
+        # Apply crossword-suitable filtering
+        filtered_words = []
+        frequency_data = Counter()
+        logger.info("🔍 Applying crossword filtering...")
+        for word, count in raw_word_counts.items():
+            if self._is_crossword_suitable(word):
+                word_lower = word.lower()
+                filtered_words.append(word_lower)
+                frequency_data[word_lower] = count
+                if len(filtered_words) >= self.vocab_size_limit:
+                    break
+        # Remove duplicates and sort
+        self.vocabulary = sorted(list(set(filtered_words)))
+        self.word_frequencies = frequency_data
+        logger.info(f"✅ Generated filtered Norvig vocabulary: {len(self.vocabulary):,} words")
+        logger.info(f"📊 Frequency data coverage: {len(self.word_frequencies):,} words")
+        # Log some stats about the filtered vocabulary
+        if self.vocabulary:
+            lengths = [len(word) for word in self.vocabulary]
+            logger.info(f"📏 Word length range: {min(lengths)}-{max(lengths)} chars")
+            logger.info(f"🔢 Average word length: {np.mean(lengths):.1f} chars")
+        if self.word_frequencies:
+            counts = list(self.word_frequencies.values())
+            logger.info(f"📈 Frequency range: {min(counts):,} - {max(counts):,}")
+    def _load_norvig_file(self) -> Dict[str, int]:
+        """Load Norvig word count file and return word->count mapping."""
+        word_counts = {}
+        try:
+            with open(self.norvig_file_path, 'r', encoding='utf-8') as f:
+                for line_num, line in enumerate(f, 1):
+                    line = line.strip()
+                    if not line:
+                        continue
+                    # Parse tab-separated format: WORD\tCOUNT
+                    parts = line.split('\t')
+                    if len(parts) == 2:
+                        word, count_str = parts
+                        try:
+                            count = int(count_str)
+                            word_counts[word.upper()] = count
+                        except ValueError:
+                            logger.warning(f"⚠️ Invalid count on line {line_num}: {line}")
+                    else:
+                        logger.warning(f"⚠️ Invalid format on line {line_num}: {line}")
+            return word_counts
+        except Exception as e:
+            logger.error(f"❌ Failed to load Norvig file {self.norvig_file_path}: {e}")
+            raise
+    def _is_crossword_suitable(self, word: str) -> bool:
+        """Check if word is suitable for crosswords (same logic as WordFreq version)."""
+        word = word.lower().strip()
+        # Length check (3-12 characters for crosswords)
+        if len(word) < 3 or len(word) > 12:
+            return False
+        # Must be alphabetic only
+        if not word.isalpha():
+            return False
+        # Skip boring/common words (same as WordFreq version)
+        boring_words = {
+            'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'this', 'that',
+            'with', 'from', 'they', 'were', 'been', 'have', 'their', 'said', 'each',
+            'which', 'what', 'there', 'will', 'more', 'when', 'some', 'like', 'into',
+            'time', 'very', 'only', 'has', 'had', 'who', 'its', 'now', 'find', 'long',
+            'down', 'day', 'did', 'get', 'come', 'made', 'may', 'part'
+        }
+        if word in boring_words:
+            return False
+        # Skip obvious plurals (simple heuristic)
+        if len(word) > 4 and word.endswith('s') and not word.endswith(('ss', 'us', 'is')):
+            return False
+        # Skip words with repeated characters (often not real words)
+        if len(set(word)) < len(word) * 0.6:  # Less than 60% unique characters
+            return False
+        return True
+    def get_word_frequency(self, word: str) -> float:
+        """Get word frequency as a normalized score (compatible with WordFreq API)."""
+        word_lower = word.lower()
+        if word_lower not in self.word_frequencies:
+            return 0.0
+        # Convert count to normalized frequency similar to WordFreq
+        # Use log scale similar to WordFreq's approach
+        count = self.word_frequencies[word_lower]
+        max_count = max(self.word_frequencies.values()) if self.word_frequencies else 1
+        # Normalize to 0-1 range with log scaling
+        normalized_freq = np.log10(count + 1) / np.log10(max_count + 1)
+        return float(normalized_freq)
+    def get_vocabulary_stats(self) -> Dict:
+        """Get statistics about the loaded vocabulary."""
+        if not self.is_loaded:
+            self.load_vocabulary()
+        stats = {
+            "total_words": len(self.vocabulary),
+            "vocabulary_source": "norvig",
+            "norvig_file": str(self.norvig_file_path),
+            "vocab_size_limit": self.vocab_size_limit,
+        }
+        if self.vocabulary:
+            lengths = [len(word) for word in self.vocabulary]
+            stats.update({
+                "min_word_length": min(lengths),
+                "max_word_length": max(lengths),
+                "avg_word_length": np.mean(lengths),
+            })
+        if self.word_frequencies:
+            counts = list(self.word_frequencies.values())
+            stats.update({
+                "min_frequency": min(counts),
+                "max_frequency": max(counts),
+                "total_frequency": sum(counts),
+            })
+        return stats

crossword-app/backend-py/src/services/thematic_word_service.py CHANGED Viewed

@@ -50,12 +50,21 @@ import time
 from collections import Counter
 from pathlib import Path
-# WordFreq imports (assumed to be available)
-from wordfreq import word_frequency, zipf_frequency, top_n_list
 # Use backend's logging configuration
 logger = logging.getLogger(__name__)
 def get_timestamp():
     return datetime.now().strftime("%H:%M:%S")
@@ -65,7 +74,7 @@ def get_datetimestamp():
 class VocabularyManager:
     """
-    Centralized vocabulary management using WordFreq as the single source.
     Handles loading, filtering, caching, and frequency data generation.
     """
@@ -74,7 +83,7 @@ class VocabularyManager:
         Args:
             cache_dir: Directory for caching vocabulary and embeddings
-            vocab_size_limit: Maximum vocabulary size (None for full WordFreq vocabulary)
         """
         if cache_dir is None:
             # Check environment variable for cache directory
@@ -89,9 +98,29 @@ class VocabularyManager:
         self.vocab_size_limit = vocab_size_limit or int(os.getenv("THEMATIC_VOCAB_SIZE_LIMIT",
                                                                  os.getenv("MAX_VOCABULARY_SIZE", "100000")))
-        # Cache paths
-        self.vocab_cache_path = self.cache_dir / f"vocabulary_{self.vocab_size_limit}.pkl"
-        self.frequency_cache_path = self.cache_dir / f"frequencies_{self.vocab_size_limit}.pkl"
         # Loaded data
         self.vocabulary: List[str] = []
@@ -102,7 +131,14 @@ class VocabularyManager:
         """Load vocabulary and frequency data, with caching."""
         if self.is_loaded:
             return self.vocabulary, self.word_frequencies
         # Try loading from cache
         if self._load_from_cache():
             logger.info(f"✅ Loaded vocabulary from cache: {len(self.vocabulary):,} words")
@@ -179,6 +215,9 @@ class VocabularyManager:
     def _generate_vocabulary_from_wordfreq(self):
         """Generate filtered vocabulary from WordFreq database."""
         logger.info(f"📚 Fetching top {self.vocab_size_limit:,} words from WordFreq...")
         # Get comprehensive word list from WordFreq
@@ -282,6 +321,28 @@ class ThematicWordService:
                                 int(os.getenv("THEMATIC_VOCAB_SIZE_LIMIT",
                                              os.getenv("MAX_VOCABULARY_SIZE", "100000"))))
         # Configuration parameters for softmax weighted selection
         self.similarity_temperature = float(os.getenv("SIMILARITY_TEMPERATURE", "0.2"))
         self.use_softmax_selection = os.getenv("USE_SOFTMAX_SELECTION", "true").lower() == "true"
@@ -312,7 +373,7 @@ class ThematicWordService:
         self.enable_debug_tab = os.getenv("ENABLE_DEBUG_TAB", "false").lower() == "true"
         # Core components
-        self.vocab_manager = VocabularyManager(str(self.cache_dir), self.vocab_size_limit)
         self.model: Optional[SentenceTransformer] = None
         # Loaded data
@@ -323,8 +384,8 @@ class ThematicWordService:
         self.tier_descriptions: Dict[str, str] = {}
         self.word_percentiles: Dict[str, float] = {}
-        # Cache paths for embeddings
-        vocab_hash = f"{self.model_name.replace('/', '_')}_{self.vocab_size_limit}"
         self.embeddings_cache_path = self.cache_dir / f"embeddings_{vocab_hash}.npy"
         self.is_initialized = False
@@ -1330,28 +1391,40 @@ class ThematicWordService:
     def get_cache_status(self) -> Dict[str, Any]:
         """Get detailed cache status information."""
-        vocab_exists = self.vocab_manager.vocab_cache_path.exists()
-        freq_exists = self.vocab_manager.frequency_cache_path.exists()
         embeddings_exists = self.embeddings_cache_path.exists()
         status = {
             "cache_directory": str(self.cache_dir),
             "vocabulary_cache": {
-                "path": str(self.vocab_manager.vocab_cache_path),
                 "exists": vocab_exists,
-                "readable": vocab_exists and os.access(self.vocab_manager.vocab_cache_path, os.R_OK)
             },
             "frequency_cache": {
-                "path": str(self.vocab_manager.frequency_cache_path),
                 "exists": freq_exists,
-                "readable": freq_exists and os.access(self.vocab_manager.frequency_cache_path, os.R_OK)
             },
             "embeddings_cache": {
                 "path": str(self.embeddings_cache_path),
                 "exists": embeddings_exists,
                 "readable": embeddings_exists and os.access(self.embeddings_cache_path, os.R_OK)
             },
-            "complete": vocab_exists and freq_exists and embeddings_exists
         }
         # Add size information if files exist
@@ -1519,7 +1592,9 @@ class ThematicWordService:
                     "custom_sentence": custom_sentence,
                     "multi_theme": multi_theme,
                     "thematic_pool_size": thematic_pool,
-                    "min_similarity": min_similarity
                 },
                 "thematic_pool": [
                     {

 from collections import Counter
 from pathlib import Path
 # Use backend's logging configuration
 logger = logging.getLogger(__name__)
+# WordFreq imports (for backward compatibility)
+try:
+    from wordfreq import word_frequency, zipf_frequency, top_n_list
+    WORDFREQ_AVAILABLE = True
+except ImportError:
+    logger.warning("WordFreq not available, using Norvig vocabulary only")
+    WORDFREQ_AVAILABLE = False
+# Norvig vocabulary imports
+from .norvig_vocabulary_manager import NorgivVocabularyManager
 def get_timestamp():
     return datetime.now().strftime("%H:%M:%S")
 class VocabularyManager:
     """
+    Centralized vocabulary management supporting both WordFreq and Norvig sources.
     Handles loading, filtering, caching, and frequency data generation.
     """
         Args:
             cache_dir: Directory for caching vocabulary and embeddings
+            vocab_size_limit: Maximum vocabulary size (None for full vocabulary)
         """
         if cache_dir is None:
             # Check environment variable for cache directory
         self.vocab_size_limit = vocab_size_limit or int(os.getenv("THEMATIC_VOCAB_SIZE_LIMIT",
                                                                  os.getenv("MAX_VOCABULARY_SIZE", "100000")))
+        # Vocabulary source configuration
+        self.vocab_source = os.getenv("VOCAB_SOURCE", "norvig").lower()
+        logger.info(f"📚 Vocabulary source: {self.vocab_source}")
+        # Initialize appropriate vocabulary manager
+        if self.vocab_source == "norvig":
+            self.vocab_manager = NorgivVocabularyManager(cache_dir, vocab_size_limit)
+        elif self.vocab_source == "wordfreq" and WORDFREQ_AVAILABLE:
+            self.vocab_manager = None  # Use built-in WordFreq logic
+        else:
+            if not WORDFREQ_AVAILABLE:
+                logger.warning("⚠️ WordFreq not available, falling back to Norvig")
+                self.vocab_source = "norvig"
+                self.vocab_manager = NorgivVocabularyManager(cache_dir, vocab_size_limit)
+            else:
+                logger.warning(f"⚠️ Unknown vocab source '{self.vocab_source}', falling back to Norvig")
+                self.vocab_source = "norvig"
+                self.vocab_manager = NorgivVocabularyManager(cache_dir, vocab_size_limit)
+        # Cache paths (include source in filename)
+        source_suffix = f"_{self.vocab_source}" if self.vocab_source != "wordfreq" else ""
+        self.vocab_cache_path = self.cache_dir / f"vocabulary{source_suffix}_{self.vocab_size_limit}.pkl"
+        self.frequency_cache_path = self.cache_dir / f"frequencies{source_suffix}_{self.vocab_size_limit}.pkl"
         # Loaded data
         self.vocabulary: List[str] = []
         """Load vocabulary and frequency data, with caching."""
         if self.is_loaded:
             return self.vocabulary, self.word_frequencies
+        # Use Norvig vocabulary manager if configured
+        if self.vocab_manager is not None:
+            self.vocabulary, self.word_frequencies = self.vocab_manager.load_vocabulary()
+            self.is_loaded = True
+            return self.vocabulary, self.word_frequencies
+        # Fallback to WordFreq logic for backward compatibility
         # Try loading from cache
         if self._load_from_cache():
             logger.info(f"✅ Loaded vocabulary from cache: {len(self.vocabulary):,} words")
     def _generate_vocabulary_from_wordfreq(self):
         """Generate filtered vocabulary from WordFreq database."""
+        if not WORDFREQ_AVAILABLE:
+            raise ImportError("WordFreq is not available, cannot generate vocabulary")
         logger.info(f"📚 Fetching top {self.vocab_size_limit:,} words from WordFreq...")
         # Get comprehensive word list from WordFreq
                                 int(os.getenv("THEMATIC_VOCAB_SIZE_LIMIT",
                                              os.getenv("MAX_VOCABULARY_SIZE", "100000"))))
+        # Vocabulary source configuration
+        self.vocab_source = os.getenv("VOCAB_SOURCE", "norvig").lower()
+        logger.info(f"📚 Vocabulary source: {self.vocab_source}")
+        # Initialize appropriate vocabulary manager
+        if self.vocab_source == "norvig":
+            from .norvig_vocabulary_manager import NorgivVocabularyManager
+            self.vocab_manager = NorgivVocabularyManager(str(self.cache_dir), self.vocab_size_limit)
+        elif self.vocab_source == "wordfreq" and WORDFREQ_AVAILABLE:
+            self.vocab_manager = None  # Use built-in WordFreq logic
+        else:
+            if not WORDFREQ_AVAILABLE:
+                logger.warning("⚠️ WordFreq not available, falling back to Norvig")
+                self.vocab_source = "norvig"
+                from .norvig_vocabulary_manager import NorgivVocabularyManager
+                self.vocab_manager = NorgivVocabularyManager(str(self.cache_dir), self.vocab_size_limit)
+            else:
+                logger.warning(f"⚠️ Unknown vocab source '{self.vocab_source}', falling back to Norvig")
+                self.vocab_source = "norvig"
+                from .norvig_vocabulary_manager import NorgivVocabularyManager
+                self.vocab_manager = NorgivVocabularyManager(str(self.cache_dir), self.vocab_size_limit)
         # Configuration parameters for softmax weighted selection
         self.similarity_temperature = float(os.getenv("SIMILARITY_TEMPERATURE", "0.2"))
         self.use_softmax_selection = os.getenv("USE_SOFTMAX_SELECTION", "true").lower() == "true"
         self.enable_debug_tab = os.getenv("ENABLE_DEBUG_TAB", "false").lower() == "true"
         # Core components
+        # Note: vocab_manager already initialized in constructor based on VOCAB_SOURCE
         self.model: Optional[SentenceTransformer] = None
         # Loaded data
         self.tier_descriptions: Dict[str, str] = {}
         self.word_percentiles: Dict[str, float] = {}
+        # Cache paths for embeddings (include vocabulary source for proper separation)
+        vocab_hash = f"{self.model_name.replace('/', '_')}_{self.vocab_source}_{self.vocab_size_limit}"
         self.embeddings_cache_path = self.cache_dir / f"embeddings_{vocab_hash}.npy"
         self.is_initialized = False
     def get_cache_status(self) -> Dict[str, Any]:
         """Get detailed cache status information."""
+        # Handle different vocabulary manager types
+        if self.vocab_manager is not None:
+            # Using Norvig or other vocab manager with cache paths
+            vocab_exists = self.vocab_manager.vocab_cache_path.exists()
+            freq_exists = self.vocab_manager.frequency_cache_path.exists()
+            vocab_path = str(self.vocab_manager.vocab_cache_path)
+            freq_path = str(self.vocab_manager.frequency_cache_path)
+        else:
+            # Using WordFreq (no separate cache files)
+            vocab_exists = False
+            freq_exists = False
+            vocab_path = "N/A (using WordFreq)"
+            freq_path = "N/A (using WordFreq)"
         embeddings_exists = self.embeddings_cache_path.exists()
         status = {
             "cache_directory": str(self.cache_dir),
             "vocabulary_cache": {
+                "path": vocab_path,
                 "exists": vocab_exists,
+                "readable": vocab_exists and os.access(vocab_path, os.R_OK) if vocab_exists else False
             },
             "frequency_cache": {
+                "path": freq_path,
                 "exists": freq_exists,
+                "readable": freq_exists and os.access(freq_path, os.R_OK) if freq_exists else False
             },
             "embeddings_cache": {
                 "path": str(self.embeddings_cache_path),
                 "exists": embeddings_exists,
                 "readable": embeddings_exists and os.access(self.embeddings_cache_path, os.R_OK)
             },
+            "complete": (vocab_exists or self.vocab_manager is None) and (freq_exists or self.vocab_manager is None) and embeddings_exists
         }
         # Add size information if files exist
                     "custom_sentence": custom_sentence,
                     "multi_theme": multi_theme,
                     "thematic_pool_size": thematic_pool,
+                    "min_similarity": min_similarity,
+                    "multi_topic_method": self.multi_topic_method if len(topics) > 1 else None,
+                    "soft_min_beta": self.soft_min_beta if len(topics) > 1 and self.multi_topic_method == "soft_minimum" else None
                 },
                 "thematic_pool": [
                     {

crossword-app/frontend/src/components/DebugTab.jsx CHANGED Viewed

@@ -53,6 +53,12 @@ const DebugTab = ({ debugData }) => {
         <div><strong>Thematic Pool Size:</strong> {debugData.generation_params.thematic_pool_size}</div>
         <div><strong>Min Similarity:</strong> {debugData.generation_params.min_similarity}</div>
         <div><strong>Multi-theme:</strong> {debugData.generation_params.multi_theme ? 'Yes' : 'No'}</div>
         {debugData.generation_params.custom_sentence && (
           <div><strong>Custom Sentence:</strong> "{debugData.generation_params.custom_sentence}"</div>
         )}
@@ -71,6 +77,9 @@ const DebugTab = ({ debugData }) => {
         <li><strong>Composite Score</strong> = (1 - difficulty_weight) × similarity + difficulty_weight × frequency_alignment</li>
         <li><strong>Frequency Alignment</strong>: Gaussian distribution favoring target percentiles by difficulty</li>
         <li><strong>Softmax Selection</strong>: Probabilistic selection based on composite scores with temperature control</li>
       </ul>
       <h4>Difficulty Targets:</h4>
@@ -177,6 +186,10 @@ const DebugTab = ({ debugData }) => {
                 onClick={() => handleSort('similarity')}
                 style={{ cursor: 'pointer', userSelect: 'none' }}
                 className={sortBy === 'similarity' ? 'sorted-column' : ''}
               >
                 Similarity{getSortIcon('similarity')}
               </th>
@@ -299,6 +312,9 @@ const DebugTab = ({ debugData }) => {
         <li><strong>Composite Score</strong> = (1 - difficulty_weight) × similarity + difficulty_weight × frequency_alignment</li>
         <li><strong>Frequency Alignment</strong>: Gaussian distribution favoring target percentiles by difficulty</li>
         <li><strong>Softmax Selection</strong>: Probabilistic selection based on composite scores with temperature control</li>
       </ul>
       <h4>Difficulty Targets:</h4>

         <div><strong>Thematic Pool Size:</strong> {debugData.generation_params.thematic_pool_size}</div>
         <div><strong>Min Similarity:</strong> {debugData.generation_params.min_similarity}</div>
         <div><strong>Multi-theme:</strong> {debugData.generation_params.multi_theme ? 'Yes' : 'No'}</div>
+        {debugData.generation_params.multi_topic_method && (
+          <div><strong>Multi-Topic Method:</strong> {debugData.generation_params.multi_topic_method}</div>
+        )}
+        {debugData.generation_params.soft_min_beta && (
+          <div><strong>Soft Min Beta:</strong> {debugData.generation_params.soft_min_beta}</div>
+        )}
         {debugData.generation_params.custom_sentence && (
           <div><strong>Custom Sentence:</strong> "{debugData.generation_params.custom_sentence}"</div>
         )}
         <li><strong>Composite Score</strong> = (1 - difficulty_weight) × similarity + difficulty_weight × frequency_alignment</li>
         <li><strong>Frequency Alignment</strong>: Gaussian distribution favoring target percentiles by difficulty</li>
         <li><strong>Softmax Selection</strong>: Probabilistic selection based on composite scores with temperature control</li>
+        {debugData.generation_params.multi_topic_method && (
+          <li><strong>Multi-Topic Similarity:</strong> Uses {debugData.generation_params.multi_topic_method} method to find words relevant to ALL topics</li>
+        )}
       </ul>
       <h4>Difficulty Targets:</h4>
                 onClick={() => handleSort('similarity')}
                 style={{ cursor: 'pointer', userSelect: 'none' }}
                 className={sortBy === 'similarity' ? 'sorted-column' : ''}
+                title={debugData.generation_params.multi_topic_method ?
+                  `Multi-Topic Similarity (${debugData.generation_params.multi_topic_method}): Score representing relevance to ALL topics simultaneously. ${debugData.generation_params.multi_topic_method === 'soft_minimum' ? 'Uses soft minimum aggregation (β=' + debugData.generation_params.soft_min_beta + ') - high scores mean the word relates well to every selected topic.' : 'Aggregated across all topics.'}` :
+                  'Similarity: Semantic similarity score to the selected topic (0.0 to 1.0)'
+                }
               >
                 Similarity{getSortIcon('similarity')}
               </th>
         <li><strong>Composite Score</strong> = (1 - difficulty_weight) × similarity + difficulty_weight × frequency_alignment</li>
         <li><strong>Frequency Alignment</strong>: Gaussian distribution favoring target percentiles by difficulty</li>
         <li><strong>Softmax Selection</strong>: Probabilistic selection based on composite scores with temperature control</li>
+        {debugData.generation_params.multi_topic_method && (
+          <li><strong>Multi-Topic Similarity:</strong> Uses {debugData.generation_params.multi_topic_method} method to find words relevant to ALL topics</li>
+        )}
       </ul>
       <h4>Difficulty Targets:</h4>

{hack → crossword-app/words}/norvig/count_1w.txt RENAMED Viewed

File without changes

{hack → crossword-app/words}/norvig/count_1w100k.txt RENAMED Viewed

File without changes