File size: 13,190 Bytes
9cd7541
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
#!/usr/bin/env python3
"""
Norvig Vocabulary Manager

Provides a WordFreq-compatible interface using Peter Norvig's curated word lists.
Replaces the WordFreq-based vocabulary system with clean, high-quality word data
from norvig.com/ngrams/count_1w100k.txt.

Features:
- Clean vocabulary without web-scraped junk or typos
- Google-quality curation by Peter Norvig (Director of Research)
- Maintains WordFreq compatibility for seamless integration
- Preserves all existing frequency tier and difficulty systems

Environment Variables:
- NORVIG_VOCAB_PATH: Path to Norvig word count file (default: hack/norvig/count_1w100k.txt)
- CACHE_DIR: Cache directory for processed vocabulary data
"""

import os
import pickle
import logging
import numpy as np
from pathlib import Path
from typing import List, Tuple, Dict, Optional, Counter
from collections import Counter

logger = logging.getLogger(__name__)


class NorgivVocabularyManager:
    """
    Norvig vocabulary manager that provides a WordFreq-compatible interface.
    Loads and processes Peter Norvig's curated word lists for crossword generation.
    """
    
    def __init__(self, cache_dir: Optional[str] = None, vocab_size_limit: Optional[int] = None):
        """Initialize Norvig vocabulary manager.
        
        Args:
            cache_dir: Directory for caching vocabulary and frequency data
            vocab_size_limit: Maximum vocabulary size (None for full Norvig list)
        """
        if cache_dir is None:
            cache_dir = os.getenv("CACHE_DIR")
            if cache_dir is None:
                cache_dir = os.path.join(os.path.dirname(__file__), 'model_cache')
        
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(parents=True, exist_ok=True)
        
        # Vocabulary size configuration
        self.vocab_size_limit = vocab_size_limit or int(os.getenv("THEMATIC_VOCAB_SIZE_LIMIT", 
                                                                 os.getenv("MAX_VOCABULARY_SIZE", "100000")))
        
        # Norvig file configuration
        norvig_path = os.getenv("NORVIG_VOCAB_PATH", "words/norvig/count_1w100k.txt")
        if not os.path.isabs(norvig_path):
            # Make relative paths relative to backend-py directory (2 levels up from this file)
            # Current: crossword-app/backend-py/src/services/norvig_vocabulary_manager.py
            # Target: crossword-app/backend-py/words/norvig/count_1w100k.txt
            backend_root = Path(__file__).parent.parent.parent
            self.norvig_file_path = backend_root / norvig_path
        else:
            self.norvig_file_path = Path(norvig_path)
        
        # Cache paths - use "norvig" prefix to distinguish from wordfreq cache
        self.vocab_cache_path = self.cache_dir / f"norvig_vocabulary_{self.vocab_size_limit}.pkl"
        self.frequency_cache_path = self.cache_dir / f"norvig_frequencies_{self.vocab_size_limit}.pkl"
        
        # Loaded data
        self.vocabulary: List[str] = []
        self.word_frequencies: Counter = Counter()
        self.is_loaded = False
        
        logger.info(f"πŸ“ Norvig Vocabulary Manager initialized")
        logger.info(f"   πŸ“‚ Cache dir: {self.cache_dir}")
        logger.info(f"   πŸ“Š Vocab limit: {self.vocab_size_limit:,}")
        logger.info(f"   πŸ“„ Norvig file: {self.norvig_file_path}")

    def load_vocabulary(self) -> Tuple[List[str], Counter]:
        """Load vocabulary and frequency data, with caching."""
        if self.is_loaded:
            return self.vocabulary, self.word_frequencies
            
        # Try loading from cache
        if self._load_from_cache():
            logger.info(f"βœ… Loaded Norvig vocabulary from cache: {len(self.vocabulary):,} words")
            self.is_loaded = True
            return self.vocabulary, self.word_frequencies
        
        # Generate from Norvig file
        logger.info("πŸ”„ Generating vocabulary from Norvig file...")
        self._generate_vocabulary_from_norvig()
        
        # Save to cache
        self._save_to_cache()
        
        self.is_loaded = True
        return self.vocabulary, self.word_frequencies
    
    def _load_from_cache(self) -> bool:
        """Load vocabulary and frequencies from cache."""
        try:
            if self.vocab_cache_path.exists() and self.frequency_cache_path.exists():
                logger.info(f"πŸ“¦ Loading Norvig vocabulary from cache...")
                logger.info(f"  Vocab cache: {self.vocab_cache_path}")
                logger.info(f"  Freq cache: {self.frequency_cache_path}")
                
                # Validate cache files are readable
                if not os.access(self.vocab_cache_path, os.R_OK):
                    logger.warning(f"⚠️ Vocabulary cache file not readable: {self.vocab_cache_path}")
                    return False
                    
                if not os.access(self.frequency_cache_path, os.R_OK):
                    logger.warning(f"⚠️ Frequency cache file not readable: {self.frequency_cache_path}")
                    return False
                
                with open(self.vocab_cache_path, 'rb') as f:
                    self.vocabulary = pickle.load(f)
                    
                with open(self.frequency_cache_path, 'rb') as f:
                    self.word_frequencies = pickle.load(f)
                
                # Validate loaded data
                if not self.vocabulary or not self.word_frequencies:
                    logger.warning("⚠️ Cache files contain empty data")
                    return False
                    
                logger.info(f"βœ… Loaded {len(self.vocabulary):,} words and {len(self.word_frequencies):,} frequencies from cache")
                return True
            else:
                missing = []
                if not self.vocab_cache_path.exists():
                    missing.append(f"vocabulary ({self.vocab_cache_path})")
                if not self.frequency_cache_path.exists():
                    missing.append(f"frequency ({self.frequency_cache_path})")
                logger.info(f"πŸ“‚ Cache files missing: {', '.join(missing)}")
                return False
        except Exception as e:
            logger.warning(f"⚠️ Cache loading failed: {e}")
            
        return False
    
    def _save_to_cache(self):
        """Save vocabulary and frequencies to cache."""
        try:
            logger.info("πŸ’Ύ Saving Norvig vocabulary to cache...")
            
            with open(self.vocab_cache_path, 'wb') as f:
                pickle.dump(self.vocabulary, f)
                
            with open(self.frequency_cache_path, 'wb') as f:
                pickle.dump(self.word_frequencies, f)
                
            logger.info("βœ… Norvig vocabulary cached successfully")
        except Exception as e:
            logger.warning(f"⚠️ Cache saving failed: {e}")
    
    def _generate_vocabulary_from_norvig(self):
        """Generate filtered vocabulary from Norvig word count file."""
        if not self.norvig_file_path.exists():
            raise FileNotFoundError(f"Norvig vocabulary file not found: {self.norvig_file_path}")
        
        logger.info(f"πŸ“š Loading words from Norvig file: {self.norvig_file_path}")
        
        raw_word_counts = self._load_norvig_file()
        logger.info(f"πŸ“₯ Loaded {len(raw_word_counts):,} raw words from Norvig file")
        
        # Apply crossword-suitable filtering
        filtered_words = []
        frequency_data = Counter()
        
        logger.info("πŸ” Applying crossword filtering...")
        for word, count in raw_word_counts.items():
            if self._is_crossword_suitable(word):
                word_lower = word.lower()
                filtered_words.append(word_lower)
                frequency_data[word_lower] = count
                
                if len(filtered_words) >= self.vocab_size_limit:
                    break
        
        # Remove duplicates and sort
        self.vocabulary = sorted(list(set(filtered_words)))
        self.word_frequencies = frequency_data
        
        logger.info(f"βœ… Generated filtered Norvig vocabulary: {len(self.vocabulary):,} words")
        logger.info(f"πŸ“Š Frequency data coverage: {len(self.word_frequencies):,} words")
        
        # Log some stats about the filtered vocabulary
        if self.vocabulary:
            lengths = [len(word) for word in self.vocabulary]
            logger.info(f"πŸ“ Word length range: {min(lengths)}-{max(lengths)} chars")
            logger.info(f"πŸ”’ Average word length: {np.mean(lengths):.1f} chars")
        
        if self.word_frequencies:
            counts = list(self.word_frequencies.values())
            logger.info(f"πŸ“ˆ Frequency range: {min(counts):,} - {max(counts):,}")
    
    def _load_norvig_file(self) -> Dict[str, int]:
        """Load Norvig word count file and return word->count mapping."""
        word_counts = {}
        
        try:
            with open(self.norvig_file_path, 'r', encoding='utf-8') as f:
                for line_num, line in enumerate(f, 1):
                    line = line.strip()
                    if not line:
                        continue
                    
                    # Parse tab-separated format: WORD\tCOUNT
                    parts = line.split('\t')
                    if len(parts) == 2:
                        word, count_str = parts
                        try:
                            count = int(count_str)
                            word_counts[word.upper()] = count
                        except ValueError:
                            logger.warning(f"⚠️ Invalid count on line {line_num}: {line}")
                    else:
                        logger.warning(f"⚠️ Invalid format on line {line_num}: {line}")
            
            return word_counts
        
        except Exception as e:
            logger.error(f"❌ Failed to load Norvig file {self.norvig_file_path}: {e}")
            raise
    
    def _is_crossword_suitable(self, word: str) -> bool:
        """Check if word is suitable for crosswords (same logic as WordFreq version)."""
        word = word.lower().strip()
        
        # Length check (3-12 characters for crosswords)
        if len(word) < 3 or len(word) > 12:
            return False
            
        # Must be alphabetic only
        if not word.isalpha():
            return False
            
        # Skip boring/common words (same as WordFreq version)
        boring_words = {
            'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'this', 'that',
            'with', 'from', 'they', 'were', 'been', 'have', 'their', 'said', 'each',
            'which', 'what', 'there', 'will', 'more', 'when', 'some', 'like', 'into',
            'time', 'very', 'only', 'has', 'had', 'who', 'its', 'now', 'find', 'long',
            'down', 'day', 'did', 'get', 'come', 'made', 'may', 'part'
        }
        
        if word in boring_words:
            return False
            
        # Skip obvious plurals (simple heuristic)
        if len(word) > 4 and word.endswith('s') and not word.endswith(('ss', 'us', 'is')):
            return False
            
        # Skip words with repeated characters (often not real words)
        if len(set(word)) < len(word) * 0.6:  # Less than 60% unique characters
            return False
            
        return True
    
    def get_word_frequency(self, word: str) -> float:
        """Get word frequency as a normalized score (compatible with WordFreq API)."""
        word_lower = word.lower()
        if word_lower not in self.word_frequencies:
            return 0.0
        
        # Convert count to normalized frequency similar to WordFreq
        # Use log scale similar to WordFreq's approach
        count = self.word_frequencies[word_lower]
        max_count = max(self.word_frequencies.values()) if self.word_frequencies else 1
        
        # Normalize to 0-1 range with log scaling
        normalized_freq = np.log10(count + 1) / np.log10(max_count + 1)
        return float(normalized_freq)
    
    def get_vocabulary_stats(self) -> Dict:
        """Get statistics about the loaded vocabulary."""
        if not self.is_loaded:
            self.load_vocabulary()
        
        stats = {
            "total_words": len(self.vocabulary),
            "vocabulary_source": "norvig",
            "norvig_file": str(self.norvig_file_path),
            "vocab_size_limit": self.vocab_size_limit,
        }
        
        if self.vocabulary:
            lengths = [len(word) for word in self.vocabulary]
            stats.update({
                "min_word_length": min(lengths),
                "max_word_length": max(lengths),
                "avg_word_length": np.mean(lengths),
            })
        
        if self.word_frequencies:
            counts = list(self.word_frequencies.values())
            stats.update({
                "min_frequency": min(counts),
                "max_frequency": max(counts),
                "total_frequency": sum(counts),
            })
        
        return stats