Spaces:

vimalk78
/

abc123

Sleeping

File size: 13,190 Bytes

9cd7541

#!/usr/bin/env python3
"""
Norvig Vocabulary Manager

Provides a WordFreq-compatible interface using Peter Norvig's curated word lists.
Replaces the WordFreq-based vocabulary system with clean, high-quality word data
from norvig.com/ngrams/count_1w100k.txt.

Features:
- Clean vocabulary without web-scraped junk or typos
- Google-quality curation by Peter Norvig (Director of Research)
- Maintains WordFreq compatibility for seamless integration
- Preserves all existing frequency tier and difficulty systems

Environment Variables:
- NORVIG_VOCAB_PATH: Path to Norvig word count file (default: hack/norvig/count_1w100k.txt)
- CACHE_DIR: Cache directory for processed vocabulary data
"""

import os
import pickle
import logging
import numpy as np
from pathlib import Path
from typing import List, Tuple, Dict, Optional, Counter
from collections import Counter

logger = logging.getLogger(__name__)


class NorgivVocabularyManager:
    """
    Norvig vocabulary manager that provides a WordFreq-compatible interface.
    Loads and processes Peter Norvig's curated word lists for crossword generation.
    """
    
    def __init__(self, cache_dir: Optional[str] = None, vocab_size_limit: Optional[int] = None):
        """Initialize Norvig vocabulary manager.
        
        Args:
            cache_dir: Directory for caching vocabulary and frequency data
            vocab_size_limit: Maximum vocabulary size (None for full Norvig list)
        """
        if cache_dir is None:
            cache_dir = os.getenv("CACHE_DIR")
            if cache_dir is None:
                cache_dir = os.path.join(os.path.dirname(__file__), 'model_cache')
        
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(parents=True, exist_ok=True)
        
        # Vocabulary size configuration
        self.vocab_size_limit = vocab_size_limit or int(os.getenv("THEMATIC_VOCAB_SIZE_LIMIT", 
                                                                 os.getenv("MAX_VOCABULARY_SIZE", "100000")))
        
        # Norvig file configuration
        norvig_path = os.getenv("NORVIG_VOCAB_PATH", "words/norvig/count_1w100k.txt")
        if not os.path.isabs(norvig_path):
            # Make relative paths relative to backend-py directory (2 levels up from this file)
            # Current: crossword-app/backend-py/src/services/norvig_vocabulary_manager.py
            # Target: crossword-app/backend-py/words/norvig/count_1w100k.txt
            backend_root = Path(__file__).parent.parent.parent
            self.norvig_file_path = backend_root / norvig_path
        else:
            self.norvig_file_path = Path(norvig_path)
        
        # Cache paths - use "norvig" prefix to distinguish from wordfreq cache
        self.vocab_cache_path = self.cache_dir / f"norvig_vocabulary_{self.vocab_size_limit}.pkl"
        self.frequency_cache_path = self.cache_dir / f"norvig_frequencies_{self.vocab_size_limit}.pkl"
        
        # Loaded data
        self.vocabulary: List[str] = []
        self.word_frequencies: Counter = Counter()
        self.is_loaded = False
        
        logger.info(f"📝 Norvig Vocabulary Manager initialized")
        logger.info(f"   📂 Cache dir: {self.cache_dir}")
        logger.info(f"   📊 Vocab limit: {self.vocab_size_limit:,}")
        logger.info(f"   📄 Norvig file: {self.norvig_file_path}")

    def load_vocabulary(self) -> Tuple[List[str], Counter]:
        """Load vocabulary and frequency data, with caching."""
        if self.is_loaded:
            return self.vocabulary, self.word_frequencies
            
        # Try loading from cache
        if self._load_from_cache():
            logger.info(f"✅ Loaded Norvig vocabulary from cache: {len(self.vocabulary):,} words")
            self.is_loaded = True
            return self.vocabulary, self.word_frequencies
        
        # Generate from Norvig file
        logger.info("🔄 Generating vocabulary from Norvig file...")
        self._generate_vocabulary_from_norvig()
        
        # Save to cache
        self._save_to_cache()
        
        self.is_loaded = True
        return self.vocabulary, self.word_frequencies
    
    def _load_from_cache(self) -> bool:
        """Load vocabulary and frequencies from cache."""
        try:
            if self.vocab_cache_path.exists() and self.frequency_cache_path.exists():
                logger.info(f"📦 Loading Norvig vocabulary from cache...")
                logger.info(f"  Vocab cache: {self.vocab_cache_path}")
                logger.info(f"  Freq cache: {self.frequency_cache_path}")
                
                # Validate cache files are readable
                if not os.access(self.vocab_cache_path, os.R_OK):
                    logger.warning(f"⚠️ Vocabulary cache file not readable: {self.vocab_cache_path}")
                    return False
                    
                if not os.access(self.frequency_cache_path, os.R_OK):
                    logger.warning(f"⚠️ Frequency cache file not readable: {self.frequency_cache_path}")
                    return False
                
                with open(self.vocab_cache_path, 'rb') as f:
                    self.vocabulary = pickle.load(f)
                    
                with open(self.frequency_cache_path, 'rb') as f:
                    self.word_frequencies = pickle.load(f)
                
                # Validate loaded data
                if not self.vocabulary or not self.word_frequencies:
                    logger.warning("⚠️ Cache files contain empty data")
                    return False
                    
                logger.info(f"✅ Loaded {len(self.vocabulary):,} words and {len(self.word_frequencies):,} frequencies from cache")
                return True
            else:
                missing = []
                if not self.vocab_cache_path.exists():
                    missing.append(f"vocabulary ({self.vocab_cache_path})")
                if not self.frequency_cache_path.exists():
                    missing.append(f"frequency ({self.frequency_cache_path})")
                logger.info(f"📂 Cache files missing: {', '.join(missing)}")
                return False
        except Exception as e:
            logger.warning(f"⚠️ Cache loading failed: {e}")
            
        return False
    
    def _save_to_cache(self):
        """Save vocabulary and frequencies to cache."""
        try:
            logger.info("💾 Saving Norvig vocabulary to cache...")
            
            with open(self.vocab_cache_path, 'wb') as f:
                pickle.dump(self.vocabulary, f)
                
            with open(self.frequency_cache_path, 'wb') as f:
                pickle.dump(self.word_frequencies, f)
                
            logger.info("✅ Norvig vocabulary cached successfully")
        except Exception as e:
            logger.warning(f"⚠️ Cache saving failed: {e}")
    
    def _generate_vocabulary_from_norvig(self):
        """Generate filtered vocabulary from Norvig word count file."""
        if not self.norvig_file_path.exists():
            raise FileNotFoundError(f"Norvig vocabulary file not found: {self.norvig_file_path}")
        
        logger.info(f"📚 Loading words from Norvig file: {self.norvig_file_path}")
        
        raw_word_counts = self._load_norvig_file()
        logger.info(f"📥 Loaded {len(raw_word_counts):,} raw words from Norvig file")
        
        # Apply crossword-suitable filtering
        filtered_words = []
        frequency_data = Counter()
        
        logger.info("🔍 Applying crossword filtering...")
        for word, count in raw_word_counts.items():
            if self._is_crossword_suitable(word):
                word_lower = word.lower()
                filtered_words.append(word_lower)
                frequency_data[word_lower] = count
                
                if len(filtered_words) >= self.vocab_size_limit:
                    break
        
        # Remove duplicates and sort
        self.vocabulary = sorted(list(set(filtered_words)))
        self.word_frequencies = frequency_data
        
        logger.info(f"✅ Generated filtered Norvig vocabulary: {len(self.vocabulary):,} words")
        logger.info(f"📊 Frequency data coverage: {len(self.word_frequencies):,} words")
        
        # Log some stats about the filtered vocabulary
        if self.vocabulary:
            lengths = [len(word) for word in self.vocabulary]
            logger.info(f"📏 Word length range: {min(lengths)}-{max(lengths)} chars")
            logger.info(f"🔢 Average word length: {np.mean(lengths):.1f} chars")
        
        if self.word_frequencies:
            counts = list(self.word_frequencies.values())
            logger.info(f"📈 Frequency range: {min(counts):,} - {max(counts):,}")
    
    def _load_norvig_file(self) -> Dict[str, int]:
        """Load Norvig word count file and return word->count mapping."""
        word_counts = {}
        
        try:
            with open(self.norvig_file_path, 'r', encoding='utf-8') as f:
                for line_num, line in enumerate(f, 1):
                    line = line.strip()
                    if not line:
                        continue
                    
                    # Parse tab-separated format: WORD\tCOUNT
                    parts = line.split('\t')
                    if len(parts) == 2:
                        word, count_str = parts
                        try:
                            count = int(count_str)
                            word_counts[word.upper()] = count
                        except ValueError:
                            logger.warning(f"⚠️ Invalid count on line {line_num}: {line}")
                    else:
                        logger.warning(f"⚠️ Invalid format on line {line_num}: {line}")
            
            return word_counts
        
        except Exception as e:
            logger.error(f"❌ Failed to load Norvig file {self.norvig_file_path}: {e}")
            raise
    
    def _is_crossword_suitable(self, word: str) -> bool:
        """Check if word is suitable for crosswords (same logic as WordFreq version)."""
        word = word.lower().strip()
        
        # Length check (3-12 characters for crosswords)
        if len(word) < 3 or len(word) > 12:
            return False
            
        # Must be alphabetic only
        if not word.isalpha():
            return False
            
        # Skip boring/common words (same as WordFreq version)
        boring_words = {
            'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'this', 'that',
            'with', 'from', 'they', 'were', 'been', 'have', 'their', 'said', 'each',
            'which', 'what', 'there', 'will', 'more', 'when', 'some', 'like', 'into',
            'time', 'very', 'only', 'has', 'had', 'who', 'its', 'now', 'find', 'long',
            'down', 'day', 'did', 'get', 'come', 'made', 'may', 'part'
        }
        
        if word in boring_words:
            return False
            
        # Skip obvious plurals (simple heuristic)
        if len(word) > 4 and word.endswith('s') and not word.endswith(('ss', 'us', 'is')):
            return False
            
        # Skip words with repeated characters (often not real words)
        if len(set(word)) < len(word) * 0.6:  # Less than 60% unique characters
            return False
            
        return True
    
    def get_word_frequency(self, word: str) -> float:
        """Get word frequency as a normalized score (compatible with WordFreq API)."""
        word_lower = word.lower()
        if word_lower not in self.word_frequencies:
            return 0.0
        
        # Convert count to normalized frequency similar to WordFreq
        # Use log scale similar to WordFreq's approach
        count = self.word_frequencies[word_lower]
        max_count = max(self.word_frequencies.values()) if self.word_frequencies else 1
        
        # Normalize to 0-1 range with log scaling
        normalized_freq = np.log10(count + 1) / np.log10(max_count + 1)
        return float(normalized_freq)
    
    def get_vocabulary_stats(self) -> Dict:
        """Get statistics about the loaded vocabulary."""
        if not self.is_loaded:
            self.load_vocabulary()
        
        stats = {
            "total_words": len(self.vocabulary),
            "vocabulary_source": "norvig",
            "norvig_file": str(self.norvig_file_path),
            "vocab_size_limit": self.vocab_size_limit,
        }
        
        if self.vocabulary:
            lengths = [len(word) for word in self.vocabulary]
            stats.update({
                "min_word_length": min(lengths),
                "max_word_length": max(lengths),
                "avg_word_length": np.mean(lengths),
            })
        
        if self.word_frequencies:
            counts = list(self.word_frequencies.values())
            stats.update({
                "min_frequency": min(counts),
                "max_frequency": max(counts),
                "total_frequency": sum(counts),
            })
        
        return stats