|
|
""" |
|
|
Integration adapter for Unified Thematic Word Generator |
|
|
|
|
|
This service provides a bridge between the new unified word generator |
|
|
and the existing crossword backend, enabling the backend to use the |
|
|
comprehensive WordFreq vocabulary instead of the limited model vocabulary. |
|
|
""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
import logging |
|
|
from typing import List, Dict, Any, Optional |
|
|
from pathlib import Path |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class UnifiedWordService: |
|
|
""" |
|
|
Service adapter for integrating UnifiedThematicWordGenerator with the crossword backend. |
|
|
|
|
|
Provides the same interface as VectorSearchService but uses the comprehensive |
|
|
WordFreq vocabulary instead of model-limited vocabulary. |
|
|
""" |
|
|
|
|
|
def __init__(self, vocab_size_limit: Optional[int] = None): |
|
|
"""Initialize the unified word service. |
|
|
|
|
|
Args: |
|
|
vocab_size_limit: Maximum vocabulary size (None for default 100K) |
|
|
""" |
|
|
self.generator = None |
|
|
self.vocab_size_limit = vocab_size_limit or int(os.getenv("MAX_VOCABULARY_SIZE", "100000")) |
|
|
self.is_initialized = False |
|
|
|
|
|
|
|
|
self._import_generator() |
|
|
|
|
|
def _import_generator(self): |
|
|
"""Import the UnifiedThematicWordGenerator from hack directory.""" |
|
|
try: |
|
|
|
|
|
hack_dir = Path(__file__).parent.parent.parent.parent.parent / "hack" |
|
|
if hack_dir.exists(): |
|
|
sys.path.insert(0, str(hack_dir)) |
|
|
logger.info(f"π Added hack directory to path: {hack_dir}") |
|
|
|
|
|
|
|
|
from thematic_word_generator import UnifiedThematicWordGenerator |
|
|
|
|
|
|
|
|
cache_dir = os.path.join(os.path.dirname(__file__), '..', '..', 'cache', 'unified_generator') |
|
|
|
|
|
self.generator = UnifiedThematicWordGenerator( |
|
|
cache_dir=cache_dir, |
|
|
vocab_size_limit=self.vocab_size_limit |
|
|
) |
|
|
|
|
|
logger.info(f"β
Imported UnifiedThematicWordGenerator with vocab limit: {self.vocab_size_limit:,}") |
|
|
|
|
|
except ImportError as e: |
|
|
logger.error(f"β Failed to import UnifiedThematicWordGenerator: {e}") |
|
|
logger.error(" Make sure the hack directory contains thematic_word_generator.py") |
|
|
self.generator = None |
|
|
except Exception as e: |
|
|
logger.error(f"β Error setting up UnifiedThematicWordGenerator: {e}") |
|
|
self.generator = None |
|
|
|
|
|
async def initialize(self): |
|
|
"""Initialize the unified word service.""" |
|
|
if not self.generator: |
|
|
logger.error("β Cannot initialize: generator not available") |
|
|
return False |
|
|
|
|
|
try: |
|
|
logger.info("π Initializing Unified Word Service...") |
|
|
start_time = time.time() |
|
|
|
|
|
|
|
|
await self.generator.initialize_async() |
|
|
|
|
|
self.is_initialized = True |
|
|
init_time = time.time() - start_time |
|
|
|
|
|
logger.info(f"β
Unified Word Service initialized in {init_time:.2f}s") |
|
|
logger.info(f"π Vocabulary size: {self.generator.get_vocabulary_size():,} words") |
|
|
logger.info(f"π― Tier distribution: {self.generator.get_tier_distribution()}") |
|
|
|
|
|
return True |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"β Failed to initialize Unified Word Service: {e}") |
|
|
self.is_initialized = False |
|
|
return False |
|
|
|
|
|
async def find_similar_words( |
|
|
self, |
|
|
topic: str, |
|
|
difficulty: str = "medium", |
|
|
max_words: int = 15 |
|
|
) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
Find similar words using the unified generator. |
|
|
|
|
|
Compatible with VectorSearchService interface. |
|
|
|
|
|
Args: |
|
|
topic: Topic to find words for |
|
|
difficulty: Difficulty level (easy/medium/hard) |
|
|
max_words: Maximum number of words to return |
|
|
|
|
|
Returns: |
|
|
List of word dictionaries: [{"word": str, "clue": str}, ...] |
|
|
""" |
|
|
if not self.is_initialized or not self.generator: |
|
|
logger.error("β Service not initialized or generator not available") |
|
|
return [] |
|
|
|
|
|
try: |
|
|
|
|
|
results = await self.generator.find_similar_words(topic, difficulty, max_words) |
|
|
|
|
|
logger.info(f"π― Generated {len(results)} words for '{topic}' (difficulty: {difficulty})") |
|
|
return results |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"β Error finding similar words for '{topic}': {e}") |
|
|
return [] |
|
|
|
|
|
async def _get_cached_fallback(self, topic: str, difficulty: str, max_words: int) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
Fallback method for compatibility with existing backend code. |
|
|
|
|
|
Since our unified generator already has comprehensive vocabulary, |
|
|
this just calls find_similar_words with relaxed criteria. |
|
|
""" |
|
|
if not self.is_initialized or not self.generator: |
|
|
return [] |
|
|
|
|
|
try: |
|
|
|
|
|
results = self.generator.generate_thematic_words( |
|
|
topic, |
|
|
num_words=max_words, |
|
|
min_similarity=0.2 |
|
|
) |
|
|
|
|
|
|
|
|
backend_words = [] |
|
|
for word, similarity, tier in results: |
|
|
if self.generator._matches_backend_difficulty(word, difficulty): |
|
|
backend_word = { |
|
|
"word": word.upper(), |
|
|
"clue": self.generator._generate_simple_clue(word, topic), |
|
|
"similarity": similarity, |
|
|
"tier": tier |
|
|
} |
|
|
backend_words.append(backend_word) |
|
|
|
|
|
logger.info(f"π¦ Fallback generated {len(backend_words)} words for '{topic}'") |
|
|
return backend_words[:max_words] |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"β Error in cached fallback for '{topic}': {e}") |
|
|
return [] |
|
|
|
|
|
def get_vocabulary_size(self) -> int: |
|
|
"""Get the vocabulary size.""" |
|
|
if self.generator: |
|
|
return self.generator.get_vocabulary_size() |
|
|
return 0 |
|
|
|
|
|
def get_tier_info(self) -> Dict[str, Any]: |
|
|
"""Get frequency tier information.""" |
|
|
if not self.generator: |
|
|
return {} |
|
|
|
|
|
return { |
|
|
"tier_distribution": self.generator.get_tier_distribution(), |
|
|
"tier_descriptions": getattr(self.generator, 'tier_descriptions', {}), |
|
|
"vocabulary_size": self.generator.get_vocabulary_size() |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
import time |
|
|
|
|
|
|
|
|
async def create_unified_word_service(vocab_size_limit: Optional[int] = None) -> Optional[UnifiedWordService]: |
|
|
""" |
|
|
Factory function to create and initialize a UnifiedWordService. |
|
|
|
|
|
Args: |
|
|
vocab_size_limit: Maximum vocabulary size (None for default) |
|
|
|
|
|
Returns: |
|
|
Initialized UnifiedWordService or None if initialization failed |
|
|
""" |
|
|
try: |
|
|
service = UnifiedWordService(vocab_size_limit) |
|
|
|
|
|
if await service.initialize(): |
|
|
return service |
|
|
else: |
|
|
logger.error("β Failed to initialize UnifiedWordService") |
|
|
return None |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"β Error creating UnifiedWordService: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
async def main(): |
|
|
"""Test the unified word service.""" |
|
|
print("π§ͺ Testing Unified Word Service") |
|
|
print("=" * 50) |
|
|
|
|
|
|
|
|
service = await create_unified_word_service(vocab_size_limit=50000) |
|
|
|
|
|
if not service: |
|
|
print("β Failed to create service") |
|
|
return |
|
|
|
|
|
|
|
|
test_topics = ["animal", "science", "technology"] |
|
|
|
|
|
for topic in test_topics: |
|
|
print(f"\nπ― Testing topic: '{topic}'") |
|
|
print("-" * 30) |
|
|
|
|
|
for difficulty in ["easy", "medium", "hard"]: |
|
|
words = await service.find_similar_words(topic, difficulty, max_words=5) |
|
|
|
|
|
print(f" {difficulty.capitalize()}: {len(words)} words") |
|
|
for word_data in words: |
|
|
word = word_data['word'] |
|
|
tier = word_data.get('tier', 'unknown') |
|
|
print(f" {word:<12} ({tier})") |
|
|
|
|
|
print(f"\nπ Service Info:") |
|
|
print(f" Vocabulary size: {service.get_vocabulary_size():,}") |
|
|
print(f" Tier info: {service.get_tier_info()}") |
|
|
|
|
|
print("\nβ
Test completed!") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
import asyncio |
|
|
asyncio.run(main()) |