|
|
""" |
|
|
Vector similarity search service using sentence-transformers and FAISS. |
|
|
This implements true AI word generation via vector space nearest neighbor search. |
|
|
""" |
|
|
|
|
|
from math import log |
|
|
import os |
|
|
import logging |
|
|
import asyncio |
|
|
import time |
|
|
import hashlib |
|
|
import pickle |
|
|
from datetime import datetime |
|
|
from typing import List, Dict, Any, Optional, Tuple |
|
|
import json |
|
|
|
|
|
import numpy as np |
|
|
import torch |
|
|
from sentence_transformers import SentenceTransformer |
|
|
import faiss |
|
|
from pathlib import Path |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
|
|
|
class VectorSearchService: |
|
|
""" |
|
|
Service for finding semantically similar words using vector similarity search. |
|
|
|
|
|
This replaces the old approach of filtering static word lists with true |
|
|
vector space search through the model's full vocabulary. |
|
|
""" |
|
|
|
|
|
def __init__(self): |
|
|
self.model = None |
|
|
self.vocab = None |
|
|
self.word_embeddings = None |
|
|
self.faiss_index = None |
|
|
self.is_initialized = False |
|
|
|
|
|
|
|
|
self.model_name = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-mpnet-base-v2") |
|
|
self.base_similarity_threshold = float(os.getenv("WORD_SIMILARITY_THRESHOLD", "0.55")) |
|
|
self.min_similarity_threshold = 0.45 |
|
|
self.max_results = 40 |
|
|
self.use_hierarchical_search = os.getenv("USE_HIERARCHICAL_SEARCH", "true").lower() == "true" |
|
|
|
|
|
|
|
|
self.cache_manager = None |
|
|
|
|
|
|
|
|
self.used_words_by_topic = {} |
|
|
self.max_used_words_per_topic = int(os.getenv("MAX_USED_WORDS_MEMORY", "50")) |
|
|
|
|
|
|
|
|
self.excluded_words = self._load_excluded_words() |
|
|
|
|
|
|
|
|
self.index_cache_dir = self._get_index_cache_dir() |
|
|
self.vocab_cache_path = os.path.join(self.index_cache_dir, f"vocab_{self._get_model_hash()}.pkl") |
|
|
self.embeddings_cache_path = os.path.join(self.index_cache_dir, f"embeddings_{self._get_model_hash()}.npy") |
|
|
self.faiss_cache_path = os.path.join(self.index_cache_dir, f"faiss_index_{self._get_model_hash()}.faiss") |
|
|
|
|
|
async def initialize(self): |
|
|
"""Initialize the vector search service.""" |
|
|
try: |
|
|
start_time = time.time() |
|
|
|
|
|
|
|
|
logger.info(f"🔧 Environment Configuration:") |
|
|
logger.info(f" 📊 Model: {self.model_name}") |
|
|
logger.info(f" 🎯 Base Similarity Threshold: {self.base_similarity_threshold}") |
|
|
logger.info(f" 📉 Min Similarity Threshold: {self.min_similarity_threshold}") |
|
|
logger.info(f" 📈 Max Results: {self.max_results}") |
|
|
logger.info(f" 🌟 Hierarchical Search: {self.use_hierarchical_search}") |
|
|
logger.info(f" 🔀 Search Randomness: {os.getenv('SEARCH_RANDOMNESS', '0.02')}") |
|
|
logger.info(f" 💾 Cache Dir: {os.getenv('WORD_CACHE_DIR', 'auto-detect')}") |
|
|
|
|
|
logger.info(f"🔧 Loading model: {self.model_name}") |
|
|
|
|
|
|
|
|
model_start = time.time() |
|
|
self.model = SentenceTransformer(self.model_name) |
|
|
model_time = time.time() - model_start |
|
|
logger.info(f"✅ Model loaded in {model_time:.2f}s: {self.model_name}") |
|
|
|
|
|
|
|
|
if self._load_cached_index(): |
|
|
logger.info("🚀 Using cached FAISS index - startup accelerated!") |
|
|
else: |
|
|
|
|
|
logger.info("🔨 Building FAISS index from scratch...") |
|
|
|
|
|
|
|
|
vocab_start = time.time() |
|
|
tokenizer = self.model.tokenizer |
|
|
vocab_dict = tokenizer.get_vocab() |
|
|
|
|
|
|
|
|
self.vocab = self._filter_vocabulary(vocab_dict) |
|
|
vocab_time = time.time() - vocab_start |
|
|
logger.info(f"📚 Filtered vocabulary in {vocab_time:.2f}s: {len(self.vocab)} words") |
|
|
|
|
|
|
|
|
embedding_start = time.time() |
|
|
logger.info("🔄 Starting embedding generation...") |
|
|
await self._build_embeddings_index() |
|
|
embedding_time = time.time() - embedding_start |
|
|
logger.info(f"🔄 Embeddings built in {embedding_time:.2f}s") |
|
|
|
|
|
|
|
|
self._save_index_to_cache() |
|
|
|
|
|
|
|
|
cache_start = time.time() |
|
|
logger.info("📦 Initializing word cache manager...") |
|
|
try: |
|
|
from .word_cache import WordCacheManager |
|
|
self.cache_manager = WordCacheManager() |
|
|
await self.cache_manager.initialize() |
|
|
cache_time = time.time() - cache_start |
|
|
logger.info(f"📦 Cache manager initialized in {cache_time:.2f}s") |
|
|
except Exception as e: |
|
|
cache_time = time.time() - cache_start |
|
|
logger.info(f"⚠️ Cache manager initialization failed in {cache_time:.2f}s: {e}") |
|
|
logger.info("📝 Continuing without persistent caching (in-memory only)") |
|
|
self.cache_manager = None |
|
|
|
|
|
self.is_initialized = True |
|
|
total_time = time.time() - start_time |
|
|
logger.info(f"✅ Vector search service fully initialized in {total_time:.2f}s") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"❌ Failed to initialize vector search: {e}") |
|
|
self.is_initialized = False |
|
|
raise |
|
|
|
|
|
def _filter_vocabulary(self, vocab_dict: Dict[str, int]) -> List[str]: |
|
|
"""Filter vocabulary to keep only crossword-suitable words.""" |
|
|
logger.info(f"📚 Filtering {len(vocab_dict)} vocabulary words...") |
|
|
|
|
|
|
|
|
excluded_words = { |
|
|
|
|
|
'THE', 'AND', 'FOR', 'ARE', 'BUT', 'NOT', 'YOU', 'ALL', 'THIS', 'THAT', 'WITH', 'FROM', 'THEY', 'WERE', 'BEEN', 'HAVE', 'THEIR', 'SAID', 'EACH', 'WHICH', 'WHAT', 'THERE', 'WILL', 'MORE', 'WHEN', 'SOME', 'LIKE', 'INTO', 'TIME', 'VERY', 'ONLY', 'HAS', 'HAD', 'WHO', 'OIL', 'ITS', 'NOW', 'FIND', 'LONG', 'DOWN', 'DAY', 'DID', 'GET', 'COME', 'MADE', 'MAY', 'PART', |
|
|
|
|
|
'ANIMAL', 'ANIMALS', 'CREATURE', 'CREATURES', 'BEAST', 'BEASTS', 'THING', 'THINGS' |
|
|
} |
|
|
|
|
|
|
|
|
filtered = [] |
|
|
processed = 0 |
|
|
|
|
|
for word, _ in vocab_dict.items(): |
|
|
processed += 1 |
|
|
|
|
|
|
|
|
if processed % 10000 == 0: |
|
|
logger.info(f"📊 Vocabulary filtering progress: {processed}/{len(vocab_dict)}") |
|
|
|
|
|
|
|
|
if word.startswith('##'): |
|
|
clean_word = word[2:].upper() |
|
|
else: |
|
|
clean_word = word.upper() |
|
|
|
|
|
|
|
|
if len(clean_word) < 3 or len(clean_word) > 12: |
|
|
continue |
|
|
|
|
|
|
|
|
if not clean_word.isalpha(): |
|
|
continue |
|
|
|
|
|
|
|
|
if clean_word.startswith(('[', '<')): |
|
|
continue |
|
|
|
|
|
|
|
|
if clean_word in excluded_words: |
|
|
continue |
|
|
|
|
|
|
|
|
if self._is_plural(clean_word) or self._is_boring_word(clean_word): |
|
|
continue |
|
|
|
|
|
filtered.append(clean_word) |
|
|
|
|
|
|
|
|
unique_filtered = sorted(list(set(filtered))) |
|
|
logger.info(f"📚 Vocabulary filtered: {len(vocab_dict)} → {len(unique_filtered)} words") |
|
|
|
|
|
return unique_filtered |
|
|
|
|
|
def _is_plural(self, word: str) -> bool: |
|
|
"""Check if word is likely a plural.""" |
|
|
|
|
|
if len(word) < 4: |
|
|
return False |
|
|
return ( |
|
|
word.endswith('S') and not word.endswith('SS') and |
|
|
not word.endswith('US') and not word.endswith('IS') |
|
|
) |
|
|
|
|
|
def _is_boring_word(self, word: str) -> bool: |
|
|
"""Check if word is boring or too generic for crosswords.""" |
|
|
boring_patterns = [ |
|
|
|
|
|
word.endswith('ING') and len(word) > 6, |
|
|
word.endswith('TION') and len(word) > 7, |
|
|
word.endswith('NESS') and len(word) > 6, |
|
|
|
|
|
word in ['GET', 'GOT', 'PUT', 'SET', 'LET', 'RUN', 'CUT', 'HIT', 'SIT', 'WIN', 'BIG', 'NEW', 'OLD', 'BAD', 'GOOD', 'BEST', 'LAST', 'NEXT', 'REAL'] |
|
|
] |
|
|
return any(boring_patterns) |
|
|
|
|
|
async def _build_embeddings_index(self): |
|
|
"""Build FAISS index with pre-computed embeddings for all vocabulary.""" |
|
|
logger.info("🔨 Building embeddings index...") |
|
|
|
|
|
|
|
|
cpu_count = os.cpu_count() or 1 |
|
|
|
|
|
batch_size = min(200 if cpu_count > 2 else 100, len(self.vocab) // 4) |
|
|
logger.info(f"⚡ Using batch size {batch_size} with {cpu_count} CPUs") |
|
|
|
|
|
embeddings_list = [] |
|
|
total_batches = (len(self.vocab) + batch_size - 1) // batch_size |
|
|
|
|
|
|
|
|
for i in range(0, len(self.vocab), batch_size): |
|
|
batch = self.vocab[i:i + batch_size] |
|
|
batch_num = i // batch_size + 1 |
|
|
|
|
|
|
|
|
|
|
|
batch_embeddings = self.model.encode( |
|
|
batch, |
|
|
convert_to_numpy=True, |
|
|
show_progress_bar=False, |
|
|
batch_size=min(32, len(batch)), |
|
|
normalize_embeddings=False |
|
|
) |
|
|
embeddings_list.append(batch_embeddings) |
|
|
|
|
|
|
|
|
if batch_num % max(1, total_batches // 10) == 0: |
|
|
progress = (batch_num / total_batches) * 100 |
|
|
logger.info(f"📊 Embedding progress: {progress:.1f}% ({i}/{len(self.vocab)} words)") |
|
|
|
|
|
|
|
|
logger.info("🔗 Combining embeddings...") |
|
|
self.word_embeddings = np.vstack(embeddings_list) |
|
|
logger.info(f"📈 Generated embeddings shape: {self.word_embeddings.shape}") |
|
|
|
|
|
|
|
|
logger.info("🏗️ Building FAISS index...") |
|
|
dimension = self.word_embeddings.shape[1] |
|
|
self.faiss_index = faiss.IndexFlatIP(dimension) |
|
|
|
|
|
|
|
|
logger.info("📏 Normalizing embeddings for cosine similarity...") |
|
|
faiss.normalize_L2(self.word_embeddings) |
|
|
|
|
|
|
|
|
logger.info("📥 Adding embeddings to FAISS index...") |
|
|
self.faiss_index.add(self.word_embeddings) |
|
|
|
|
|
logger.info(f"🔍 FAISS index built with {self.faiss_index.ntotal} vectors") |
|
|
|
|
|
|
|
|
async def find_similar_words( |
|
|
self, |
|
|
topic: str, |
|
|
difficulty: str = "medium", |
|
|
max_words: int = 15 |
|
|
) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
Find words similar to the given topic using vector similarity search. |
|
|
|
|
|
This is the core function that replaces embedding filtering with true |
|
|
vector space nearest neighbor search. |
|
|
""" |
|
|
logger.info(f"🔍 Starting word search for topic: '{topic}', difficulty: '{difficulty}', max_words: {max_words}") |
|
|
logger.info(f"🤖 Vector search initialized: {self.is_initialized}") |
|
|
|
|
|
if not self.is_initialized: |
|
|
logger.warning("🔄 Vector search not initialized, using cached fallback") |
|
|
return await self._get_cached_fallback(topic, difficulty, max_words) |
|
|
|
|
|
try: |
|
|
if self.use_hierarchical_search: |
|
|
|
|
|
logger.info(f"🌟 Using hierarchical semantic search for enhanced word generation") |
|
|
|
|
|
|
|
|
all_candidates = await self._hierarchical_search(topic, difficulty, max_words) |
|
|
|
|
|
|
|
|
if all_candidates: |
|
|
combined_results = self._combine_hierarchical_results(all_candidates, max_words * 2) |
|
|
|
|
|
|
|
|
combined_results = self._apply_word_exclusions(combined_results) |
|
|
|
|
|
|
|
|
similar_words = self._filter_used_words(combined_results, topic) |
|
|
|
|
|
|
|
|
similar_words = similar_words[:max_words] |
|
|
|
|
|
logger.info(f"🎯 Hierarchical search generated {len(similar_words)} words for '{topic}' (after variety filtering)") |
|
|
|
|
|
|
|
|
if similar_words: |
|
|
self._track_used_words(topic, similar_words) |
|
|
|
|
|
|
|
|
if similar_words: |
|
|
await self._cache_successful_search(topic, difficulty, similar_words) |
|
|
else: |
|
|
similar_words = [] |
|
|
logger.warning(f"⚠️ Hierarchical search found no candidates for '{topic}'") |
|
|
else: |
|
|
|
|
|
logger.info(f"🔍 Using traditional single-search approach") |
|
|
traditional_results = await self._traditional_single_search(topic, difficulty, max_words * 2) |
|
|
|
|
|
|
|
|
traditional_results = self._apply_word_exclusions(traditional_results) |
|
|
|
|
|
|
|
|
similar_words = self._filter_used_words(traditional_results, topic) |
|
|
similar_words = similar_words[:max_words] |
|
|
|
|
|
|
|
|
if similar_words: |
|
|
self._track_used_words(topic, similar_words) |
|
|
|
|
|
|
|
|
if len(similar_words) < max_words * 0.75: |
|
|
cached_supplement = await self._get_cached_fallback( |
|
|
topic, difficulty, max_words - len(similar_words) |
|
|
) |
|
|
similar_words.extend(cached_supplement) |
|
|
logger.info(f"🔄 Supplemented with {len(cached_supplement)} cached words") |
|
|
|
|
|
|
|
|
if len(similar_words) < max_words // 2: |
|
|
emergency_words = self._get_emergency_bootstrap( |
|
|
topic, difficulty, max_words - len(similar_words) |
|
|
) |
|
|
similar_words.extend(emergency_words) |
|
|
logger.info(f"🆘 Added {len(emergency_words)} emergency bootstrap words") |
|
|
|
|
|
return similar_words[:max_words] |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"❌ Vector search failed for '{topic}': {e}") |
|
|
|
|
|
cached_words = await self._get_cached_fallback(topic, difficulty, max_words) |
|
|
if cached_words: |
|
|
return cached_words |
|
|
|
|
|
|
|
|
logger.warning(f"⚠️ No cached words available, using emergency bootstrap for '{topic}'") |
|
|
return self._get_emergency_bootstrap(topic, difficulty, max_words) |
|
|
|
|
|
def _matches_difficulty(self, word: str, difficulty: str) -> bool: |
|
|
"""Check if word matches difficulty criteria.""" |
|
|
difficulty_map = { |
|
|
"easy": {"min_len": 3, "max_len": 8}, |
|
|
"medium": {"min_len": 4, "max_len": 10}, |
|
|
"hard": {"min_len": 5, "max_len": 15} |
|
|
} |
|
|
|
|
|
criteria = difficulty_map.get(difficulty, difficulty_map["medium"]) |
|
|
return criteria["min_len"] <= len(word) <= criteria["max_len"] |
|
|
|
|
|
def _generate_clue(self, word: str, topic: str) -> str: |
|
|
"""Generate a simple clue for the word.""" |
|
|
|
|
|
clue_templates = { |
|
|
"Animals": f"{word.lower()} (animal)", |
|
|
"Technology": f"{word.lower()} (tech term)", |
|
|
"Science": f"{word.lower()} (scientific term)", |
|
|
"Geography": f"{word.lower()} (geographic feature)" |
|
|
} |
|
|
|
|
|
return clue_templates.get(topic, f"{word.lower()} (related to {topic.lower()})") |
|
|
|
|
|
def _is_interesting_word(self, word: str, topic: str) -> bool: |
|
|
"""Check if word is interesting enough for crosswords.""" |
|
|
|
|
|
topic_lower = topic.lower() |
|
|
word_lower = word.lower() |
|
|
|
|
|
|
|
|
if word_lower == topic_lower: |
|
|
return False |
|
|
|
|
|
|
|
|
|
|
|
if len(word_lower) >= 4: |
|
|
|
|
|
if topic_lower in ['technology', 'tech'] and word_lower in ['tech', 'ict']: |
|
|
return True |
|
|
|
|
|
if topic_lower in ['animals', 'animal'] and word_lower in ['animal', 'mammal']: |
|
|
return True |
|
|
|
|
|
|
|
|
if word_lower in topic_lower and len(word_lower) < 4: |
|
|
return False |
|
|
|
|
|
|
|
|
if topic_lower == 'animals': |
|
|
obvious_animals = ['mammal', 'mammals', 'wildlife', 'organism', 'organisms', 'livestock'] |
|
|
if word_lower in obvious_animals: |
|
|
return False |
|
|
|
|
|
|
|
|
|
|
|
truly_abstract_endings = ['tion', 'ness', 'ity', 'ism'] |
|
|
if any(word_lower.endswith(ending) for ending in truly_abstract_endings) and len(word) > 9: |
|
|
|
|
|
abstract_prefixes = ['develop', 'manage', 'establish', 'improve', 'achieve'] |
|
|
if any(word_lower.startswith(prefix) for prefix in abstract_prefixes): |
|
|
return False |
|
|
|
|
|
return True |
|
|
|
|
|
def _track_used_words(self, topic: str, words: List[Dict[str, Any]]): |
|
|
"""Track words used for this topic to avoid repetition in future puzzles.""" |
|
|
topic_key = topic.lower() |
|
|
|
|
|
if topic_key not in self.used_words_by_topic: |
|
|
self.used_words_by_topic[topic_key] = set() |
|
|
|
|
|
|
|
|
new_words = [w['word'].upper() for w in words] |
|
|
self.used_words_by_topic[topic_key].update(new_words) |
|
|
|
|
|
|
|
|
if len(self.used_words_by_topic[topic_key]) > self.max_used_words_per_topic: |
|
|
|
|
|
used_list = list(self.used_words_by_topic[topic_key]) |
|
|
self.used_words_by_topic[topic_key] = set(used_list[-self.max_used_words_per_topic:]) |
|
|
|
|
|
logger.info(f"📝 Tracking {len(new_words)} words for '{topic}' (total remembered: {len(self.used_words_by_topic[topic_key])})") |
|
|
|
|
|
def _get_used_words_for_topic(self, topic: str) -> set: |
|
|
"""Get the set of words already used for this topic.""" |
|
|
topic_key = topic.lower() |
|
|
return self.used_words_by_topic.get(topic_key, set()) |
|
|
|
|
|
def _filter_used_words(self, candidates: List[Dict[str, Any]], topic: str) -> List[Dict[str, Any]]: |
|
|
"""Filter out words that have been used recently for this topic.""" |
|
|
if not candidates: |
|
|
return candidates |
|
|
|
|
|
used_words = self._get_used_words_for_topic(topic) |
|
|
if not used_words: |
|
|
return candidates |
|
|
|
|
|
|
|
|
filtered = [] |
|
|
filtered_out = [] |
|
|
|
|
|
for candidate in candidates: |
|
|
word = candidate['word'].upper() |
|
|
if word not in used_words: |
|
|
filtered.append(candidate) |
|
|
else: |
|
|
filtered_out.append(word) |
|
|
|
|
|
if filtered_out: |
|
|
logger.info(f"🚫 Filtered out {len(filtered_out)} previously used words for '{topic}': {filtered_out[:5]}{'...' if len(filtered_out) > 5 else ''}") |
|
|
|
|
|
logger.info(f"🔄 Word variety filter: {len(candidates)} → {len(filtered)} candidates") |
|
|
return filtered |
|
|
|
|
|
def _expand_topic_variations(self, topic: str) -> List[str]: |
|
|
""" |
|
|
Expand topic to include singular/plural variations for better semantic coverage. |
|
|
|
|
|
Examples: |
|
|
- "Animal" → ["Animal", "Animals"] |
|
|
- "Animals" → ["Animals", "Animal"] |
|
|
- "Technology" → ["Technology", "Technologies"] |
|
|
""" |
|
|
variations = [topic] |
|
|
|
|
|
topic_lower = topic.lower() |
|
|
|
|
|
|
|
|
if topic_lower.endswith('s') and len(topic) > 3: |
|
|
|
|
|
if topic_lower.endswith('ies'): |
|
|
|
|
|
singular = topic[:-3] + 'y' |
|
|
elif topic_lower.endswith('sses') or topic_lower.endswith('shes') or topic_lower.endswith('ches') or topic_lower.endswith('xes'): |
|
|
|
|
|
singular = topic[:-2] |
|
|
elif topic_lower.endswith('es') and len(topic) > 4: |
|
|
|
|
|
singular = topic[:-1] |
|
|
elif topic_lower.endswith('s'): |
|
|
|
|
|
singular = topic[:-1] |
|
|
else: |
|
|
singular = topic |
|
|
|
|
|
if singular != topic and len(singular) >= 3: |
|
|
variations.append(singular) |
|
|
else: |
|
|
|
|
|
if topic_lower.endswith('y') and topic_lower[-2] not in 'aeiou': |
|
|
|
|
|
plural = topic[:-1] + 'ies' |
|
|
elif topic_lower.endswith(('s', 'sh', 'ch', 'x', 'z')): |
|
|
|
|
|
plural = topic + 'es' |
|
|
else: |
|
|
|
|
|
plural = topic + 's' |
|
|
|
|
|
variations.append(plural) |
|
|
|
|
|
|
|
|
unique_variations = [] |
|
|
for variation in variations: |
|
|
if variation not in unique_variations: |
|
|
unique_variations.append(variation) |
|
|
|
|
|
logger.info(f"🔄 Topic variations for '{topic}': {unique_variations}") |
|
|
return unique_variations |
|
|
|
|
|
def _identify_subcategories(self, candidates: List[Dict[str, Any]], main_topic: str) -> List[str]: |
|
|
""" |
|
|
Identify which candidate words are likely sub-categories for hierarchical search. |
|
|
|
|
|
Args: |
|
|
candidates: List of word candidates with similarity scores |
|
|
main_topic: The original topic being searched |
|
|
|
|
|
Returns: |
|
|
List of subcategory words suitable for secondary search |
|
|
""" |
|
|
subcategories = [] |
|
|
main_topic_lower = main_topic.lower() |
|
|
|
|
|
|
|
|
category_patterns = { |
|
|
|
|
|
'academic': ['logy', 'ics', 'ism', 'ology'], |
|
|
|
|
|
'adjective': ['logical', 'ical', 'tic', 'ian', 'nal', 'ous'], |
|
|
|
|
|
'collective': ['life', 'stock', 'ware', 'kind', 'type', 'group'], |
|
|
|
|
|
'general': ['wild', 'domestic', 'marine', 'land', 'air', 'water'] |
|
|
} |
|
|
|
|
|
|
|
|
known_categories = { |
|
|
'animal': ['wildlife', 'livestock', 'mammal', 'mammalian', 'fauna', 'zoology', 'zoological', |
|
|
'vertebrate', 'invertebrate', 'reptile', 'amphibian', 'primate', 'rodent', |
|
|
'carnivore', 'herbivore', 'omnivore', 'predator', 'prey'], |
|
|
'technology': ['software', 'hardware', 'digital', 'electronic', 'computing', 'internet', |
|
|
'mobile', 'wireless', 'networking', 'cybernetic', 'robotic', 'automated'], |
|
|
'science': ['physics', 'chemistry', 'biology', 'astronomy', 'geology', 'mathematics', |
|
|
'theoretical', 'experimental', 'applied', 'quantum', 'molecular', 'atomic'], |
|
|
'geography': ['continental', 'coastal', 'mountainous', 'desert', 'tropical', 'polar', |
|
|
'urban', 'rural', 'geological', 'topographical', 'cartographic'] |
|
|
} |
|
|
|
|
|
|
|
|
for candidate in candidates: |
|
|
word = candidate['word'].lower() |
|
|
similarity = candidate['similarity'] |
|
|
|
|
|
|
|
|
if similarity < 0.45: |
|
|
continue |
|
|
|
|
|
is_subcategory = False |
|
|
|
|
|
|
|
|
topic_categories = known_categories.get(main_topic_lower, []) |
|
|
if word in topic_categories: |
|
|
is_subcategory = True |
|
|
logger.info(f"🔍 '{word.upper()}' identified as known subcategory for '{main_topic}'") |
|
|
|
|
|
|
|
|
if not is_subcategory: |
|
|
for pattern_type, patterns in category_patterns.items(): |
|
|
for pattern in patterns: |
|
|
if word.endswith(pattern): |
|
|
is_subcategory = True |
|
|
logger.info(f"🔍 '{word.upper()}' identified as subcategory (pattern: {pattern})") |
|
|
break |
|
|
if is_subcategory: |
|
|
break |
|
|
|
|
|
|
|
|
if not is_subcategory: |
|
|
|
|
|
if (len(word) >= 6 and |
|
|
word.count('i') + word.count('o') >= 2 and |
|
|
not word.isupper() and |
|
|
word.isalpha()): |
|
|
|
|
|
|
|
|
if any(word.endswith(ending) for ending in ['ism', 'ity', 'ness', 'tion', 'sion']): |
|
|
is_subcategory = True |
|
|
logger.info(f"🔍 '{word.upper()}' identified as subcategory (abstract concept)") |
|
|
|
|
|
if is_subcategory and word.upper() not in subcategories: |
|
|
subcategories.append(word.upper()) |
|
|
|
|
|
|
|
|
max_subcategories = 5 |
|
|
limited_subcategories = subcategories[:max_subcategories] |
|
|
|
|
|
if limited_subcategories: |
|
|
logger.info(f"🌳 Identified {len(limited_subcategories)} subcategories for '{main_topic}': {limited_subcategories}") |
|
|
else: |
|
|
logger.info(f"🌳 No suitable subcategories found for '{main_topic}'") |
|
|
|
|
|
return limited_subcategories |
|
|
|
|
|
async def _hierarchical_search( |
|
|
self, |
|
|
topic: str, |
|
|
difficulty: str, |
|
|
max_words: int |
|
|
) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
Perform hierarchical semantic search using topic variations and subcategories. |
|
|
|
|
|
Search strategy: |
|
|
1. Search for topic variations (singular/plural) |
|
|
2. Identify subcategories from initial results |
|
|
3. Search subcategories for more specific words |
|
|
4. Combine and weight all results |
|
|
""" |
|
|
all_candidates = [] |
|
|
|
|
|
|
|
|
topic_variations = self._expand_topic_variations(topic) |
|
|
|
|
|
logger.info(f"🌟 Starting hierarchical search for '{topic}' with {len(topic_variations)} variations") |
|
|
|
|
|
|
|
|
main_topic_candidates = [] |
|
|
for variation in topic_variations: |
|
|
logger.info(f"🔍 Searching topic variation: '{variation}'") |
|
|
|
|
|
|
|
|
topic_embedding = self.model.encode([variation], convert_to_numpy=True) |
|
|
|
|
|
|
|
|
noise_factor = float(os.getenv("SEARCH_RANDOMNESS", "0.02")) |
|
|
if noise_factor > 0: |
|
|
try: |
|
|
noise = np.random.normal(0, noise_factor, topic_embedding.shape) |
|
|
topic_embedding = topic_embedding + noise |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
topic_embedding = np.ascontiguousarray(topic_embedding, dtype=np.float32) |
|
|
faiss.normalize_L2(topic_embedding) |
|
|
|
|
|
|
|
|
search_size = min(self.max_results * 3, 100) |
|
|
scores, indices = self.faiss_index.search(topic_embedding, search_size) |
|
|
|
|
|
|
|
|
variation_candidates = self._collect_candidates_with_threshold( |
|
|
scores, indices, self.base_similarity_threshold, variation, difficulty |
|
|
) |
|
|
|
|
|
|
|
|
weight = 1.0 if variation == topic else 0.9 |
|
|
for candidate in variation_candidates: |
|
|
candidate['similarity'] *= weight |
|
|
candidate['search_source'] = f"main_topic:{variation}" |
|
|
|
|
|
main_topic_candidates.extend(variation_candidates) |
|
|
|
|
|
if len(main_topic_candidates) <= 10: |
|
|
logger.info(f"🔍 Main topic search found candidates: {main_topic_candidates}") |
|
|
logger.info(f"🔍 Main topic search found {len(main_topic_candidates)} candidates") |
|
|
|
|
|
|
|
|
if main_topic_candidates: |
|
|
|
|
|
main_topic_candidates.sort(key=lambda x: x['similarity'], reverse=True) |
|
|
subcategories = self._identify_subcategories(main_topic_candidates, topic) |
|
|
|
|
|
|
|
|
subcategory_candidates = [] |
|
|
for subcategory in subcategories: |
|
|
logger.info(f"🌳 Searching subcategory: '{subcategory}'") |
|
|
|
|
|
try: |
|
|
|
|
|
subcat_embedding = self.model.encode([subcategory], convert_to_numpy=True) |
|
|
subcat_embedding = np.ascontiguousarray(subcat_embedding, dtype=np.float32) |
|
|
faiss.normalize_L2(subcat_embedding) |
|
|
|
|
|
|
|
|
sub_search_size = min(self.max_results * 2, 60) |
|
|
sub_scores, sub_indices = self.faiss_index.search(subcat_embedding, sub_search_size) |
|
|
|
|
|
|
|
|
sub_threshold = max(self.base_similarity_threshold - 0.05, self.min_similarity_threshold) |
|
|
sub_candidates = self._collect_candidates_with_threshold( |
|
|
sub_scores, sub_indices, sub_threshold, subcategory, difficulty |
|
|
) |
|
|
|
|
|
|
|
|
for candidate in sub_candidates: |
|
|
candidate['similarity'] *= 0.8 |
|
|
candidate['search_source'] = f"subcategory:{subcategory}" |
|
|
|
|
|
subcategory_candidates.extend(sub_candidates) |
|
|
logger.info(f"🌳 Subcategory '{subcategory}' found {len(sub_candidates)} candidates") |
|
|
|
|
|
except Exception as e: |
|
|
logger.warning(f"⚠️ Failed to search subcategory '{subcategory}': {e}") |
|
|
continue |
|
|
|
|
|
logger.info(f"🌳 Subcategory search found {len(subcategory_candidates)} additional candidates") |
|
|
else: |
|
|
subcategory_candidates = [] |
|
|
|
|
|
|
|
|
all_candidates = main_topic_candidates + subcategory_candidates |
|
|
|
|
|
logger.info(f"🔗 Total candidates before deduplication: {len(all_candidates)}") |
|
|
|
|
|
return all_candidates |
|
|
|
|
|
async def _traditional_single_search( |
|
|
self, |
|
|
topic: str, |
|
|
difficulty: str, |
|
|
max_words: int |
|
|
) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
Traditional single-topic search approach (original implementation). |
|
|
Kept as fallback option for compatibility. |
|
|
""" |
|
|
|
|
|
topic_embedding = self.model.encode([topic], convert_to_numpy=True) |
|
|
|
|
|
|
|
|
import numpy as np |
|
|
noise_factor = float(os.getenv("SEARCH_RANDOMNESS", "0.02")) |
|
|
if noise_factor > 0: |
|
|
try: |
|
|
noise = np.random.normal(0, noise_factor, topic_embedding.shape) |
|
|
topic_embedding = topic_embedding + noise |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
topic_embedding = np.ascontiguousarray(topic_embedding, dtype=np.float32) |
|
|
faiss.normalize_L2(topic_embedding) |
|
|
|
|
|
|
|
|
search_size = min(self.max_results * 6, 150) |
|
|
scores, indices = self.faiss_index.search(topic_embedding, search_size) |
|
|
|
|
|
|
|
|
logger.info(f"🔍 FAISS search returned {len(scores[0])} results") |
|
|
logger.info(f"🔍 Top 5 scores: {scores[0][:5]}") |
|
|
|
|
|
|
|
|
top_words_with_scores = [] |
|
|
for i, (score, idx) in enumerate(zip(scores[0][:10], indices[0][:10])): |
|
|
word = self.vocab[idx] |
|
|
top_words_with_scores.append(f"{word}({score:.3f})") |
|
|
|
|
|
logger.info(f"🔍 Top 10 FAISS words: {', '.join(top_words_with_scores)}") |
|
|
|
|
|
|
|
|
candidates = [] |
|
|
thresholds_to_try = [ |
|
|
self.base_similarity_threshold, |
|
|
max(self.base_similarity_threshold - 0.05, self.min_similarity_threshold), |
|
|
max(self.base_similarity_threshold - 0.10, self.min_similarity_threshold), |
|
|
self.min_similarity_threshold |
|
|
] |
|
|
|
|
|
for threshold in thresholds_to_try: |
|
|
logger.info(f"🎯 Trying threshold: {threshold}") |
|
|
candidates = self._collect_candidates_with_threshold(scores, indices, threshold, topic, difficulty) |
|
|
logger.info(f"🔍 Found {len(candidates)} candidates with threshold {threshold}") |
|
|
|
|
|
if len(candidates) >= max_words * 0.75: |
|
|
logger.info(f"✅ Sufficient words found with threshold {threshold}") |
|
|
break |
|
|
elif len(candidates) >= max_words // 2: |
|
|
logger.info(f"⚡ Acceptable words found with threshold {threshold}") |
|
|
break |
|
|
|
|
|
|
|
|
import random |
|
|
if len(candidates) > max_words * 2: |
|
|
similar_words = self._weighted_random_selection(candidates, max_words) |
|
|
else: |
|
|
random.shuffle(candidates) |
|
|
similar_words = candidates[:max_words] |
|
|
|
|
|
logger.info(f"🎯 Traditional search found {len(similar_words)} words for '{topic}'") |
|
|
|
|
|
|
|
|
if similar_words: |
|
|
await self._cache_successful_search(topic, difficulty, similar_words) |
|
|
|
|
|
return similar_words |
|
|
|
|
|
def _combine_hierarchical_results( |
|
|
self, |
|
|
all_candidates: List[Dict[str, Any]], |
|
|
max_words: int |
|
|
) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
Intelligently combine and deduplicate results from hierarchical search. |
|
|
|
|
|
Strategy: |
|
|
1. Remove duplicates while preserving best similarity scores |
|
|
2. Apply source-based weighting (main topic > subcategories) |
|
|
3. Ensure diverse representation from different search sources |
|
|
4. Apply adaptive threshold filtering |
|
|
""" |
|
|
if not all_candidates: |
|
|
return [] |
|
|
|
|
|
|
|
|
word_best_scores = {} |
|
|
for candidate in all_candidates: |
|
|
word = candidate['word'].upper() |
|
|
similarity = candidate['similarity'] |
|
|
source = candidate.get('search_source', 'unknown') |
|
|
|
|
|
|
|
|
if word not in word_best_scores or similarity > word_best_scores[word]['similarity']: |
|
|
candidate_copy = candidate.copy() |
|
|
candidate_copy['word'] = word |
|
|
word_best_scores[word] = candidate_copy |
|
|
|
|
|
deduplicated = list(word_best_scores.values()) |
|
|
logger.info(f"🔗 After strict deduplication: {len(all_candidates)} → {len(deduplicated)} unique words") |
|
|
|
|
|
|
|
|
|
|
|
high_quality = [w for w in deduplicated if w['similarity'] >= self.base_similarity_threshold] |
|
|
medium_quality = [w for w in deduplicated if self.base_similarity_threshold - 0.1 <= w['similarity'] < self.base_similarity_threshold] |
|
|
lower_quality = [w for w in deduplicated if w['similarity'] < self.base_similarity_threshold - 0.1] |
|
|
|
|
|
|
|
|
import random |
|
|
random.shuffle(high_quality) |
|
|
random.shuffle(medium_quality) |
|
|
random.shuffle(lower_quality) |
|
|
|
|
|
|
|
|
deduplicated = high_quality + medium_quality + lower_quality |
|
|
|
|
|
logger.info(f"🎲 Randomized within quality tiers: {len(high_quality)} high, {len(medium_quality)} medium, {len(lower_quality)} lower") |
|
|
|
|
|
|
|
|
thresholds_to_try = [ |
|
|
self.base_similarity_threshold, |
|
|
max(self.base_similarity_threshold - 0.05, self.min_similarity_threshold), |
|
|
max(self.base_similarity_threshold - 0.10, self.min_similarity_threshold), |
|
|
self.min_similarity_threshold |
|
|
] |
|
|
|
|
|
final_candidates = [] |
|
|
for threshold in thresholds_to_try: |
|
|
filtered_candidates = [c for c in deduplicated if c['similarity'] >= threshold] |
|
|
|
|
|
logger.info(f"🎯 Hierarchical threshold {threshold}: {len(filtered_candidates)} candidates") |
|
|
|
|
|
if len(filtered_candidates) >= max_words * 0.75: |
|
|
final_candidates = filtered_candidates |
|
|
logger.info(f"✅ Sufficient words found with hierarchical threshold {threshold}") |
|
|
break |
|
|
elif len(filtered_candidates) >= max_words // 2: |
|
|
final_candidates = filtered_candidates |
|
|
logger.info(f"⚡ Acceptable words found with hierarchical threshold {threshold}") |
|
|
break |
|
|
|
|
|
if not final_candidates: |
|
|
final_candidates = deduplicated |
|
|
|
|
|
|
|
|
final_selection = self._ensure_source_diversity(final_candidates, max_words) |
|
|
|
|
|
logger.info(f"🏆 Final hierarchical selection: {len(final_selection)} words") |
|
|
|
|
|
|
|
|
source_counts = {} |
|
|
for candidate in final_selection: |
|
|
source = candidate.get('search_source', 'unknown') |
|
|
source_counts[source] = source_counts.get(source, 0) + 1 |
|
|
|
|
|
logger.info(f"📊 Source distribution: {source_counts}") |
|
|
|
|
|
return final_selection |
|
|
|
|
|
def _ensure_source_diversity( |
|
|
self, |
|
|
candidates: List[Dict[str, Any]], |
|
|
max_words: int |
|
|
) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
Balance word selection across different search sources for optimal variety. |
|
|
|
|
|
Allocates selection quotas to ensure representation from main topic searches |
|
|
and subcategory searches, preventing over-concentration from any single source |
|
|
while maintaining quality standards. |
|
|
|
|
|
Args: |
|
|
candidates: Word candidates with search source metadata |
|
|
max_words: Target number of words to select |
|
|
|
|
|
Returns: |
|
|
Balanced selection ensuring source diversity |
|
|
""" |
|
|
if len(candidates) <= max_words: |
|
|
return candidates |
|
|
|
|
|
|
|
|
source_groups = {} |
|
|
for candidate in candidates: |
|
|
source = candidate.get('search_source', 'unknown') |
|
|
if source not in source_groups: |
|
|
source_groups[source] = [] |
|
|
source_groups[source].append(candidate) |
|
|
|
|
|
|
|
|
if len(source_groups) > 1: |
|
|
selected = [] |
|
|
main_topic_quota = max_words * 2 // 3 |
|
|
subcategory_quota = max_words - main_topic_quota |
|
|
|
|
|
|
|
|
main_sources = [k for k in source_groups.keys() if k.startswith('main_topic:')] |
|
|
for source in main_sources: |
|
|
quota = main_topic_quota // len(main_sources) if main_sources else 0 |
|
|
selected.extend(source_groups[source][:quota]) |
|
|
|
|
|
|
|
|
subcat_sources = [k for k in source_groups.keys() if k.startswith('subcategory:')] |
|
|
if subcat_sources and len(selected) < max_words: |
|
|
remaining_slots = max_words - len(selected) |
|
|
quota_per_subcat = max(1, remaining_slots // len(subcat_sources)) |
|
|
|
|
|
for source in subcat_sources: |
|
|
if len(selected) >= max_words: |
|
|
break |
|
|
selected.extend(source_groups[source][:quota_per_subcat]) |
|
|
|
|
|
|
|
|
if len(selected) < max_words: |
|
|
used_words = {c['word'] for c in selected} |
|
|
remaining = [c for c in candidates if c['word'] not in used_words] |
|
|
needed = max_words - len(selected) |
|
|
selected.extend(remaining[:needed]) |
|
|
|
|
|
return selected[:max_words] |
|
|
else: |
|
|
|
|
|
return candidates[:max_words] |
|
|
|
|
|
def _get_index_cache_dir(self) -> str: |
|
|
"""Get the directory for caching FAISS indexes.""" |
|
|
|
|
|
if os.path.exists("/.dockerenv") or os.getenv("SPACE_ID"): |
|
|
|
|
|
cache_dir = os.getenv("FAISS_CACHE_DIR", "/tmp/faiss_cache") |
|
|
else: |
|
|
|
|
|
cache_dir = os.getenv("FAISS_CACHE_DIR", "faiss_cache") |
|
|
|
|
|
os.makedirs(cache_dir, exist_ok=True) |
|
|
return cache_dir |
|
|
|
|
|
def _get_model_hash(self) -> str: |
|
|
"""Generate a hash for the model configuration to use in cache keys.""" |
|
|
|
|
|
config_str = f"{self.model_name}_v2" |
|
|
return hashlib.md5(config_str.encode()).hexdigest()[:8] |
|
|
|
|
|
def _cache_exists(self) -> bool: |
|
|
"""Check if all cached files exist.""" |
|
|
return (os.path.exists(self.vocab_cache_path) and |
|
|
os.path.exists(self.embeddings_cache_path) and |
|
|
os.path.exists(self.faiss_cache_path)) |
|
|
|
|
|
def _load_excluded_words(self) -> set: |
|
|
"""Load list of words to exclude from crossword generation.""" |
|
|
|
|
|
default_excluded = { |
|
|
"WORD", "THING", "STUFF", "ITEMS", "THINGS", "WORDS", "TEXT", "STRING", |
|
|
"DATA", "INFO", "CONTENT", "MATERIAL", "ELEMENT", "OBJECT", "ENTITY", |
|
|
"CONCEPT", "IDEA", "NOTION", "ABSTRACT", "GENERAL", "SPECIFIC", "VARIOUS", |
|
|
"MULTIPLE", "SEVERAL", "MANY", "SOME", "MOST", "ALL", "EACH", "EVERY", |
|
|
"DIFFERENT", "SIMILAR", "SAME", "OTHER", "ANOTHER", "VARIOUS", "CERTAIN" |
|
|
} |
|
|
|
|
|
|
|
|
env_excluded = os.getenv("EXCLUDED_WORDS", "") |
|
|
if env_excluded: |
|
|
env_words = {word.strip().upper() for word in env_excluded.split(",") if word.strip()} |
|
|
default_excluded.update(env_words) |
|
|
|
|
|
|
|
|
exclusion_file = os.getenv("WORD_EXCLUSION_FILE", "") |
|
|
if exclusion_file and os.path.exists(exclusion_file): |
|
|
try: |
|
|
with open(exclusion_file, 'r') as f: |
|
|
file_words = {word.strip().upper() for line in f for word in [line.strip()] if word and not word.startswith('#')} |
|
|
default_excluded.update(file_words) |
|
|
logger.info(f"📋 Loaded {len(file_words)} additional excluded words from {exclusion_file}") |
|
|
except Exception as e: |
|
|
logger.warning(f"⚠️ Failed to load exclusion file {exclusion_file}: {e}") |
|
|
|
|
|
logger.info(f"🚫 Loaded {len(default_excluded)} excluded words for filtering") |
|
|
return default_excluded |
|
|
|
|
|
def _apply_word_exclusions(self, candidates: List[Dict[str, Any]]) -> List[Dict[str, Any]]: |
|
|
"""Filter out excluded words from candidates.""" |
|
|
if not candidates or not self.excluded_words: |
|
|
return candidates |
|
|
|
|
|
filtered = [] |
|
|
excluded_count = 0 |
|
|
|
|
|
for candidate in candidates: |
|
|
word = candidate['word'].upper() |
|
|
if word not in self.excluded_words: |
|
|
filtered.append(candidate) |
|
|
else: |
|
|
excluded_count += 1 |
|
|
|
|
|
if excluded_count > 0: |
|
|
logger.info(f"🚫 Excluded {excluded_count} inappropriate words from results") |
|
|
|
|
|
return filtered |
|
|
|
|
|
def _load_cached_index(self) -> bool: |
|
|
"""Load FAISS index from cache if available.""" |
|
|
try: |
|
|
if not self._cache_exists(): |
|
|
logger.info("📁 No cached index found - will build new index") |
|
|
return False |
|
|
|
|
|
logger.info("📁 Loading cached FAISS index...") |
|
|
cache_start = time.time() |
|
|
|
|
|
|
|
|
with open(self.vocab_cache_path, 'rb') as f: |
|
|
self.vocab = pickle.load(f) |
|
|
logger.info(f"📚 Loaded {len(self.vocab)} vocabulary words from cache") |
|
|
|
|
|
|
|
|
self.word_embeddings = np.load(self.embeddings_cache_path) |
|
|
logger.info(f"📈 Loaded embeddings shape: {self.word_embeddings.shape}") |
|
|
|
|
|
|
|
|
self.faiss_index = faiss.read_index(self.faiss_cache_path) |
|
|
logger.info(f"🔍 Loaded FAISS index with {self.faiss_index.ntotal} vectors") |
|
|
|
|
|
cache_time = time.time() - cache_start |
|
|
logger.info(f"✅ Successfully loaded cached index in {cache_time:.2f}s") |
|
|
return True |
|
|
|
|
|
except Exception as e: |
|
|
logger.info(f"❌ Failed to load cached index: {e}") |
|
|
logger.info("🔄 Will rebuild index from scratch") |
|
|
return False |
|
|
|
|
|
def _save_index_to_cache(self): |
|
|
"""Save the built FAISS index to cache for future use.""" |
|
|
try: |
|
|
logger.info("💾 Saving FAISS index to cache...") |
|
|
save_start = time.time() |
|
|
|
|
|
|
|
|
with open(self.vocab_cache_path, 'wb') as f: |
|
|
pickle.dump(self.vocab, f) |
|
|
|
|
|
|
|
|
np.save(self.embeddings_cache_path, self.word_embeddings) |
|
|
|
|
|
|
|
|
faiss.write_index(self.faiss_index, self.faiss_cache_path) |
|
|
|
|
|
save_time = time.time() - save_start |
|
|
logger.info(f"✅ Index cached successfully in {save_time:.2f}s") |
|
|
logger.info(f"📁 Cache location: {self.index_cache_dir}") |
|
|
|
|
|
except Exception as e: |
|
|
logger.info(f"⚠️ Failed to cache index: {e}") |
|
|
logger.info("📝 Continuing without caching (performance will be slower next startup)") |
|
|
|
|
|
def _is_topic_relevant(self, word: str, topic: str) -> bool: |
|
|
""" |
|
|
Enhanced topic relevance check to prevent unrelated words. |
|
|
This is an additional filter beyond similarity scores. |
|
|
""" |
|
|
word_lower = word.lower() |
|
|
topic_lower = topic.lower() |
|
|
|
|
|
|
|
|
if topic_lower in ['animals', 'animal']: |
|
|
|
|
|
animal_indicators = [ |
|
|
'bird', 'fish', 'mammal', 'reptile', 'insect', 'creature', 'wild', 'domestic', |
|
|
'hunt', 'prey', 'pack', 'herd', 'flock', 'swarm', 'nest', 'den', 'habitat', |
|
|
'fur', 'feather', 'scale', 'claw', 'tail', 'wing', 'beak', 'hoof', |
|
|
'zoo', 'farm', 'forest', 'ocean', 'jungle', 'safari' |
|
|
] |
|
|
|
|
|
tech_indicators = ['computer', 'software', 'digital', 'internet', 'mobile', 'app', 'code', 'data'] |
|
|
if any(indicator in word_lower for indicator in tech_indicators): |
|
|
logger.info(f"🚫 Rejected '{word}' for {topic}: contains tech indicators") |
|
|
return False |
|
|
|
|
|
elif topic_lower in ['technology', 'tech']: |
|
|
|
|
|
animal_indicators = ['bird', 'fish', 'mammal', 'animal', 'creature', 'wild', 'fur', 'feather', |
|
|
'elephant', 'tiger', 'lion', 'bear', 'wolf', 'cat', 'dog', 'horse'] |
|
|
if any(indicator in word_lower for indicator in animal_indicators): |
|
|
logger.info(f"🚫 Rejected '{word}' for {topic}: contains animal indicators") |
|
|
return False |
|
|
|
|
|
elif topic_lower in ['science', 'scientific']: |
|
|
|
|
|
casual_indicators = ['phone', 'app', 'game', 'fun', 'cool', 'awesome'] |
|
|
if any(indicator in word_lower for indicator in casual_indicators): |
|
|
logger.info(f"🚫 Rejected '{word}' for {topic}: too casual for science") |
|
|
return False |
|
|
|
|
|
elif topic_lower in ['geography', 'geographic']: |
|
|
|
|
|
tech_indicators = ['software', 'computer', 'digital', 'code', 'app'] |
|
|
if any(indicator in word_lower for indicator in tech_indicators): |
|
|
logger.info(f"🚫 Rejected '{word}' for {topic}: tech term in geography") |
|
|
return False |
|
|
|
|
|
|
|
|
|
|
|
meta_words = ['word', 'term', 'name', 'thing', 'stuff', 'item', 'object'] |
|
|
if word_lower in meta_words: |
|
|
logger.info(f"🚫 Rejected '{word}': too generic/meta") |
|
|
return False |
|
|
|
|
|
|
|
|
if len(word) < 3: |
|
|
return False |
|
|
|
|
|
return True |
|
|
|
|
|
def _collect_candidates_with_threshold( |
|
|
self, |
|
|
scores: np.ndarray, |
|
|
indices: np.ndarray, |
|
|
threshold: float, |
|
|
topic: str, |
|
|
difficulty: str |
|
|
) -> List[Dict[str, Any]]: |
|
|
"""Collect word candidates using a specific similarity threshold.""" |
|
|
candidates = [] |
|
|
above_threshold = 0 |
|
|
difficulty_passed = 0 |
|
|
interesting_passed = 0 |
|
|
rejected_words = [] |
|
|
|
|
|
for score, idx in zip(scores[0], indices[0]): |
|
|
if score < threshold: |
|
|
continue |
|
|
above_threshold += 1 |
|
|
|
|
|
word = self.vocab[idx] |
|
|
|
|
|
|
|
|
if self._matches_difficulty(word, difficulty): |
|
|
difficulty_passed += 1 |
|
|
|
|
|
interesting_passed += 1 |
|
|
candidates.append({ |
|
|
"word": word, |
|
|
"clue": self._generate_clue(word, topic), |
|
|
"similarity": float(score), |
|
|
"source": "vector_search" |
|
|
}) |
|
|
|
|
|
|
|
|
else: |
|
|
rejected_words.append(f"{word}({score:.3f})") |
|
|
|
|
|
|
|
|
if rejected_words and len(rejected_words) <= 10: |
|
|
logger.info(f"🚫 Rejected words at threshold {threshold}: {', '.join(rejected_words[:5])}") |
|
|
elif rejected_words: |
|
|
logger.info(f"🚫 Rejected {len(rejected_words)} words at threshold {threshold} (showing first 5): {', '.join(rejected_words[:5])}") |
|
|
|
|
|
logger.info(f"🔍 Threshold {threshold}: {len(scores[0])} total → {above_threshold} above threshold → {difficulty_passed} difficulty OK → {interesting_passed} relevant → {len(candidates)} final") |
|
|
|
|
|
|
|
|
if candidates: |
|
|
passed_words = [f"{w['word']}({w['similarity']:.3f})" for w in candidates[:8]] |
|
|
logger.info(f"✅ Words passing threshold {threshold}: {', '.join(passed_words)}") |
|
|
|
|
|
return candidates |
|
|
|
|
|
def _weighted_random_selection(self, candidates: List[Dict[str, Any]], max_words: int) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
Weighted random selection that favors higher similarity scores but adds variety. |
|
|
|
|
|
This ensures we don't always get the exact same words, while still preferring |
|
|
high-quality matches. |
|
|
""" |
|
|
import random |
|
|
|
|
|
if len(candidates) <= max_words: |
|
|
return candidates |
|
|
|
|
|
|
|
|
candidates_sorted = sorted(candidates, key=lambda w: w["similarity"], reverse=True) |
|
|
|
|
|
|
|
|
tier1_size = max(1, len(candidates_sorted) // 4) |
|
|
tier1 = candidates_sorted[:tier1_size] |
|
|
|
|
|
|
|
|
tier2_size = max(1, len(candidates_sorted) // 4) |
|
|
tier2 = candidates_sorted[tier1_size:tier1_size + tier2_size] |
|
|
|
|
|
|
|
|
tier3_size = max(1, len(candidates_sorted) * 35 // 100) |
|
|
tier3 = candidates_sorted[tier1_size + tier2_size:tier1_size + tier2_size + tier3_size] |
|
|
|
|
|
|
|
|
tier4 = candidates_sorted[tier1_size + tier2_size + tier3_size:] |
|
|
|
|
|
selected = [] |
|
|
|
|
|
|
|
|
tier1_count = min(max_words // 3, len(tier1)) |
|
|
selected.extend(random.sample(tier1, tier1_count)) |
|
|
|
|
|
|
|
|
remaining_slots = max_words - len(selected) |
|
|
|
|
|
if remaining_slots > 0: |
|
|
|
|
|
weighted_pool = [] |
|
|
weighted_pool.extend([(w, 3) for w in tier2]) |
|
|
weighted_pool.extend([(w, 2) for w in tier3]) |
|
|
weighted_pool.extend([(w, 1) for w in tier4]) |
|
|
|
|
|
|
|
|
remaining_tier1 = [w for w in tier1 if w not in selected] |
|
|
weighted_pool.extend([(w, 4) for w in remaining_tier1]) |
|
|
|
|
|
|
|
|
for _ in range(remaining_slots): |
|
|
if not weighted_pool: |
|
|
break |
|
|
|
|
|
|
|
|
weighted_words = [] |
|
|
for word, weight in weighted_pool: |
|
|
weighted_words.extend([word] * weight) |
|
|
|
|
|
if weighted_words: |
|
|
chosen = random.choice(weighted_words) |
|
|
selected.append(chosen) |
|
|
|
|
|
|
|
|
weighted_pool = [(w, wt) for w, wt in weighted_pool if w != chosen] |
|
|
|
|
|
|
|
|
random.shuffle(selected) |
|
|
|
|
|
logger.info(f"🎲 Weighted selection: {len(selected)} words from {len(candidates)} candidates") |
|
|
return selected[:max_words] |
|
|
|
|
|
async def _get_cached_fallback( |
|
|
self, |
|
|
topic: str, |
|
|
difficulty: str, |
|
|
max_words: int |
|
|
) -> List[Dict[str, Any]]: |
|
|
"""Fallback to cached words when vector search fails.""" |
|
|
if not self.cache_manager: |
|
|
logger.warning(f"📭 No cache manager available for fallback") |
|
|
return [] |
|
|
|
|
|
logger.info(f"🔄 Looking for cached words for topic: '{topic}', difficulty: '{difficulty}'") |
|
|
|
|
|
try: |
|
|
cached_words = await self.cache_manager.get_cached_words(topic, difficulty, max_words) |
|
|
|
|
|
if cached_words: |
|
|
logger.info(f"📦 Found {len(cached_words)} cached words for '{topic}/{difficulty}'") |
|
|
return cached_words |
|
|
else: |
|
|
logger.info(f"📭 No cached words available for '{topic}/{difficulty}'") |
|
|
return [] |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"❌ Failed to get cached fallback for '{topic}': {e}") |
|
|
return [] |
|
|
|
|
|
async def _cache_successful_search( |
|
|
self, |
|
|
topic: str, |
|
|
difficulty: str, |
|
|
words: List[Dict[str, Any]] |
|
|
): |
|
|
"""Cache successful vector search results for future use.""" |
|
|
if not self.cache_manager: |
|
|
return |
|
|
|
|
|
try: |
|
|
|
|
|
vector_words = [w for w in words if w.get("source") == "vector_search"] |
|
|
|
|
|
if vector_words: |
|
|
success = await self.cache_manager.cache_words(topic, difficulty, vector_words) |
|
|
if success: |
|
|
logger.info(f"💾 Successfully cached {len(vector_words)} words for {topic}/{difficulty}") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"❌ Failed to cache search results: {e}") |
|
|
|
|
|
def _get_emergency_bootstrap(self, topic: str, difficulty: str, max_words: int) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
Emergency bootstrap words when vector search and cache both fail. |
|
|
This prevents complete failure by providing basic topic-related words. |
|
|
""" |
|
|
bootstrap_words = { |
|
|
"animals": [ |
|
|
{"word": "DOG", "clue": "Man's best friend"}, |
|
|
{"word": "CAT", "clue": "Feline pet"}, |
|
|
{"word": "FISH", "clue": "Aquatic animal"}, |
|
|
], |
|
|
"science": [ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
], |
|
|
"technology": [ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
], |
|
|
"geography": [ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
] |
|
|
} |
|
|
|
|
|
topic_lower = topic.lower() |
|
|
words = bootstrap_words.get(topic_lower, []) |
|
|
|
|
|
if not words: |
|
|
|
|
|
words = [ |
|
|
{"word": "WORD", "clue": "Unit of language"}, |
|
|
{"word": "PUZZLE", "clue": "Brain teaser"}, |
|
|
{"word": "GAME", "clue": "Form of play"}, |
|
|
{"word": "CROSS", "clue": "Intersecting lines"}, |
|
|
{"word": "GRID", "clue": "Pattern of squares"}, |
|
|
{"word": "CLUE", "clue": "Helpful hint"} |
|
|
] |
|
|
|
|
|
|
|
|
filtered_words = [] |
|
|
for word_obj in words: |
|
|
word = word_obj["word"] |
|
|
if self._matches_difficulty(word, difficulty): |
|
|
filtered_words.append({ |
|
|
"word": word, |
|
|
"clue": word_obj["clue"], |
|
|
"similarity": 0.7, |
|
|
"source": "emergency_bootstrap" |
|
|
}) |
|
|
|
|
|
|
|
|
import random |
|
|
random.shuffle(filtered_words) |
|
|
result = filtered_words[:max_words] |
|
|
|
|
|
logger.info(f"🆘 Emergency bootstrap provided {len(result)} words for '{topic}'") |
|
|
return result |
|
|
|
|
|
async def cleanup(self): |
|
|
"""Cleanup resources.""" |
|
|
logger.info("🧹 Cleaning up vector search service") |
|
|
if hasattr(self, 'model'): |
|
|
del self.model |
|
|
if hasattr(self, 'word_embeddings'): |
|
|
del self.word_embeddings |
|
|
if hasattr(self, 'faiss_index'): |
|
|
del self.faiss_index |
|
|
if self.cache_manager: |
|
|
await self.cache_manager.cleanup_expired_caches() |
|
|
self.is_initialized = False |
|
|
|