#!/usr/bin/env python3 """ Test adaptive beta fix with full vocabulary to see if it now correctly uses the adjusted threshold for filtering """ import os import sys import logging # Configure logging to see the debug messages logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s') def setup_environment(): """Setup environment and add src to path""" # Set cache directory to root cache-dir folder cache_dir = os.path.join(os.path.dirname(__file__), '..', 'cache-dir') cache_dir = os.path.abspath(cache_dir) os.environ['HF_HOME'] = cache_dir os.environ['TRANSFORMERS_CACHE'] = cache_dir os.environ['SENTENCE_TRANSFORMERS_HOME'] = cache_dir # Add backend source to path backend_path = os.path.join(os.path.dirname(__file__), '..', 'crossword-app', 'backend-py', 'src') backend_path = os.path.abspath(backend_path) if backend_path not in sys.path: sys.path.insert(0, backend_path) print(f"Using cache directory: {cache_dir}") def test_adaptive_fix(): """Test with full vocabulary to see the fix in action""" setup_environment() print("šŸ”§ Testing Adaptive Beta Fix") print("=" * 50) # Set environment variables for soft minimum with debug - USE FULL VOCABULARY os.environ['MULTI_TOPIC_METHOD'] = 'soft_minimum' os.environ['SOFT_MIN_BETA'] = '10.0' os.environ['SOFT_MIN_ADAPTIVE'] = 'true' os.environ['SOFT_MIN_MIN_WORDS'] = '15' os.environ['SOFT_MIN_MAX_RETRIES'] = '5' os.environ['SOFT_MIN_BETA_DECAY'] = '0.7' os.environ['THEMATIC_VOCAB_SIZE_LIMIT'] = '100000' # Full vocabulary try: from services.thematic_word_service import ThematicWordService print("Creating ThematicWordService...") service = ThematicWordService() service.initialize() # Test the original problematic case with full vocabulary inputs = ["universe", "movies", "languages"] print(f"\\nTesting original case: {inputs} (with full vocabulary)") print(f"Expected: Should now get words using adjusted threshold") print("-" * 50) results = service.generate_thematic_words( inputs, num_words=50, min_similarity=0.25, # Use 0.25 like the original log multi_theme=True ) print(f"\\nāœ… Final result: {len(results)} words generated") if len(results) > 0: print(f"Top 10 words:") for i, (word, similarity, tier) in enumerate(results[:10], 1): print(f" {i}. {word}: {similarity:.4f}") else: print(" āš ļø Still no words generated!") print(f"\\nšŸ”¬ Test another challenging case: ['science', 'art', 'music']") results2 = service.generate_thematic_words( ["science", "art", "music"], num_words=30, min_similarity=0.25, multi_theme=True ) print(f"\\nāœ… Second result: {len(results2)} words generated") if len(results2) > 0: print(f"Top 5 words:") for i, (word, similarity, tier) in enumerate(results2[:5], 1): print(f" {i}. {word}: {similarity:.4f}") except Exception as e: print(f"āŒ Test failed: {e}") import traceback traceback.print_exc() if __name__ == "__main__": test_adaptive_fix()