File size: 10,890 Bytes
53e35dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#!/usr/bin/env python3
"""
Test script for distribution normalization feature.

This script demonstrates how distribution normalization ensures consistent 
difficulty levels across different topics by normalizing similarity ranges
and standardizing distribution shapes.
"""

import os
import sys
import numpy as np
from collections import defaultdict

# Add src directory to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))

def test_normalization_across_topics():
    """Test normalization consistency across different topics."""
    print("🧪 Testing distribution normalization across topics...")
    
    # Set up environment for testing normalization
    os.environ['SIMILARITY_TEMPERATURE'] = '0.7'
    os.environ['USE_SOFTMAX_SELECTION'] = 'true'
    os.environ['DIFFICULTY_WEIGHT'] = '0.3'
    os.environ['ENABLE_DEBUG_TAB'] = 'true'
    
    # Test with normalization ENABLED
    os.environ['ENABLE_DISTRIBUTION_NORMALIZATION'] = 'true'
    os.environ['NORMALIZATION_METHOD'] = 'similarity_range'
    
    from services.thematic_word_service import ThematicWordService
    
    # Create service instance
    service = ThematicWordService()
    service.initialize()
    
    # Test topics with expected different similarity ranges
    test_topics = [
        ("animals", "Expected high similarity range - many animals in vocabulary"),
        ("technology", "Expected medium similarity range - some tech words"),
        ("geology", "Expected low similarity range - fewer geology terms"),
        ("food", "Expected high similarity range - many food words"),
        ("philosophy", "Expected very low similarity range - abstract concepts")
    ]
    
    difficulty = "medium"  # Use medium difficulty for consistent comparison
    num_words = 15
    
    print(f"\n🎯 Testing normalization for difficulty: {difficulty.upper()}")
    print(f"📊 Requesting {num_words} words per topic")
    print(f"🔧 Normalization: {service.enable_distribution_normalization} ({service.normalization_method})")
    
    results = {}
    
    for topic, description in test_topics:
        print(f"\n📚 Topic: {topic.upper()}")
        print(f"   {description}")
        
        try:
            # Generate words using crossword-specific method to get debug data
            result = service.find_words_for_crossword([topic], difficulty, num_words)
            words = result["words"]
            debug_data = result.get("debug", {})
            
            if debug_data and "probability_distribution" in debug_data:
                prob_data = debug_data["probability_distribution"]
                probabilities = prob_data["probabilities"]
                
                # Calculate distribution statistics
                similarities = [p["similarity"] for p in probabilities]
                percentiles = [p["percentile"] for p in probabilities]
                composite_scores = [p["composite_score"] for p in probabilities]
                probs = [p["probability"] for p in probabilities]
                
                # Check for normalization data
                has_normalization_data = any(p.get("normalization_applied", False) for p in probabilities)
                original_scores = []
                if has_normalization_data:
                    original_scores = [p.get("original_composite_score", p["composite_score"]) for p in probabilities]
                
                stats = {
                    "topic": topic,
                    "word_count": len(words),
                    "similarity_range": (min(similarities), max(similarities)),
                    "similarity_mean": np.mean(similarities),
                    "similarity_std": np.std(similarities),
                    "percentile_mean": np.mean(percentiles), 
                    "percentile_std": np.std(percentiles),
                    "composite_mean": np.mean(composite_scores),
                    "composite_std": np.std(composite_scores),
                    "prob_entropy": -sum(p * np.log(p + 1e-10) for p in probs),  # Selection entropy
                    "selected_words": [w["word"] for w in words[:5]],  # First 5 words
                    "normalization_applied": has_normalization_data
                }
                
                if original_scores:
                    stats["original_composite_mean"] = np.mean(original_scores)
                    stats["original_composite_std"] = np.std(original_scores)
                    stats["normalization_effect"] = abs(stats["composite_mean"] - stats["original_composite_mean"])
                
                results[topic] = stats
                
                # Display key statistics
                print(f"   ✅ Generated {len(words)} words")
                print(f"   📊 Similarity range: {stats['similarity_range'][0]:.3f} - {stats['similarity_range'][1]:.3f}")
                print(f"   📈 Similarity mean±std: {stats['similarity_mean']:.3f}±{stats['similarity_std']:.3f}")
                print(f"   🎯 Percentile mean±std: {stats['percentile_mean']:.3f}±{stats['percentile_std']:.3f}")
                print(f"   🔢 Composite mean±std: {stats['composite_mean']:.3f}±{stats['composite_std']:.3f}")
                if has_normalization_data:
                    print(f"   🎯 Normalization applied: Original composite mean was {stats['original_composite_mean']:.3f}")
                    print(f"   📈 Normalization effect: {stats['normalization_effect']:.3f} change in mean")
                print(f"   📝 Selected words: {', '.join(stats['selected_words'])}")
                
            else:
                print(f"   ❌ No debug data available for {topic}")
                
        except Exception as e:
            print(f"   ❌ Error testing {topic}: {e}")
            continue
    
    # Analyze consistency across topics
    if len(results) >= 3:
        print(f"\n📊 NORMALIZATION CONSISTENCY ANALYSIS")
        print(f"=" * 60)
        
        # Compare similarity ranges (should be more consistent after normalization)
        sim_ranges = [stats['similarity_range'][1] - stats['similarity_range'][0] for stats in results.values()]
        sim_means = [stats['similarity_mean'] for stats in results.values()]
        composite_stds = [stats['composite_std'] for stats in results.values()]
        percentile_means = [stats['percentile_mean'] for stats in results.values()]
        
        print(f"🎯 Similarity Range Consistency:")
        print(f"   Range spread: {np.std(sim_ranges):.4f} (lower = more consistent)")
        print(f"   Mean variation: {np.std(sim_means):.4f} (lower = more consistent)")
        
        print(f"\n🎲 Selection Distribution Consistency:")
        print(f"   Composite score std variation: {np.std(composite_stds):.4f} (lower = more consistent)")
        print(f"   Percentile targeting consistency: {np.std(percentile_means):.4f} (should be near 0.5 for medium)")
        
        print(f"\n🏆 Normalization Effectiveness:")
        if any(stats.get('normalization_applied', False) for stats in results.values()):
            normalization_effects = [stats.get('normalization_effect', 0) for stats in results.values() if stats.get('normalization_effect') is not None]
            if normalization_effects:
                avg_effect = np.mean(normalization_effects)
                print(f"   Average normalization effect: {avg_effect:.4f}")
                print(f"   Normalization was {'SIGNIFICANT' if avg_effect > 0.05 else 'MINIMAL'}")
            print("   ✅ Normalization data found in debug output")
        else:
            print("   ⚠️ No normalization data found - check ENABLE_DISTRIBUTION_NORMALIZATION")
        
        # Ideal targets for medium difficulty
        target_percentile = 0.5
        percentile_deviation = np.mean([abs(pm - target_percentile) for pm in percentile_means])
        print(f"\n🎯 Difficulty Targeting Accuracy:")
        print(f"   Target percentile (medium): {target_percentile}")
        print(f"   Average deviation: {percentile_deviation:.4f}")
        print(f"   Targeting accuracy: {'EXCELLENT' if percentile_deviation < 0.05 else 'GOOD' if percentile_deviation < 0.1 else 'NEEDS IMPROVEMENT'}")
    
    print(f"\n✅ Distribution normalization test completed!")
    return results

def test_normalization_methods():
    """Test different normalization methods."""
    print(f"\n🧪 Testing different normalization methods...")
    
    methods = ["similarity_range", "composite_zscore", "percentile_recentering"]
    topic = "animals"  # Use consistent topic
    difficulty = "easy"  # Use easy difficulty to see clear effects
    
    for method in methods:
        print(f"\n🔧 Testing method: {method.upper()}")
        
        os.environ['NORMALIZATION_METHOD'] = method
        
        from services.thematic_word_service import ThematicWordService
        
        service = ThematicWordService()
        service.initialize()
        
        try:
            result = service.find_words_for_crossword([topic], difficulty, 10)
            words = result["words"]
            debug_data = result.get("debug", {})
            
            if debug_data and "probability_distribution" in debug_data:
                prob_data = debug_data["probability_distribution"]
                probabilities = prob_data["probabilities"]
                
                similarities = [p["similarity"] for p in probabilities]
                percentiles = [p["percentile"] for p in probabilities]
                
                print(f"   📊 Similarity range: {min(similarities):.3f} - {max(similarities):.3f}")
                print(f"   🎯 Mean percentile: {np.mean(percentiles):.3f} (target for easy: 0.9)")
                print(f"   📈 Selected words: {', '.join([w['word'] for w in words[:5]])}")
                
                if any(p.get("normalization_applied", False) for p in probabilities):
                    print(f"   ✅ Normalization applied successfully")
                else:
                    print(f"   ⚠️ Normalization not detected in debug data")
            else:
                print(f"   ❌ No debug data available")
                
        except Exception as e:
            print(f"   ❌ Error with method {method}: {e}")

if __name__ == "__main__":
    print("🎯 Distribution Normalization Test Suite")
    print("=" * 50)
    
    test_normalization_across_topics()
    test_normalization_methods()
    
    print(f"\n🎉 All tests completed!")
    print(f"\n💡 To see normalization effects in the UI:")
    print(f"   1. Set ENABLE_DISTRIBUTION_NORMALIZATION=true")
    print(f"   2. Set ENABLE_DEBUG_TAB=true") 
    print(f"   3. Generate crosswords with different topics at the same difficulty")
    print(f"   4. Check the Debug tab for normalization indicators and tooltips")