File size: 10,890 Bytes
53e35dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 |
#!/usr/bin/env python3
"""
Test script for distribution normalization feature.
This script demonstrates how distribution normalization ensures consistent
difficulty levels across different topics by normalizing similarity ranges
and standardizing distribution shapes.
"""
import os
import sys
import numpy as np
from collections import defaultdict
# Add src directory to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
def test_normalization_across_topics():
"""Test normalization consistency across different topics."""
print("🧪 Testing distribution normalization across topics...")
# Set up environment for testing normalization
os.environ['SIMILARITY_TEMPERATURE'] = '0.7'
os.environ['USE_SOFTMAX_SELECTION'] = 'true'
os.environ['DIFFICULTY_WEIGHT'] = '0.3'
os.environ['ENABLE_DEBUG_TAB'] = 'true'
# Test with normalization ENABLED
os.environ['ENABLE_DISTRIBUTION_NORMALIZATION'] = 'true'
os.environ['NORMALIZATION_METHOD'] = 'similarity_range'
from services.thematic_word_service import ThematicWordService
# Create service instance
service = ThematicWordService()
service.initialize()
# Test topics with expected different similarity ranges
test_topics = [
("animals", "Expected high similarity range - many animals in vocabulary"),
("technology", "Expected medium similarity range - some tech words"),
("geology", "Expected low similarity range - fewer geology terms"),
("food", "Expected high similarity range - many food words"),
("philosophy", "Expected very low similarity range - abstract concepts")
]
difficulty = "medium" # Use medium difficulty for consistent comparison
num_words = 15
print(f"\n🎯 Testing normalization for difficulty: {difficulty.upper()}")
print(f"📊 Requesting {num_words} words per topic")
print(f"🔧 Normalization: {service.enable_distribution_normalization} ({service.normalization_method})")
results = {}
for topic, description in test_topics:
print(f"\n📚 Topic: {topic.upper()}")
print(f" {description}")
try:
# Generate words using crossword-specific method to get debug data
result = service.find_words_for_crossword([topic], difficulty, num_words)
words = result["words"]
debug_data = result.get("debug", {})
if debug_data and "probability_distribution" in debug_data:
prob_data = debug_data["probability_distribution"]
probabilities = prob_data["probabilities"]
# Calculate distribution statistics
similarities = [p["similarity"] for p in probabilities]
percentiles = [p["percentile"] for p in probabilities]
composite_scores = [p["composite_score"] for p in probabilities]
probs = [p["probability"] for p in probabilities]
# Check for normalization data
has_normalization_data = any(p.get("normalization_applied", False) for p in probabilities)
original_scores = []
if has_normalization_data:
original_scores = [p.get("original_composite_score", p["composite_score"]) for p in probabilities]
stats = {
"topic": topic,
"word_count": len(words),
"similarity_range": (min(similarities), max(similarities)),
"similarity_mean": np.mean(similarities),
"similarity_std": np.std(similarities),
"percentile_mean": np.mean(percentiles),
"percentile_std": np.std(percentiles),
"composite_mean": np.mean(composite_scores),
"composite_std": np.std(composite_scores),
"prob_entropy": -sum(p * np.log(p + 1e-10) for p in probs), # Selection entropy
"selected_words": [w["word"] for w in words[:5]], # First 5 words
"normalization_applied": has_normalization_data
}
if original_scores:
stats["original_composite_mean"] = np.mean(original_scores)
stats["original_composite_std"] = np.std(original_scores)
stats["normalization_effect"] = abs(stats["composite_mean"] - stats["original_composite_mean"])
results[topic] = stats
# Display key statistics
print(f" ✅ Generated {len(words)} words")
print(f" 📊 Similarity range: {stats['similarity_range'][0]:.3f} - {stats['similarity_range'][1]:.3f}")
print(f" 📈 Similarity mean±std: {stats['similarity_mean']:.3f}±{stats['similarity_std']:.3f}")
print(f" 🎯 Percentile mean±std: {stats['percentile_mean']:.3f}±{stats['percentile_std']:.3f}")
print(f" 🔢 Composite mean±std: {stats['composite_mean']:.3f}±{stats['composite_std']:.3f}")
if has_normalization_data:
print(f" 🎯 Normalization applied: Original composite mean was {stats['original_composite_mean']:.3f}")
print(f" 📈 Normalization effect: {stats['normalization_effect']:.3f} change in mean")
print(f" 📝 Selected words: {', '.join(stats['selected_words'])}")
else:
print(f" ❌ No debug data available for {topic}")
except Exception as e:
print(f" ❌ Error testing {topic}: {e}")
continue
# Analyze consistency across topics
if len(results) >= 3:
print(f"\n📊 NORMALIZATION CONSISTENCY ANALYSIS")
print(f"=" * 60)
# Compare similarity ranges (should be more consistent after normalization)
sim_ranges = [stats['similarity_range'][1] - stats['similarity_range'][0] for stats in results.values()]
sim_means = [stats['similarity_mean'] for stats in results.values()]
composite_stds = [stats['composite_std'] for stats in results.values()]
percentile_means = [stats['percentile_mean'] for stats in results.values()]
print(f"🎯 Similarity Range Consistency:")
print(f" Range spread: {np.std(sim_ranges):.4f} (lower = more consistent)")
print(f" Mean variation: {np.std(sim_means):.4f} (lower = more consistent)")
print(f"\n🎲 Selection Distribution Consistency:")
print(f" Composite score std variation: {np.std(composite_stds):.4f} (lower = more consistent)")
print(f" Percentile targeting consistency: {np.std(percentile_means):.4f} (should be near 0.5 for medium)")
print(f"\n🏆 Normalization Effectiveness:")
if any(stats.get('normalization_applied', False) for stats in results.values()):
normalization_effects = [stats.get('normalization_effect', 0) for stats in results.values() if stats.get('normalization_effect') is not None]
if normalization_effects:
avg_effect = np.mean(normalization_effects)
print(f" Average normalization effect: {avg_effect:.4f}")
print(f" Normalization was {'SIGNIFICANT' if avg_effect > 0.05 else 'MINIMAL'}")
print(" ✅ Normalization data found in debug output")
else:
print(" ⚠️ No normalization data found - check ENABLE_DISTRIBUTION_NORMALIZATION")
# Ideal targets for medium difficulty
target_percentile = 0.5
percentile_deviation = np.mean([abs(pm - target_percentile) for pm in percentile_means])
print(f"\n🎯 Difficulty Targeting Accuracy:")
print(f" Target percentile (medium): {target_percentile}")
print(f" Average deviation: {percentile_deviation:.4f}")
print(f" Targeting accuracy: {'EXCELLENT' if percentile_deviation < 0.05 else 'GOOD' if percentile_deviation < 0.1 else 'NEEDS IMPROVEMENT'}")
print(f"\n✅ Distribution normalization test completed!")
return results
def test_normalization_methods():
"""Test different normalization methods."""
print(f"\n🧪 Testing different normalization methods...")
methods = ["similarity_range", "composite_zscore", "percentile_recentering"]
topic = "animals" # Use consistent topic
difficulty = "easy" # Use easy difficulty to see clear effects
for method in methods:
print(f"\n🔧 Testing method: {method.upper()}")
os.environ['NORMALIZATION_METHOD'] = method
from services.thematic_word_service import ThematicWordService
service = ThematicWordService()
service.initialize()
try:
result = service.find_words_for_crossword([topic], difficulty, 10)
words = result["words"]
debug_data = result.get("debug", {})
if debug_data and "probability_distribution" in debug_data:
prob_data = debug_data["probability_distribution"]
probabilities = prob_data["probabilities"]
similarities = [p["similarity"] for p in probabilities]
percentiles = [p["percentile"] for p in probabilities]
print(f" 📊 Similarity range: {min(similarities):.3f} - {max(similarities):.3f}")
print(f" 🎯 Mean percentile: {np.mean(percentiles):.3f} (target for easy: 0.9)")
print(f" 📈 Selected words: {', '.join([w['word'] for w in words[:5]])}")
if any(p.get("normalization_applied", False) for p in probabilities):
print(f" ✅ Normalization applied successfully")
else:
print(f" ⚠️ Normalization not detected in debug data")
else:
print(f" ❌ No debug data available")
except Exception as e:
print(f" ❌ Error with method {method}: {e}")
if __name__ == "__main__":
print("🎯 Distribution Normalization Test Suite")
print("=" * 50)
test_normalization_across_topics()
test_normalization_methods()
print(f"\n🎉 All tests completed!")
print(f"\n💡 To see normalization effects in the UI:")
print(f" 1. Set ENABLE_DISTRIBUTION_NORMALIZATION=true")
print(f" 2. Set ENABLE_DEBUG_TAB=true")
print(f" 3. Generate crosswords with different topics at the same difficulty")
print(f" 4. Check the Debug tab for normalization indicators and tooltips") |