|
|
|
|
|
""" |
|
|
Test script for distribution normalization feature. |
|
|
|
|
|
This script demonstrates how distribution normalization ensures consistent |
|
|
difficulty levels across different topics by normalizing similarity ranges |
|
|
and standardizing distribution shapes. |
|
|
""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
import numpy as np |
|
|
from collections import defaultdict |
|
|
|
|
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) |
|
|
|
|
|
def test_normalization_across_topics(): |
|
|
"""Test normalization consistency across different topics.""" |
|
|
print("π§ͺ Testing distribution normalization across topics...") |
|
|
|
|
|
|
|
|
os.environ['SIMILARITY_TEMPERATURE'] = '0.7' |
|
|
os.environ['USE_SOFTMAX_SELECTION'] = 'true' |
|
|
os.environ['DIFFICULTY_WEIGHT'] = '0.3' |
|
|
os.environ['ENABLE_DEBUG_TAB'] = 'true' |
|
|
|
|
|
|
|
|
os.environ['ENABLE_DISTRIBUTION_NORMALIZATION'] = 'true' |
|
|
os.environ['NORMALIZATION_METHOD'] = 'similarity_range' |
|
|
|
|
|
from services.thematic_word_service import ThematicWordService |
|
|
|
|
|
|
|
|
service = ThematicWordService() |
|
|
service.initialize() |
|
|
|
|
|
|
|
|
test_topics = [ |
|
|
("animals", "Expected high similarity range - many animals in vocabulary"), |
|
|
("technology", "Expected medium similarity range - some tech words"), |
|
|
("geology", "Expected low similarity range - fewer geology terms"), |
|
|
("food", "Expected high similarity range - many food words"), |
|
|
("philosophy", "Expected very low similarity range - abstract concepts") |
|
|
] |
|
|
|
|
|
difficulty = "medium" |
|
|
num_words = 15 |
|
|
|
|
|
print(f"\nπ― Testing normalization for difficulty: {difficulty.upper()}") |
|
|
print(f"π Requesting {num_words} words per topic") |
|
|
print(f"π§ Normalization: {service.enable_distribution_normalization} ({service.normalization_method})") |
|
|
|
|
|
results = {} |
|
|
|
|
|
for topic, description in test_topics: |
|
|
print(f"\nπ Topic: {topic.upper()}") |
|
|
print(f" {description}") |
|
|
|
|
|
try: |
|
|
|
|
|
result = service.find_words_for_crossword([topic], difficulty, num_words) |
|
|
words = result["words"] |
|
|
debug_data = result.get("debug", {}) |
|
|
|
|
|
if debug_data and "probability_distribution" in debug_data: |
|
|
prob_data = debug_data["probability_distribution"] |
|
|
probabilities = prob_data["probabilities"] |
|
|
|
|
|
|
|
|
similarities = [p["similarity"] for p in probabilities] |
|
|
percentiles = [p["percentile"] for p in probabilities] |
|
|
composite_scores = [p["composite_score"] for p in probabilities] |
|
|
probs = [p["probability"] for p in probabilities] |
|
|
|
|
|
|
|
|
has_normalization_data = any(p.get("normalization_applied", False) for p in probabilities) |
|
|
original_scores = [] |
|
|
if has_normalization_data: |
|
|
original_scores = [p.get("original_composite_score", p["composite_score"]) for p in probabilities] |
|
|
|
|
|
stats = { |
|
|
"topic": topic, |
|
|
"word_count": len(words), |
|
|
"similarity_range": (min(similarities), max(similarities)), |
|
|
"similarity_mean": np.mean(similarities), |
|
|
"similarity_std": np.std(similarities), |
|
|
"percentile_mean": np.mean(percentiles), |
|
|
"percentile_std": np.std(percentiles), |
|
|
"composite_mean": np.mean(composite_scores), |
|
|
"composite_std": np.std(composite_scores), |
|
|
"prob_entropy": -sum(p * np.log(p + 1e-10) for p in probs), |
|
|
"selected_words": [w["word"] for w in words[:5]], |
|
|
"normalization_applied": has_normalization_data |
|
|
} |
|
|
|
|
|
if original_scores: |
|
|
stats["original_composite_mean"] = np.mean(original_scores) |
|
|
stats["original_composite_std"] = np.std(original_scores) |
|
|
stats["normalization_effect"] = abs(stats["composite_mean"] - stats["original_composite_mean"]) |
|
|
|
|
|
results[topic] = stats |
|
|
|
|
|
|
|
|
print(f" β
Generated {len(words)} words") |
|
|
print(f" π Similarity range: {stats['similarity_range'][0]:.3f} - {stats['similarity_range'][1]:.3f}") |
|
|
print(f" π Similarity meanΒ±std: {stats['similarity_mean']:.3f}Β±{stats['similarity_std']:.3f}") |
|
|
print(f" π― Percentile meanΒ±std: {stats['percentile_mean']:.3f}Β±{stats['percentile_std']:.3f}") |
|
|
print(f" π’ Composite meanΒ±std: {stats['composite_mean']:.3f}Β±{stats['composite_std']:.3f}") |
|
|
if has_normalization_data: |
|
|
print(f" π― Normalization applied: Original composite mean was {stats['original_composite_mean']:.3f}") |
|
|
print(f" π Normalization effect: {stats['normalization_effect']:.3f} change in mean") |
|
|
print(f" π Selected words: {', '.join(stats['selected_words'])}") |
|
|
|
|
|
else: |
|
|
print(f" β No debug data available for {topic}") |
|
|
|
|
|
except Exception as e: |
|
|
print(f" β Error testing {topic}: {e}") |
|
|
continue |
|
|
|
|
|
|
|
|
if len(results) >= 3: |
|
|
print(f"\nπ NORMALIZATION CONSISTENCY ANALYSIS") |
|
|
print(f"=" * 60) |
|
|
|
|
|
|
|
|
sim_ranges = [stats['similarity_range'][1] - stats['similarity_range'][0] for stats in results.values()] |
|
|
sim_means = [stats['similarity_mean'] for stats in results.values()] |
|
|
composite_stds = [stats['composite_std'] for stats in results.values()] |
|
|
percentile_means = [stats['percentile_mean'] for stats in results.values()] |
|
|
|
|
|
print(f"π― Similarity Range Consistency:") |
|
|
print(f" Range spread: {np.std(sim_ranges):.4f} (lower = more consistent)") |
|
|
print(f" Mean variation: {np.std(sim_means):.4f} (lower = more consistent)") |
|
|
|
|
|
print(f"\nπ² Selection Distribution Consistency:") |
|
|
print(f" Composite score std variation: {np.std(composite_stds):.4f} (lower = more consistent)") |
|
|
print(f" Percentile targeting consistency: {np.std(percentile_means):.4f} (should be near 0.5 for medium)") |
|
|
|
|
|
print(f"\nπ Normalization Effectiveness:") |
|
|
if any(stats.get('normalization_applied', False) for stats in results.values()): |
|
|
normalization_effects = [stats.get('normalization_effect', 0) for stats in results.values() if stats.get('normalization_effect') is not None] |
|
|
if normalization_effects: |
|
|
avg_effect = np.mean(normalization_effects) |
|
|
print(f" Average normalization effect: {avg_effect:.4f}") |
|
|
print(f" Normalization was {'SIGNIFICANT' if avg_effect > 0.05 else 'MINIMAL'}") |
|
|
print(" β
Normalization data found in debug output") |
|
|
else: |
|
|
print(" β οΈ No normalization data found - check ENABLE_DISTRIBUTION_NORMALIZATION") |
|
|
|
|
|
|
|
|
target_percentile = 0.5 |
|
|
percentile_deviation = np.mean([abs(pm - target_percentile) for pm in percentile_means]) |
|
|
print(f"\nπ― Difficulty Targeting Accuracy:") |
|
|
print(f" Target percentile (medium): {target_percentile}") |
|
|
print(f" Average deviation: {percentile_deviation:.4f}") |
|
|
print(f" Targeting accuracy: {'EXCELLENT' if percentile_deviation < 0.05 else 'GOOD' if percentile_deviation < 0.1 else 'NEEDS IMPROVEMENT'}") |
|
|
|
|
|
print(f"\nβ
Distribution normalization test completed!") |
|
|
return results |
|
|
|
|
|
def test_normalization_methods(): |
|
|
"""Test different normalization methods.""" |
|
|
print(f"\nπ§ͺ Testing different normalization methods...") |
|
|
|
|
|
methods = ["similarity_range", "composite_zscore", "percentile_recentering"] |
|
|
topic = "animals" |
|
|
difficulty = "easy" |
|
|
|
|
|
for method in methods: |
|
|
print(f"\nπ§ Testing method: {method.upper()}") |
|
|
|
|
|
os.environ['NORMALIZATION_METHOD'] = method |
|
|
|
|
|
from services.thematic_word_service import ThematicWordService |
|
|
|
|
|
service = ThematicWordService() |
|
|
service.initialize() |
|
|
|
|
|
try: |
|
|
result = service.find_words_for_crossword([topic], difficulty, 10) |
|
|
words = result["words"] |
|
|
debug_data = result.get("debug", {}) |
|
|
|
|
|
if debug_data and "probability_distribution" in debug_data: |
|
|
prob_data = debug_data["probability_distribution"] |
|
|
probabilities = prob_data["probabilities"] |
|
|
|
|
|
similarities = [p["similarity"] for p in probabilities] |
|
|
percentiles = [p["percentile"] for p in probabilities] |
|
|
|
|
|
print(f" π Similarity range: {min(similarities):.3f} - {max(similarities):.3f}") |
|
|
print(f" π― Mean percentile: {np.mean(percentiles):.3f} (target for easy: 0.9)") |
|
|
print(f" π Selected words: {', '.join([w['word'] for w in words[:5]])}") |
|
|
|
|
|
if any(p.get("normalization_applied", False) for p in probabilities): |
|
|
print(f" β
Normalization applied successfully") |
|
|
else: |
|
|
print(f" β οΈ Normalization not detected in debug data") |
|
|
else: |
|
|
print(f" β No debug data available") |
|
|
|
|
|
except Exception as e: |
|
|
print(f" β Error with method {method}: {e}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("π― Distribution Normalization Test Suite") |
|
|
print("=" * 50) |
|
|
|
|
|
test_normalization_across_topics() |
|
|
test_normalization_methods() |
|
|
|
|
|
print(f"\nπ All tests completed!") |
|
|
print(f"\nπ‘ To see normalization effects in the UI:") |
|
|
print(f" 1. Set ENABLE_DISTRIBUTION_NORMALIZATION=true") |
|
|
print(f" 2. Set ENABLE_DEBUG_TAB=true") |
|
|
print(f" 3. Generate crosswords with different topics at the same difficulty") |
|
|
print(f" 4. Check the Debug tab for normalization indicators and tooltips") |