simple-text-analyzer / test /test_multi_index.py
egumasa's picture
Japanese language support
dbc9105
#!/usr/bin/env python3
import sys
import os
import tempfile
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
def test_multi_index_functionality():
print("Testing multi-index functionality...")
# Create test data with multiple score columns
test_data = """Type,POS,Headword,Rank,Freq,Range,NormFreq,NormRange
the,,,1,60056,500,59001.119,1.000
of,,,2,30331,500,29798.237,1.000
and,,,3,28973,500,28464.091,1.000
to,,,4,26036,500,25578.679,1.000
a,,,5,23926,500,23505.741,1.000
in,,,6,19923,500,19573.053,1.000
that,,,7,12279,500,12063.320,1.000"""
# Save to temporary file
temp_dir = tempfile.mkdtemp()
test_file = os.path.join(temp_dir, "multi_freq.csv")
with open(test_file, 'w') as f:
f.write(test_data)
print(f"βœ“ Created test file: {test_file}")
# Test multiple indices from single file
configs = [
{
'index_name': 'test_freq',
'word_column': 'Type',
'score_column': 'Freq'
},
{
'index_name': 'test_range',
'word_column': 'Type',
'score_column': 'Range'
},
{
'index_name': 'test_normfreq',
'word_column': 'Type',
'score_column': 'NormFreq'
}
]
analyzer = LexicalSophisticationAnalyzer()
reference_lists = {}
# Create multiple reference lists from same file
for i, config in enumerate(configs):
custom_config = {
'file_path': test_file,
'word_column': config['word_column'],
'freq_column': config['score_column'],
'delimiter': ',',
'is_custom_config': True
}
reference_lists[config['index_name']] = {
'token': custom_config
}
print(f"βœ“ Created {len(reference_lists)} reference configurations")
# Test loading
try:
analyzer.load_reference_lists(reference_lists)
print("βœ“ Successfully loaded all reference lists")
# Verify each index was loaded correctly
for config in configs:
index_name = config['index_name']
if index_name in analyzer.reference_lists:
token_data = analyzer.reference_lists[index_name].get('token', {})
print(f"βœ“ {index_name}: {len(token_data)} entries")
# Test some words
test_words = ['the', 'of', 'and']
for word in test_words[:1]: # Just test first word
if word in token_data:
print(f" - '{word}': {token_data[word]}")
else:
print(f"βœ— {index_name}: not found in loaded lists")
except Exception as e:
print(f"βœ— Error loading reference lists: {e}")
import traceback
traceback.print_exc()
return
# Test analysis with multiple indices
print("\nβœ“ Testing analysis with multiple indices...")
test_text = "The quick brown fox jumps over the lazy dog and runs to the park."
try:
results = analyzer.analyze_text(
test_text,
['test_freq', 'test_range', 'test_normfreq'],
apply_log=False,
word_type_filter=None
)
print("βœ“ Analysis completed successfully")
# Check results
if results['summary']:
print("βœ“ Summary results:")
for key, stats in results['summary'].items():
print(f" - {key}: mean={stats['mean']:.3f}, count={stats['count']}")
# Check token details
if results['token_details']:
print("βœ“ Token details (first 3 tokens):")
for token in results['token_details'][:3]:
token_word = token.get('token', 'N/A')
freq_score = token.get('test_freq', 'N/A')
range_score = token.get('test_range', 'N/A')
normfreq_score = token.get('test_normfreq', 'N/A')
print(f" - {token_word}: freq={freq_score}, range={range_score}, normfreq={normfreq_score}")
except Exception as e:
print(f"βœ— Error during analysis: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
test_multi_index_functionality()