Spaces:

egumasa
/

simple-text-analyzer

Building

App Files Files Community

simple-text-analyzer / test /test_multi_index.py

egumasa

Japanese language support

dbc9105 7 months ago

raw

history blame contribute delete

4.4 kB

	#!/usr/bin/env python3

	import sys
	import os
	import tempfile

	from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer

	def test_multi_index_functionality():
	print("Testing multi-index functionality...")

	# Create test data with multiple score columns
	test_data = """Type,POS,Headword,Rank,Freq,Range,NormFreq,NormRange
	the,,,1,60056,500,59001.119,1.000
	of,,,2,30331,500,29798.237,1.000
	and,,,3,28973,500,28464.091,1.000
	to,,,4,26036,500,25578.679,1.000
	a,,,5,23926,500,23505.741,1.000
	in,,,6,19923,500,19573.053,1.000
	that,,,7,12279,500,12063.320,1.000"""

	# Save to temporary file
	temp_dir = tempfile.mkdtemp()
	test_file = os.path.join(temp_dir, "multi_freq.csv")

	with open(test_file, 'w') as f:
	f.write(test_data)

	print(f"✓ Created test file: {test_file}")

	# Test multiple indices from single file
	configs = [
	{
	'index_name': 'test_freq',
	'word_column': 'Type',
	'score_column': 'Freq'
	},
	{
	'index_name': 'test_range',
	'word_column': 'Type',
	'score_column': 'Range'
	},
	{
	'index_name': 'test_normfreq',
	'word_column': 'Type',
	'score_column': 'NormFreq'
	}
	]

	analyzer = LexicalSophisticationAnalyzer()
	reference_lists = {}

	# Create multiple reference lists from same file
	for i, config in enumerate(configs):
	custom_config = {
	'file_path': test_file,
	'word_column': config['word_column'],
	'freq_column': config['score_column'],
	'delimiter': ',',
	'is_custom_config': True
	}

	reference_lists[config['index_name']] = {
	'token': custom_config
	}

	print(f"✓ Created {len(reference_lists)} reference configurations")

	# Test loading
	try:
	analyzer.load_reference_lists(reference_lists)
	print("✓ Successfully loaded all reference lists")

	# Verify each index was loaded correctly
	for config in configs:
	index_name = config['index_name']
	if index_name in analyzer.reference_lists:
	token_data = analyzer.reference_lists[index_name].get('token', {})
	print(f"✓ {index_name}: {len(token_data)} entries")

	# Test some words
	test_words = ['the', 'of', 'and']
	for word in test_words[:1]: # Just test first word
	if word in token_data:
	print(f" - '{word}': {token_data[word]}")
	else:
	print(f"✗ {index_name}: not found in loaded lists")

	except Exception as e:
	print(f"✗ Error loading reference lists: {e}")
	import traceback
	traceback.print_exc()
	return

	# Test analysis with multiple indices
	print("\n✓ Testing analysis with multiple indices...")

	test_text = "The quick brown fox jumps over the lazy dog and runs to the park."

	try:
	results = analyzer.analyze_text(
	test_text,
	['test_freq', 'test_range', 'test_normfreq'],
	apply_log=False,
	word_type_filter=None
	)

	print("✓ Analysis completed successfully")

	# Check results
	if results['summary']:
	print("✓ Summary results:")
	for key, stats in results['summary'].items():
	print(f" - {key}: mean={stats['mean']:.3f}, count={stats['count']}")

	# Check token details
	if results['token_details']:
	print("✓ Token details (first 3 tokens):")
	for token in results['token_details'][:3]:
	token_word = token.get('token', 'N/A')
	freq_score = token.get('test_freq', 'N/A')
	range_score = token.get('test_range', 'N/A')
	normfreq_score = token.get('test_normfreq', 'N/A')
	print(f" - {token_word}: freq={freq_score}, range={range_score}, normfreq={normfreq_score}")

	except Exception as e:
	print(f"✗ Error during analysis: {e}")
	import traceback
	traceback.print_exc()

	if __name__ == "__main__":
	test_multi_index_functionality()