Spaces:

egumasa
/

simple-text-analyzer

Building

App Files Files Community

simple-text-analyzer / test /test_japanese_integration.py

egumasa

removed temporary file writing

ca02ec3 9 months ago

raw

history blame contribute delete

5.37 kB

	#!/usr/bin/env python3
	"""
	Test script for Japanese lexical sophistication integration.
	Tests the BCCWJ and CSJ frequency analysis with composite key lookup.
	"""

	import os
	import sys
	sys.path.append('.')

	from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
	from web_app.config_manager import ConfigManager

	def test_japanese_integration():
	"""Test Japanese corpus integration with sample text."""

	print("=== Japanese Lexical Sophistication Integration Test ===\n")

	# Initialize Japanese analyzer
	print("1. Initializing Japanese analyzer...")
	try:
	analyzer = LexicalSophisticationAnalyzer(language="ja", model_size="md")
	print("✓ Japanese SpaCy model loaded successfully")

	# Check if UniDic enricher is available
	if hasattr(analyzer, 'unidic_enricher') and analyzer.unidic_enricher:
	print("✓ UniDic enricher initialized successfully")
	else:
	print("⚠ UniDic enricher not available - using legacy mode")
	except Exception as e:
	print(f"✗ Failed to load Japanese model: {e}")
	print("Please install: python -m spacy download ja_core_news_md")
	return False

	# Load reference configuration
	print("\n2. Loading reference configuration...")
	config = ConfigManager.load_reference_config()
	japanese_config = config.get('japanese', {}).get('unigrams', {})

	if not japanese_config:
	print("✗ No Japanese configuration found")
	return False

	print(f"✓ Found {len(japanese_config)} Japanese reference lists")

	# Test data loading for available files
	print("\n3. Testing data loading...")
	reference_data = {}

	for list_name, list_config in japanese_config.items():
	if not list_config.get('enabled', False):
	continue

	file_path = list_config.get('files', {}).get('token', '')
	if not os.path.exists(file_path):
	print(f"⚠ File not found: {file_path}")
	continue

	print(f" Loading {list_name}...")
	try:
	data = ConfigManager.load_reference_list_data(list_config)
	if data:
	reference_data[f"unigrams_{list_name}"] = data

	# Check if Japanese corpus data was created correctly
	for file_type, file_data in data.items():
	if isinstance(file_data, dict) and file_data.get('is_japanese_corpus'):
	composite_count = len(file_data.get('composite_dict', {}))
	lemma_count = len(file_data.get('lemma_dict', {}))
	surface_count = len(file_data.get('surface_dict', {}))
	print(f" ✓ {list_name}: {composite_count} composite keys, {lemma_count} lemmas, {surface_count} surface forms")

	except Exception as e:
	print(f" ✗ Error loading {list_name}: {e}")

	if not reference_data:
	print("✗ No reference data loaded successfully")
	return False

	# Load reference data into analyzer
	print("\n4. Loading reference data into analyzer...")
	analyzer.load_reference_lists(reference_data)
	print(f"✓ Loaded {len(reference_data)} reference lists")

	# Test with Japanese text
	print("\n5. Testing Japanese text analysis...")
	japanese_text = """
	私は毎日学校に行きます。
	友達と一緒に勉強して、とても楽しいです。
	日本語の文法は少し難しいですが、頑張って覚えています。
	"""

	selected_indices = list(reference_data.keys())
	print(f" Using indices: {', '.join(selected_indices)}")

	try:
	results = analyzer.analyze_text(japanese_text, selected_indices)

	# Display results
	print(f"\n6. Analysis Results:")
	print(f" Total tokens: {results['text_stats']['total_tokens']}")
	print(f" Content words: {results['text_stats']['content_words']}")
	print(f" Function words: {results['text_stats']['function_words']}")

	# Show some token details
	print(f"\n Sample token analysis:")
	for i, token in enumerate(results['token_details'][:5]): # First 5 tokens
	print(f" {i+1}. {token['token']} (lemma: {token['lemma']}, pos: {token['pos']})")
	for key, value in token.items():
	if key.endswith('_token') or key.endswith('_lemma'):
	if value != 'NA':
	print(f" {key}: {value}")

	# Show summary statistics
	print(f"\n Summary statistics:")
	for key, stats in results['summary'].items():
	print(f" {key}: mean={stats['mean']:.2f}, count={stats['count']}")

	print(f"\n✓ Japanese text analysis completed successfully!")
	return True

	except Exception as e:
	print(f"✗ Error during analysis: {e}")
	import traceback
	traceback.print_exc()
	return False

	if __name__ == "__main__":
	success = test_japanese_integration()
	if success:
	print("\n🎉 Japanese integration test PASSED!")
	else:
	print("\n❌ Japanese integration test FAILED!")

	sys.exit(0 if success else 1)