simple-text-analyzer / test /test_japanese_integration.py
egumasa's picture
removed temporary file writing
ca02ec3
#!/usr/bin/env python3
"""
Test script for Japanese lexical sophistication integration.
Tests the BCCWJ and CSJ frequency analysis with composite key lookup.
"""
import os
import sys
sys.path.append('.')
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
from web_app.config_manager import ConfigManager
def test_japanese_integration():
"""Test Japanese corpus integration with sample text."""
print("=== Japanese Lexical Sophistication Integration Test ===\n")
# Initialize Japanese analyzer
print("1. Initializing Japanese analyzer...")
try:
analyzer = LexicalSophisticationAnalyzer(language="ja", model_size="md")
print("✓ Japanese SpaCy model loaded successfully")
# Check if UniDic enricher is available
if hasattr(analyzer, 'unidic_enricher') and analyzer.unidic_enricher:
print("✓ UniDic enricher initialized successfully")
else:
print("⚠ UniDic enricher not available - using legacy mode")
except Exception as e:
print(f"✗ Failed to load Japanese model: {e}")
print("Please install: python -m spacy download ja_core_news_md")
return False
# Load reference configuration
print("\n2. Loading reference configuration...")
config = ConfigManager.load_reference_config()
japanese_config = config.get('japanese', {}).get('unigrams', {})
if not japanese_config:
print("✗ No Japanese configuration found")
return False
print(f"✓ Found {len(japanese_config)} Japanese reference lists")
# Test data loading for available files
print("\n3. Testing data loading...")
reference_data = {}
for list_name, list_config in japanese_config.items():
if not list_config.get('enabled', False):
continue
file_path = list_config.get('files', {}).get('token', '')
if not os.path.exists(file_path):
print(f"⚠ File not found: {file_path}")
continue
print(f" Loading {list_name}...")
try:
data = ConfigManager.load_reference_list_data(list_config)
if data:
reference_data[f"unigrams_{list_name}"] = data
# Check if Japanese corpus data was created correctly
for file_type, file_data in data.items():
if isinstance(file_data, dict) and file_data.get('is_japanese_corpus'):
composite_count = len(file_data.get('composite_dict', {}))
lemma_count = len(file_data.get('lemma_dict', {}))
surface_count = len(file_data.get('surface_dict', {}))
print(f" ✓ {list_name}: {composite_count} composite keys, {lemma_count} lemmas, {surface_count} surface forms")
except Exception as e:
print(f" ✗ Error loading {list_name}: {e}")
if not reference_data:
print("✗ No reference data loaded successfully")
return False
# Load reference data into analyzer
print("\n4. Loading reference data into analyzer...")
analyzer.load_reference_lists(reference_data)
print(f"✓ Loaded {len(reference_data)} reference lists")
# Test with Japanese text
print("\n5. Testing Japanese text analysis...")
japanese_text = """
私は毎日学校に行きます。
友達と一緒に勉強して、とても楽しいです。
日本語の文法は少し難しいですが、頑張って覚えています。
"""
selected_indices = list(reference_data.keys())
print(f" Using indices: {', '.join(selected_indices)}")
try:
results = analyzer.analyze_text(japanese_text, selected_indices)
# Display results
print(f"\n6. Analysis Results:")
print(f" Total tokens: {results['text_stats']['total_tokens']}")
print(f" Content words: {results['text_stats']['content_words']}")
print(f" Function words: {results['text_stats']['function_words']}")
# Show some token details
print(f"\n Sample token analysis:")
for i, token in enumerate(results['token_details'][:5]): # First 5 tokens
print(f" {i+1}. {token['token']} (lemma: {token['lemma']}, pos: {token['pos']})")
for key, value in token.items():
if key.endswith('_token') or key.endswith('_lemma'):
if value != 'NA':
print(f" {key}: {value}")
# Show summary statistics
print(f"\n Summary statistics:")
for key, stats in results['summary'].items():
print(f" {key}: mean={stats['mean']:.2f}, count={stats['count']}")
print(f"\n✓ Japanese text analysis completed successfully!")
return True
except Exception as e:
print(f"✗ Error during analysis: {e}")
import traceback
traceback.print_exc()
return False
if __name__ == "__main__":
success = test_japanese_integration()
if success:
print("\n🎉 Japanese integration test PASSED!")
else:
print("\n❌ Japanese integration test FAILED!")
sys.exit(0 if success else 1)