Spaces:
Building
Building
| #!/usr/bin/env python3 | |
| """ | |
| Test script for Japanese lexical sophistication integration. | |
| Tests the BCCWJ and CSJ frequency analysis with composite key lookup. | |
| """ | |
| import os | |
| import sys | |
| sys.path.append('.') | |
| from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer | |
| from web_app.config_manager import ConfigManager | |
| def test_japanese_integration(): | |
| """Test Japanese corpus integration with sample text.""" | |
| print("=== Japanese Lexical Sophistication Integration Test ===\n") | |
| # Initialize Japanese analyzer | |
| print("1. Initializing Japanese analyzer...") | |
| try: | |
| analyzer = LexicalSophisticationAnalyzer(language="ja", model_size="md") | |
| print("✓ Japanese SpaCy model loaded successfully") | |
| # Check if UniDic enricher is available | |
| if hasattr(analyzer, 'unidic_enricher') and analyzer.unidic_enricher: | |
| print("✓ UniDic enricher initialized successfully") | |
| else: | |
| print("⚠ UniDic enricher not available - using legacy mode") | |
| except Exception as e: | |
| print(f"✗ Failed to load Japanese model: {e}") | |
| print("Please install: python -m spacy download ja_core_news_md") | |
| return False | |
| # Load reference configuration | |
| print("\n2. Loading reference configuration...") | |
| config = ConfigManager.load_reference_config() | |
| japanese_config = config.get('japanese', {}).get('unigrams', {}) | |
| if not japanese_config: | |
| print("✗ No Japanese configuration found") | |
| return False | |
| print(f"✓ Found {len(japanese_config)} Japanese reference lists") | |
| # Test data loading for available files | |
| print("\n3. Testing data loading...") | |
| reference_data = {} | |
| for list_name, list_config in japanese_config.items(): | |
| if not list_config.get('enabled', False): | |
| continue | |
| file_path = list_config.get('files', {}).get('token', '') | |
| if not os.path.exists(file_path): | |
| print(f"⚠ File not found: {file_path}") | |
| continue | |
| print(f" Loading {list_name}...") | |
| try: | |
| data = ConfigManager.load_reference_list_data(list_config) | |
| if data: | |
| reference_data[f"unigrams_{list_name}"] = data | |
| # Check if Japanese corpus data was created correctly | |
| for file_type, file_data in data.items(): | |
| if isinstance(file_data, dict) and file_data.get('is_japanese_corpus'): | |
| composite_count = len(file_data.get('composite_dict', {})) | |
| lemma_count = len(file_data.get('lemma_dict', {})) | |
| surface_count = len(file_data.get('surface_dict', {})) | |
| print(f" ✓ {list_name}: {composite_count} composite keys, {lemma_count} lemmas, {surface_count} surface forms") | |
| except Exception as e: | |
| print(f" ✗ Error loading {list_name}: {e}") | |
| if not reference_data: | |
| print("✗ No reference data loaded successfully") | |
| return False | |
| # Load reference data into analyzer | |
| print("\n4. Loading reference data into analyzer...") | |
| analyzer.load_reference_lists(reference_data) | |
| print(f"✓ Loaded {len(reference_data)} reference lists") | |
| # Test with Japanese text | |
| print("\n5. Testing Japanese text analysis...") | |
| japanese_text = """ | |
| 私は毎日学校に行きます。 | |
| 友達と一緒に勉強して、とても楽しいです。 | |
| 日本語の文法は少し難しいですが、頑張って覚えています。 | |
| """ | |
| selected_indices = list(reference_data.keys()) | |
| print(f" Using indices: {', '.join(selected_indices)}") | |
| try: | |
| results = analyzer.analyze_text(japanese_text, selected_indices) | |
| # Display results | |
| print(f"\n6. Analysis Results:") | |
| print(f" Total tokens: {results['text_stats']['total_tokens']}") | |
| print(f" Content words: {results['text_stats']['content_words']}") | |
| print(f" Function words: {results['text_stats']['function_words']}") | |
| # Show some token details | |
| print(f"\n Sample token analysis:") | |
| for i, token in enumerate(results['token_details'][:5]): # First 5 tokens | |
| print(f" {i+1}. {token['token']} (lemma: {token['lemma']}, pos: {token['pos']})") | |
| for key, value in token.items(): | |
| if key.endswith('_token') or key.endswith('_lemma'): | |
| if value != 'NA': | |
| print(f" {key}: {value}") | |
| # Show summary statistics | |
| print(f"\n Summary statistics:") | |
| for key, stats in results['summary'].items(): | |
| print(f" {key}: mean={stats['mean']:.2f}, count={stats['count']}") | |
| print(f"\n✓ Japanese text analysis completed successfully!") | |
| return True | |
| except Exception as e: | |
| print(f"✗ Error during analysis: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return False | |
| if __name__ == "__main__": | |
| success = test_japanese_integration() | |
| if success: | |
| print("\n🎉 Japanese integration test PASSED!") | |
| else: | |
| print("\n❌ Japanese integration test FAILED!") | |
| sys.exit(0 if success else 1) | |