Spaces:
Building
Building
| #!/usr/bin/env python3 | |
| """ | |
| Debug script to test bigram and trigram processing | |
| """ | |
| import sys | |
| import os | |
| # Add the project root to the path | |
| sys.path.insert(0, os.getcwd()) | |
| from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer | |
| from web_app.config_manager import ConfigManager | |
| # Test simple text | |
| test_text = "The cat sat on the mat. The dog ran quickly." | |
| # Create analyzer | |
| analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md') | |
| # Load config | |
| config = ConfigManager.load_reference_config() | |
| english_config = config.get('english', {}) | |
| print("=== Available Reference Lists ===") | |
| for ngram_type, lists in english_config.items(): | |
| print(f"\n{ngram_type.upper()}:") | |
| for list_name, list_config in lists.items(): | |
| if list_config.get('enabled', True): | |
| print(f" - {list_name}") | |
| # Test loading a bigram reference | |
| print("\n=== Testing Bigram Reference Loading ===") | |
| bigram_config = english_config.get('bigrams', {}).get('COCA_spoken_bigram_frequency_token', {}) | |
| if bigram_config: | |
| print(f"Config: {bigram_config}") | |
| # Load the data | |
| data = ConfigManager.load_reference_list_data(bigram_config) | |
| print(f"Loaded data keys: {data.keys()}") | |
| if 'bigram' in data: | |
| bigram_df = data['bigram'] | |
| print(f"Bigram DataFrame shape: {bigram_df.shape}") | |
| print(f"Bigram DataFrame columns: {list(bigram_df.columns)}") | |
| print("First 5 bigrams:") | |
| print(bigram_df.head()) | |
| # Test with full reference list structure | |
| print("\n=== Testing Analyzer with Bigram References ===") | |
| reference_lists = { | |
| 'COCA_spoken_bigram_frequency_token': ConfigManager.load_reference_list_data(bigram_config) | |
| } | |
| print(f"Reference lists for analyzer: {list(reference_lists.keys())}") | |
| for name, data in reference_lists.items(): | |
| print(f" {name}: {list(data.keys())}") | |
| # Load into analyzer | |
| analyzer.load_reference_lists(reference_lists) | |
| # Analyze text | |
| results = analyzer.analyze_text( | |
| test_text, | |
| list(reference_lists.keys()), | |
| apply_log=False | |
| ) | |
| print("\n=== Analysis Results ===") | |
| print(f"Summary keys: {list(results['summary'].keys())}") | |
| print(f"Raw scores keys: {list(results['raw_scores'].keys())}") | |
| print(f"Bigram details count: {len(results.get('bigram_details', []))}") | |
| print(f"Trigram details count: {len(results.get('trigram_details', []))}") | |
| if results.get('bigram_details'): | |
| print("\nFirst few bigram details:") | |
| for detail in results['bigram_details'][:3]: | |
| print(f" {detail}") | |