#!/usr/bin/env python3 """ Debug script to examine column naming issues in bigram/trigram plots """ import sys import os # Add the project root to the path sys.path.insert(0, os.getcwd()) from web_app.config_manager import ConfigManager from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer def debug_plot_columns(): print("=== Debugging Plot Column Names ==") # Load config and create reference lists config = ConfigManager.load_reference_config() english_config = config.get('english', {}) reference_lists = {} # Load a unigram, bigram, and trigram reference unigram_config = english_config['unigrams']['COCA_spoken_frequency_token'] bigram_config = english_config['bigrams']['COCA_spoken_bigram_frequency_token'] trigram_config = english_config['trigrams']['COCA_trigram_frequency_token'] reference_lists['COCA_spoken_frequency_token'] = ConfigManager.load_reference_list_data(unigram_config) reference_lists['COCA_spoken_bigram_frequency_token'] = ConfigManager.load_reference_list_data(bigram_config) reference_lists['COCA_trigram_frequency_token'] = ConfigManager.load_reference_list_data(trigram_config) # Create analyzer and analyze text analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md') analyzer.load_reference_lists(reference_lists) test_text = "The cat sat on the mat. The dog ran quickly." results = analyzer.analyze_text(test_text, list(reference_lists.keys()), apply_log=False) print("\n=== Raw Scores Keys ===") for key in results['raw_scores'].keys(): print(f" {key}") print("\n=== Token Details Columns ===") if results['token_details']: print(f" Sample token: {list(results['token_details'][0].keys())}") print("\n=== Bigram Details Columns ===") if results['bigram_details']: print(f" Sample bigram: {list(results['bigram_details'][0].keys())}") print("\n=== Trigram Details Columns ===") if results['trigram_details']: print(f" Sample trigram: {list(results['trigram_details'][0].keys())}") print("\n=== Column Matching Analysis ===") # Test the current algorithm for bigrams for key in results['raw_scores'].keys(): if '_bigram_' in key: print(f"\nAnalyzing bigram key: {key}") key_parts = key.split('_') if len(key_parts) >= 3 and 'bigram' in key_parts: measure_name = '_'.join(key_parts[key_parts.index('bigram') + 1:]) index_measure_col = f"{key_parts[0]}_{measure_name}" print(f" Algorithm expects column: '{index_measure_col}'") # Check if this column exists in bigram_details if results['bigram_details']: sample_bigram = results['bigram_details'][0] if index_measure_col in sample_bigram: print(f" ✅ Column found in bigram_details") else: print(f" ❌ Column NOT found in bigram_details") print(f" Available columns: {list(sample_bigram.keys())}") # Try to find the correct column for col in sample_bigram.keys(): if measure_name in col: print(f" Possible match: '{col}'") # Test the current algorithm for trigrams for key in results['raw_scores'].keys(): if '_trigram_' in key: print(f"\nAnalyzing trigram key: {key}") key_parts = key.split('_') if len(key_parts) >= 3 and 'trigram' in key_parts: measure_name = '_'.join(key_parts[key_parts.index('trigram') + 1:]) index_measure_col = f"{key_parts[0]}_{measure_name}" print(f" Algorithm expects column: '{index_measure_col}'") # Check if this column exists in trigram_details if results['trigram_details']: sample_trigram = results['trigram_details'][0] if index_measure_col in sample_trigram: print(f" ✅ Column found in trigram_details") else: print(f" ❌ Column NOT found in trigram_details") print(f" Available columns: {list(sample_trigram.keys())}") # Try to find the correct column for col in sample_trigram.keys(): if measure_name in col: print(f" Possible match: '{col}'") if __name__ == "__main__": debug_plot_columns()