File size: 4,813 Bytes
e7279e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python3
"""
Debug script to examine column naming issues in bigram/trigram plots
"""

import sys
import os

# Add the project root to the path
sys.path.insert(0, os.getcwd())

from web_app.config_manager import ConfigManager
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer

def debug_plot_columns():
    print("=== Debugging Plot Column Names ==")
    
    # Load config and create reference lists
    config = ConfigManager.load_reference_config()
    english_config = config.get('english', {})
    
    reference_lists = {}
    
    # Load a unigram, bigram, and trigram reference
    unigram_config = english_config['unigrams']['COCA_spoken_frequency_token']
    bigram_config = english_config['bigrams']['COCA_spoken_bigram_frequency_token']
    trigram_config = english_config['trigrams']['COCA_trigram_frequency_token']
    
    reference_lists['COCA_spoken_frequency_token'] = ConfigManager.load_reference_list_data(unigram_config)
    reference_lists['COCA_spoken_bigram_frequency_token'] = ConfigManager.load_reference_list_data(bigram_config)
    reference_lists['COCA_trigram_frequency_token'] = ConfigManager.load_reference_list_data(trigram_config)
    
    # Create analyzer and analyze text
    analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
    analyzer.load_reference_lists(reference_lists)
    
    test_text = "The cat sat on the mat. The dog ran quickly."
    results = analyzer.analyze_text(test_text, list(reference_lists.keys()), apply_log=False)
    
    print("\n=== Raw Scores Keys ===")
    for key in results['raw_scores'].keys():
        print(f"  {key}")
    
    print("\n=== Token Details Columns ===")
    if results['token_details']:
        print(f"  Sample token: {list(results['token_details'][0].keys())}")
    
    print("\n=== Bigram Details Columns ===")
    if results['bigram_details']:
        print(f"  Sample bigram: {list(results['bigram_details'][0].keys())}")
    
    print("\n=== Trigram Details Columns ===")
    if results['trigram_details']:
        print(f"  Sample trigram: {list(results['trigram_details'][0].keys())}")
    
    print("\n=== Column Matching Analysis ===")
    
    # Test the current algorithm for bigrams
    for key in results['raw_scores'].keys():
        if '_bigram_' in key:
            print(f"\nAnalyzing bigram key: {key}")
            key_parts = key.split('_')
            if len(key_parts) >= 3 and 'bigram' in key_parts:
                measure_name = '_'.join(key_parts[key_parts.index('bigram') + 1:])
                index_measure_col = f"{key_parts[0]}_{measure_name}"
                print(f"  Algorithm expects column: '{index_measure_col}'")
                
                # Check if this column exists in bigram_details
                if results['bigram_details']:
                    sample_bigram = results['bigram_details'][0]
                    if index_measure_col in sample_bigram:
                        print(f"  ✅ Column found in bigram_details")
                    else:
                        print(f"  ❌ Column NOT found in bigram_details")
                        print(f"  Available columns: {list(sample_bigram.keys())}")
                        
                        # Try to find the correct column
                        for col in sample_bigram.keys():
                            if measure_name in col:
                                print(f"  Possible match: '{col}'")
    
    # Test the current algorithm for trigrams
    for key in results['raw_scores'].keys():
        if '_trigram_' in key:
            print(f"\nAnalyzing trigram key: {key}")
            key_parts = key.split('_')
            if len(key_parts) >= 3 and 'trigram' in key_parts:
                measure_name = '_'.join(key_parts[key_parts.index('trigram') + 1:])
                index_measure_col = f"{key_parts[0]}_{measure_name}"
                print(f"  Algorithm expects column: '{index_measure_col}'")
                
                # Check if this column exists in trigram_details
                if results['trigram_details']:
                    sample_trigram = results['trigram_details'][0]
                    if index_measure_col in sample_trigram:
                        print(f"  ✅ Column found in trigram_details")
                    else:
                        print(f"  ❌ Column NOT found in trigram_details")
                        print(f"  Available columns: {list(sample_trigram.keys())}")
                        
                        # Try to find the correct column
                        for col in sample_trigram.keys():
                            if measure_name in col:
                                print(f"  Possible match: '{col}'")

if __name__ == "__main__":
    debug_plot_columns()