Spaces:
Building
Building
File size: 4,813 Bytes
e7279e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
#!/usr/bin/env python3
"""
Debug script to examine column naming issues in bigram/trigram plots
"""
import sys
import os
# Add the project root to the path
sys.path.insert(0, os.getcwd())
from web_app.config_manager import ConfigManager
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
def debug_plot_columns():
print("=== Debugging Plot Column Names ==")
# Load config and create reference lists
config = ConfigManager.load_reference_config()
english_config = config.get('english', {})
reference_lists = {}
# Load a unigram, bigram, and trigram reference
unigram_config = english_config['unigrams']['COCA_spoken_frequency_token']
bigram_config = english_config['bigrams']['COCA_spoken_bigram_frequency_token']
trigram_config = english_config['trigrams']['COCA_trigram_frequency_token']
reference_lists['COCA_spoken_frequency_token'] = ConfigManager.load_reference_list_data(unigram_config)
reference_lists['COCA_spoken_bigram_frequency_token'] = ConfigManager.load_reference_list_data(bigram_config)
reference_lists['COCA_trigram_frequency_token'] = ConfigManager.load_reference_list_data(trigram_config)
# Create analyzer and analyze text
analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
analyzer.load_reference_lists(reference_lists)
test_text = "The cat sat on the mat. The dog ran quickly."
results = analyzer.analyze_text(test_text, list(reference_lists.keys()), apply_log=False)
print("\n=== Raw Scores Keys ===")
for key in results['raw_scores'].keys():
print(f" {key}")
print("\n=== Token Details Columns ===")
if results['token_details']:
print(f" Sample token: {list(results['token_details'][0].keys())}")
print("\n=== Bigram Details Columns ===")
if results['bigram_details']:
print(f" Sample bigram: {list(results['bigram_details'][0].keys())}")
print("\n=== Trigram Details Columns ===")
if results['trigram_details']:
print(f" Sample trigram: {list(results['trigram_details'][0].keys())}")
print("\n=== Column Matching Analysis ===")
# Test the current algorithm for bigrams
for key in results['raw_scores'].keys():
if '_bigram_' in key:
print(f"\nAnalyzing bigram key: {key}")
key_parts = key.split('_')
if len(key_parts) >= 3 and 'bigram' in key_parts:
measure_name = '_'.join(key_parts[key_parts.index('bigram') + 1:])
index_measure_col = f"{key_parts[0]}_{measure_name}"
print(f" Algorithm expects column: '{index_measure_col}'")
# Check if this column exists in bigram_details
if results['bigram_details']:
sample_bigram = results['bigram_details'][0]
if index_measure_col in sample_bigram:
print(f" ✅ Column found in bigram_details")
else:
print(f" ❌ Column NOT found in bigram_details")
print(f" Available columns: {list(sample_bigram.keys())}")
# Try to find the correct column
for col in sample_bigram.keys():
if measure_name in col:
print(f" Possible match: '{col}'")
# Test the current algorithm for trigrams
for key in results['raw_scores'].keys():
if '_trigram_' in key:
print(f"\nAnalyzing trigram key: {key}")
key_parts = key.split('_')
if len(key_parts) >= 3 and 'trigram' in key_parts:
measure_name = '_'.join(key_parts[key_parts.index('trigram') + 1:])
index_measure_col = f"{key_parts[0]}_{measure_name}"
print(f" Algorithm expects column: '{index_measure_col}'")
# Check if this column exists in trigram_details
if results['trigram_details']:
sample_trigram = results['trigram_details'][0]
if index_measure_col in sample_trigram:
print(f" ✅ Column found in trigram_details")
else:
print(f" ❌ Column NOT found in trigram_details")
print(f" Available columns: {list(sample_trigram.keys())}")
# Try to find the correct column
for col in sample_trigram.keys():
if measure_name in col:
print(f" Possible match: '{col}'")
if __name__ == "__main__":
debug_plot_columns()
|