simple-text-analyzer / debug_bigram_trigram.py
egumasa's picture
emuTAALES
e7279e4
#!/usr/bin/env python3
"""
Debug script to test bigram and trigram processing
"""
import sys
import os
# Add the project root to the path
sys.path.insert(0, os.getcwd())
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
from web_app.config_manager import ConfigManager
# Test simple text
test_text = "The cat sat on the mat. The dog ran quickly."
# Create analyzer
analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
# Load config
config = ConfigManager.load_reference_config()
english_config = config.get('english', {})
print("=== Available Reference Lists ===")
for ngram_type, lists in english_config.items():
print(f"\n{ngram_type.upper()}:")
for list_name, list_config in lists.items():
if list_config.get('enabled', True):
print(f" - {list_name}")
# Test loading a bigram reference
print("\n=== Testing Bigram Reference Loading ===")
bigram_config = english_config.get('bigrams', {}).get('COCA_spoken_bigram_frequency_token', {})
if bigram_config:
print(f"Config: {bigram_config}")
# Load the data
data = ConfigManager.load_reference_list_data(bigram_config)
print(f"Loaded data keys: {data.keys()}")
if 'bigram' in data:
bigram_df = data['bigram']
print(f"Bigram DataFrame shape: {bigram_df.shape}")
print(f"Bigram DataFrame columns: {list(bigram_df.columns)}")
print("First 5 bigrams:")
print(bigram_df.head())
# Test with full reference list structure
print("\n=== Testing Analyzer with Bigram References ===")
reference_lists = {
'COCA_spoken_bigram_frequency_token': ConfigManager.load_reference_list_data(bigram_config)
}
print(f"Reference lists for analyzer: {list(reference_lists.keys())}")
for name, data in reference_lists.items():
print(f" {name}: {list(data.keys())}")
# Load into analyzer
analyzer.load_reference_lists(reference_lists)
# Analyze text
results = analyzer.analyze_text(
test_text,
list(reference_lists.keys()),
apply_log=False
)
print("\n=== Analysis Results ===")
print(f"Summary keys: {list(results['summary'].keys())}")
print(f"Raw scores keys: {list(results['raw_scores'].keys())}")
print(f"Bigram details count: {len(results.get('bigram_details', []))}")
print(f"Trigram details count: {len(results.get('trigram_details', []))}")
if results.get('bigram_details'):
print("\nFirst few bigram details:")
for detail in results['bigram_details'][:3]:
print(f" {detail}")