#!/usr/bin/env python3
"""
Debug script to test bigram and trigram processing
"""

import sys
import os

# Add the project root to the path
sys.path.insert(0, os.getcwd())

from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
from web_app.config_manager import ConfigManager

# Test simple text
test_text = "The cat sat on the mat. The dog ran quickly."

# Create analyzer
analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')

# Load config
config = ConfigManager.load_reference_config()
english_config = config.get('english', {})

print("=== Available Reference Lists ===")
for ngram_type, lists in english_config.items():
    print(f"\n{ngram_type.upper()}:")
    for list_name, list_config in lists.items():
        if list_config.get('enabled', True):
            print(f"  - {list_name}")

# Test loading a bigram reference
print("\n=== Testing Bigram Reference Loading ===")
bigram_config = english_config.get('bigrams', {}).get('COCA_spoken_bigram_frequency_token', {})
if bigram_config:
    print(f"Config: {bigram_config}")
    
    # Load the data
    data = ConfigManager.load_reference_list_data(bigram_config)
    print(f"Loaded data keys: {data.keys()}")
    
    if 'bigram' in data:
        bigram_df = data['bigram']
        print(f"Bigram DataFrame shape: {bigram_df.shape}")
        print(f"Bigram DataFrame columns: {list(bigram_df.columns)}")
        print("First 5 bigrams:")
        print(bigram_df.head())

# Test with full reference list structure
print("\n=== Testing Analyzer with Bigram References ===")
reference_lists = {
    'COCA_spoken_bigram_frequency_token': ConfigManager.load_reference_list_data(bigram_config)
}

print(f"Reference lists for analyzer: {list(reference_lists.keys())}")
for name, data in reference_lists.items():
    print(f"  {name}: {list(data.keys())}")

# Load into analyzer
analyzer.load_reference_lists(reference_lists)

# Analyze text
results = analyzer.analyze_text(
    test_text, 
    list(reference_lists.keys()), 
    apply_log=False
)

print("\n=== Analysis Results ===")
print(f"Summary keys: {list(results['summary'].keys())}")
print(f"Raw scores keys: {list(results['raw_scores'].keys())}")
print(f"Bigram details count: {len(results.get('bigram_details', []))}")
print(f"Trigram details count: {len(results.get('trigram_details', []))}")

if results.get('bigram_details'):
    print("\nFirst few bigram details:")
    for detail in results['bigram_details'][:3]:
        print(f"  {detail}")