#!/usr/bin/env python3 """ Debug script to test bigram and trigram processing """ import sys import os # Add the project root to the path sys.path.insert(0, os.getcwd()) from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer from web_app.config_manager import ConfigManager # Test simple text test_text = "The cat sat on the mat. The dog ran quickly." # Create analyzer analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md') # Load config config = ConfigManager.load_reference_config() english_config = config.get('english', {}) print("=== Available Reference Lists ===") for ngram_type, lists in english_config.items(): print(f"\n{ngram_type.upper()}:") for list_name, list_config in lists.items(): if list_config.get('enabled', True): print(f" - {list_name}") # Test loading a bigram reference print("\n=== Testing Bigram Reference Loading ===") bigram_config = english_config.get('bigrams', {}).get('COCA_spoken_bigram_frequency_token', {}) if bigram_config: print(f"Config: {bigram_config}") # Load the data data = ConfigManager.load_reference_list_data(bigram_config) print(f"Loaded data keys: {data.keys()}") if 'bigram' in data: bigram_df = data['bigram'] print(f"Bigram DataFrame shape: {bigram_df.shape}") print(f"Bigram DataFrame columns: {list(bigram_df.columns)}") print("First 5 bigrams:") print(bigram_df.head()) # Test with full reference list structure print("\n=== Testing Analyzer with Bigram References ===") reference_lists = { 'COCA_spoken_bigram_frequency_token': ConfigManager.load_reference_list_data(bigram_config) } print(f"Reference lists for analyzer: {list(reference_lists.keys())}") for name, data in reference_lists.items(): print(f" {name}: {list(data.keys())}") # Load into analyzer analyzer.load_reference_lists(reference_lists) # Analyze text results = analyzer.analyze_text( test_text, list(reference_lists.keys()), apply_log=False ) print("\n=== Analysis Results ===") print(f"Summary keys: {list(results['summary'].keys())}") print(f"Raw scores keys: {list(results['raw_scores'].keys())}") print(f"Bigram details count: {len(results.get('bigram_details', []))}") print(f"Trigram details count: {len(results.get('trigram_details', []))}") if results.get('bigram_details'): print("\nFirst few bigram details:") for detail in results['bigram_details'][:3]: print(f" {detail}")