#!/usr/bin/env python3 """ Test script to validate COCA integration. """ import sys import os sys.path.append(os.path.join(os.path.dirname(__file__), 'backend')) from lexical_sophistication import LexicalSophisticationAnalyzer import pandas as pd from pathlib import Path def test_coca_integration(): """Test loading and using COCA reference list.""" print("Testing COCA Integration...") try: # Create analyzer analyzer = LexicalSophisticationAnalyzer(language="en", model_size="trf") # Test text test_text = "The quick brown fox jumps over the lazy dog." # Load COCA data directly coca_path = Path("resources/reference_lists/en/COCA_spoken_unigram_list.csv") if not coca_path.exists(): print(f"✗ COCA file not found at {coca_path}") return False print(f"✓ Found COCA file at {coca_path}") # Load COCA data df = pd.read_csv(coca_path, sep='\t', header=None, names=['word', 'frequency', 'normalized_freq', 'range', 'dispersion']) print(f"✓ Loaded COCA data with {len(df)} entries") # Create word frequency dictionary word_freq_dict = dict(zip(df['word'].str.lower(), df['frequency'])) # Test some common words test_words = ['the', 'quick', 'brown', 'fox', 'jumps'] for word in test_words: if word in word_freq_dict: print(f"✓ Found '{word}' with frequency {word_freq_dict[word]:,}") else: print(f"✗ Word '{word}' not found in COCA data") # Load reference lists ref_lists = { "COCA_spoken": { "token": word_freq_dict, "lemma": word_freq_dict } } analyzer.load_reference_lists(ref_lists) print("✓ Successfully loaded COCA reference lists into analyzer") # Analyze text results = analyzer.analyze_text(test_text, ["COCA_spoken"], apply_log=False) print(f"✓ Analysis complete:") print(f" - Processed {results['text_stats']['total_tokens']} tokens") print(f" - Found {len(results['summary'])} summary statistics") print(f" - Generated {len(results['token_details'])} token details") # Show some results if results['summary']: print("\n✓ Summary statistics:") for key, stats in results['summary'].items(): print(f" - {key}: mean={stats['mean']:.2f}, count={stats['count']}") # Show token details for first few words if results['token_details']: print("\n✓ Token details (first 5):") for i, token in enumerate(results['token_details'][:5]): coca_score = token.get('COCA_spoken_token', 'NA') print(f" - {token['token']}: {coca_score}") return True except Exception as e: print(f"✗ COCA integration test failed: {e}") import traceback traceback.print_exc() return False def main(): """Run COCA integration test.""" print("Running COCA Integration Test...") print("=" * 50) success = test_coca_integration() print("\n" + "=" * 50) if success: print("✓ COCA integration test passed!") return True else: print("✗ COCA integration test failed!") return False if __name__ == "__main__": success = main() sys.exit(0 if success else 1)