File size: 3,636 Bytes
a543e33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python3
"""
Test script to validate COCA integration.
"""

import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), 'backend'))

from lexical_sophistication import LexicalSophisticationAnalyzer
import pandas as pd
from pathlib import Path

def test_coca_integration():
    """Test loading and using COCA reference list."""
    print("Testing COCA Integration...")
    
    try:
        # Create analyzer
        analyzer = LexicalSophisticationAnalyzer(language="en", model_size="trf")
        
        # Test text
        test_text = "The quick brown fox jumps over the lazy dog."
        
        # Load COCA data directly
        coca_path = Path("resources/reference_lists/en/COCA_spoken_unigram_list.csv")
        if not coca_path.exists():
            print(f"βœ— COCA file not found at {coca_path}")
            return False
        
        print(f"βœ“ Found COCA file at {coca_path}")
        
        # Load COCA data
        df = pd.read_csv(coca_path, sep='\t', header=None, 
                        names=['word', 'frequency', 'normalized_freq', 'range', 'dispersion'])
        
        print(f"βœ“ Loaded COCA data with {len(df)} entries")
        
        # Create word frequency dictionary
        word_freq_dict = dict(zip(df['word'].str.lower(), df['frequency']))
        
        # Test some common words
        test_words = ['the', 'quick', 'brown', 'fox', 'jumps']
        for word in test_words:
            if word in word_freq_dict:
                print(f"βœ“ Found '{word}' with frequency {word_freq_dict[word]:,}")
            else:
                print(f"βœ— Word '{word}' not found in COCA data")
        
        # Load reference lists
        ref_lists = {
            "COCA_spoken": {
                "token": word_freq_dict,
                "lemma": word_freq_dict
            }
        }
        
        analyzer.load_reference_lists(ref_lists)
        print("βœ“ Successfully loaded COCA reference lists into analyzer")
        
        # Analyze text
        results = analyzer.analyze_text(test_text, ["COCA_spoken"], apply_log=False)
        
        print(f"βœ“ Analysis complete:")
        print(f"  - Processed {results['text_stats']['total_tokens']} tokens")
        print(f"  - Found {len(results['summary'])} summary statistics")
        print(f"  - Generated {len(results['token_details'])} token details")
        
        # Show some results
        if results['summary']:
            print("\nβœ“ Summary statistics:")
            for key, stats in results['summary'].items():
                print(f"  - {key}: mean={stats['mean']:.2f}, count={stats['count']}")
        
        # Show token details for first few words
        if results['token_details']:
            print("\nβœ“ Token details (first 5):")
            for i, token in enumerate(results['token_details'][:5]):
                coca_score = token.get('COCA_spoken_token', 'NA')
                print(f"  - {token['token']}: {coca_score}")
        
        return True
        
    except Exception as e:
        print(f"βœ— COCA integration test failed: {e}")
        import traceback
        traceback.print_exc()
        return False

def main():
    """Run COCA integration test."""
    print("Running COCA Integration Test...")
    print("=" * 50)
    
    success = test_coca_integration()
    
    print("\n" + "=" * 50)
    
    if success:
        print("βœ“ COCA integration test passed!")
        return True
    else:
        print("βœ— COCA integration test failed!")
        return False

if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)