Spaces:

egumasa
/

simple-text-analyzer

Building

File size: 3,636 Bytes

a543e33

#!/usr/bin/env python3
"""
Test script to validate COCA integration.
"""

import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), 'backend'))

from lexical_sophistication import LexicalSophisticationAnalyzer
import pandas as pd
from pathlib import Path

def test_coca_integration():
    """Test loading and using COCA reference list."""
    print("Testing COCA Integration...")
    
    try:
        # Create analyzer
        analyzer = LexicalSophisticationAnalyzer(language="en", model_size="trf")
        
        # Test text
        test_text = "The quick brown fox jumps over the lazy dog."
        
        # Load COCA data directly
        coca_path = Path("resources/reference_lists/en/COCA_spoken_unigram_list.csv")
        if not coca_path.exists():
            print(f"✗ COCA file not found at {coca_path}")
            return False
        
        print(f"✓ Found COCA file at {coca_path}")
        
        # Load COCA data
        df = pd.read_csv(coca_path, sep='\t', header=None, 
                        names=['word', 'frequency', 'normalized_freq', 'range', 'dispersion'])
        
        print(f"✓ Loaded COCA data with {len(df)} entries")
        
        # Create word frequency dictionary
        word_freq_dict = dict(zip(df['word'].str.lower(), df['frequency']))
        
        # Test some common words
        test_words = ['the', 'quick', 'brown', 'fox', 'jumps']
        for word in test_words:
            if word in word_freq_dict:
                print(f"✓ Found '{word}' with frequency {word_freq_dict[word]:,}")
            else:
                print(f"✗ Word '{word}' not found in COCA data")
        
        # Load reference lists
        ref_lists = {
            "COCA_spoken": {
                "token": word_freq_dict,
                "lemma": word_freq_dict
            }
        }
        
        analyzer.load_reference_lists(ref_lists)
        print("✓ Successfully loaded COCA reference lists into analyzer")
        
        # Analyze text
        results = analyzer.analyze_text(test_text, ["COCA_spoken"], apply_log=False)
        
        print(f"✓ Analysis complete:")
        print(f"  - Processed {results['text_stats']['total_tokens']} tokens")
        print(f"  - Found {len(results['summary'])} summary statistics")
        print(f"  - Generated {len(results['token_details'])} token details")
        
        # Show some results
        if results['summary']:
            print("\n✓ Summary statistics:")
            for key, stats in results['summary'].items():
                print(f"  - {key}: mean={stats['mean']:.2f}, count={stats['count']}")
        
        # Show token details for first few words
        if results['token_details']:
            print("\n✓ Token details (first 5):")
            for i, token in enumerate(results['token_details'][:5]):
                coca_score = token.get('COCA_spoken_token', 'NA')
                print(f"  - {token['token']}: {coca_score}")
        
        return True
        
    except Exception as e:
        print(f"✗ COCA integration test failed: {e}")
        import traceback
        traceback.print_exc()
        return False

def main():
    """Run COCA integration test."""
    print("Running COCA Integration Test...")
    print("=" * 50)
    
    success = test_coca_integration()
    
    print("\n" + "=" * 50)
    
    if success:
        print("✓ COCA integration test passed!")
        return True
    else:
        print("✗ COCA integration test failed!")
        return False

if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)