Spaces:
Building
Building
File size: 3,636 Bytes
a543e33 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | #!/usr/bin/env python3
"""
Test script to validate COCA integration.
"""
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), 'backend'))
from lexical_sophistication import LexicalSophisticationAnalyzer
import pandas as pd
from pathlib import Path
def test_coca_integration():
"""Test loading and using COCA reference list."""
print("Testing COCA Integration...")
try:
# Create analyzer
analyzer = LexicalSophisticationAnalyzer(language="en", model_size="trf")
# Test text
test_text = "The quick brown fox jumps over the lazy dog."
# Load COCA data directly
coca_path = Path("resources/reference_lists/en/COCA_spoken_unigram_list.csv")
if not coca_path.exists():
print(f"β COCA file not found at {coca_path}")
return False
print(f"β Found COCA file at {coca_path}")
# Load COCA data
df = pd.read_csv(coca_path, sep='\t', header=None,
names=['word', 'frequency', 'normalized_freq', 'range', 'dispersion'])
print(f"β Loaded COCA data with {len(df)} entries")
# Create word frequency dictionary
word_freq_dict = dict(zip(df['word'].str.lower(), df['frequency']))
# Test some common words
test_words = ['the', 'quick', 'brown', 'fox', 'jumps']
for word in test_words:
if word in word_freq_dict:
print(f"β Found '{word}' with frequency {word_freq_dict[word]:,}")
else:
print(f"β Word '{word}' not found in COCA data")
# Load reference lists
ref_lists = {
"COCA_spoken": {
"token": word_freq_dict,
"lemma": word_freq_dict
}
}
analyzer.load_reference_lists(ref_lists)
print("β Successfully loaded COCA reference lists into analyzer")
# Analyze text
results = analyzer.analyze_text(test_text, ["COCA_spoken"], apply_log=False)
print(f"β Analysis complete:")
print(f" - Processed {results['text_stats']['total_tokens']} tokens")
print(f" - Found {len(results['summary'])} summary statistics")
print(f" - Generated {len(results['token_details'])} token details")
# Show some results
if results['summary']:
print("\nβ Summary statistics:")
for key, stats in results['summary'].items():
print(f" - {key}: mean={stats['mean']:.2f}, count={stats['count']}")
# Show token details for first few words
if results['token_details']:
print("\nβ Token details (first 5):")
for i, token in enumerate(results['token_details'][:5]):
coca_score = token.get('COCA_spoken_token', 'NA')
print(f" - {token['token']}: {coca_score}")
return True
except Exception as e:
print(f"β COCA integration test failed: {e}")
import traceback
traceback.print_exc()
return False
def main():
"""Run COCA integration test."""
print("Running COCA Integration Test...")
print("=" * 50)
success = test_coca_integration()
print("\n" + "=" * 50)
if success:
print("β COCA integration test passed!")
return True
else:
print("β COCA integration test failed!")
return False
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1) |