Spaces:
Building
Building
| #!/usr/bin/env python3 | |
| """ | |
| Test script to validate COCA integration. | |
| """ | |
| import sys | |
| import os | |
| sys.path.append(os.path.join(os.path.dirname(__file__), 'backend')) | |
| from lexical_sophistication import LexicalSophisticationAnalyzer | |
| import pandas as pd | |
| from pathlib import Path | |
| def test_coca_integration(): | |
| """Test loading and using COCA reference list.""" | |
| print("Testing COCA Integration...") | |
| try: | |
| # Create analyzer | |
| analyzer = LexicalSophisticationAnalyzer(language="en", model_size="trf") | |
| # Test text | |
| test_text = "The quick brown fox jumps over the lazy dog." | |
| # Load COCA data directly | |
| coca_path = Path("resources/reference_lists/en/COCA_spoken_unigram_list.csv") | |
| if not coca_path.exists(): | |
| print(f"β COCA file not found at {coca_path}") | |
| return False | |
| print(f"β Found COCA file at {coca_path}") | |
| # Load COCA data | |
| df = pd.read_csv(coca_path, sep='\t', header=None, | |
| names=['word', 'frequency', 'normalized_freq', 'range', 'dispersion']) | |
| print(f"β Loaded COCA data with {len(df)} entries") | |
| # Create word frequency dictionary | |
| word_freq_dict = dict(zip(df['word'].str.lower(), df['frequency'])) | |
| # Test some common words | |
| test_words = ['the', 'quick', 'brown', 'fox', 'jumps'] | |
| for word in test_words: | |
| if word in word_freq_dict: | |
| print(f"β Found '{word}' with frequency {word_freq_dict[word]:,}") | |
| else: | |
| print(f"β Word '{word}' not found in COCA data") | |
| # Load reference lists | |
| ref_lists = { | |
| "COCA_spoken": { | |
| "token": word_freq_dict, | |
| "lemma": word_freq_dict | |
| } | |
| } | |
| analyzer.load_reference_lists(ref_lists) | |
| print("β Successfully loaded COCA reference lists into analyzer") | |
| # Analyze text | |
| results = analyzer.analyze_text(test_text, ["COCA_spoken"], apply_log=False) | |
| print(f"β Analysis complete:") | |
| print(f" - Processed {results['text_stats']['total_tokens']} tokens") | |
| print(f" - Found {len(results['summary'])} summary statistics") | |
| print(f" - Generated {len(results['token_details'])} token details") | |
| # Show some results | |
| if results['summary']: | |
| print("\nβ Summary statistics:") | |
| for key, stats in results['summary'].items(): | |
| print(f" - {key}: mean={stats['mean']:.2f}, count={stats['count']}") | |
| # Show token details for first few words | |
| if results['token_details']: | |
| print("\nβ Token details (first 5):") | |
| for i, token in enumerate(results['token_details'][:5]): | |
| coca_score = token.get('COCA_spoken_token', 'NA') | |
| print(f" - {token['token']}: {coca_score}") | |
| return True | |
| except Exception as e: | |
| print(f"β COCA integration test failed: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return False | |
| def main(): | |
| """Run COCA integration test.""" | |
| print("Running COCA Integration Test...") | |
| print("=" * 50) | |
| success = test_coca_integration() | |
| print("\n" + "=" * 50) | |
| if success: | |
| print("β COCA integration test passed!") | |
| return True | |
| else: | |
| print("β COCA integration test failed!") | |
| return False | |
| if __name__ == "__main__": | |
| success = main() | |
| sys.exit(0 if success else 1) |