Spaces:

egumasa
/

simple-text-analyzer

Building

App Files Files Community

simple-text-analyzer / test /test_coca_integration.py

egumasa

initialize app

a543e33 7 months ago

raw

history blame contribute delete

3.64 kB

	#!/usr/bin/env python3
	"""
	Test script to validate COCA integration.
	"""

	import sys
	import os
	sys.path.append(os.path.join(os.path.dirname(__file__), 'backend'))

	from lexical_sophistication import LexicalSophisticationAnalyzer
	import pandas as pd
	from pathlib import Path

	def test_coca_integration():
	"""Test loading and using COCA reference list."""
	print("Testing COCA Integration...")

	try:
	# Create analyzer
	analyzer = LexicalSophisticationAnalyzer(language="en", model_size="trf")

	# Test text
	test_text = "The quick brown fox jumps over the lazy dog."

	# Load COCA data directly
	coca_path = Path("resources/reference_lists/en/COCA_spoken_unigram_list.csv")
	if not coca_path.exists():
	print(f"✗ COCA file not found at {coca_path}")
	return False

	print(f"✓ Found COCA file at {coca_path}")

	# Load COCA data
	df = pd.read_csv(coca_path, sep='\t', header=None,
	names=['word', 'frequency', 'normalized_freq', 'range', 'dispersion'])

	print(f"✓ Loaded COCA data with {len(df)} entries")

	# Create word frequency dictionary
	word_freq_dict = dict(zip(df['word'].str.lower(), df['frequency']))

	# Test some common words
	test_words = ['the', 'quick', 'brown', 'fox', 'jumps']
	for word in test_words:
	if word in word_freq_dict:
	print(f"✓ Found '{word}' with frequency {word_freq_dict[word]:,}")
	else:
	print(f"✗ Word '{word}' not found in COCA data")

	# Load reference lists
	ref_lists = {
	"COCA_spoken": {
	"token": word_freq_dict,
	"lemma": word_freq_dict
	}
	}

	analyzer.load_reference_lists(ref_lists)
	print("✓ Successfully loaded COCA reference lists into analyzer")

	# Analyze text
	results = analyzer.analyze_text(test_text, ["COCA_spoken"], apply_log=False)

	print(f"✓ Analysis complete:")
	print(f" - Processed {results['text_stats']['total_tokens']} tokens")
	print(f" - Found {len(results['summary'])} summary statistics")
	print(f" - Generated {len(results['token_details'])} token details")

	# Show some results
	if results['summary']:
	print("\n✓ Summary statistics:")
	for key, stats in results['summary'].items():
	print(f" - {key}: mean={stats['mean']:.2f}, count={stats['count']}")

	# Show token details for first few words
	if results['token_details']:
	print("\n✓ Token details (first 5):")
	for i, token in enumerate(results['token_details'][:5]):
	coca_score = token.get('COCA_spoken_token', 'NA')
	print(f" - {token['token']}: {coca_score}")

	return True

	except Exception as e:
	print(f"✗ COCA integration test failed: {e}")
	import traceback
	traceback.print_exc()
	return False

	def main():
	"""Run COCA integration test."""
	print("Running COCA Integration Test...")
	print("=" * 50)

	success = test_coca_integration()

	print("\n" + "=" * 50)

	if success:
	print("✓ COCA integration test passed!")
	return True
	else:
	print("✗ COCA integration test failed!")
	return False

	if __name__ == "__main__":
	success = main()
	sys.exit(0 if success else 1)