Spaces:

vimalk78
/

abc123

Sleeping

App Files Files Community

abc123 / hack /analyze_norvig_vocabulary.py

vimalk78

docs: add soft minimum visualization ideas and vocabulary alternatives analysis

bfd6ff4 3 months ago

raw

history blame contribute delete

16 kB

	#!/usr/bin/env python3
	"""
	Statistical Analysis of Norvig Word Count Files

	Analyzes a single Norvig word count file (count_1w.txt or count_1w100k.txt)
	from norvig.com/ngrams/ to understand vocabulary characteristics for crossword generation.

	Usage:
	python analyze_norvig_vocabulary.py <filename>
	python analyze_norvig_vocabulary.py --help

	Examples:
	python analyze_norvig_vocabulary.py norvig/count_1w100k.txt
	python analyze_norvig_vocabulary.py norvig/count_1w.txt
	"""

	import os
	import sys
	import argparse
	import numpy as np
	import matplotlib.pyplot as plt
	import pandas as pd
	from collections import Counter, defaultdict
	import seaborn as sns
	from pathlib import Path

	# Set style for better plots
	plt.style.use('seaborn-v0_8')
	sns.set_palette("husl")

	def parse_arguments():
	"""Parse command line arguments"""
	parser = argparse.ArgumentParser(
	description='Analyze Norvig word count files for crossword generation',
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	python analyze_norvig_vocabulary.py norvig/count_1w100k.txt
	python analyze_norvig_vocabulary.py norvig/count_1w.txt
	python analyze_norvig_vocabulary.py --help

	File formats supported:
	- count_1w100k.txt: Top 100,000 most frequent words
	- count_1w.txt: Full word count dataset (1M+ words)

	Output:
	- Comprehensive statistical analysis
	- 6-panel visualization saved as norvig_comprehensive_analysis.png
	- Summary statistics printed to console
	"""
	)

	parser.add_argument(
	'filename',
	help='Path to Norvig word count file (e.g., norvig/count_1w100k.txt)'
	)

	return parser.parse_args()

	def load_word_counts(filepath):
	"""Load word count file and return dict of {word: count}"""
	word_counts = {}
	total_lines = 0

	print(f"Loading {filepath}...")

	try:
	with open(filepath, 'r', encoding='utf-8') as f:
	for line in f:
	total_lines += 1
	parts = line.strip().split('\t')
	if len(parts) == 2:
	word, count = parts
	word_counts[word.upper()] = int(count)
	elif len(parts) == 1 and line.strip():
	# Handle case where count might be missing
	word = parts[0]
	word_counts[word.upper()] = 1

	print(f"✅ Loaded {len(word_counts):,} words from {filepath}")
	return word_counts

	except FileNotFoundError:
	print(f"❌ File not found: {filepath}")
	return {}
	except Exception as e:
	print(f"❌ Error loading {filepath}: {e}")
	return {}

	def analyze_word_lengths(words):
	"""Analyze distribution of word lengths"""
	lengths = [len(word) for word in words]
	length_dist = Counter(lengths)

	return lengths, length_dist

	def classify_difficulty(rank, total_words):
	"""Classify word difficulty based on frequency rank"""
	if rank <= total_words * 0.05: # Top 5%
	return "Very Easy"
	elif rank <= total_words * 0.20: # Top 20%
	return "Easy"
	elif rank <= total_words * 0.60: # Top 60%
	return "Medium"
	elif rank <= total_words * 0.85: # Top 85%
	return "Hard"
	else:
	return "Very Hard"

	def create_comprehensive_analysis(word_counts, filename, base_dir):
	"""Create comprehensive statistical analysis with readable plots"""

	# Create figure with subplots - 2x3 layout with good spacing
	fig = plt.figure(figsize=(18, 12))
	fig.suptitle(f'Norvig Word Count Analysis - {filename}',
	fontsize=16, fontweight='bold', y=0.95)

	# Convert to sorted lists for analysis
	words = list(word_counts.keys())
	counts = list(word_counts.values())
	ranks = list(range(1, len(counts) + 1))

	# 1. Zipf's Law Analysis (log-log plot)
	ax1 = plt.subplot(2, 3, 1)
	plt.loglog(ranks, counts, 'b-', alpha=0.7, linewidth=2)
	plt.xlabel('Rank (log scale)')
	plt.ylabel('Frequency (log scale)')
	plt.title('Zipf\'s Law Validation', fontweight='bold')
	plt.grid(True, alpha=0.3)

	# Add theoretical Zipf line for comparison
	theoretical_zipf = [counts[0] / r for r in ranks]
	plt.loglog(ranks, theoretical_zipf, 'r--', alpha=0.5, label='Theoretical')
	plt.legend()

	# 2. Word Length Distribution
	ax2 = plt.subplot(2, 3, 2)
	lengths, length_dist = analyze_word_lengths(words)
	lengths_list = sorted(length_dist.keys())
	counts_list = [length_dist[l] for l in lengths_list]

	bars = plt.bar(lengths_list, counts_list, alpha=0.7, color='skyblue', edgecolor='navy')
	plt.xlabel('Word Length (characters)')
	plt.ylabel('Number of Words')
	plt.title('Word Length Distribution', fontweight='bold')

	# Highlight crossword-suitable range (3-12 letters)
	for i, bar in enumerate(bars):
	if 3 <= lengths_list[i] <= 12:
	bar.set_color('lightgreen')
	elif lengths_list[i] < 3 or lengths_list[i] > 15:
	bar.set_color('lightcoral')

	plt.axvspan(3, 12, alpha=0.2, color='green', label='Crossword Range')
	plt.legend()

	# 3. Difficulty Distribution
	ax3 = plt.subplot(2, 3, 3)
	difficulty_dist = defaultdict(int)
	for rank in ranks:
	difficulty = classify_difficulty(rank, len(ranks))
	difficulty_dist[difficulty] += 1

	diff_labels = list(difficulty_dist.keys())
	diff_counts = list(difficulty_dist.values())
	colors = ['darkgreen', 'green', 'orange', 'red', 'darkred']

	wedges, texts, autotexts = plt.pie(diff_counts, labels=diff_labels, autopct='%1.1f%%',
	colors=colors[:len(diff_labels)], startangle=90)
	plt.title('Difficulty Distribution', fontweight='bold')

	# 4. Cumulative Frequency Coverage
	ax4 = plt.subplot(2, 3, 4)
	cumulative_freq = np.cumsum(counts)
	total_freq = cumulative_freq[-1]
	coverage_pct = (cumulative_freq / total_freq) * 100

	plt.plot(ranks, coverage_pct, 'g-', linewidth=2)
	plt.xlabel('Vocabulary Size')
	plt.ylabel('Coverage (%)')
	plt.title('Cumulative Coverage', fontweight='bold')
	plt.grid(True, alpha=0.3)

	# Add key milestone markers
	milestones = [1000, 5000, 10000, 25000, 50000]
	for milestone in milestones:
	if milestone < len(coverage_pct):
	plt.axvline(x=milestone, color='red', linestyle='--', alpha=0.5)

	# 5. Crossword Suitability
	ax5 = plt.subplot(2, 3, 5)
	crossword_suitable = {word: count for word, count in word_counts.items()
	if 3 <= len(word) <= 12 and word.isalpha()}

	total_words = len(word_counts)
	suitable_words = len(crossword_suitable)
	unsuitable_words = total_words - suitable_words

	labels = [f'Suitable\n{suitable_words:,}', f'Not Suitable\n{unsuitable_words:,}']
	sizes = [suitable_words, unsuitable_words]
	colors = ['lightgreen', 'lightcoral']

	wedges, texts, autotexts = plt.pie(sizes, labels=labels, autopct='%1.1f%%', colors=colors, startangle=90)
	plt.title('Crossword Suitability', fontweight='bold')

	# 6. Difficulty Categories for Crosswords
	ax6 = plt.subplot(2, 3, 6)

	# Define crossword difficulty thresholds
	easy_threshold = 5000
	medium_threshold = 25000

	easy_words = sum(1 for i, word in enumerate(words[:easy_threshold]) if 3 <= len(word) <= 12 and i < len(words))
	medium_words = sum(1 for i, word in enumerate(words[easy_threshold:medium_threshold]) if 3 <= len(word) <= 12 and (i + easy_threshold) < len(words))
	hard_words = sum(1 for i, word in enumerate(words[medium_threshold:]) if 3 <= len(word) <= 12 and (i + medium_threshold) < len(words))

	categories = ['Easy', 'Medium', 'Hard']
	word_counts_cat = [easy_words, medium_words, hard_words]
	colors_cat = ['lightgreen', 'gold', 'lightcoral']

	bars = plt.bar(categories, word_counts_cat, color=colors_cat, alpha=0.8)
	plt.ylabel('Crossword Words')
	plt.title('Difficulty Categories\n(Based on Frequency Rank)', fontweight='bold')

	# Add value labels on bars
	for bar, count in zip(bars, word_counts_cat):
	height = bar.get_height()
	if height > 0:
	plt.text(bar.get_x() + bar.get_width()/2, height + max(word_counts_cat)*0.02,
	f'{count:,}', ha='center', va='bottom', fontweight='bold')

	# Add explanation text box with examples
	# Get some example words for each category
	easy_examples = [w for i, w in enumerate(words[:100]) if 3 <= len(w) <= 12][:3]
	medium_examples = [w for i, w in enumerate(words[7000:12000]) if 3 <= len(w) <= 12][:3]
	hard_examples = [w for i, w in enumerate(words[30000:35000]) if 3 <= len(w) <= 12][:3]

	explanation = (f'Easy: Ranks 1-5,000 (most frequent)\n'
	f' e.g., {", ".join(easy_examples[:3])}\n'
	f'Medium: Ranks 5,001-25,000\n'
	f' e.g., {", ".join(medium_examples[:3])}\n'
	f'Hard: Ranks 25,001+ (least frequent)\n'
	f' e.g., {", ".join(hard_examples[:3])}\n\n'
	'Lower rank = higher frequency = easier')

	plt.text(0.98, 0.98, explanation, transform=ax6.transAxes,
	fontsize=8, verticalalignment='top', horizontalalignment='right',
	bbox=dict(boxstyle='round,pad=0.5', facecolor='lightblue', alpha=0.9))

	# Adjust layout with proper spacing
	plt.subplots_adjust(left=0.08, bottom=0.08, right=0.95, top=0.88, wspace=0.35, hspace=0.45)

	# Save the comprehensive analysis with filename in the output name
	# Extract base name and create clean output filename
	if 'count_1w100k' in filename:
	output_name = 'norvig_analysis_100k.png'
	elif 'count_1w.txt' in filename:
	output_name = 'norvig_analysis_full.png'
	else:
	# Fallback for any other filename - make it filesystem safe
	safe_name = filename.replace('.txt', '').replace('/', '_').replace('count_', '')
	output_name = f'norvig_analysis_{safe_name}.png'

	output_path = base_dir / output_name
	plt.savefig(output_path, dpi=300, bbox_inches='tight')
	print(f"📊 Comprehensive analysis saved to: {output_path}")

	return fig, crossword_suitable

	def print_summary_statistics(word_counts, filename, crossword_suitable):
	"""Print comprehensive summary statistics"""

	print("\n" + "="*80)
	print("📊 NORVIG VOCABULARY STATISTICAL ANALYSIS")
	print(f"📁 File: {filename}")
	print("="*80)

	# Basic statistics
	total_words = len(word_counts)
	total_frequency = sum(word_counts.values())

	print(f"\n📚 BASIC STATISTICS:")
	print(f" • Total words: {total_words:,}")
	print(f" • Total frequency: {total_frequency:,}")
	print(f" • Average frequency: {total_frequency/total_words:.2f}")

	# Word length analysis
	lengths, length_dist = analyze_word_lengths(word_counts.keys())
	avg_length = np.mean(lengths)
	crossword_length_words = sum(count for length, count in length_dist.items() if 3 <= length <= 12)
	crossword_length_pct = (crossword_length_words / total_words) * 100

	print(f"\n📏 WORD LENGTH ANALYSIS:")
	print(f" • Average word length: {avg_length:.1f} characters")
	print(f" • Words 3-12 characters: {crossword_length_words:,} ({crossword_length_pct:.1f}%)")
	print(f" • Most common lengths: {sorted(length_dist.items(), key=lambda x: x[1], reverse=True)[:5]}")

	# Crossword suitability
	suitable_count = len(crossword_suitable)
	suitable_pct = (suitable_count / total_words) * 100
	suitable_freq = sum(crossword_suitable.values())
	suitable_freq_pct = (suitable_freq / total_frequency) * 100

	print(f"\n🧩 CROSSWORD SUITABILITY:")
	print(f" • Suitable words (3-12 letters, alphabetic): {suitable_count:,} ({suitable_pct:.1f}%)")
	print(f" • Suitable word frequency coverage: {suitable_freq_pct:.1f}%")

	# Difficulty distribution for crosswords
	easy_words = len([w for w, c in list(crossword_suitable.items())[:5000]])
	medium_words = len([w for w, c in list(crossword_suitable.items())[5000:25000]])
	hard_words = len([w for w, c in list(crossword_suitable.items())[25000:]])

	print(f"\n🎯 CROSSWORD DIFFICULTY DISTRIBUTION:")
	print(f" • Easy (rank 1-5K): {easy_words:,} words")
	print(f" • Medium (rank 5K-25K): {medium_words:,} words")
	print(f" • Hard (rank 25K+): {hard_words:,} words")

	# Top and bottom words examples
	words_list = list(word_counts.keys())
	print(f"\n🔝 TOP 10 MOST FREQUENT WORDS:")
	for i, word in enumerate(words_list[:10], 1):
	print(f" {i:2d}. {word:<12} ({word_counts[word]:,})")

	print(f"\n🔚 BOTTOM 10 LEAST FREQUENT WORDS:")
	for i, word in enumerate(words_list[-10:], 1):
	print(f" {i:2d}. {word:<12} ({word_counts[word]:,})")

	# Zipf's law validation
	words_list = list(word_counts.keys())
	counts_list = list(word_counts.values())

	# Calculate correlation coefficient for log-log relationship
	log_ranks = np.log(range(1, len(counts_list) + 1))
	log_freqs = np.log(counts_list)
	correlation = np.corrcoef(log_ranks, log_freqs)[0, 1]

	print(f"\n📈 ZIPF'S LAW VALIDATION:")
	print(f" • Log-log correlation: {correlation:.4f}")
	print(f" • Zipf compliance: {'✅ Excellent' if abs(correlation) > 0.95 else '⚠️ Moderate' if abs(correlation) > 0.8 else '❌ Poor'}")

	# Recommendations
	print(f"\n💡 RECOMMENDATIONS FOR CROSSWORD GENERATION:")
	print(f" • Dataset size: {total_words:,} words with excellent coverage")
	print(f" • Filter to 3-12 letters: Reduces to {suitable_count:,} words ({suitable_pct:.1f}%)")
	print(f" • Difficulty thresholds (for crossword-suitable words):")
	print(f" - Easy: ranks 1-5,000 ({easy_words:,} suitable words)")
	print(f" - Medium: ranks 5,001-25,000 ({medium_words:,} suitable words)")
	print(f" - Hard: ranks 25,001+ ({hard_words:,} suitable words)")
	print(f" • Quality: ✅ No garbage entries (unlike crossword-specific lists)")
	print(f" • Source credibility: ✅ Peter Norvig (Google) + Google Books corpus")

	print("="*80)

	def main():
	"""Main analysis function"""

	# Parse command line arguments
	args = parse_arguments()

	# File paths
	base_dir = Path(__file__).parent
	input_file = Path(args.filename)

	# Make path relative to script directory if not absolute
	if not input_file.is_absolute():
	input_file = base_dir / input_file

	print("🔍 Norvig Vocabulary Statistical Analysis")
	print("=" * 50)
	print(f"📁 Analyzing: {input_file}")

	# Load data
	word_counts = load_word_counts(input_file)

	if not word_counts:
	print(f"❌ Could not load word list from {input_file}. Please check file path.")
	return

	# Create comprehensive analysis
	fig, crossword_suitable = create_comprehensive_analysis(word_counts, input_file.name, base_dir)

	# Print summary statistics
	print_summary_statistics(word_counts, input_file.name, crossword_suitable)

	# Don't show plot interactively in CLI, just save it
	# plt.show() # Comment out for CLI usage

	# Generate the same output filename logic for final message
	if 'count_1w100k' in input_file.name:
	output_name = 'norvig_analysis_100k.png'
	elif 'count_1w.txt' in input_file.name:
	output_name = 'norvig_analysis_full.png'
	else:
	safe_name = input_file.name.replace('.txt', '').replace('/', '_').replace('count_', '')
	output_name = f'norvig_analysis_{safe_name}.png'

	print(f"\n✅ Analysis complete! Check {base_dir}/{output_name} for detailed plots.")

	if __name__ == "__main__":
	main()