abc123 / hack /analyze_norvig_vocabulary.py
vimalk78's picture
docs: add soft minimum visualization ideas and vocabulary alternatives analysis
bfd6ff4
#!/usr/bin/env python3
"""
Statistical Analysis of Norvig Word Count Files
Analyzes a single Norvig word count file (count_1w.txt or count_1w100k.txt)
from norvig.com/ngrams/ to understand vocabulary characteristics for crossword generation.
Usage:
python analyze_norvig_vocabulary.py <filename>
python analyze_norvig_vocabulary.py --help
Examples:
python analyze_norvig_vocabulary.py norvig/count_1w100k.txt
python analyze_norvig_vocabulary.py norvig/count_1w.txt
"""
import os
import sys
import argparse
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter, defaultdict
import seaborn as sns
from pathlib import Path
# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
def parse_arguments():
"""Parse command line arguments"""
parser = argparse.ArgumentParser(
description='Analyze Norvig word count files for crossword generation',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python analyze_norvig_vocabulary.py norvig/count_1w100k.txt
python analyze_norvig_vocabulary.py norvig/count_1w.txt
python analyze_norvig_vocabulary.py --help
File formats supported:
- count_1w100k.txt: Top 100,000 most frequent words
- count_1w.txt: Full word count dataset (1M+ words)
Output:
- Comprehensive statistical analysis
- 6-panel visualization saved as norvig_comprehensive_analysis.png
- Summary statistics printed to console
"""
)
parser.add_argument(
'filename',
help='Path to Norvig word count file (e.g., norvig/count_1w100k.txt)'
)
return parser.parse_args()
def load_word_counts(filepath):
"""Load word count file and return dict of {word: count}"""
word_counts = {}
total_lines = 0
print(f"Loading {filepath}...")
try:
with open(filepath, 'r', encoding='utf-8') as f:
for line in f:
total_lines += 1
parts = line.strip().split('\t')
if len(parts) == 2:
word, count = parts
word_counts[word.upper()] = int(count)
elif len(parts) == 1 and line.strip():
# Handle case where count might be missing
word = parts[0]
word_counts[word.upper()] = 1
print(f"βœ… Loaded {len(word_counts):,} words from {filepath}")
return word_counts
except FileNotFoundError:
print(f"❌ File not found: {filepath}")
return {}
except Exception as e:
print(f"❌ Error loading {filepath}: {e}")
return {}
def analyze_word_lengths(words):
"""Analyze distribution of word lengths"""
lengths = [len(word) for word in words]
length_dist = Counter(lengths)
return lengths, length_dist
def classify_difficulty(rank, total_words):
"""Classify word difficulty based on frequency rank"""
if rank <= total_words * 0.05: # Top 5%
return "Very Easy"
elif rank <= total_words * 0.20: # Top 20%
return "Easy"
elif rank <= total_words * 0.60: # Top 60%
return "Medium"
elif rank <= total_words * 0.85: # Top 85%
return "Hard"
else:
return "Very Hard"
def create_comprehensive_analysis(word_counts, filename, base_dir):
"""Create comprehensive statistical analysis with readable plots"""
# Create figure with subplots - 2x3 layout with good spacing
fig = plt.figure(figsize=(18, 12))
fig.suptitle(f'Norvig Word Count Analysis - {filename}',
fontsize=16, fontweight='bold', y=0.95)
# Convert to sorted lists for analysis
words = list(word_counts.keys())
counts = list(word_counts.values())
ranks = list(range(1, len(counts) + 1))
# 1. Zipf's Law Analysis (log-log plot)
ax1 = plt.subplot(2, 3, 1)
plt.loglog(ranks, counts, 'b-', alpha=0.7, linewidth=2)
plt.xlabel('Rank (log scale)')
plt.ylabel('Frequency (log scale)')
plt.title('Zipf\'s Law Validation', fontweight='bold')
plt.grid(True, alpha=0.3)
# Add theoretical Zipf line for comparison
theoretical_zipf = [counts[0] / r for r in ranks]
plt.loglog(ranks, theoretical_zipf, 'r--', alpha=0.5, label='Theoretical')
plt.legend()
# 2. Word Length Distribution
ax2 = plt.subplot(2, 3, 2)
lengths, length_dist = analyze_word_lengths(words)
lengths_list = sorted(length_dist.keys())
counts_list = [length_dist[l] for l in lengths_list]
bars = plt.bar(lengths_list, counts_list, alpha=0.7, color='skyblue', edgecolor='navy')
plt.xlabel('Word Length (characters)')
plt.ylabel('Number of Words')
plt.title('Word Length Distribution', fontweight='bold')
# Highlight crossword-suitable range (3-12 letters)
for i, bar in enumerate(bars):
if 3 <= lengths_list[i] <= 12:
bar.set_color('lightgreen')
elif lengths_list[i] < 3 or lengths_list[i] > 15:
bar.set_color('lightcoral')
plt.axvspan(3, 12, alpha=0.2, color='green', label='Crossword Range')
plt.legend()
# 3. Difficulty Distribution
ax3 = plt.subplot(2, 3, 3)
difficulty_dist = defaultdict(int)
for rank in ranks:
difficulty = classify_difficulty(rank, len(ranks))
difficulty_dist[difficulty] += 1
diff_labels = list(difficulty_dist.keys())
diff_counts = list(difficulty_dist.values())
colors = ['darkgreen', 'green', 'orange', 'red', 'darkred']
wedges, texts, autotexts = plt.pie(diff_counts, labels=diff_labels, autopct='%1.1f%%',
colors=colors[:len(diff_labels)], startangle=90)
plt.title('Difficulty Distribution', fontweight='bold')
# 4. Cumulative Frequency Coverage
ax4 = plt.subplot(2, 3, 4)
cumulative_freq = np.cumsum(counts)
total_freq = cumulative_freq[-1]
coverage_pct = (cumulative_freq / total_freq) * 100
plt.plot(ranks, coverage_pct, 'g-', linewidth=2)
plt.xlabel('Vocabulary Size')
plt.ylabel('Coverage (%)')
plt.title('Cumulative Coverage', fontweight='bold')
plt.grid(True, alpha=0.3)
# Add key milestone markers
milestones = [1000, 5000, 10000, 25000, 50000]
for milestone in milestones:
if milestone < len(coverage_pct):
plt.axvline(x=milestone, color='red', linestyle='--', alpha=0.5)
# 5. Crossword Suitability
ax5 = plt.subplot(2, 3, 5)
crossword_suitable = {word: count for word, count in word_counts.items()
if 3 <= len(word) <= 12 and word.isalpha()}
total_words = len(word_counts)
suitable_words = len(crossword_suitable)
unsuitable_words = total_words - suitable_words
labels = [f'Suitable\n{suitable_words:,}', f'Not Suitable\n{unsuitable_words:,}']
sizes = [suitable_words, unsuitable_words]
colors = ['lightgreen', 'lightcoral']
wedges, texts, autotexts = plt.pie(sizes, labels=labels, autopct='%1.1f%%', colors=colors, startangle=90)
plt.title('Crossword Suitability', fontweight='bold')
# 6. Difficulty Categories for Crosswords
ax6 = plt.subplot(2, 3, 6)
# Define crossword difficulty thresholds
easy_threshold = 5000
medium_threshold = 25000
easy_words = sum(1 for i, word in enumerate(words[:easy_threshold]) if 3 <= len(word) <= 12 and i < len(words))
medium_words = sum(1 for i, word in enumerate(words[easy_threshold:medium_threshold]) if 3 <= len(word) <= 12 and (i + easy_threshold) < len(words))
hard_words = sum(1 for i, word in enumerate(words[medium_threshold:]) if 3 <= len(word) <= 12 and (i + medium_threshold) < len(words))
categories = ['Easy', 'Medium', 'Hard']
word_counts_cat = [easy_words, medium_words, hard_words]
colors_cat = ['lightgreen', 'gold', 'lightcoral']
bars = plt.bar(categories, word_counts_cat, color=colors_cat, alpha=0.8)
plt.ylabel('Crossword Words')
plt.title('Difficulty Categories\n(Based on Frequency Rank)', fontweight='bold')
# Add value labels on bars
for bar, count in zip(bars, word_counts_cat):
height = bar.get_height()
if height > 0:
plt.text(bar.get_x() + bar.get_width()/2, height + max(word_counts_cat)*0.02,
f'{count:,}', ha='center', va='bottom', fontweight='bold')
# Add explanation text box with examples
# Get some example words for each category
easy_examples = [w for i, w in enumerate(words[:100]) if 3 <= len(w) <= 12][:3]
medium_examples = [w for i, w in enumerate(words[7000:12000]) if 3 <= len(w) <= 12][:3]
hard_examples = [w for i, w in enumerate(words[30000:35000]) if 3 <= len(w) <= 12][:3]
explanation = (f'Easy: Ranks 1-5,000 (most frequent)\n'
f' e.g., {", ".join(easy_examples[:3])}\n'
f'Medium: Ranks 5,001-25,000\n'
f' e.g., {", ".join(medium_examples[:3])}\n'
f'Hard: Ranks 25,001+ (least frequent)\n'
f' e.g., {", ".join(hard_examples[:3])}\n\n'
'Lower rank = higher frequency = easier')
plt.text(0.98, 0.98, explanation, transform=ax6.transAxes,
fontsize=8, verticalalignment='top', horizontalalignment='right',
bbox=dict(boxstyle='round,pad=0.5', facecolor='lightblue', alpha=0.9))
# Adjust layout with proper spacing
plt.subplots_adjust(left=0.08, bottom=0.08, right=0.95, top=0.88, wspace=0.35, hspace=0.45)
# Save the comprehensive analysis with filename in the output name
# Extract base name and create clean output filename
if 'count_1w100k' in filename:
output_name = 'norvig_analysis_100k.png'
elif 'count_1w.txt' in filename:
output_name = 'norvig_analysis_full.png'
else:
# Fallback for any other filename - make it filesystem safe
safe_name = filename.replace('.txt', '').replace('/', '_').replace('count_', '')
output_name = f'norvig_analysis_{safe_name}.png'
output_path = base_dir / output_name
plt.savefig(output_path, dpi=300, bbox_inches='tight')
print(f"πŸ“Š Comprehensive analysis saved to: {output_path}")
return fig, crossword_suitable
def print_summary_statistics(word_counts, filename, crossword_suitable):
"""Print comprehensive summary statistics"""
print("\n" + "="*80)
print("πŸ“Š NORVIG VOCABULARY STATISTICAL ANALYSIS")
print(f"πŸ“ File: {filename}")
print("="*80)
# Basic statistics
total_words = len(word_counts)
total_frequency = sum(word_counts.values())
print(f"\nπŸ“š BASIC STATISTICS:")
print(f" β€’ Total words: {total_words:,}")
print(f" β€’ Total frequency: {total_frequency:,}")
print(f" β€’ Average frequency: {total_frequency/total_words:.2f}")
# Word length analysis
lengths, length_dist = analyze_word_lengths(word_counts.keys())
avg_length = np.mean(lengths)
crossword_length_words = sum(count for length, count in length_dist.items() if 3 <= length <= 12)
crossword_length_pct = (crossword_length_words / total_words) * 100
print(f"\nπŸ“ WORD LENGTH ANALYSIS:")
print(f" β€’ Average word length: {avg_length:.1f} characters")
print(f" β€’ Words 3-12 characters: {crossword_length_words:,} ({crossword_length_pct:.1f}%)")
print(f" β€’ Most common lengths: {sorted(length_dist.items(), key=lambda x: x[1], reverse=True)[:5]}")
# Crossword suitability
suitable_count = len(crossword_suitable)
suitable_pct = (suitable_count / total_words) * 100
suitable_freq = sum(crossword_suitable.values())
suitable_freq_pct = (suitable_freq / total_frequency) * 100
print(f"\n🧩 CROSSWORD SUITABILITY:")
print(f" β€’ Suitable words (3-12 letters, alphabetic): {suitable_count:,} ({suitable_pct:.1f}%)")
print(f" β€’ Suitable word frequency coverage: {suitable_freq_pct:.1f}%")
# Difficulty distribution for crosswords
easy_words = len([w for w, c in list(crossword_suitable.items())[:5000]])
medium_words = len([w for w, c in list(crossword_suitable.items())[5000:25000]])
hard_words = len([w for w, c in list(crossword_suitable.items())[25000:]])
print(f"\n🎯 CROSSWORD DIFFICULTY DISTRIBUTION:")
print(f" β€’ Easy (rank 1-5K): {easy_words:,} words")
print(f" β€’ Medium (rank 5K-25K): {medium_words:,} words")
print(f" β€’ Hard (rank 25K+): {hard_words:,} words")
# Top and bottom words examples
words_list = list(word_counts.keys())
print(f"\nπŸ” TOP 10 MOST FREQUENT WORDS:")
for i, word in enumerate(words_list[:10], 1):
print(f" {i:2d}. {word:<12} ({word_counts[word]:,})")
print(f"\nπŸ”š BOTTOM 10 LEAST FREQUENT WORDS:")
for i, word in enumerate(words_list[-10:], 1):
print(f" {i:2d}. {word:<12} ({word_counts[word]:,})")
# Zipf's law validation
words_list = list(word_counts.keys())
counts_list = list(word_counts.values())
# Calculate correlation coefficient for log-log relationship
log_ranks = np.log(range(1, len(counts_list) + 1))
log_freqs = np.log(counts_list)
correlation = np.corrcoef(log_ranks, log_freqs)[0, 1]
print(f"\nπŸ“ˆ ZIPF'S LAW VALIDATION:")
print(f" β€’ Log-log correlation: {correlation:.4f}")
print(f" β€’ Zipf compliance: {'βœ… Excellent' if abs(correlation) > 0.95 else '⚠️ Moderate' if abs(correlation) > 0.8 else '❌ Poor'}")
# Recommendations
print(f"\nπŸ’‘ RECOMMENDATIONS FOR CROSSWORD GENERATION:")
print(f" β€’ Dataset size: {total_words:,} words with excellent coverage")
print(f" β€’ Filter to 3-12 letters: Reduces to {suitable_count:,} words ({suitable_pct:.1f}%)")
print(f" β€’ Difficulty thresholds (for crossword-suitable words):")
print(f" - Easy: ranks 1-5,000 ({easy_words:,} suitable words)")
print(f" - Medium: ranks 5,001-25,000 ({medium_words:,} suitable words)")
print(f" - Hard: ranks 25,001+ ({hard_words:,} suitable words)")
print(f" β€’ Quality: βœ… No garbage entries (unlike crossword-specific lists)")
print(f" β€’ Source credibility: βœ… Peter Norvig (Google) + Google Books corpus")
print("="*80)
def main():
"""Main analysis function"""
# Parse command line arguments
args = parse_arguments()
# File paths
base_dir = Path(__file__).parent
input_file = Path(args.filename)
# Make path relative to script directory if not absolute
if not input_file.is_absolute():
input_file = base_dir / input_file
print("πŸ” Norvig Vocabulary Statistical Analysis")
print("=" * 50)
print(f"πŸ“ Analyzing: {input_file}")
# Load data
word_counts = load_word_counts(input_file)
if not word_counts:
print(f"❌ Could not load word list from {input_file}. Please check file path.")
return
# Create comprehensive analysis
fig, crossword_suitable = create_comprehensive_analysis(word_counts, input_file.name, base_dir)
# Print summary statistics
print_summary_statistics(word_counts, input_file.name, crossword_suitable)
# Don't show plot interactively in CLI, just save it
# plt.show() # Comment out for CLI usage
# Generate the same output filename logic for final message
if 'count_1w100k' in input_file.name:
output_name = 'norvig_analysis_100k.png'
elif 'count_1w.txt' in input_file.name:
output_name = 'norvig_analysis_full.png'
else:
safe_name = input_file.name.replace('.txt', '').replace('/', '_').replace('count_', '')
output_name = f'norvig_analysis_{safe_name}.png'
print(f"\nβœ… Analysis complete! Check {base_dir}/{output_name} for detailed plots.")
if __name__ == "__main__":
main()