|
|
|
|
|
""" |
|
|
Statistical Analysis of Norvig Word Count Files |
|
|
|
|
|
Analyzes a single Norvig word count file (count_1w.txt or count_1w100k.txt) |
|
|
from norvig.com/ngrams/ to understand vocabulary characteristics for crossword generation. |
|
|
|
|
|
Usage: |
|
|
python analyze_norvig_vocabulary.py <filename> |
|
|
python analyze_norvig_vocabulary.py --help |
|
|
|
|
|
Examples: |
|
|
python analyze_norvig_vocabulary.py norvig/count_1w100k.txt |
|
|
python analyze_norvig_vocabulary.py norvig/count_1w.txt |
|
|
""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
import argparse |
|
|
import numpy as np |
|
|
import matplotlib.pyplot as plt |
|
|
import pandas as pd |
|
|
from collections import Counter, defaultdict |
|
|
import seaborn as sns |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
plt.style.use('seaborn-v0_8') |
|
|
sns.set_palette("husl") |
|
|
|
|
|
def parse_arguments(): |
|
|
"""Parse command line arguments""" |
|
|
parser = argparse.ArgumentParser( |
|
|
description='Analyze Norvig word count files for crossword generation', |
|
|
formatter_class=argparse.RawDescriptionHelpFormatter, |
|
|
epilog=""" |
|
|
Examples: |
|
|
python analyze_norvig_vocabulary.py norvig/count_1w100k.txt |
|
|
python analyze_norvig_vocabulary.py norvig/count_1w.txt |
|
|
python analyze_norvig_vocabulary.py --help |
|
|
|
|
|
File formats supported: |
|
|
- count_1w100k.txt: Top 100,000 most frequent words |
|
|
- count_1w.txt: Full word count dataset (1M+ words) |
|
|
|
|
|
Output: |
|
|
- Comprehensive statistical analysis |
|
|
- 6-panel visualization saved as norvig_comprehensive_analysis.png |
|
|
- Summary statistics printed to console |
|
|
""" |
|
|
) |
|
|
|
|
|
parser.add_argument( |
|
|
'filename', |
|
|
help='Path to Norvig word count file (e.g., norvig/count_1w100k.txt)' |
|
|
) |
|
|
|
|
|
return parser.parse_args() |
|
|
|
|
|
def load_word_counts(filepath): |
|
|
"""Load word count file and return dict of {word: count}""" |
|
|
word_counts = {} |
|
|
total_lines = 0 |
|
|
|
|
|
print(f"Loading {filepath}...") |
|
|
|
|
|
try: |
|
|
with open(filepath, 'r', encoding='utf-8') as f: |
|
|
for line in f: |
|
|
total_lines += 1 |
|
|
parts = line.strip().split('\t') |
|
|
if len(parts) == 2: |
|
|
word, count = parts |
|
|
word_counts[word.upper()] = int(count) |
|
|
elif len(parts) == 1 and line.strip(): |
|
|
|
|
|
word = parts[0] |
|
|
word_counts[word.upper()] = 1 |
|
|
|
|
|
print(f"β
Loaded {len(word_counts):,} words from {filepath}") |
|
|
return word_counts |
|
|
|
|
|
except FileNotFoundError: |
|
|
print(f"β File not found: {filepath}") |
|
|
return {} |
|
|
except Exception as e: |
|
|
print(f"β Error loading {filepath}: {e}") |
|
|
return {} |
|
|
|
|
|
def analyze_word_lengths(words): |
|
|
"""Analyze distribution of word lengths""" |
|
|
lengths = [len(word) for word in words] |
|
|
length_dist = Counter(lengths) |
|
|
|
|
|
return lengths, length_dist |
|
|
|
|
|
def classify_difficulty(rank, total_words): |
|
|
"""Classify word difficulty based on frequency rank""" |
|
|
if rank <= total_words * 0.05: |
|
|
return "Very Easy" |
|
|
elif rank <= total_words * 0.20: |
|
|
return "Easy" |
|
|
elif rank <= total_words * 0.60: |
|
|
return "Medium" |
|
|
elif rank <= total_words * 0.85: |
|
|
return "Hard" |
|
|
else: |
|
|
return "Very Hard" |
|
|
|
|
|
def create_comprehensive_analysis(word_counts, filename, base_dir): |
|
|
"""Create comprehensive statistical analysis with readable plots""" |
|
|
|
|
|
|
|
|
fig = plt.figure(figsize=(18, 12)) |
|
|
fig.suptitle(f'Norvig Word Count Analysis - {filename}', |
|
|
fontsize=16, fontweight='bold', y=0.95) |
|
|
|
|
|
|
|
|
words = list(word_counts.keys()) |
|
|
counts = list(word_counts.values()) |
|
|
ranks = list(range(1, len(counts) + 1)) |
|
|
|
|
|
|
|
|
ax1 = plt.subplot(2, 3, 1) |
|
|
plt.loglog(ranks, counts, 'b-', alpha=0.7, linewidth=2) |
|
|
plt.xlabel('Rank (log scale)') |
|
|
plt.ylabel('Frequency (log scale)') |
|
|
plt.title('Zipf\'s Law Validation', fontweight='bold') |
|
|
plt.grid(True, alpha=0.3) |
|
|
|
|
|
|
|
|
theoretical_zipf = [counts[0] / r for r in ranks] |
|
|
plt.loglog(ranks, theoretical_zipf, 'r--', alpha=0.5, label='Theoretical') |
|
|
plt.legend() |
|
|
|
|
|
|
|
|
ax2 = plt.subplot(2, 3, 2) |
|
|
lengths, length_dist = analyze_word_lengths(words) |
|
|
lengths_list = sorted(length_dist.keys()) |
|
|
counts_list = [length_dist[l] for l in lengths_list] |
|
|
|
|
|
bars = plt.bar(lengths_list, counts_list, alpha=0.7, color='skyblue', edgecolor='navy') |
|
|
plt.xlabel('Word Length (characters)') |
|
|
plt.ylabel('Number of Words') |
|
|
plt.title('Word Length Distribution', fontweight='bold') |
|
|
|
|
|
|
|
|
for i, bar in enumerate(bars): |
|
|
if 3 <= lengths_list[i] <= 12: |
|
|
bar.set_color('lightgreen') |
|
|
elif lengths_list[i] < 3 or lengths_list[i] > 15: |
|
|
bar.set_color('lightcoral') |
|
|
|
|
|
plt.axvspan(3, 12, alpha=0.2, color='green', label='Crossword Range') |
|
|
plt.legend() |
|
|
|
|
|
|
|
|
ax3 = plt.subplot(2, 3, 3) |
|
|
difficulty_dist = defaultdict(int) |
|
|
for rank in ranks: |
|
|
difficulty = classify_difficulty(rank, len(ranks)) |
|
|
difficulty_dist[difficulty] += 1 |
|
|
|
|
|
diff_labels = list(difficulty_dist.keys()) |
|
|
diff_counts = list(difficulty_dist.values()) |
|
|
colors = ['darkgreen', 'green', 'orange', 'red', 'darkred'] |
|
|
|
|
|
wedges, texts, autotexts = plt.pie(diff_counts, labels=diff_labels, autopct='%1.1f%%', |
|
|
colors=colors[:len(diff_labels)], startangle=90) |
|
|
plt.title('Difficulty Distribution', fontweight='bold') |
|
|
|
|
|
|
|
|
ax4 = plt.subplot(2, 3, 4) |
|
|
cumulative_freq = np.cumsum(counts) |
|
|
total_freq = cumulative_freq[-1] |
|
|
coverage_pct = (cumulative_freq / total_freq) * 100 |
|
|
|
|
|
plt.plot(ranks, coverage_pct, 'g-', linewidth=2) |
|
|
plt.xlabel('Vocabulary Size') |
|
|
plt.ylabel('Coverage (%)') |
|
|
plt.title('Cumulative Coverage', fontweight='bold') |
|
|
plt.grid(True, alpha=0.3) |
|
|
|
|
|
|
|
|
milestones = [1000, 5000, 10000, 25000, 50000] |
|
|
for milestone in milestones: |
|
|
if milestone < len(coverage_pct): |
|
|
plt.axvline(x=milestone, color='red', linestyle='--', alpha=0.5) |
|
|
|
|
|
|
|
|
ax5 = plt.subplot(2, 3, 5) |
|
|
crossword_suitable = {word: count for word, count in word_counts.items() |
|
|
if 3 <= len(word) <= 12 and word.isalpha()} |
|
|
|
|
|
total_words = len(word_counts) |
|
|
suitable_words = len(crossword_suitable) |
|
|
unsuitable_words = total_words - suitable_words |
|
|
|
|
|
labels = [f'Suitable\n{suitable_words:,}', f'Not Suitable\n{unsuitable_words:,}'] |
|
|
sizes = [suitable_words, unsuitable_words] |
|
|
colors = ['lightgreen', 'lightcoral'] |
|
|
|
|
|
wedges, texts, autotexts = plt.pie(sizes, labels=labels, autopct='%1.1f%%', colors=colors, startangle=90) |
|
|
plt.title('Crossword Suitability', fontweight='bold') |
|
|
|
|
|
|
|
|
ax6 = plt.subplot(2, 3, 6) |
|
|
|
|
|
|
|
|
easy_threshold = 5000 |
|
|
medium_threshold = 25000 |
|
|
|
|
|
easy_words = sum(1 for i, word in enumerate(words[:easy_threshold]) if 3 <= len(word) <= 12 and i < len(words)) |
|
|
medium_words = sum(1 for i, word in enumerate(words[easy_threshold:medium_threshold]) if 3 <= len(word) <= 12 and (i + easy_threshold) < len(words)) |
|
|
hard_words = sum(1 for i, word in enumerate(words[medium_threshold:]) if 3 <= len(word) <= 12 and (i + medium_threshold) < len(words)) |
|
|
|
|
|
categories = ['Easy', 'Medium', 'Hard'] |
|
|
word_counts_cat = [easy_words, medium_words, hard_words] |
|
|
colors_cat = ['lightgreen', 'gold', 'lightcoral'] |
|
|
|
|
|
bars = plt.bar(categories, word_counts_cat, color=colors_cat, alpha=0.8) |
|
|
plt.ylabel('Crossword Words') |
|
|
plt.title('Difficulty Categories\n(Based on Frequency Rank)', fontweight='bold') |
|
|
|
|
|
|
|
|
for bar, count in zip(bars, word_counts_cat): |
|
|
height = bar.get_height() |
|
|
if height > 0: |
|
|
plt.text(bar.get_x() + bar.get_width()/2, height + max(word_counts_cat)*0.02, |
|
|
f'{count:,}', ha='center', va='bottom', fontweight='bold') |
|
|
|
|
|
|
|
|
|
|
|
easy_examples = [w for i, w in enumerate(words[:100]) if 3 <= len(w) <= 12][:3] |
|
|
medium_examples = [w for i, w in enumerate(words[7000:12000]) if 3 <= len(w) <= 12][:3] |
|
|
hard_examples = [w for i, w in enumerate(words[30000:35000]) if 3 <= len(w) <= 12][:3] |
|
|
|
|
|
explanation = (f'Easy: Ranks 1-5,000 (most frequent)\n' |
|
|
f' e.g., {", ".join(easy_examples[:3])}\n' |
|
|
f'Medium: Ranks 5,001-25,000\n' |
|
|
f' e.g., {", ".join(medium_examples[:3])}\n' |
|
|
f'Hard: Ranks 25,001+ (least frequent)\n' |
|
|
f' e.g., {", ".join(hard_examples[:3])}\n\n' |
|
|
'Lower rank = higher frequency = easier') |
|
|
|
|
|
plt.text(0.98, 0.98, explanation, transform=ax6.transAxes, |
|
|
fontsize=8, verticalalignment='top', horizontalalignment='right', |
|
|
bbox=dict(boxstyle='round,pad=0.5', facecolor='lightblue', alpha=0.9)) |
|
|
|
|
|
|
|
|
plt.subplots_adjust(left=0.08, bottom=0.08, right=0.95, top=0.88, wspace=0.35, hspace=0.45) |
|
|
|
|
|
|
|
|
|
|
|
if 'count_1w100k' in filename: |
|
|
output_name = 'norvig_analysis_100k.png' |
|
|
elif 'count_1w.txt' in filename: |
|
|
output_name = 'norvig_analysis_full.png' |
|
|
else: |
|
|
|
|
|
safe_name = filename.replace('.txt', '').replace('/', '_').replace('count_', '') |
|
|
output_name = f'norvig_analysis_{safe_name}.png' |
|
|
|
|
|
output_path = base_dir / output_name |
|
|
plt.savefig(output_path, dpi=300, bbox_inches='tight') |
|
|
print(f"π Comprehensive analysis saved to: {output_path}") |
|
|
|
|
|
return fig, crossword_suitable |
|
|
|
|
|
def print_summary_statistics(word_counts, filename, crossword_suitable): |
|
|
"""Print comprehensive summary statistics""" |
|
|
|
|
|
print("\n" + "="*80) |
|
|
print("π NORVIG VOCABULARY STATISTICAL ANALYSIS") |
|
|
print(f"π File: {filename}") |
|
|
print("="*80) |
|
|
|
|
|
|
|
|
total_words = len(word_counts) |
|
|
total_frequency = sum(word_counts.values()) |
|
|
|
|
|
print(f"\nπ BASIC STATISTICS:") |
|
|
print(f" β’ Total words: {total_words:,}") |
|
|
print(f" β’ Total frequency: {total_frequency:,}") |
|
|
print(f" β’ Average frequency: {total_frequency/total_words:.2f}") |
|
|
|
|
|
|
|
|
lengths, length_dist = analyze_word_lengths(word_counts.keys()) |
|
|
avg_length = np.mean(lengths) |
|
|
crossword_length_words = sum(count for length, count in length_dist.items() if 3 <= length <= 12) |
|
|
crossword_length_pct = (crossword_length_words / total_words) * 100 |
|
|
|
|
|
print(f"\nπ WORD LENGTH ANALYSIS:") |
|
|
print(f" β’ Average word length: {avg_length:.1f} characters") |
|
|
print(f" β’ Words 3-12 characters: {crossword_length_words:,} ({crossword_length_pct:.1f}%)") |
|
|
print(f" β’ Most common lengths: {sorted(length_dist.items(), key=lambda x: x[1], reverse=True)[:5]}") |
|
|
|
|
|
|
|
|
suitable_count = len(crossword_suitable) |
|
|
suitable_pct = (suitable_count / total_words) * 100 |
|
|
suitable_freq = sum(crossword_suitable.values()) |
|
|
suitable_freq_pct = (suitable_freq / total_frequency) * 100 |
|
|
|
|
|
print(f"\nπ§© CROSSWORD SUITABILITY:") |
|
|
print(f" β’ Suitable words (3-12 letters, alphabetic): {suitable_count:,} ({suitable_pct:.1f}%)") |
|
|
print(f" β’ Suitable word frequency coverage: {suitable_freq_pct:.1f}%") |
|
|
|
|
|
|
|
|
easy_words = len([w for w, c in list(crossword_suitable.items())[:5000]]) |
|
|
medium_words = len([w for w, c in list(crossword_suitable.items())[5000:25000]]) |
|
|
hard_words = len([w for w, c in list(crossword_suitable.items())[25000:]]) |
|
|
|
|
|
print(f"\nπ― CROSSWORD DIFFICULTY DISTRIBUTION:") |
|
|
print(f" β’ Easy (rank 1-5K): {easy_words:,} words") |
|
|
print(f" β’ Medium (rank 5K-25K): {medium_words:,} words") |
|
|
print(f" β’ Hard (rank 25K+): {hard_words:,} words") |
|
|
|
|
|
|
|
|
words_list = list(word_counts.keys()) |
|
|
print(f"\nπ TOP 10 MOST FREQUENT WORDS:") |
|
|
for i, word in enumerate(words_list[:10], 1): |
|
|
print(f" {i:2d}. {word:<12} ({word_counts[word]:,})") |
|
|
|
|
|
print(f"\nπ BOTTOM 10 LEAST FREQUENT WORDS:") |
|
|
for i, word in enumerate(words_list[-10:], 1): |
|
|
print(f" {i:2d}. {word:<12} ({word_counts[word]:,})") |
|
|
|
|
|
|
|
|
words_list = list(word_counts.keys()) |
|
|
counts_list = list(word_counts.values()) |
|
|
|
|
|
|
|
|
log_ranks = np.log(range(1, len(counts_list) + 1)) |
|
|
log_freqs = np.log(counts_list) |
|
|
correlation = np.corrcoef(log_ranks, log_freqs)[0, 1] |
|
|
|
|
|
print(f"\nπ ZIPF'S LAW VALIDATION:") |
|
|
print(f" β’ Log-log correlation: {correlation:.4f}") |
|
|
print(f" β’ Zipf compliance: {'β
Excellent' if abs(correlation) > 0.95 else 'β οΈ Moderate' if abs(correlation) > 0.8 else 'β Poor'}") |
|
|
|
|
|
|
|
|
print(f"\nπ‘ RECOMMENDATIONS FOR CROSSWORD GENERATION:") |
|
|
print(f" β’ Dataset size: {total_words:,} words with excellent coverage") |
|
|
print(f" β’ Filter to 3-12 letters: Reduces to {suitable_count:,} words ({suitable_pct:.1f}%)") |
|
|
print(f" β’ Difficulty thresholds (for crossword-suitable words):") |
|
|
print(f" - Easy: ranks 1-5,000 ({easy_words:,} suitable words)") |
|
|
print(f" - Medium: ranks 5,001-25,000 ({medium_words:,} suitable words)") |
|
|
print(f" - Hard: ranks 25,001+ ({hard_words:,} suitable words)") |
|
|
print(f" β’ Quality: β
No garbage entries (unlike crossword-specific lists)") |
|
|
print(f" β’ Source credibility: β
Peter Norvig (Google) + Google Books corpus") |
|
|
|
|
|
print("="*80) |
|
|
|
|
|
def main(): |
|
|
"""Main analysis function""" |
|
|
|
|
|
|
|
|
args = parse_arguments() |
|
|
|
|
|
|
|
|
base_dir = Path(__file__).parent |
|
|
input_file = Path(args.filename) |
|
|
|
|
|
|
|
|
if not input_file.is_absolute(): |
|
|
input_file = base_dir / input_file |
|
|
|
|
|
print("π Norvig Vocabulary Statistical Analysis") |
|
|
print("=" * 50) |
|
|
print(f"π Analyzing: {input_file}") |
|
|
|
|
|
|
|
|
word_counts = load_word_counts(input_file) |
|
|
|
|
|
if not word_counts: |
|
|
print(f"β Could not load word list from {input_file}. Please check file path.") |
|
|
return |
|
|
|
|
|
|
|
|
fig, crossword_suitable = create_comprehensive_analysis(word_counts, input_file.name, base_dir) |
|
|
|
|
|
|
|
|
print_summary_statistics(word_counts, input_file.name, crossword_suitable) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if 'count_1w100k' in input_file.name: |
|
|
output_name = 'norvig_analysis_100k.png' |
|
|
elif 'count_1w.txt' in input_file.name: |
|
|
output_name = 'norvig_analysis_full.png' |
|
|
else: |
|
|
safe_name = input_file.name.replace('.txt', '').replace('/', '_').replace('count_', '') |
|
|
output_name = f'norvig_analysis_{safe_name}.png' |
|
|
|
|
|
print(f"\nβ
Analysis complete! Check {base_dir}/{output_name} for detailed plots.") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |