File size: 16,033 Bytes
bfd6ff4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
#!/usr/bin/env python3
"""
Statistical Analysis of Norvig Word Count Files

Analyzes a single Norvig word count file (count_1w.txt or count_1w100k.txt) 
from norvig.com/ngrams/ to understand vocabulary characteristics for crossword generation.

Usage:
    python analyze_norvig_vocabulary.py <filename>
    python analyze_norvig_vocabulary.py --help

Examples:
    python analyze_norvig_vocabulary.py norvig/count_1w100k.txt
    python analyze_norvig_vocabulary.py norvig/count_1w.txt
"""

import os
import sys
import argparse
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter, defaultdict
import seaborn as sns
from pathlib import Path

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

def parse_arguments():
    """Parse command line arguments"""
    parser = argparse.ArgumentParser(
        description='Analyze Norvig word count files for crossword generation',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python analyze_norvig_vocabulary.py norvig/count_1w100k.txt
  python analyze_norvig_vocabulary.py norvig/count_1w.txt
  python analyze_norvig_vocabulary.py --help

File formats supported:
  - count_1w100k.txt: Top 100,000 most frequent words
  - count_1w.txt: Full word count dataset (1M+ words)

Output:
  - Comprehensive statistical analysis
  - 6-panel visualization saved as norvig_comprehensive_analysis.png
  - Summary statistics printed to console
        """
    )
    
    parser.add_argument(
        'filename',
        help='Path to Norvig word count file (e.g., norvig/count_1w100k.txt)'
    )
    
    return parser.parse_args()

def load_word_counts(filepath):
    """Load word count file and return dict of {word: count}"""
    word_counts = {}
    total_lines = 0
    
    print(f"Loading {filepath}...")
    
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                total_lines += 1
                parts = line.strip().split('\t')
                if len(parts) == 2:
                    word, count = parts
                    word_counts[word.upper()] = int(count)
                elif len(parts) == 1 and line.strip():
                    # Handle case where count might be missing
                    word = parts[0]
                    word_counts[word.upper()] = 1
        
        print(f"βœ… Loaded {len(word_counts):,} words from {filepath}")
        return word_counts
    
    except FileNotFoundError:
        print(f"❌ File not found: {filepath}")
        return {}
    except Exception as e:
        print(f"❌ Error loading {filepath}: {e}")
        return {}

def analyze_word_lengths(words):
    """Analyze distribution of word lengths"""
    lengths = [len(word) for word in words]
    length_dist = Counter(lengths)
    
    return lengths, length_dist

def classify_difficulty(rank, total_words):
    """Classify word difficulty based on frequency rank"""
    if rank <= total_words * 0.05:  # Top 5%
        return "Very Easy"
    elif rank <= total_words * 0.20:  # Top 20%  
        return "Easy"
    elif rank <= total_words * 0.60:  # Top 60%
        return "Medium"
    elif rank <= total_words * 0.85:  # Top 85%
        return "Hard"
    else:
        return "Very Hard"

def create_comprehensive_analysis(word_counts, filename, base_dir):
    """Create comprehensive statistical analysis with readable plots"""
    
    # Create figure with subplots - 2x3 layout with good spacing
    fig = plt.figure(figsize=(18, 12))
    fig.suptitle(f'Norvig Word Count Analysis - {filename}', 
                 fontsize=16, fontweight='bold', y=0.95)
    
    # Convert to sorted lists for analysis
    words = list(word_counts.keys())
    counts = list(word_counts.values())
    ranks = list(range(1, len(counts) + 1))
    
    # 1. Zipf's Law Analysis (log-log plot)
    ax1 = plt.subplot(2, 3, 1)
    plt.loglog(ranks, counts, 'b-', alpha=0.7, linewidth=2)
    plt.xlabel('Rank (log scale)')
    plt.ylabel('Frequency (log scale)')
    plt.title('Zipf\'s Law Validation', fontweight='bold')
    plt.grid(True, alpha=0.3)
    
    # Add theoretical Zipf line for comparison
    theoretical_zipf = [counts[0] / r for r in ranks]
    plt.loglog(ranks, theoretical_zipf, 'r--', alpha=0.5, label='Theoretical')
    plt.legend()
    
    # 2. Word Length Distribution
    ax2 = plt.subplot(2, 3, 2)
    lengths, length_dist = analyze_word_lengths(words)
    lengths_list = sorted(length_dist.keys())
    counts_list = [length_dist[l] for l in lengths_list]
    
    bars = plt.bar(lengths_list, counts_list, alpha=0.7, color='skyblue', edgecolor='navy')
    plt.xlabel('Word Length (characters)')
    plt.ylabel('Number of Words')
    plt.title('Word Length Distribution', fontweight='bold')
    
    # Highlight crossword-suitable range (3-12 letters)
    for i, bar in enumerate(bars):
        if 3 <= lengths_list[i] <= 12:
            bar.set_color('lightgreen')
        elif lengths_list[i] < 3 or lengths_list[i] > 15:
            bar.set_color('lightcoral')
    
    plt.axvspan(3, 12, alpha=0.2, color='green', label='Crossword Range')
    plt.legend()
    
    # 3. Difficulty Distribution
    ax3 = plt.subplot(2, 3, 3)
    difficulty_dist = defaultdict(int)
    for rank in ranks:
        difficulty = classify_difficulty(rank, len(ranks))
        difficulty_dist[difficulty] += 1
    
    diff_labels = list(difficulty_dist.keys())
    diff_counts = list(difficulty_dist.values())
    colors = ['darkgreen', 'green', 'orange', 'red', 'darkred']
    
    wedges, texts, autotexts = plt.pie(diff_counts, labels=diff_labels, autopct='%1.1f%%', 
                                      colors=colors[:len(diff_labels)], startangle=90)
    plt.title('Difficulty Distribution', fontweight='bold')
    
    # 4. Cumulative Frequency Coverage
    ax4 = plt.subplot(2, 3, 4)
    cumulative_freq = np.cumsum(counts)
    total_freq = cumulative_freq[-1]
    coverage_pct = (cumulative_freq / total_freq) * 100
    
    plt.plot(ranks, coverage_pct, 'g-', linewidth=2)
    plt.xlabel('Vocabulary Size')
    plt.ylabel('Coverage (%)')
    plt.title('Cumulative Coverage', fontweight='bold')
    plt.grid(True, alpha=0.3)
    
    # Add key milestone markers
    milestones = [1000, 5000, 10000, 25000, 50000]
    for milestone in milestones:
        if milestone < len(coverage_pct):
            plt.axvline(x=milestone, color='red', linestyle='--', alpha=0.5)
    
    # 5. Crossword Suitability
    ax5 = plt.subplot(2, 3, 5)
    crossword_suitable = {word: count for word, count in word_counts.items() 
                         if 3 <= len(word) <= 12 and word.isalpha()}
    
    total_words = len(word_counts)
    suitable_words = len(crossword_suitable)
    unsuitable_words = total_words - suitable_words
    
    labels = [f'Suitable\n{suitable_words:,}', f'Not Suitable\n{unsuitable_words:,}']
    sizes = [suitable_words, unsuitable_words]
    colors = ['lightgreen', 'lightcoral']
    
    wedges, texts, autotexts = plt.pie(sizes, labels=labels, autopct='%1.1f%%', colors=colors, startangle=90)
    plt.title('Crossword Suitability', fontweight='bold')
    
    # 6. Difficulty Categories for Crosswords
    ax6 = plt.subplot(2, 3, 6)
    
    # Define crossword difficulty thresholds
    easy_threshold = 5000
    medium_threshold = 25000
    
    easy_words = sum(1 for i, word in enumerate(words[:easy_threshold]) if 3 <= len(word) <= 12 and i < len(words))
    medium_words = sum(1 for i, word in enumerate(words[easy_threshold:medium_threshold]) if 3 <= len(word) <= 12 and (i + easy_threshold) < len(words))
    hard_words = sum(1 for i, word in enumerate(words[medium_threshold:]) if 3 <= len(word) <= 12 and (i + medium_threshold) < len(words))
    
    categories = ['Easy', 'Medium', 'Hard']
    word_counts_cat = [easy_words, medium_words, hard_words]
    colors_cat = ['lightgreen', 'gold', 'lightcoral']
    
    bars = plt.bar(categories, word_counts_cat, color=colors_cat, alpha=0.8)
    plt.ylabel('Crossword Words')
    plt.title('Difficulty Categories\n(Based on Frequency Rank)', fontweight='bold')
    
    # Add value labels on bars
    for bar, count in zip(bars, word_counts_cat):
        height = bar.get_height()
        if height > 0:
            plt.text(bar.get_x() + bar.get_width()/2, height + max(word_counts_cat)*0.02,
                    f'{count:,}', ha='center', va='bottom', fontweight='bold')
    
    # Add explanation text box with examples
    # Get some example words for each category
    easy_examples = [w for i, w in enumerate(words[:100]) if 3 <= len(w) <= 12][:3]
    medium_examples = [w for i, w in enumerate(words[7000:12000]) if 3 <= len(w) <= 12][:3]  
    hard_examples = [w for i, w in enumerate(words[30000:35000]) if 3 <= len(w) <= 12][:3]
    
    explanation = (f'Easy: Ranks 1-5,000 (most frequent)\n'
                   f'  e.g., {", ".join(easy_examples[:3])}\n'
                   f'Medium: Ranks 5,001-25,000\n'
                   f'  e.g., {", ".join(medium_examples[:3])}\n'
                   f'Hard: Ranks 25,001+ (least frequent)\n'
                   f'  e.g., {", ".join(hard_examples[:3])}\n\n'
                   'Lower rank = higher frequency = easier')
    
    plt.text(0.98, 0.98, explanation, transform=ax6.transAxes, 
             fontsize=8, verticalalignment='top', horizontalalignment='right',
             bbox=dict(boxstyle='round,pad=0.5', facecolor='lightblue', alpha=0.9))
    
    # Adjust layout with proper spacing
    plt.subplots_adjust(left=0.08, bottom=0.08, right=0.95, top=0.88, wspace=0.35, hspace=0.45)
    
    # Save the comprehensive analysis with filename in the output name
    # Extract base name and create clean output filename
    if 'count_1w100k' in filename:
        output_name = 'norvig_analysis_100k.png'
    elif 'count_1w.txt' in filename:
        output_name = 'norvig_analysis_full.png'
    else:
        # Fallback for any other filename - make it filesystem safe
        safe_name = filename.replace('.txt', '').replace('/', '_').replace('count_', '')
        output_name = f'norvig_analysis_{safe_name}.png'
    
    output_path = base_dir / output_name
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f"πŸ“Š Comprehensive analysis saved to: {output_path}")
    
    return fig, crossword_suitable

def print_summary_statistics(word_counts, filename, crossword_suitable):
    """Print comprehensive summary statistics"""
    
    print("\n" + "="*80)
    print("πŸ“Š NORVIG VOCABULARY STATISTICAL ANALYSIS")
    print(f"πŸ“ File: {filename}")
    print("="*80)
    
    # Basic statistics
    total_words = len(word_counts)
    total_frequency = sum(word_counts.values())
    
    print(f"\nπŸ“š BASIC STATISTICS:")
    print(f"   β€’ Total words: {total_words:,}")
    print(f"   β€’ Total frequency: {total_frequency:,}")
    print(f"   β€’ Average frequency: {total_frequency/total_words:.2f}")
    
    # Word length analysis
    lengths, length_dist = analyze_word_lengths(word_counts.keys())
    avg_length = np.mean(lengths)
    crossword_length_words = sum(count for length, count in length_dist.items() if 3 <= length <= 12)
    crossword_length_pct = (crossword_length_words / total_words) * 100
    
    print(f"\nπŸ“ WORD LENGTH ANALYSIS:")
    print(f"   β€’ Average word length: {avg_length:.1f} characters")
    print(f"   β€’ Words 3-12 characters: {crossword_length_words:,} ({crossword_length_pct:.1f}%)")
    print(f"   β€’ Most common lengths: {sorted(length_dist.items(), key=lambda x: x[1], reverse=True)[:5]}")
    
    # Crossword suitability
    suitable_count = len(crossword_suitable)
    suitable_pct = (suitable_count / total_words) * 100
    suitable_freq = sum(crossword_suitable.values())
    suitable_freq_pct = (suitable_freq / total_frequency) * 100
    
    print(f"\n🧩 CROSSWORD SUITABILITY:")
    print(f"   β€’ Suitable words (3-12 letters, alphabetic): {suitable_count:,} ({suitable_pct:.1f}%)")
    print(f"   β€’ Suitable word frequency coverage: {suitable_freq_pct:.1f}%")
    
    # Difficulty distribution for crosswords
    easy_words = len([w for w, c in list(crossword_suitable.items())[:5000]])
    medium_words = len([w for w, c in list(crossword_suitable.items())[5000:25000]])
    hard_words = len([w for w, c in list(crossword_suitable.items())[25000:]])
    
    print(f"\n🎯 CROSSWORD DIFFICULTY DISTRIBUTION:")
    print(f"   β€’ Easy (rank 1-5K): {easy_words:,} words")
    print(f"   β€’ Medium (rank 5K-25K): {medium_words:,} words") 
    print(f"   β€’ Hard (rank 25K+): {hard_words:,} words")
    
    # Top and bottom words examples
    words_list = list(word_counts.keys())
    print(f"\nπŸ” TOP 10 MOST FREQUENT WORDS:")
    for i, word in enumerate(words_list[:10], 1):
        print(f"   {i:2d}. {word:<12} ({word_counts[word]:,})")
    
    print(f"\nπŸ”š BOTTOM 10 LEAST FREQUENT WORDS:")
    for i, word in enumerate(words_list[-10:], 1):
        print(f"   {i:2d}. {word:<12} ({word_counts[word]:,})")
    
    # Zipf's law validation
    words_list = list(word_counts.keys())
    counts_list = list(word_counts.values())
    
    # Calculate correlation coefficient for log-log relationship
    log_ranks = np.log(range(1, len(counts_list) + 1))
    log_freqs = np.log(counts_list)
    correlation = np.corrcoef(log_ranks, log_freqs)[0, 1]
    
    print(f"\nπŸ“ˆ ZIPF'S LAW VALIDATION:")
    print(f"   β€’ Log-log correlation: {correlation:.4f}")
    print(f"   β€’ Zipf compliance: {'βœ… Excellent' if abs(correlation) > 0.95 else '⚠️ Moderate' if abs(correlation) > 0.8 else '❌ Poor'}")
    
    # Recommendations
    print(f"\nπŸ’‘ RECOMMENDATIONS FOR CROSSWORD GENERATION:")
    print(f"   β€’ Dataset size: {total_words:,} words with excellent coverage")
    print(f"   β€’ Filter to 3-12 letters: Reduces to {suitable_count:,} words ({suitable_pct:.1f}%)")
    print(f"   β€’ Difficulty thresholds (for crossword-suitable words):")
    print(f"     - Easy: ranks 1-5,000 ({easy_words:,} suitable words)")
    print(f"     - Medium: ranks 5,001-25,000 ({medium_words:,} suitable words)")
    print(f"     - Hard: ranks 25,001+ ({hard_words:,} suitable words)")
    print(f"   β€’ Quality: βœ… No garbage entries (unlike crossword-specific lists)")
    print(f"   β€’ Source credibility: βœ… Peter Norvig (Google) + Google Books corpus")
    
    print("="*80)

def main():
    """Main analysis function"""
    
    # Parse command line arguments
    args = parse_arguments()
    
    # File paths
    base_dir = Path(__file__).parent
    input_file = Path(args.filename)
    
    # Make path relative to script directory if not absolute
    if not input_file.is_absolute():
        input_file = base_dir / input_file
    
    print("πŸ” Norvig Vocabulary Statistical Analysis")
    print("=" * 50)
    print(f"πŸ“ Analyzing: {input_file}")
    
    # Load data
    word_counts = load_word_counts(input_file)
    
    if not word_counts:
        print(f"❌ Could not load word list from {input_file}. Please check file path.")
        return
    
    # Create comprehensive analysis
    fig, crossword_suitable = create_comprehensive_analysis(word_counts, input_file.name, base_dir)
    
    # Print summary statistics
    print_summary_statistics(word_counts, input_file.name, crossword_suitable)
    
    # Don't show plot interactively in CLI, just save it
    # plt.show()  # Comment out for CLI usage
    
    # Generate the same output filename logic for final message
    if 'count_1w100k' in input_file.name:
        output_name = 'norvig_analysis_100k.png'
    elif 'count_1w.txt' in input_file.name:
        output_name = 'norvig_analysis_full.png'
    else:
        safe_name = input_file.name.replace('.txt', '').replace('/', '_').replace('count_', '')
        output_name = f'norvig_analysis_{safe_name}.png'
    
    print(f"\nβœ… Analysis complete! Check {base_dir}/{output_name} for detailed plots.")

if __name__ == "__main__":
    main()