Spaces:

vimalk78
/

abc123

Sleeping

File size: 16,033 Bytes

bfd6ff4

#!/usr/bin/env python3
"""
Statistical Analysis of Norvig Word Count Files

Analyzes a single Norvig word count file (count_1w.txt or count_1w100k.txt) 
from norvig.com/ngrams/ to understand vocabulary characteristics for crossword generation.

Usage:
    python analyze_norvig_vocabulary.py <filename>
    python analyze_norvig_vocabulary.py --help

Examples:
    python analyze_norvig_vocabulary.py norvig/count_1w100k.txt
    python analyze_norvig_vocabulary.py norvig/count_1w.txt
"""

import os
import sys
import argparse
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter, defaultdict
import seaborn as sns
from pathlib import Path

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

def parse_arguments():
    """Parse command line arguments"""
    parser = argparse.ArgumentParser(
        description='Analyze Norvig word count files for crossword generation',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python analyze_norvig_vocabulary.py norvig/count_1w100k.txt
  python analyze_norvig_vocabulary.py norvig/count_1w.txt
  python analyze_norvig_vocabulary.py --help

File formats supported:
  - count_1w100k.txt: Top 100,000 most frequent words
  - count_1w.txt: Full word count dataset (1M+ words)

Output:
  - Comprehensive statistical analysis
  - 6-panel visualization saved as norvig_comprehensive_analysis.png
  - Summary statistics printed to console
        """
    )
    
    parser.add_argument(
        'filename',
        help='Path to Norvig word count file (e.g., norvig/count_1w100k.txt)'
    )
    
    return parser.parse_args()

def load_word_counts(filepath):
    """Load word count file and return dict of {word: count}"""
    word_counts = {}
    total_lines = 0
    
    print(f"Loading {filepath}...")
    
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                total_lines += 1
                parts = line.strip().split('\t')
                if len(parts) == 2:
                    word, count = parts
                    word_counts[word.upper()] = int(count)
                elif len(parts) == 1 and line.strip():
                    # Handle case where count might be missing
                    word = parts[0]
                    word_counts[word.upper()] = 1
        
        print(f"✅ Loaded {len(word_counts):,} words from {filepath}")
        return word_counts
    
    except FileNotFoundError:
        print(f"❌ File not found: {filepath}")
        return {}
    except Exception as e:
        print(f"❌ Error loading {filepath}: {e}")
        return {}

def analyze_word_lengths(words):
    """Analyze distribution of word lengths"""
    lengths = [len(word) for word in words]
    length_dist = Counter(lengths)
    
    return lengths, length_dist

def classify_difficulty(rank, total_words):
    """Classify word difficulty based on frequency rank"""
    if rank <= total_words * 0.05:  # Top 5%
        return "Very Easy"
    elif rank <= total_words * 0.20:  # Top 20%  
        return "Easy"
    elif rank <= total_words * 0.60:  # Top 60%
        return "Medium"
    elif rank <= total_words * 0.85:  # Top 85%
        return "Hard"
    else:
        return "Very Hard"

def create_comprehensive_analysis(word_counts, filename, base_dir):
    """Create comprehensive statistical analysis with readable plots"""
    
    # Create figure with subplots - 2x3 layout with good spacing
    fig = plt.figure(figsize=(18, 12))
    fig.suptitle(f'Norvig Word Count Analysis - {filename}', 
                 fontsize=16, fontweight='bold', y=0.95)
    
    # Convert to sorted lists for analysis
    words = list(word_counts.keys())
    counts = list(word_counts.values())
    ranks = list(range(1, len(counts) + 1))
    
    # 1. Zipf's Law Analysis (log-log plot)
    ax1 = plt.subplot(2, 3, 1)
    plt.loglog(ranks, counts, 'b-', alpha=0.7, linewidth=2)
    plt.xlabel('Rank (log scale)')
    plt.ylabel('Frequency (log scale)')
    plt.title('Zipf\'s Law Validation', fontweight='bold')
    plt.grid(True, alpha=0.3)
    
    # Add theoretical Zipf line for comparison
    theoretical_zipf = [counts[0] / r for r in ranks]
    plt.loglog(ranks, theoretical_zipf, 'r--', alpha=0.5, label='Theoretical')
    plt.legend()
    
    # 2. Word Length Distribution
    ax2 = plt.subplot(2, 3, 2)
    lengths, length_dist = analyze_word_lengths(words)
    lengths_list = sorted(length_dist.keys())
    counts_list = [length_dist[l] for l in lengths_list]
    
    bars = plt.bar(lengths_list, counts_list, alpha=0.7, color='skyblue', edgecolor='navy')
    plt.xlabel('Word Length (characters)')
    plt.ylabel('Number of Words')
    plt.title('Word Length Distribution', fontweight='bold')
    
    # Highlight crossword-suitable range (3-12 letters)
    for i, bar in enumerate(bars):
        if 3 <= lengths_list[i] <= 12:
            bar.set_color('lightgreen')
        elif lengths_list[i] < 3 or lengths_list[i] > 15:
            bar.set_color('lightcoral')
    
    plt.axvspan(3, 12, alpha=0.2, color='green', label='Crossword Range')
    plt.legend()
    
    # 3. Difficulty Distribution
    ax3 = plt.subplot(2, 3, 3)
    difficulty_dist = defaultdict(int)
    for rank in ranks:
        difficulty = classify_difficulty(rank, len(ranks))
        difficulty_dist[difficulty] += 1
    
    diff_labels = list(difficulty_dist.keys())
    diff_counts = list(difficulty_dist.values())
    colors = ['darkgreen', 'green', 'orange', 'red', 'darkred']
    
    wedges, texts, autotexts = plt.pie(diff_counts, labels=diff_labels, autopct='%1.1f%%', 
                                      colors=colors[:len(diff_labels)], startangle=90)
    plt.title('Difficulty Distribution', fontweight='bold')
    
    # 4. Cumulative Frequency Coverage
    ax4 = plt.subplot(2, 3, 4)
    cumulative_freq = np.cumsum(counts)
    total_freq = cumulative_freq[-1]
    coverage_pct = (cumulative_freq / total_freq) * 100
    
    plt.plot(ranks, coverage_pct, 'g-', linewidth=2)
    plt.xlabel('Vocabulary Size')
    plt.ylabel('Coverage (%)')
    plt.title('Cumulative Coverage', fontweight='bold')
    plt.grid(True, alpha=0.3)
    
    # Add key milestone markers
    milestones = [1000, 5000, 10000, 25000, 50000]
    for milestone in milestones:
        if milestone < len(coverage_pct):
            plt.axvline(x=milestone, color='red', linestyle='--', alpha=0.5)
    
    # 5. Crossword Suitability
    ax5 = plt.subplot(2, 3, 5)
    crossword_suitable = {word: count for word, count in word_counts.items() 
                         if 3 <= len(word) <= 12 and word.isalpha()}
    
    total_words = len(word_counts)
    suitable_words = len(crossword_suitable)
    unsuitable_words = total_words - suitable_words
    
    labels = [f'Suitable\n{suitable_words:,}', f'Not Suitable\n{unsuitable_words:,}']
    sizes = [suitable_words, unsuitable_words]
    colors = ['lightgreen', 'lightcoral']
    
    wedges, texts, autotexts = plt.pie(sizes, labels=labels, autopct='%1.1f%%', colors=colors, startangle=90)
    plt.title('Crossword Suitability', fontweight='bold')
    
    # 6. Difficulty Categories for Crosswords
    ax6 = plt.subplot(2, 3, 6)
    
    # Define crossword difficulty thresholds
    easy_threshold = 5000
    medium_threshold = 25000
    
    easy_words = sum(1 for i, word in enumerate(words[:easy_threshold]) if 3 <= len(word) <= 12 and i < len(words))
    medium_words = sum(1 for i, word in enumerate(words[easy_threshold:medium_threshold]) if 3 <= len(word) <= 12 and (i + easy_threshold) < len(words))
    hard_words = sum(1 for i, word in enumerate(words[medium_threshold:]) if 3 <= len(word) <= 12 and (i + medium_threshold) < len(words))
    
    categories = ['Easy', 'Medium', 'Hard']
    word_counts_cat = [easy_words, medium_words, hard_words]
    colors_cat = ['lightgreen', 'gold', 'lightcoral']
    
    bars = plt.bar(categories, word_counts_cat, color=colors_cat, alpha=0.8)
    plt.ylabel('Crossword Words')
    plt.title('Difficulty Categories\n(Based on Frequency Rank)', fontweight='bold')
    
    # Add value labels on bars
    for bar, count in zip(bars, word_counts_cat):
        height = bar.get_height()
        if height > 0:
            plt.text(bar.get_x() + bar.get_width()/2, height + max(word_counts_cat)*0.02,
                    f'{count:,}', ha='center', va='bottom', fontweight='bold')
    
    # Add explanation text box with examples
    # Get some example words for each category
    easy_examples = [w for i, w in enumerate(words[:100]) if 3 <= len(w) <= 12][:3]
    medium_examples = [w for i, w in enumerate(words[7000:12000]) if 3 <= len(w) <= 12][:3]  
    hard_examples = [w for i, w in enumerate(words[30000:35000]) if 3 <= len(w) <= 12][:3]
    
    explanation = (f'Easy: Ranks 1-5,000 (most frequent)\n'
                   f'  e.g., {", ".join(easy_examples[:3])}\n'
                   f'Medium: Ranks 5,001-25,000\n'
                   f'  e.g., {", ".join(medium_examples[:3])}\n'
                   f'Hard: Ranks 25,001+ (least frequent)\n'
                   f'  e.g., {", ".join(hard_examples[:3])}\n\n'
                   'Lower rank = higher frequency = easier')
    
    plt.text(0.98, 0.98, explanation, transform=ax6.transAxes, 
             fontsize=8, verticalalignment='top', horizontalalignment='right',
             bbox=dict(boxstyle='round,pad=0.5', facecolor='lightblue', alpha=0.9))
    
    # Adjust layout with proper spacing
    plt.subplots_adjust(left=0.08, bottom=0.08, right=0.95, top=0.88, wspace=0.35, hspace=0.45)
    
    # Save the comprehensive analysis with filename in the output name
    # Extract base name and create clean output filename
    if 'count_1w100k' in filename:
        output_name = 'norvig_analysis_100k.png'
    elif 'count_1w.txt' in filename:
        output_name = 'norvig_analysis_full.png'
    else:
        # Fallback for any other filename - make it filesystem safe
        safe_name = filename.replace('.txt', '').replace('/', '_').replace('count_', '')
        output_name = f'norvig_analysis_{safe_name}.png'
    
    output_path = base_dir / output_name
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f"📊 Comprehensive analysis saved to: {output_path}")
    
    return fig, crossword_suitable

def print_summary_statistics(word_counts, filename, crossword_suitable):
    """Print comprehensive summary statistics"""
    
    print("\n" + "="*80)
    print("📊 NORVIG VOCABULARY STATISTICAL ANALYSIS")
    print(f"📁 File: {filename}")
    print("="*80)
    
    # Basic statistics
    total_words = len(word_counts)
    total_frequency = sum(word_counts.values())
    
    print(f"\n📚 BASIC STATISTICS:")
    print(f"   • Total words: {total_words:,}")
    print(f"   • Total frequency: {total_frequency:,}")
    print(f"   • Average frequency: {total_frequency/total_words:.2f}")
    
    # Word length analysis
    lengths, length_dist = analyze_word_lengths(word_counts.keys())
    avg_length = np.mean(lengths)
    crossword_length_words = sum(count for length, count in length_dist.items() if 3 <= length <= 12)
    crossword_length_pct = (crossword_length_words / total_words) * 100
    
    print(f"\n📏 WORD LENGTH ANALYSIS:")
    print(f"   • Average word length: {avg_length:.1f} characters")
    print(f"   • Words 3-12 characters: {crossword_length_words:,} ({crossword_length_pct:.1f}%)")
    print(f"   • Most common lengths: {sorted(length_dist.items(), key=lambda x: x[1], reverse=True)[:5]}")
    
    # Crossword suitability
    suitable_count = len(crossword_suitable)
    suitable_pct = (suitable_count / total_words) * 100
    suitable_freq = sum(crossword_suitable.values())
    suitable_freq_pct = (suitable_freq / total_frequency) * 100
    
    print(f"\n🧩 CROSSWORD SUITABILITY:")
    print(f"   • Suitable words (3-12 letters, alphabetic): {suitable_count:,} ({suitable_pct:.1f}%)")
    print(f"   • Suitable word frequency coverage: {suitable_freq_pct:.1f}%")
    
    # Difficulty distribution for crosswords
    easy_words = len([w for w, c in list(crossword_suitable.items())[:5000]])
    medium_words = len([w for w, c in list(crossword_suitable.items())[5000:25000]])
    hard_words = len([w for w, c in list(crossword_suitable.items())[25000:]])
    
    print(f"\n🎯 CROSSWORD DIFFICULTY DISTRIBUTION:")
    print(f"   • Easy (rank 1-5K): {easy_words:,} words")
    print(f"   • Medium (rank 5K-25K): {medium_words:,} words") 
    print(f"   • Hard (rank 25K+): {hard_words:,} words")
    
    # Top and bottom words examples
    words_list = list(word_counts.keys())
    print(f"\n🔝 TOP 10 MOST FREQUENT WORDS:")
    for i, word in enumerate(words_list[:10], 1):
        print(f"   {i:2d}. {word:<12} ({word_counts[word]:,})")
    
    print(f"\n🔚 BOTTOM 10 LEAST FREQUENT WORDS:")
    for i, word in enumerate(words_list[-10:], 1):
        print(f"   {i:2d}. {word:<12} ({word_counts[word]:,})")
    
    # Zipf's law validation
    words_list = list(word_counts.keys())
    counts_list = list(word_counts.values())
    
    # Calculate correlation coefficient for log-log relationship
    log_ranks = np.log(range(1, len(counts_list) + 1))
    log_freqs = np.log(counts_list)
    correlation = np.corrcoef(log_ranks, log_freqs)[0, 1]
    
    print(f"\n📈 ZIPF'S LAW VALIDATION:")
    print(f"   • Log-log correlation: {correlation:.4f}")
    print(f"   • Zipf compliance: {'✅ Excellent' if abs(correlation) > 0.95 else '⚠️ Moderate' if abs(correlation) > 0.8 else '❌ Poor'}")
    
    # Recommendations
    print(f"\n💡 RECOMMENDATIONS FOR CROSSWORD GENERATION:")
    print(f"   • Dataset size: {total_words:,} words with excellent coverage")
    print(f"   • Filter to 3-12 letters: Reduces to {suitable_count:,} words ({suitable_pct:.1f}%)")
    print(f"   • Difficulty thresholds (for crossword-suitable words):")
    print(f"     - Easy: ranks 1-5,000 ({easy_words:,} suitable words)")
    print(f"     - Medium: ranks 5,001-25,000 ({medium_words:,} suitable words)")
    print(f"     - Hard: ranks 25,001+ ({hard_words:,} suitable words)")
    print(f"   • Quality: ✅ No garbage entries (unlike crossword-specific lists)")
    print(f"   • Source credibility: ✅ Peter Norvig (Google) + Google Books corpus")
    
    print("="*80)

def main():
    """Main analysis function"""
    
    # Parse command line arguments
    args = parse_arguments()
    
    # File paths
    base_dir = Path(__file__).parent
    input_file = Path(args.filename)
    
    # Make path relative to script directory if not absolute
    if not input_file.is_absolute():
        input_file = base_dir / input_file
    
    print("🔍 Norvig Vocabulary Statistical Analysis")
    print("=" * 50)
    print(f"📁 Analyzing: {input_file}")
    
    # Load data
    word_counts = load_word_counts(input_file)
    
    if not word_counts:
        print(f"❌ Could not load word list from {input_file}. Please check file path.")
        return
    
    # Create comprehensive analysis
    fig, crossword_suitable = create_comprehensive_analysis(word_counts, input_file.name, base_dir)
    
    # Print summary statistics
    print_summary_statistics(word_counts, input_file.name, crossword_suitable)
    
    # Don't show plot interactively in CLI, just save it
    # plt.show()  # Comment out for CLI usage
    
    # Generate the same output filename logic for final message
    if 'count_1w100k' in input_file.name:
        output_name = 'norvig_analysis_100k.png'
    elif 'count_1w.txt' in input_file.name:
        output_name = 'norvig_analysis_full.png'
    else:
        safe_name = input_file.name.replace('.txt', '').replace('/', '_').replace('count_', '')
        output_name = f'norvig_analysis_{safe_name}.png'
    
    print(f"\n✅ Analysis complete! Check {base_dir}/{output_name} for detailed plots.")

if __name__ == "__main__":
    main()