#!/usr/bin/env python3
"""
Analyze the distribution of spanning trees and identify what makes
some triangulations "vastly more forested" than others.
"""

import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))

import json
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

def analyze_distribution(data_file: str):
    """Analyze spanning tree distribution in detail."""

    with open(data_file, 'r') as f:
        data = json.load(f)

    tris = data['raw_data']['triangulations']

    print("="*70)
    print("SPANNING TREE DISTRIBUTION ANALYSIS")
    print("="*70)
    print(f"\nTotal triangulations: {len(tris)}")

    # Extract spanning tree counts
    spanning_trees = np.array([t['n_spanning_trees'] for t in tris])
    log_spanning_trees = np.log1p(spanning_trees)  # log(1+x) to handle zeros

    # Partition by realizability
    standard_real = [t for t in tris if t['standard_realizable']]
    standard_nonreal = [t for t in tris if not t['standard_realizable']]
    strict_real = [t for t in tris if t['strict_realizable']]
    strict_nonreal = [t for t in tris if not t['strict_realizable']]

    st_standard_real = np.array([t['n_spanning_trees'] for t in standard_real])
    st_standard_nonreal = np.array([t['n_spanning_trees'] for t in standard_nonreal])
    st_strict_real = np.array([t['n_spanning_trees'] for t in strict_real])
    st_strict_nonreal = np.array([t['n_spanning_trees'] for t in strict_nonreal])

    # Log-transformed statistics
    print("\n" + "="*70)
    print("LOG-TRANSFORMED STATISTICS: log(1 + spanning_trees)")
    print("="*70)

    print("\n--- STANDARD REALIZABILITY ---")
    print(f"Realizable:")
    print(f"  Log mean: {np.mean(np.log1p(st_standard_real)):.3f}")
    print(f"  Log median: {np.median(np.log1p(st_standard_real)):.3f}")
    print(f"  Log std: {np.std(np.log1p(st_standard_real)):.3f}")

    print(f"\nNon-realizable:")
    print(f"  Log mean: {np.mean(np.log1p(st_standard_nonreal)):.3f}")
    print(f"  Log median: {np.median(np.log1p(st_standard_nonreal)):.3f}")
    print(f"  Log std: {np.std(np.log1p(st_standard_nonreal)):.3f}")

    print("\n--- STRICT REALIZABILITY ---")
    print(f"Strict realizable:")
    print(f"  Log mean: {np.mean(np.log1p(st_strict_real)):.3f}")
    print(f"  Log median: {np.median(np.log1p(st_strict_real)):.3f}")
    print(f"  Log std: {np.std(np.log1p(st_strict_real)):.3f}")

    print(f"\nStrict non-realizable:")
    print(f"  Log mean: {np.mean(np.log1p(st_strict_nonreal)):.3f}")
    print(f"  Log median: {np.median(np.log1p(st_strict_nonreal)):.3f}")
    print(f"  Log std: {np.std(np.log1p(st_strict_nonreal)):.3f}")

    # Identify extreme cases
    print("\n" + "="*70)
    print("EXTREME CASES: Most Forested Triangulations")
    print("="*70)

    # Sort by spanning trees
    tris_sorted = sorted(tris, key=lambda t: t['n_spanning_trees'], reverse=True)

    print("\nTop 20 most forested triangulations:")
    print(f"{'Rank':<6} {'Index':<8} {'Spanning':<10} {'Vertices':<10} {'Edges':<8} {'Std Real':<10} {'Strict Real':<12}")
    print("-"*70)

    for rank, t in enumerate(tris_sorted[:20], 1):
        print(f"{rank:<6} {t['index']:<8} {t['n_spanning_trees']:<10} "
              f"{t['n_vertices']:<10} {t['n_edges']:<8} "
              f"{'Yes' if t['standard_realizable'] else 'No':<10} "
              f"{'Yes' if t['strict_realizable'] else 'No':<12}")

    # Analyze bottom cases
    print("\n" + "="*70)
    print("EXTREME CASES: Least Forested Triangulations")
    print("="*70)

    # Count zeros
    n_zero = sum(1 for t in tris if t['n_spanning_trees'] == 0)
    print(f"\nTriangulations with ZERO spanning trees: {n_zero} ({100*n_zero/len(tris):.2f}%)")

    if n_zero > 0:
        zero_tris = [t for t in tris if t['n_spanning_trees'] == 0]
        n_real = sum(1 for t in zero_tris if t['standard_realizable'])
        n_strict = sum(1 for t in zero_tris if t['strict_realizable'])
        print(f"  Standard realizable: {n_real} ({100*n_real/n_zero:.1f}%)")
        print(f"  Strict realizable: {n_strict} ({100*n_strict/n_zero:.1f}%)")

    print("\nBottom 20 least forested (non-zero):")
    nonzero_tris = [t for t in tris if t['n_spanning_trees'] > 0]
    tris_sorted_bottom = sorted(nonzero_tris, key=lambda t: t['n_spanning_trees'])

    print(f"{'Rank':<6} {'Index':<8} {'Spanning':<10} {'Vertices':<10} {'Edges':<8} {'Std Real':<10} {'Strict Real':<12}")
    print("-"*70)

    for rank, t in enumerate(tris_sorted_bottom[:20], 1):
        print(f"{rank:<6} {t['index']:<8} {t['n_spanning_trees']:<10} "
              f"{t['n_vertices']:<10} {t['n_edges']:<8} "
              f"{'Yes' if t['standard_realizable'] else 'No':<10} "
              f"{'Yes' if t['strict_realizable'] else 'No':<12}")

    # Analyze correlation with graph properties
    print("\n" + "="*70)
    print("CORRELATION WITH GRAPH PROPERTIES")
    print("="*70)

    vertices = np.array([t['n_vertices'] for t in tris])
    edges = np.array([t['n_edges'] for t in tris])

    # Compute correlations
    corr_vertices = np.corrcoef(spanning_trees, vertices)[0, 1]
    corr_edges = np.corrcoef(spanning_trees, edges)[0, 1]
    corr_log_vertices = np.corrcoef(log_spanning_trees, vertices)[0, 1]
    corr_log_edges = np.corrcoef(log_spanning_trees, edges)[0, 1]

    print(f"\nPearson correlation (raw):")
    print(f"  Spanning trees vs vertices: {corr_vertices:.4f}")
    print(f"  Spanning trees vs edges: {corr_edges:.4f}")

    print(f"\nPearson correlation (log-transformed):")
    print(f"  log(spanning trees) vs vertices: {corr_log_vertices:.4f}")
    print(f"  log(spanning trees) vs edges: {corr_log_edges:.4f}")

    # Percentile analysis
    print("\n" + "="*70)
    print("PERCENTILE ANALYSIS")
    print("="*70)

    percentiles = [0, 1, 5, 10, 25, 50, 75, 90, 95, 99, 100]
    values = np.percentile(spanning_trees, percentiles)

    print(f"\n{'Percentile':<12} {'Value':<12} {'log(1+value)':<15}")
    print("-"*40)
    for p, v in zip(percentiles, values):
        print(f"{p:<12} {int(v):<12} {np.log1p(v):<15.3f}")

    # Create visualizations
    create_visualizations(data, tris, spanning_trees, log_spanning_trees,
                         st_standard_real, st_standard_nonreal,
                         st_strict_real, st_strict_nonreal)


def create_visualizations(data, tris, spanning_trees, log_spanning_trees,
                          st_standard_real, st_standard_nonreal,
                          st_strict_real, st_strict_nonreal):
    """Create distribution plots."""

    fig, axes = plt.subplots(2, 3, figsize=(18, 12))

    # 1. Overall distribution (linear)
    ax = axes[0, 0]
    ax.hist(spanning_trees, bins=100, alpha=0.7, edgecolor='black')
    ax.set_xlabel('Number of spanning trees')
    ax.set_ylabel('Frequency')
    ax.set_title('Overall Distribution (Linear Scale)')
    ax.axvline(np.mean(spanning_trees), color='red', linestyle='--',
               label=f'Mean: {np.mean(spanning_trees):.1f}')
    ax.axvline(np.median(spanning_trees), color='blue', linestyle='--',
               label=f'Median: {np.median(spanning_trees):.1f}')
    ax.legend()
    ax.grid(True, alpha=0.3)

    # 2. Overall distribution (log)
    ax = axes[0, 1]
    ax.hist(log_spanning_trees, bins=100, alpha=0.7, edgecolor='black')
    ax.set_xlabel('log(1 + spanning trees)')
    ax.set_ylabel('Frequency')
    ax.set_title('Overall Distribution (Log-Transformed)')
    ax.axvline(np.mean(log_spanning_trees), color='red', linestyle='--',
               label=f'Mean: {np.mean(log_spanning_trees):.2f}')
    ax.axvline(np.median(log_spanning_trees), color='blue', linestyle='--',
               label=f'Median: {np.median(log_spanning_trees):.2f}')
    ax.legend()
    ax.grid(True, alpha=0.3)

    # 3. Standard realizability comparison (log)
    ax = axes[0, 2]
    ax.hist(np.log1p(st_standard_real), bins=50, alpha=0.5,
            label=f'Realizable (n={len(st_standard_real)})', color='green', edgecolor='black')
    ax.hist(np.log1p(st_standard_nonreal), bins=50, alpha=0.5,
            label=f'Non-realizable (n={len(st_standard_nonreal)})', color='red', edgecolor='black')
    ax.set_xlabel('log(1 + spanning trees)')
    ax.set_ylabel('Frequency')
    ax.set_title('Standard Realizability (Log Scale)')
    ax.legend()
    ax.grid(True, alpha=0.3)

    # 4. Strict realizability comparison (log)
    ax = axes[1, 0]
    ax.hist(np.log1p(st_strict_real), bins=50, alpha=0.5,
            label=f'Strict (n={len(st_strict_real)})', color='blue', edgecolor='black')
    ax.hist(np.log1p(st_strict_nonreal), bins=50, alpha=0.5,
            label=f'Non-strict (n={len(st_strict_nonreal)})', color='orange', edgecolor='black')
    ax.set_xlabel('log(1 + spanning trees)')
    ax.set_ylabel('Frequency')
    ax.set_title('Strict Realizability (Log Scale)')
    ax.legend()
    ax.grid(True, alpha=0.3)

    # 5. Scatter: vertices vs log(spanning trees)
    ax = axes[1, 1]
    vertices = np.array([t['n_vertices'] for t in tris])
    ax.scatter(vertices, log_spanning_trees, alpha=0.3, s=10)
    ax.set_xlabel('Number of vertices')
    ax.set_ylabel('log(1 + spanning trees)')
    ax.set_title('Vertices vs Log(Spanning Trees)')
    ax.grid(True, alpha=0.3)

    # 6. Scatter: edges vs log(spanning trees)
    ax = axes[1, 2]
    edges = np.array([t['n_edges'] for t in tris])
    ax.scatter(edges, log_spanning_trees, alpha=0.3, s=10)
    ax.set_xlabel('Number of edges')
    ax.set_ylabel('log(1 + spanning trees)')
    ax.set_title('Edges vs Log(Spanning Trees)')
    ax.grid(True, alpha=0.3)

    plt.tight_layout()

    # Save figure
    n = data['parameters']['n_vertices']
    output_path = f'results/plots/spanning_tree_analysis_n{n}.png'
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    plt.savefig(output_path, dpi=150, bbox_inches='tight')
    print(f"\n{'='*70}")
    print(f"Plots saved to: {output_path}")
    print(f"{'='*70}")

    plt.close()


if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser(description='Analyze spanning tree distribution')
    parser.add_argument('--data', type=str,
                       default='results/spanning_trees_n10.json',
                       help='Path to spanning trees data JSON')

    args = parser.parse_args()

    analyze_distribution(args.data)