# %% [markdown]
# # COP Results Joiner
# This notebook joins all Excel files from `data/cop_modelling` into a single Parquet file.

# %%
import pandas as pd
import os
from pathlib import Path

# %%
# Define paths
# Try to resolve data path dynamically based on current working directory
current_dir = Path.cwd()
if (current_dir / "data" / "cop_modelling").exists():
    data_path = current_dir / "data" / "cop_modelling"
elif (current_dir.parent / "data" / "cop_modelling").exists():
    data_path = current_dir.parent / "data" / "cop_modelling"
else:
    # Fallback
    data_path = Path("..") / "data" / "cop_modelling"

output_file = data_path / "joined_results.parquet"

# Configuration
LOAD_FROM_PARQUET = True # Set to False to rebuild from Excel files

# %%
if LOAD_FROM_PARQUET and output_file.exists():
    print(f"Loading data directly from {output_file.name}...")
    joined_df = pd.read_parquet(output_file)
    print(f"Loaded shape: {joined_df.shape}")
else:
    # Get all Excel files
    excel_files = list(data_path.glob("*.xlsx"))
    print(f"Found {len(excel_files)} files in {data_path.resolve()}: {[f.name for f in excel_files]}")

    # Load and join
    dfs = []
    for f in excel_files:
        try:
            # Results are in 'Results' sheet
            df = pd.read_excel(f, sheet_name='Results')
            
            # Drop the first row (which usually contains units)
            df = df.iloc[1:].reset_index(drop=True)
            
            # Add a column to identify the source
            df['source_file'] = f.name
            
            # Convert columns to numerical if possible, else convert to strings
            for col in df.columns:
                try:
                    df[col] = pd.to_numeric(df[col], errors='raise')
                except (ValueError, TypeError):
                    df[col] = df[col].astype(str)
                    
            dfs.append(df)
        except Exception as e:
            print(f"Error reading {f}: {e}")

    if not dfs:
        raise ValueError(f"No objects to concatenate. Could not find or read any valid Excel files in {data_path.resolve()}.")

    joined_df = pd.concat(dfs, ignore_index=True)
    print(f"Joined shape: {joined_df.shape}")

    # Save to parquet
    joined_df.to_parquet(output_file)
    print(f"Saved to {output_file}")

# %%
# Quick preview
joined_df


# %%
import matplotlib.pyplot as plt
import seaborn as sns

# Set visual style
sns.set_theme(style="whitegrid")

# 1. Correlation matrix for numerical variables
# Select only numerical columns
numerical_cols = joined_df.select_dtypes(include=['number']).columns

if len(numerical_cols) > 0:
    plt.figure(figsize=(10, 8))
    corr_matrix = joined_df[numerical_cols].corr()
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
    plt.title('Correlation Matrix of Numerical Variables')
    plt.tight_layout()
    plt.show()
else:
    print("No numerical variables found for correlation matrix.")

# %%
# 2. Visualize the distribution of every variable
# Numerical variables - Histograms
for col in numerical_cols:
    plt.figure(figsize=(8, 4))
    sns.histplot(joined_df[col].dropna(), kde=True, bins=30)
    plt.title(f'Distribution of {col}')
    plt.tight_layout()
    plt.show()

# Categorical variables - Count plots (limiting to top 20 categories if there are many)
categorical_cols = joined_df.select_dtypes(exclude=['number']).columns
for col in categorical_cols:
    plt.figure(figsize=(10, 5))
    # Get top 20 most frequent categories to avoid unreadable plots
    top_categories = joined_df[col].value_counts().nlargest(20).index
    sns.countplot(data=joined_df[joined_df[col].isin(top_categories)], x=col, order=top_categories)
    plt.title(f'Distribution of {col} (Top 20 categories)')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

# %%
import plotly.graph_objects as go
import pandas as pd
import numpy as np

print("Generating 3D Surface Plot...")

# Find columns case-insensitively or by partial match to ensure it works
cols = joined_df.columns.tolist()
cols_lower = [c.lower() for c in cols]

def find_col(possible_names):
    for name in possible_names:
        for idx, c in enumerate(cols_lower):
            if name.lower() in c:
                return cols[idx]
    return None

# Attempting to match your exact columns or close variations
col_quelle = find_col(['t_vorlauf_quelle', 'quelle'])
col_senke = find_col(['t_vorlauf_senke', 'senke'])
col_cop = find_col(['cop'])
col_komp = find_col(['kompressor', 'stufe'])
col_kalt = find_col(['kältemittel', 'kaltemittel', 'kaeltemittel', 'refrigerant'])

required_cols = {'Quelle (X)': col_quelle, 'Senke (Y)': col_senke, 'COP (Z)': col_cop, 'Kompressor': col_komp, 'Kältemittel': col_kalt}
missing = {k: v for k, v in required_cols.items() if v is None}

if missing:
    print(f"Could not automatically find columns for: {list(missing.keys())}")
    print(f"Available columns are: {cols}")
    print("Please update the column names in the code below.")
else:
    fig = go.Figure()

    # Drop NaNs for the relevant columns
    plot_df = joined_df.dropna(subset=list(required_cols.values())).copy()
    
    # Create combinations of Kompressor and Kältemittel
    combinations = plot_df.groupby([col_komp, col_kalt]).size().reset_index()
    
    traces = []
    buttons = []
    
    for i, row in combinations.iterrows():
        komp_val = str(row[col_komp])
        kalt_val = str(row[col_kalt])
        
        # Filter dataframe for this specific combination
        subset = plot_df[(plot_df[col_komp] == row[col_komp]) & (plot_df[col_kalt] == row[col_kalt])]
        
        if len(subset) < 3:
            continue # Skip if not enough points
            
        # Create a pivot table to form the 2D grid for the surface
        # Aggregating by mean in case there are duplicate combinations of (Quelle, Senke)
        pivot = subset.pivot_table(values=col_cop, 
                                   index=col_senke, 
                                   columns=col_quelle, 
                                   aggfunc='mean')
        
        trace_name = f"{komp_val} | {kalt_val}"
        
        trace = go.Surface(
            x=pivot.columns.values,
            y=pivot.index.values,
            z=pivot.values,
            name=trace_name,
            visible=(len(traces) == 0) # Only the first trace is visible initially
        )
        
        traces.append(trace)
        fig.add_trace(trace)
        
    # Generate the dropdown buttons
    for i, trace in enumerate(traces):
        visibility = [False] * len(traces)
        visibility[i] = True
        
        button = dict(
            label=trace.name,
            method="update",
            args=[{"visible": visibility},
                  {"title": f"COP Surface - {trace.name}"}]
        )
        buttons.append(button)

    if traces:
        # Add the dropdown menu to the layout
        fig.update_layout(
            updatemenus=[
                dict(
                    active=0,
                    buttons=buttons,
                    direction="down",
                    pad={"r": 10, "t": 10},
                    showactive=True,
                    x=0.1,
                    xanchor="left",
                    y=1.15,
                    yanchor="top"
                )
            ],
            title=f"COP Surface - {traces[0].name}",
            scene=dict(
                xaxis_title=col_quelle,
                yaxis_title=col_senke,
                zaxis_title=col_cop
            ),
            autosize=False,
            width=800, 
            height=700,
            margin=dict(l=65, r=50, b=65, t=90)
        )
        # Show in a new browser tab to bypass nbformat issues in VS Code interactive mode
        fig.show(renderer="browser")
    else:
        print("Not enough data to plot surfaces for any combination.")

# %%
# Second Visual: Stacked Surfaces for Kältemittel
print("Generating Stacked 3D Surface Plot by Kältemittel...")

if not missing:
    fig2 = go.Figure()

    # List of built-in Plotly colorscales to differentiate the Kältemittel
    colorscales = ['Viridis', 'Plasma', 'Inferno', 'Magma', 'Cividis', 'Blues', 'Greens', 'Reds']
    
    unique_kalt = plot_df[col_kalt].dropna().unique()
    
    for idx, kalt_val in enumerate(unique_kalt):
        kalt_val = str(kalt_val)
        subset = plot_df[plot_df[col_kalt] == kalt_val]
        
        if len(subset) < 3:
            continue
            
        pivot = subset.pivot_table(values=col_cop, 
                                   index=col_senke, 
                                   columns=col_quelle, 
                                   aggfunc='mean')
        
        cscale = colorscales[idx % len(colorscales)]
        
        trace = go.Surface(
            x=pivot.columns.values,
            y=pivot.index.values,
            z=pivot.values,
            name=kalt_val,
            showscale=False, # Hide individual colorbars so it's not cluttered
            colorscale=cscale,
            showlegend=True, # Allow selecting/deselecting from legend
            hovertemplate=f"Kältemittel: {kalt_val}<br>Quelle (X): %{{x}}<br>Senke (Y): %{{y}}<br>COP (Z): %{{z}}<extra></extra>"
        )
        fig2.add_trace(trace)
        
    fig2.update_layout(
        title="Stacked COP Surfaces by Kältemittel",
        scene=dict(
            xaxis_title=col_quelle,
            yaxis_title=col_senke,
            zaxis_title=col_cop
        ),
        legend=dict(
            title="Kältemittel<br>(Click to toggle)",
            x=1.05,
            y=0.9
        ),
        autosize=False,
        width=900, 
        height=800,
        margin=dict(l=65, r=50, b=65, t=90)
    )
    
    fig2.show(renderer="browser")