# %% [markdown] # # COP Results Joiner # This notebook joins all Excel files from `data/cop_modelling` into a single Parquet file. # %% import pandas as pd import os from pathlib import Path # %% # Define paths # Try to resolve data path dynamically based on current working directory current_dir = Path.cwd() if (current_dir / "data" / "cop_modelling").exists(): data_path = current_dir / "data" / "cop_modelling" elif (current_dir.parent / "data" / "cop_modelling").exists(): data_path = current_dir.parent / "data" / "cop_modelling" else: # Fallback data_path = Path("..") / "data" / "cop_modelling" output_file = data_path / "joined_results.parquet" # Configuration LOAD_FROM_PARQUET = True # Set to False to rebuild from Excel files # %% if LOAD_FROM_PARQUET and output_file.exists(): print(f"Loading data directly from {output_file.name}...") joined_df = pd.read_parquet(output_file) print(f"Loaded shape: {joined_df.shape}") else: # Get all Excel files excel_files = list(data_path.glob("*.xlsx")) print(f"Found {len(excel_files)} files in {data_path.resolve()}: {[f.name for f in excel_files]}") # Load and join dfs = [] for f in excel_files: try: # Results are in 'Results' sheet df = pd.read_excel(f, sheet_name='Results') # Drop the first row (which usually contains units) df = df.iloc[1:].reset_index(drop=True) # Add a column to identify the source df['source_file'] = f.name # Convert columns to numerical if possible, else convert to strings for col in df.columns: try: df[col] = pd.to_numeric(df[col], errors='raise') except (ValueError, TypeError): df[col] = df[col].astype(str) dfs.append(df) except Exception as e: print(f"Error reading {f}: {e}") if not dfs: raise ValueError(f"No objects to concatenate. Could not find or read any valid Excel files in {data_path.resolve()}.") joined_df = pd.concat(dfs, ignore_index=True) print(f"Joined shape: {joined_df.shape}") # Save to parquet joined_df.to_parquet(output_file) print(f"Saved to {output_file}") # %% # Quick preview joined_df # %% import matplotlib.pyplot as plt import seaborn as sns # Set visual style sns.set_theme(style="whitegrid") # 1. Correlation matrix for numerical variables # Select only numerical columns numerical_cols = joined_df.select_dtypes(include=['number']).columns if len(numerical_cols) > 0: plt.figure(figsize=(10, 8)) corr_matrix = joined_df[numerical_cols].corr() sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1) plt.title('Correlation Matrix of Numerical Variables') plt.tight_layout() plt.show() else: print("No numerical variables found for correlation matrix.") # %% # 2. Visualize the distribution of every variable # Numerical variables - Histograms for col in numerical_cols: plt.figure(figsize=(8, 4)) sns.histplot(joined_df[col].dropna(), kde=True, bins=30) plt.title(f'Distribution of {col}') plt.tight_layout() plt.show() # Categorical variables - Count plots (limiting to top 20 categories if there are many) categorical_cols = joined_df.select_dtypes(exclude=['number']).columns for col in categorical_cols: plt.figure(figsize=(10, 5)) # Get top 20 most frequent categories to avoid unreadable plots top_categories = joined_df[col].value_counts().nlargest(20).index sns.countplot(data=joined_df[joined_df[col].isin(top_categories)], x=col, order=top_categories) plt.title(f'Distribution of {col} (Top 20 categories)') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() # %% import plotly.graph_objects as go import pandas as pd import numpy as np print("Generating 3D Surface Plot...") # Find columns case-insensitively or by partial match to ensure it works cols = joined_df.columns.tolist() cols_lower = [c.lower() for c in cols] def find_col(possible_names): for name in possible_names: for idx, c in enumerate(cols_lower): if name.lower() in c: return cols[idx] return None # Attempting to match your exact columns or close variations col_quelle = find_col(['t_vorlauf_quelle', 'quelle']) col_senke = find_col(['t_vorlauf_senke', 'senke']) col_cop = find_col(['cop']) col_komp = find_col(['kompressor', 'stufe']) col_kalt = find_col(['kältemittel', 'kaltemittel', 'kaeltemittel', 'refrigerant']) required_cols = {'Quelle (X)': col_quelle, 'Senke (Y)': col_senke, 'COP (Z)': col_cop, 'Kompressor': col_komp, 'Kältemittel': col_kalt} missing = {k: v for k, v in required_cols.items() if v is None} if missing: print(f"Could not automatically find columns for: {list(missing.keys())}") print(f"Available columns are: {cols}") print("Please update the column names in the code below.") else: fig = go.Figure() # Drop NaNs for the relevant columns plot_df = joined_df.dropna(subset=list(required_cols.values())).copy() # Create combinations of Kompressor and Kältemittel combinations = plot_df.groupby([col_komp, col_kalt]).size().reset_index() traces = [] buttons = [] for i, row in combinations.iterrows(): komp_val = str(row[col_komp]) kalt_val = str(row[col_kalt]) # Filter dataframe for this specific combination subset = plot_df[(plot_df[col_komp] == row[col_komp]) & (plot_df[col_kalt] == row[col_kalt])] if len(subset) < 3: continue # Skip if not enough points # Create a pivot table to form the 2D grid for the surface # Aggregating by mean in case there are duplicate combinations of (Quelle, Senke) pivot = subset.pivot_table(values=col_cop, index=col_senke, columns=col_quelle, aggfunc='mean') trace_name = f"{komp_val} | {kalt_val}" trace = go.Surface( x=pivot.columns.values, y=pivot.index.values, z=pivot.values, name=trace_name, visible=(len(traces) == 0) # Only the first trace is visible initially ) traces.append(trace) fig.add_trace(trace) # Generate the dropdown buttons for i, trace in enumerate(traces): visibility = [False] * len(traces) visibility[i] = True button = dict( label=trace.name, method="update", args=[{"visible": visibility}, {"title": f"COP Surface - {trace.name}"}] ) buttons.append(button) if traces: # Add the dropdown menu to the layout fig.update_layout( updatemenus=[ dict( active=0, buttons=buttons, direction="down", pad={"r": 10, "t": 10}, showactive=True, x=0.1, xanchor="left", y=1.15, yanchor="top" ) ], title=f"COP Surface - {traces[0].name}", scene=dict( xaxis_title=col_quelle, yaxis_title=col_senke, zaxis_title=col_cop ), autosize=False, width=800, height=700, margin=dict(l=65, r=50, b=65, t=90) ) # Show in a new browser tab to bypass nbformat issues in VS Code interactive mode fig.show(renderer="browser") else: print("Not enough data to plot surfaces for any combination.") # %% # Second Visual: Stacked Surfaces for Kältemittel print("Generating Stacked 3D Surface Plot by Kältemittel...") if not missing: fig2 = go.Figure() # List of built-in Plotly colorscales to differentiate the Kältemittel colorscales = ['Viridis', 'Plasma', 'Inferno', 'Magma', 'Cividis', 'Blues', 'Greens', 'Reds'] unique_kalt = plot_df[col_kalt].dropna().unique() for idx, kalt_val in enumerate(unique_kalt): kalt_val = str(kalt_val) subset = plot_df[plot_df[col_kalt] == kalt_val] if len(subset) < 3: continue pivot = subset.pivot_table(values=col_cop, index=col_senke, columns=col_quelle, aggfunc='mean') cscale = colorscales[idx % len(colorscales)] trace = go.Surface( x=pivot.columns.values, y=pivot.index.values, z=pivot.values, name=kalt_val, showscale=False, # Hide individual colorbars so it's not cluttered colorscale=cscale, showlegend=True, # Allow selecting/deselecting from legend hovertemplate=f"Kältemittel: {kalt_val}
Quelle (X): %{{x}}
Senke (Y): %{{y}}
COP (Z): %{{z}}" ) fig2.add_trace(trace) fig2.update_layout( title="Stacked COP Surfaces by Kältemittel", scene=dict( xaxis_title=col_quelle, yaxis_title=col_senke, zaxis_title=col_cop ), legend=dict( title="Kältemittel
(Click to toggle)", x=1.05, y=0.9 ), autosize=False, width=900, height=800, margin=dict(l=65, r=50, b=65, t=90) ) fig2.show(renderer="browser")