HeatTransPlan / cop_analysis /join_cop copy.py
drzg15's picture
Initial code commit with LFS for binaries
c993983
# %% [markdown]
# # COP Results Joiner
# This notebook joins all Excel files from `data/cop_modelling` into a single Parquet file.
# %%
import pandas as pd
import os
from pathlib import Path
# %%
# Define paths
# Try to resolve data path dynamically based on current working directory
current_dir = Path.cwd()
if (current_dir / "data" / "cop_modelling").exists():
data_path = current_dir / "data" / "cop_modelling"
elif (current_dir.parent / "data" / "cop_modelling").exists():
data_path = current_dir.parent / "data" / "cop_modelling"
else:
# Fallback
data_path = Path("..") / "data" / "cop_modelling"
output_file = data_path / "joined_results.parquet"
# Configuration
LOAD_FROM_PARQUET = True # Set to False to rebuild from Excel files
# %%
if LOAD_FROM_PARQUET and output_file.exists():
print(f"Loading data directly from {output_file.name}...")
joined_df = pd.read_parquet(output_file)
print(f"Loaded shape: {joined_df.shape}")
else:
# Get all Excel files
excel_files = list(data_path.glob("*.xlsx"))
print(f"Found {len(excel_files)} files in {data_path.resolve()}: {[f.name for f in excel_files]}")
# Load and join
dfs = []
for f in excel_files:
try:
# Results are in 'Results' sheet
df = pd.read_excel(f, sheet_name='Results')
# Drop the first row (which usually contains units)
df = df.iloc[1:].reset_index(drop=True)
# Add a column to identify the source
df['source_file'] = f.name
# Convert columns to numerical if possible, else convert to strings
for col in df.columns:
try:
df[col] = pd.to_numeric(df[col], errors='raise')
except (ValueError, TypeError):
df[col] = df[col].astype(str)
dfs.append(df)
except Exception as e:
print(f"Error reading {f}: {e}")
if not dfs:
raise ValueError(f"No objects to concatenate. Could not find or read any valid Excel files in {data_path.resolve()}.")
joined_df = pd.concat(dfs, ignore_index=True)
print(f"Joined shape: {joined_df.shape}")
# Save to parquet
joined_df.to_parquet(output_file)
print(f"Saved to {output_file}")
# %%
# Quick preview
joined_df
# %%
import matplotlib.pyplot as plt
import seaborn as sns
# Set visual style
sns.set_theme(style="whitegrid")
# 1. Correlation matrix for numerical variables
# Select only numerical columns
numerical_cols = joined_df.select_dtypes(include=['number']).columns
if len(numerical_cols) > 0:
plt.figure(figsize=(10, 8))
corr_matrix = joined_df[numerical_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title('Correlation Matrix of Numerical Variables')
plt.tight_layout()
plt.show()
else:
print("No numerical variables found for correlation matrix.")
# %%
# 2. Visualize the distribution of every variable
# Numerical variables - Histograms
for col in numerical_cols:
plt.figure(figsize=(8, 4))
sns.histplot(joined_df[col].dropna(), kde=True, bins=30)
plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()
# Categorical variables - Count plots (limiting to top 20 categories if there are many)
categorical_cols = joined_df.select_dtypes(exclude=['number']).columns
for col in categorical_cols:
plt.figure(figsize=(10, 5))
# Get top 20 most frequent categories to avoid unreadable plots
top_categories = joined_df[col].value_counts().nlargest(20).index
sns.countplot(data=joined_df[joined_df[col].isin(top_categories)], x=col, order=top_categories)
plt.title(f'Distribution of {col} (Top 20 categories)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
# %%
import plotly.graph_objects as go
import pandas as pd
import numpy as np
print("Generating 3D Surface Plot...")
# Find columns case-insensitively or by partial match to ensure it works
cols = joined_df.columns.tolist()
cols_lower = [c.lower() for c in cols]
def find_col(possible_names):
for name in possible_names:
for idx, c in enumerate(cols_lower):
if name.lower() in c:
return cols[idx]
return None
# Attempting to match your exact columns or close variations
col_quelle = find_col(['t_vorlauf_quelle', 'quelle'])
col_senke = find_col(['t_vorlauf_senke', 'senke'])
col_cop = find_col(['cop'])
col_komp = find_col(['kompressor', 'stufe'])
col_kalt = find_col(['kältemittel', 'kaltemittel', 'kaeltemittel', 'refrigerant'])
required_cols = {'Quelle (X)': col_quelle, 'Senke (Y)': col_senke, 'COP (Z)': col_cop, 'Kompressor': col_komp, 'Kältemittel': col_kalt}
missing = {k: v for k, v in required_cols.items() if v is None}
if missing:
print(f"Could not automatically find columns for: {list(missing.keys())}")
print(f"Available columns are: {cols}")
print("Please update the column names in the code below.")
else:
fig = go.Figure()
# Drop NaNs for the relevant columns
plot_df = joined_df.dropna(subset=list(required_cols.values())).copy()
# Create combinations of Kompressor and Kältemittel
combinations = plot_df.groupby([col_komp, col_kalt]).size().reset_index()
traces = []
buttons = []
for i, row in combinations.iterrows():
komp_val = str(row[col_komp])
kalt_val = str(row[col_kalt])
# Filter dataframe for this specific combination
subset = plot_df[(plot_df[col_komp] == row[col_komp]) & (plot_df[col_kalt] == row[col_kalt])]
if len(subset) < 3:
continue # Skip if not enough points
# Create a pivot table to form the 2D grid for the surface
# Aggregating by mean in case there are duplicate combinations of (Quelle, Senke)
pivot = subset.pivot_table(values=col_cop,
index=col_senke,
columns=col_quelle,
aggfunc='mean')
trace_name = f"{komp_val} | {kalt_val}"
trace = go.Surface(
x=pivot.columns.values,
y=pivot.index.values,
z=pivot.values,
name=trace_name,
visible=(len(traces) == 0) # Only the first trace is visible initially
)
traces.append(trace)
fig.add_trace(trace)
# Generate the dropdown buttons
for i, trace in enumerate(traces):
visibility = [False] * len(traces)
visibility[i] = True
button = dict(
label=trace.name,
method="update",
args=[{"visible": visibility},
{"title": f"COP Surface - {trace.name}"}]
)
buttons.append(button)
if traces:
# Add the dropdown menu to the layout
fig.update_layout(
updatemenus=[
dict(
active=0,
buttons=buttons,
direction="down",
pad={"r": 10, "t": 10},
showactive=True,
x=0.1,
xanchor="left",
y=1.15,
yanchor="top"
)
],
title=f"COP Surface - {traces[0].name}",
scene=dict(
xaxis_title=col_quelle,
yaxis_title=col_senke,
zaxis_title=col_cop
),
autosize=False,
width=800,
height=700,
margin=dict(l=65, r=50, b=65, t=90)
)
# Show in a new browser tab to bypass nbformat issues in VS Code interactive mode
fig.show(renderer="browser")
else:
print("Not enough data to plot surfaces for any combination.")
# %%
# Second Visual: Stacked Surfaces for Kältemittel
print("Generating Stacked 3D Surface Plot by Kältemittel...")
if not missing:
fig2 = go.Figure()
# List of built-in Plotly colorscales to differentiate the Kältemittel
colorscales = ['Viridis', 'Plasma', 'Inferno', 'Magma', 'Cividis', 'Blues', 'Greens', 'Reds']
unique_kalt = plot_df[col_kalt].dropna().unique()
for idx, kalt_val in enumerate(unique_kalt):
kalt_val = str(kalt_val)
subset = plot_df[plot_df[col_kalt] == kalt_val]
if len(subset) < 3:
continue
pivot = subset.pivot_table(values=col_cop,
index=col_senke,
columns=col_quelle,
aggfunc='mean')
cscale = colorscales[idx % len(colorscales)]
trace = go.Surface(
x=pivot.columns.values,
y=pivot.index.values,
z=pivot.values,
name=kalt_val,
showscale=False, # Hide individual colorbars so it's not cluttered
colorscale=cscale,
showlegend=True, # Allow selecting/deselecting from legend
hovertemplate=f"Kältemittel: {kalt_val}<br>Quelle (X): %{{x}}<br>Senke (Y): %{{y}}<br>COP (Z): %{{z}}<extra></extra>"
)
fig2.add_trace(trace)
fig2.update_layout(
title="Stacked COP Surfaces by Kältemittel",
scene=dict(
xaxis_title=col_quelle,
yaxis_title=col_senke,
zaxis_title=col_cop
),
legend=dict(
title="Kältemittel<br>(Click to toggle)",
x=1.05,
y=0.9
),
autosize=False,
width=900,
height=800,
margin=dict(l=65, r=50, b=65, t=90)
)
fig2.show(renderer="browser")