Spaces:
Running
Running
| # %% [markdown] | |
| # # COP Results Joiner | |
| # This notebook joins all Excel files from `data/cop_modelling` into a single Parquet file. | |
| # %% | |
| import pandas as pd | |
| import os | |
| from pathlib import Path | |
| # %% | |
| # Define paths | |
| # Try to resolve data path dynamically based on current working directory | |
| current_dir = Path.cwd() | |
| if (current_dir / "data" / "cop_modelling").exists(): | |
| data_path = current_dir / "data" / "cop_modelling" | |
| elif (current_dir.parent / "data" / "cop_modelling").exists(): | |
| data_path = current_dir.parent / "data" / "cop_modelling" | |
| else: | |
| # Fallback | |
| data_path = Path("..") / "data" / "cop_modelling" | |
| output_file = data_path / "joined_results.parquet" | |
| # Configuration | |
| LOAD_FROM_PARQUET = True # Set to False to rebuild from Excel files | |
| # %% | |
| if LOAD_FROM_PARQUET and output_file.exists(): | |
| print(f"Loading data directly from {output_file.name}...") | |
| joined_df = pd.read_parquet(output_file) | |
| print(f"Loaded shape: {joined_df.shape}") | |
| else: | |
| # Get all Excel files | |
| excel_files = list(data_path.glob("*.xlsx")) | |
| print(f"Found {len(excel_files)} files in {data_path.resolve()}: {[f.name for f in excel_files]}") | |
| # Load and join | |
| dfs = [] | |
| for f in excel_files: | |
| try: | |
| # Results are in 'Results' sheet | |
| df = pd.read_excel(f, sheet_name='Results') | |
| # Drop the first row (which usually contains units) | |
| df = df.iloc[1:].reset_index(drop=True) | |
| # Add a column to identify the source | |
| df['source_file'] = f.name | |
| # Convert columns to numerical if possible, else convert to strings | |
| for col in df.columns: | |
| try: | |
| df[col] = pd.to_numeric(df[col], errors='raise') | |
| except (ValueError, TypeError): | |
| df[col] = df[col].astype(str) | |
| dfs.append(df) | |
| except Exception as e: | |
| print(f"Error reading {f}: {e}") | |
| if not dfs: | |
| raise ValueError(f"No objects to concatenate. Could not find or read any valid Excel files in {data_path.resolve()}.") | |
| joined_df = pd.concat(dfs, ignore_index=True) | |
| print(f"Joined shape: {joined_df.shape}") | |
| # Save to parquet | |
| joined_df.to_parquet(output_file) | |
| print(f"Saved to {output_file}") | |
| # %% | |
| # Quick preview | |
| joined_df | |
| # %% | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| # Set visual style | |
| sns.set_theme(style="whitegrid") | |
| # 1. Correlation matrix for numerical variables | |
| # Select only numerical columns | |
| numerical_cols = joined_df.select_dtypes(include=['number']).columns | |
| if len(numerical_cols) > 0: | |
| plt.figure(figsize=(10, 8)) | |
| corr_matrix = joined_df[numerical_cols].corr() | |
| sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1) | |
| plt.title('Correlation Matrix of Numerical Variables') | |
| plt.tight_layout() | |
| plt.show() | |
| else: | |
| print("No numerical variables found for correlation matrix.") | |
| # %% | |
| # 2. Visualize the distribution of every variable | |
| # Numerical variables - Histograms | |
| for col in numerical_cols: | |
| plt.figure(figsize=(8, 4)) | |
| sns.histplot(joined_df[col].dropna(), kde=True, bins=30) | |
| plt.title(f'Distribution of {col}') | |
| plt.tight_layout() | |
| plt.show() | |
| # Categorical variables - Count plots (limiting to top 20 categories if there are many) | |
| categorical_cols = joined_df.select_dtypes(exclude=['number']).columns | |
| for col in categorical_cols: | |
| plt.figure(figsize=(10, 5)) | |
| # Get top 20 most frequent categories to avoid unreadable plots | |
| top_categories = joined_df[col].value_counts().nlargest(20).index | |
| sns.countplot(data=joined_df[joined_df[col].isin(top_categories)], x=col, order=top_categories) | |
| plt.title(f'Distribution of {col} (Top 20 categories)') | |
| plt.xticks(rotation=45, ha='right') | |
| plt.tight_layout() | |
| plt.show() | |
| # %% | |
| import plotly.graph_objects as go | |
| import pandas as pd | |
| import numpy as np | |
| print("Generating 3D Surface Plot...") | |
| # Find columns case-insensitively or by partial match to ensure it works | |
| cols = joined_df.columns.tolist() | |
| cols_lower = [c.lower() for c in cols] | |
| def find_col(possible_names): | |
| for name in possible_names: | |
| for idx, c in enumerate(cols_lower): | |
| if name.lower() in c: | |
| return cols[idx] | |
| return None | |
| # Attempting to match your exact columns or close variations | |
| col_quelle = find_col(['t_vorlauf_quelle', 'quelle']) | |
| col_senke = find_col(['t_vorlauf_senke', 'senke']) | |
| col_cop = find_col(['cop']) | |
| col_komp = find_col(['kompressor', 'stufe']) | |
| col_kalt = find_col(['kältemittel', 'kaltemittel', 'kaeltemittel', 'refrigerant']) | |
| required_cols = {'Quelle (X)': col_quelle, 'Senke (Y)': col_senke, 'COP (Z)': col_cop, 'Kompressor': col_komp, 'Kältemittel': col_kalt} | |
| missing = {k: v for k, v in required_cols.items() if v is None} | |
| if missing: | |
| print(f"Could not automatically find columns for: {list(missing.keys())}") | |
| print(f"Available columns are: {cols}") | |
| print("Please update the column names in the code below.") | |
| else: | |
| fig = go.Figure() | |
| # Drop NaNs for the relevant columns | |
| plot_df = joined_df.dropna(subset=list(required_cols.values())).copy() | |
| # Create combinations of Kompressor and Kältemittel | |
| combinations = plot_df.groupby([col_komp, col_kalt]).size().reset_index() | |
| traces = [] | |
| buttons = [] | |
| for i, row in combinations.iterrows(): | |
| komp_val = str(row[col_komp]) | |
| kalt_val = str(row[col_kalt]) | |
| # Filter dataframe for this specific combination | |
| subset = plot_df[(plot_df[col_komp] == row[col_komp]) & (plot_df[col_kalt] == row[col_kalt])] | |
| if len(subset) < 3: | |
| continue # Skip if not enough points | |
| # Create a pivot table to form the 2D grid for the surface | |
| # Aggregating by mean in case there are duplicate combinations of (Quelle, Senke) | |
| pivot = subset.pivot_table(values=col_cop, | |
| index=col_senke, | |
| columns=col_quelle, | |
| aggfunc='mean') | |
| trace_name = f"{komp_val} | {kalt_val}" | |
| trace = go.Surface( | |
| x=pivot.columns.values, | |
| y=pivot.index.values, | |
| z=pivot.values, | |
| name=trace_name, | |
| visible=(len(traces) == 0) # Only the first trace is visible initially | |
| ) | |
| traces.append(trace) | |
| fig.add_trace(trace) | |
| # Generate the dropdown buttons | |
| for i, trace in enumerate(traces): | |
| visibility = [False] * len(traces) | |
| visibility[i] = True | |
| button = dict( | |
| label=trace.name, | |
| method="update", | |
| args=[{"visible": visibility}, | |
| {"title": f"COP Surface - {trace.name}"}] | |
| ) | |
| buttons.append(button) | |
| if traces: | |
| # Add the dropdown menu to the layout | |
| fig.update_layout( | |
| updatemenus=[ | |
| dict( | |
| active=0, | |
| buttons=buttons, | |
| direction="down", | |
| pad={"r": 10, "t": 10}, | |
| showactive=True, | |
| x=0.1, | |
| xanchor="left", | |
| y=1.15, | |
| yanchor="top" | |
| ) | |
| ], | |
| title=f"COP Surface - {traces[0].name}", | |
| scene=dict( | |
| xaxis_title=col_quelle, | |
| yaxis_title=col_senke, | |
| zaxis_title=col_cop | |
| ), | |
| autosize=False, | |
| width=800, | |
| height=700, | |
| margin=dict(l=65, r=50, b=65, t=90) | |
| ) | |
| # Show in a new browser tab to bypass nbformat issues in VS Code interactive mode | |
| fig.show(renderer="browser") | |
| else: | |
| print("Not enough data to plot surfaces for any combination.") | |
| # %% | |
| # Second Visual: Stacked Surfaces for Kältemittel | |
| print("Generating Stacked 3D Surface Plot by Kältemittel...") | |
| if not missing: | |
| fig2 = go.Figure() | |
| # List of built-in Plotly colorscales to differentiate the Kältemittel | |
| colorscales = ['Viridis', 'Plasma', 'Inferno', 'Magma', 'Cividis', 'Blues', 'Greens', 'Reds'] | |
| unique_kalt = plot_df[col_kalt].dropna().unique() | |
| for idx, kalt_val in enumerate(unique_kalt): | |
| kalt_val = str(kalt_val) | |
| subset = plot_df[plot_df[col_kalt] == kalt_val] | |
| if len(subset) < 3: | |
| continue | |
| pivot = subset.pivot_table(values=col_cop, | |
| index=col_senke, | |
| columns=col_quelle, | |
| aggfunc='mean') | |
| cscale = colorscales[idx % len(colorscales)] | |
| trace = go.Surface( | |
| x=pivot.columns.values, | |
| y=pivot.index.values, | |
| z=pivot.values, | |
| name=kalt_val, | |
| showscale=False, # Hide individual colorbars so it's not cluttered | |
| colorscale=cscale, | |
| showlegend=True, # Allow selecting/deselecting from legend | |
| hovertemplate=f"Kältemittel: {kalt_val}<br>Quelle (X): %{{x}}<br>Senke (Y): %{{y}}<br>COP (Z): %{{z}}<extra></extra>" | |
| ) | |
| fig2.add_trace(trace) | |
| fig2.update_layout( | |
| title="Stacked COP Surfaces by Kältemittel", | |
| scene=dict( | |
| xaxis_title=col_quelle, | |
| yaxis_title=col_senke, | |
| zaxis_title=col_cop | |
| ), | |
| legend=dict( | |
| title="Kältemittel<br>(Click to toggle)", | |
| x=1.05, | |
| y=0.9 | |
| ), | |
| autosize=False, | |
| width=900, | |
| height=800, | |
| margin=dict(l=65, r=50, b=65, t=90) | |
| ) | |
| fig2.show(renderer="browser") | |