Spaces:
Configuration error
Configuration error
| # -*- coding: utf-8 -*- | |
| """app.py | |
| # Task | |
| Perform exploratory data analysis (EDA) and preprocessing on the dataset located at "/content/predictive-maintenance-dataset.csv" to gain insights for feature engineering. | |
| ## Load data | |
| ### Subtask: | |
| Load the dataset into a pandas DataFrame. | |
| **Reasoning**: | |
| Load the data into a pandas DataFrame and display the first 5 rows. | |
| """ | |
| import pandas as pd | |
| df = pd.read_csv('predictive-maintenance-dataset.csv') | |
| print(df.head()) | |
| """## Initial exploration | |
| ### Subtask: | |
| Display the first few rows, check data types, and look for missing values. | |
| **Reasoning**: | |
| Display the first few rows, check data types, and look for missing values in the DataFrame. | |
| """ | |
| print(df.head()) | |
| df.info() | |
| missing_values = df.isnull().sum() | |
| print(missing_values) | |
| """## Handle missing values | |
| ### Subtask: | |
| Address missing values in the dataset, potentially using imputation or removal. | |
| **Reasoning**: | |
| Parse the single column into multiple columns using the semicolon as a separator, convert the relevant columns to appropriate numerical data types, identify and count the missing values, and then impute missing values in 'ball-bearing' and 'vibration' columns with the mean, as the number of missing values is relatively small compared to the total dataset size. | |
| """ | |
| df[['ball-bearing', 'humidity', 'vibration']] = df['ball-bearing;humidity;vibration'].str.split(';', expand=True) | |
| df = df.drop(columns=['ball-bearing;humidity;vibration']) | |
| df['vibration'] = df['vibration'].astype(str).str.replace(r'\s[A-Za-z]+', '00', regex=True) #apus nama bulan di kolom vibration | |
| for col in ['ball-bearing', 'humidity', 'vibration']: | |
| df[col] = pd.to_numeric(df[col], errors='coerce') | |
| missing_values_after_conversion = df.isnull().sum() | |
| print("Missing values after splitting and conversion:") | |
| print(missing_values_after_conversion) | |
| df['ball-bearing'].fillna(df['ball-bearing'].mean(), inplace=True) | |
| df['vibration'].fillna(df['vibration'].mean(), inplace=True) | |
| print("Missing values after imputation:") | |
| print(df.isnull().sum()) | |
| display | |
| """## Descriptive statistics | |
| ### Subtask: | |
| Calculate descriptive statistics for the numerical features to understand their distribution. | |
| **Reasoning**: | |
| Calculate and display descriptive statistics for the numerical features using the .describe() method. | |
| """ | |
| descriptive_stats = df.describe() | |
| print(descriptive_stats) | |
| """## Visualize data distribution | |
| ### Subtask: | |
| Create visualizations (e.g., histograms, box plots) to understand the distribution of individual features. | |
| **Reasoning**: | |
| Generate and display histograms and box plots for the numerical features to visualize their distributions. | |
| """ | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| # Histograms | |
| fig, axes = plt.subplots(1, 3, figsize=(15, 5)) | |
| sns.histplot(df['ball-bearing'], ax=axes[0]) | |
| axes[0].set_title('Distribution of Ball-Bearing') | |
| sns.histplot(df['humidity'], ax=axes[1]) | |
| axes[1].set_title('Distribution of Humidity') | |
| sns.histplot(df['vibration'], ax=axes[2]) | |
| axes[2].set_title('Distribution of Vibration') | |
| plt.tight_layout() | |
| plt.show() | |
| # Box plots | |
| fig, axes = plt.subplots(1, 3, figsize=(15, 5)) | |
| sns.boxplot(y=df['ball-bearing'], ax=axes[0]) | |
| axes[0].set_title('Box Plot of Ball-Bearing') | |
| sns.boxplot(y=df['humidity'], ax=axes[1]) | |
| axes[1].set_title('Box Plot of Humidity') | |
| sns.boxplot(y=df['vibration'], ax=axes[2]) | |
| axes[2].set_title('Box Plot of Vibration') | |
| plt.tight_layout() | |
| plt.show() | |
| """## Analyze time series data | |
| ### Subtask: | |
| Analyze trends, seasonality, and correlations over time. | |
| **Reasoning**: | |
| The dataframe does not have a time-based index. Since there is no explicit timestamp column, I will assume a uniform time interval and create a DatetimeIndex. Then, I will resample the data to an hourly frequency and calculate the mean for each period. Finally, I will plot the resampled data for each feature to visualize trends over time. | |
| """ | |
| # Assume a uniform time interval and create a time-based index | |
| df.index = pd.date_range(start='2024-01-01', periods=len(df), freq='s') | |
| # Resample the data to hourly frequency and calculate the mean | |
| df_resampled = df.resample('H').mean() | |
| # Plot the resampled data for each feature | |
| fig, axes = plt.subplots(3, 1, figsize=(15, 10), sharex=True) | |
| axes[0].plot(df_resampled.index, df_resampled['ball-bearing']) | |
| axes[0].set_title('Hourly Average Ball-Bearing over Time') | |
| axes[0].set_ylabel('Ball-Bearing') | |
| axes[1].plot(df_resampled.index, df_resampled['humidity'], color='orange') | |
| axes[1].set_title('Hourly Average Humidity over Time') | |
| axes[1].set_ylabel('Humidity') | |
| axes[2].plot(df_resampled.index, df_resampled['vibration'], color='green') | |
| axes[2].set_title('Hourly Average Vibration over Time') | |
| axes[2].set_ylabel('Vibration') | |
| axes[2].set_xlabel('Time') | |
| plt.tight_layout() | |
| plt.show() | |
| """**Reasoning**: | |
| I have already created a time-based index, resampled the data, and plotted the trends. The next step according to the instructions is to calculate and display the correlation between the features over time using the resampled data. | |
| """ | |
| # Calculate the correlation matrix for the resampled data | |
| correlation_matrix = df_resampled[['ball-bearing', 'humidity', 'vibration']].corr() | |
| # Display the correlation matrix | |
| print("Correlation Matrix of Resampled Data:") | |
| print(correlation_matrix) | |
| """## Correlation analysis | |
| ### Subtask: | |
| Analyze correlations between features to identify potential relationships. | |
| **Reasoning**: | |
| Calculate and display the correlation matrix for the DataFrame `df`. | |
| """ | |
| correlation_matrix = df.corr(numeric_only=True) | |
| print("Correlation Matrix:") | |
| print(correlation_matrix) | |
| """## Insights | |
| ### Subtask: | |
| Summarize findings from the EDA to guide feature engineering efforts. | |
| **Reasoning**: | |
| Summarize the findings from the EDA steps to guide feature engineering efforts based on descriptive statistics, visualizations, and correlation analysis. | |
| """ | |
| print("Summary of EDA Findings for Feature Engineering:") | |
| print("\n1. Descriptive Statistics:") | |
| print("- The 'ball-bearing' and 'vibration' features have a wide range of values and relatively high standard deviations, indicating significant variability.") | |
| print("- 'Humidity' has a much smaller range and lower standard deviation, suggesting it is more stable.") | |
| print("- The mean and median for 'ball-bearing' and 'vibration' are somewhat different, hinting at potential skewness or outliers.") | |
| print("- The minimum values for 'ball-bearing' and 'vibration' are considerably lower than their means, also suggesting potential outliers or skewed distributions.") | |
| print("\n2. Visualizations (Histograms and Box Plots):") | |
| print("- Histograms show that 'ball-bearing' and 'vibration' distributions are somewhat skewed. 'Humidity' appears more normally distributed but with a narrow range.") | |
| print("- Box plots confirm the presence of potential outliers, particularly in 'ball-bearing' and 'vibration', which could be important indicators of anomalies or events in predictive maintenance.") | |
| print("- The distributions suggest that transformations (e.g., log transformation) might be beneficial for 'ball-bearing' and 'vibration' if they are used in models sensitive to skewed data.") | |
| print("\n3. Time Series Analysis:") | |
| print("- The time series plots of the hourly averaged data show trends and fluctuations over time for all features.") | |
| print("- Capturing these temporal patterns could be crucial for predictive maintenance. Features like moving averages, rolling standard deviations, or lag features could be engineered to represent these time-dependent behaviors.") | |
| print("\n4. Correlation Analysis:") | |
| print("- The correlation matrix shows weak linear relationships between the features.") | |
| print("- 'Ball-bearing' has a slight negative correlation with 'humidity' and 'vibration'. 'Humidity' and 'vibration' have a slight positive correlation.") | |
| print("- While linear correlations are weak, non-linear relationships might exist and could be explored. However, based on the current analysis, complex interaction terms between these features might not be highly informative unless combined with temporal aspects.") | |
| print("\n5. Missing Value Handling:") | |
| print("- Missing values were imputed using the mean, which is a simple approach.") | |
| print("- Depending on the impact of missing data and the modeling technique, more sophisticated imputation methods or models that can handle missing values directly could be considered, although for this analysis, mean imputation was sufficient.") | |
| """## Summary: | |
| ### Data Analysis Key Findings | |
| * The dataset was initially loaded as a single column due to a delimiter issue (semicolons). | |
| * After splitting the column and converting to numeric types, missing values were found in 'ball-bearing' (535) and 'vibration' (5242) but none in 'humidity'. | |
| * Missing values in 'ball-bearing' and 'vibration' were imputed using the mean of their respective columns. | |
| * Descriptive statistics show significant variability and potential skewness/outliers in 'ball-bearing' and 'vibration', while 'humidity' is more stable. | |
| * Visualizations (histograms and box plots) confirm the skewed distributions and presence of potential outliers in 'ball-bearing' and 'vibration'. 'Humidity' appears more normally distributed. | |
| * Time series analysis of hourly averaged data reveals trends and fluctuations over time for all features. | |
| * Correlation analysis shows weak linear relationships between the features: 'ball-bearing' has a slight negative correlation with 'humidity' and 'vibration', while 'humidity' and 'vibration' have a slight positive correlation. | |
| ### Insights or Next Steps | |
| * Temporal features (e.g., moving averages, lag features) should be engineered to capture the observed trends and temporal patterns in the data. | |
| * Statistical features (e.g., rolling standard deviation, variance) calculated over time windows could help capture the changing behavior and variability of 'ball-bearing' and 'vibration' signals. | |
| """ | |
| from sklearn.preprocessing import MinMaxScaler | |
| #scaling data karena neural networks work best saat input data discaled, nilai antara 0 and 1. | |
| scaler = MinMaxScaler() | |
| df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index) | |
| """## Modelling""" | |
| import tensorflow as tf | |
| from tensorflow.keras.models import Model | |
| from tensorflow.keras.layers import Input, Dense | |
| import numpy as np | |
| train_split_index = int(len(df_scaled) * 0.7) | |
| train_data = df_scaled[train_split_index:] | |
| test_data = df_scaled | |
| input_dim = train_data.shape[1] | |
| encoding_dim = 2 | |
| # Input layer | |
| input_layer = Input(shape=(input_dim,)) | |
| # Encoder layer | |
| encoder = Dense(encoding_dim, activation='relu')(input_layer) | |
| # Decoder layer | |
| decoder = Dense(input_dim, activation='sigmoid')(encoder) # Sigmoid for output between 0 and 1 | |
| autoencoder = Model(inputs=input_layer, outputs=decoder) | |
| autoencoder.compile(optimizer='adam', loss='mae') # Use Mean Absolute Error for loss | |
| autoencoder.summary() | |
| print("-" * 30) | |
| # Train autoencoder | |
| history = autoencoder.fit( | |
| train_data, | |
| train_data, | |
| epochs=50, | |
| batch_size=32, | |
| shuffle=True, | |
| validation_split=0.1, #pakai 10% data | |
| verbose=1 | |
| ) | |
| # Calculate health index (Reconstruction error) | |
| # Predict (reconstruct) dataset semua | |
| reconstructions = autoencoder.predict(test_data) | |
| # Kalkulasi Mean Absolute Error diantara original dan reconstructed data | |
| mae_loss = np.mean(np.abs(reconstructions - test_data.values), axis=1) | |
| df['health_index'] = mae_loss | |
| print("Health Index calculated and added to DataFrame.") | |
| print(df[['health_index']].head()) | |
| print("-" * 30) | |
| # Visualisasi | |
| fig, ax1 = plt.subplots(figsize=(15, 7)) | |
| plt.style.use('seaborn-v0_8-whitegrid') | |
| # Plot Health Index pada y axis | |
| ax1.plot(df.index, df['health_index'], color='red', label='Health Index (Anomaly Score)') | |
| ax1.set_xlabel('Time', fontsize=12) | |
| ax1.set_ylabel('Health Index (Higher is Worse)', color='red', fontsize=12) | |
| ax1.tick_params(axis='y', labelcolor='red') | |
| ax1.legend(loc='upper left') | |
| # Secondary y-axis untuk original ball bearing | |
| ax2 = ax1.twinx() | |
| ax2.plot(df.index, df['ball-bearing'], color='blue', linestyle='--', alpha=0.5, label='Ball Bearing Value') | |
| ax2.set_ylabel('Original Ball Bearing Value', color='blue', fontsize=12) | |
| ax2.tick_params(axis='y', labelcolor='blue') | |
| ax2.legend(loc='upper right') | |
| plt.title('Health Index vs. Ball Bearing Value', fontsize=16) | |
| plt.show() | |
| from joblib import dump | |
| #Export | |
| autoencoder.save('elevator_health_model.h5') | |
| dump(scaler, 'elevator_data_scaler.joblib') |