datathon-dedrels / model.py
gracialy's picture
Upload 7 files
3453176 verified
# -*- coding: utf-8 -*-
"""app.py
# Task
Perform exploratory data analysis (EDA) and preprocessing on the dataset located at "/content/predictive-maintenance-dataset.csv" to gain insights for feature engineering.
## Load data
### Subtask:
Load the dataset into a pandas DataFrame.
**Reasoning**:
Load the data into a pandas DataFrame and display the first 5 rows.
"""
import pandas as pd
df = pd.read_csv('predictive-maintenance-dataset.csv')
print(df.head())
"""## Initial exploration
### Subtask:
Display the first few rows, check data types, and look for missing values.
**Reasoning**:
Display the first few rows, check data types, and look for missing values in the DataFrame.
"""
print(df.head())
df.info()
missing_values = df.isnull().sum()
print(missing_values)
"""## Handle missing values
### Subtask:
Address missing values in the dataset, potentially using imputation or removal.
**Reasoning**:
Parse the single column into multiple columns using the semicolon as a separator, convert the relevant columns to appropriate numerical data types, identify and count the missing values, and then impute missing values in 'ball-bearing' and 'vibration' columns with the mean, as the number of missing values is relatively small compared to the total dataset size.
"""
df[['ball-bearing', 'humidity', 'vibration']] = df['ball-bearing;humidity;vibration'].str.split(';', expand=True)
df = df.drop(columns=['ball-bearing;humidity;vibration'])
df['vibration'] = df['vibration'].astype(str).str.replace(r'\s[A-Za-z]+', '00', regex=True) #apus nama bulan di kolom vibration
for col in ['ball-bearing', 'humidity', 'vibration']:
df[col] = pd.to_numeric(df[col], errors='coerce')
missing_values_after_conversion = df.isnull().sum()
print("Missing values after splitting and conversion:")
print(missing_values_after_conversion)
df['ball-bearing'].fillna(df['ball-bearing'].mean(), inplace=True)
df['vibration'].fillna(df['vibration'].mean(), inplace=True)
print("Missing values after imputation:")
print(df.isnull().sum())
display
"""## Descriptive statistics
### Subtask:
Calculate descriptive statistics for the numerical features to understand their distribution.
**Reasoning**:
Calculate and display descriptive statistics for the numerical features using the .describe() method.
"""
descriptive_stats = df.describe()
print(descriptive_stats)
"""## Visualize data distribution
### Subtask:
Create visualizations (e.g., histograms, box plots) to understand the distribution of individual features.
**Reasoning**:
Generate and display histograms and box plots for the numerical features to visualize their distributions.
"""
import matplotlib.pyplot as plt
import seaborn as sns
# Histograms
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
sns.histplot(df['ball-bearing'], ax=axes[0])
axes[0].set_title('Distribution of Ball-Bearing')
sns.histplot(df['humidity'], ax=axes[1])
axes[1].set_title('Distribution of Humidity')
sns.histplot(df['vibration'], ax=axes[2])
axes[2].set_title('Distribution of Vibration')
plt.tight_layout()
plt.show()
# Box plots
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
sns.boxplot(y=df['ball-bearing'], ax=axes[0])
axes[0].set_title('Box Plot of Ball-Bearing')
sns.boxplot(y=df['humidity'], ax=axes[1])
axes[1].set_title('Box Plot of Humidity')
sns.boxplot(y=df['vibration'], ax=axes[2])
axes[2].set_title('Box Plot of Vibration')
plt.tight_layout()
plt.show()
"""## Analyze time series data
### Subtask:
Analyze trends, seasonality, and correlations over time.
**Reasoning**:
The dataframe does not have a time-based index. Since there is no explicit timestamp column, I will assume a uniform time interval and create a DatetimeIndex. Then, I will resample the data to an hourly frequency and calculate the mean for each period. Finally, I will plot the resampled data for each feature to visualize trends over time.
"""
# Assume a uniform time interval and create a time-based index
df.index = pd.date_range(start='2024-01-01', periods=len(df), freq='s')
# Resample the data to hourly frequency and calculate the mean
df_resampled = df.resample('H').mean()
# Plot the resampled data for each feature
fig, axes = plt.subplots(3, 1, figsize=(15, 10), sharex=True)
axes[0].plot(df_resampled.index, df_resampled['ball-bearing'])
axes[0].set_title('Hourly Average Ball-Bearing over Time')
axes[0].set_ylabel('Ball-Bearing')
axes[1].plot(df_resampled.index, df_resampled['humidity'], color='orange')
axes[1].set_title('Hourly Average Humidity over Time')
axes[1].set_ylabel('Humidity')
axes[2].plot(df_resampled.index, df_resampled['vibration'], color='green')
axes[2].set_title('Hourly Average Vibration over Time')
axes[2].set_ylabel('Vibration')
axes[2].set_xlabel('Time')
plt.tight_layout()
plt.show()
"""**Reasoning**:
I have already created a time-based index, resampled the data, and plotted the trends. The next step according to the instructions is to calculate and display the correlation between the features over time using the resampled data.
"""
# Calculate the correlation matrix for the resampled data
correlation_matrix = df_resampled[['ball-bearing', 'humidity', 'vibration']].corr()
# Display the correlation matrix
print("Correlation Matrix of Resampled Data:")
print(correlation_matrix)
"""## Correlation analysis
### Subtask:
Analyze correlations between features to identify potential relationships.
**Reasoning**:
Calculate and display the correlation matrix for the DataFrame `df`.
"""
correlation_matrix = df.corr(numeric_only=True)
print("Correlation Matrix:")
print(correlation_matrix)
"""## Insights
### Subtask:
Summarize findings from the EDA to guide feature engineering efforts.
**Reasoning**:
Summarize the findings from the EDA steps to guide feature engineering efforts based on descriptive statistics, visualizations, and correlation analysis.
"""
print("Summary of EDA Findings for Feature Engineering:")
print("\n1. Descriptive Statistics:")
print("- The 'ball-bearing' and 'vibration' features have a wide range of values and relatively high standard deviations, indicating significant variability.")
print("- 'Humidity' has a much smaller range and lower standard deviation, suggesting it is more stable.")
print("- The mean and median for 'ball-bearing' and 'vibration' are somewhat different, hinting at potential skewness or outliers.")
print("- The minimum values for 'ball-bearing' and 'vibration' are considerably lower than their means, also suggesting potential outliers or skewed distributions.")
print("\n2. Visualizations (Histograms and Box Plots):")
print("- Histograms show that 'ball-bearing' and 'vibration' distributions are somewhat skewed. 'Humidity' appears more normally distributed but with a narrow range.")
print("- Box plots confirm the presence of potential outliers, particularly in 'ball-bearing' and 'vibration', which could be important indicators of anomalies or events in predictive maintenance.")
print("- The distributions suggest that transformations (e.g., log transformation) might be beneficial for 'ball-bearing' and 'vibration' if they are used in models sensitive to skewed data.")
print("\n3. Time Series Analysis:")
print("- The time series plots of the hourly averaged data show trends and fluctuations over time for all features.")
print("- Capturing these temporal patterns could be crucial for predictive maintenance. Features like moving averages, rolling standard deviations, or lag features could be engineered to represent these time-dependent behaviors.")
print("\n4. Correlation Analysis:")
print("- The correlation matrix shows weak linear relationships between the features.")
print("- 'Ball-bearing' has a slight negative correlation with 'humidity' and 'vibration'. 'Humidity' and 'vibration' have a slight positive correlation.")
print("- While linear correlations are weak, non-linear relationships might exist and could be explored. However, based on the current analysis, complex interaction terms between these features might not be highly informative unless combined with temporal aspects.")
print("\n5. Missing Value Handling:")
print("- Missing values were imputed using the mean, which is a simple approach.")
print("- Depending on the impact of missing data and the modeling technique, more sophisticated imputation methods or models that can handle missing values directly could be considered, although for this analysis, mean imputation was sufficient.")
"""## Summary:
### Data Analysis Key Findings
* The dataset was initially loaded as a single column due to a delimiter issue (semicolons).
* After splitting the column and converting to numeric types, missing values were found in 'ball-bearing' (535) and 'vibration' (5242) but none in 'humidity'.
* Missing values in 'ball-bearing' and 'vibration' were imputed using the mean of their respective columns.
* Descriptive statistics show significant variability and potential skewness/outliers in 'ball-bearing' and 'vibration', while 'humidity' is more stable.
* Visualizations (histograms and box plots) confirm the skewed distributions and presence of potential outliers in 'ball-bearing' and 'vibration'. 'Humidity' appears more normally distributed.
* Time series analysis of hourly averaged data reveals trends and fluctuations over time for all features.
* Correlation analysis shows weak linear relationships between the features: 'ball-bearing' has a slight negative correlation with 'humidity' and 'vibration', while 'humidity' and 'vibration' have a slight positive correlation.
### Insights or Next Steps
* Temporal features (e.g., moving averages, lag features) should be engineered to capture the observed trends and temporal patterns in the data.
* Statistical features (e.g., rolling standard deviation, variance) calculated over time windows could help capture the changing behavior and variability of 'ball-bearing' and 'vibration' signals.
"""
from sklearn.preprocessing import MinMaxScaler
#scaling data karena neural networks work best saat input data discaled, nilai antara 0 and 1.
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)
"""## Modelling"""
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
import numpy as np
train_split_index = int(len(df_scaled) * 0.7)
train_data = df_scaled[train_split_index:]
test_data = df_scaled
input_dim = train_data.shape[1]
encoding_dim = 2
# Input layer
input_layer = Input(shape=(input_dim,))
# Encoder layer
encoder = Dense(encoding_dim, activation='relu')(input_layer)
# Decoder layer
decoder = Dense(input_dim, activation='sigmoid')(encoder) # Sigmoid for output between 0 and 1
autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mae') # Use Mean Absolute Error for loss
autoencoder.summary()
print("-" * 30)
# Train autoencoder
history = autoencoder.fit(
train_data,
train_data,
epochs=50,
batch_size=32,
shuffle=True,
validation_split=0.1, #pakai 10% data
verbose=1
)
# Calculate health index (Reconstruction error)
# Predict (reconstruct) dataset semua
reconstructions = autoencoder.predict(test_data)
# Kalkulasi Mean Absolute Error diantara original dan reconstructed data
mae_loss = np.mean(np.abs(reconstructions - test_data.values), axis=1)
df['health_index'] = mae_loss
print("Health Index calculated and added to DataFrame.")
print(df[['health_index']].head())
print("-" * 30)
# Visualisasi
fig, ax1 = plt.subplots(figsize=(15, 7))
plt.style.use('seaborn-v0_8-whitegrid')
# Plot Health Index pada y axis
ax1.plot(df.index, df['health_index'], color='red', label='Health Index (Anomaly Score)')
ax1.set_xlabel('Time', fontsize=12)
ax1.set_ylabel('Health Index (Higher is Worse)', color='red', fontsize=12)
ax1.tick_params(axis='y', labelcolor='red')
ax1.legend(loc='upper left')
# Secondary y-axis untuk original ball bearing
ax2 = ax1.twinx()
ax2.plot(df.index, df['ball-bearing'], color='blue', linestyle='--', alpha=0.5, label='Ball Bearing Value')
ax2.set_ylabel('Original Ball Bearing Value', color='blue', fontsize=12)
ax2.tick_params(axis='y', labelcolor='blue')
ax2.legend(loc='upper right')
plt.title('Health Index vs. Ball Bearing Value', fontsize=16)
plt.show()
from joblib import dump
#Export
autoencoder.save('elevator_health_model.h5')
dump(scaler, 'elevator_data_scaler.joblib')