|
|
import os |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
from sklearn.preprocessing import MinMaxScaler |
|
|
from sklearn.model_selection import StratifiedShuffleSplit |
|
|
from sklearn.metrics import ( |
|
|
confusion_matrix, |
|
|
precision_score, |
|
|
recall_score, |
|
|
f1_score, |
|
|
accuracy_score, |
|
|
classification_report |
|
|
) |
|
|
import torch |
|
|
import torch.nn as nn |
|
|
from torch.utils.data import Dataset, DataLoader, Subset |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DATA_PATH = "dataset.xlsx" |
|
|
|
|
|
FEATURE_COLUMNS = [ |
|
|
"Temp", |
|
|
"Turbidity (cm)", |
|
|
"DO(mg/L)", |
|
|
"BOD (mg/L)", |
|
|
"CO2", |
|
|
"pH`", |
|
|
"Alkalinity (mg L-1 )", |
|
|
"Hardness (mg L-1 )", |
|
|
"Calcium (mg L-1 )", |
|
|
"Ammonia (mg L-1 )", |
|
|
"Nitrite (mg L-1 )", |
|
|
"Phosphorus (mg L-1 )", |
|
|
"H2S (mg L-1 )", |
|
|
"Plankton (No. L-1)" |
|
|
] |
|
|
LABEL_COL = "Water Quality" |
|
|
|
|
|
SEQUENCE_LENGTH = 10 |
|
|
|
|
|
TRAIN_RATIO = 0.8 |
|
|
VAL_RATIO = 0.1 |
|
|
TEST_RATIO = 0.1 |
|
|
|
|
|
|
|
|
INPUT_DIM = len(FEATURE_COLUMNS) |
|
|
SEQ_LEN = SEQUENCE_LENGTH |
|
|
CHANNELS = INPUT_DIM |
|
|
AE_LR = 1e-3 |
|
|
AE_EPOCHS = 50 |
|
|
BATCH_SIZE = 64 |
|
|
RANDOM_STATE = 42 |
|
|
|
|
|
THRESHOLD_STD_FACTOR = 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df = pd.read_excel(DATA_PATH) |
|
|
df = df.dropna(how="all") |
|
|
|
|
|
|
|
|
df[LABEL_COL] = df[LABEL_COL].astype(int) |
|
|
labels_all = df[LABEL_COL].values |
|
|
|
|
|
|
|
|
for col in FEATURE_COLUMNS: |
|
|
if df[col].dtype == object or df[col].dtype == str: |
|
|
df[col] = df[col].apply(lambda x: str(x).replace(",", ".")) |
|
|
df[col] = df[col].astype(float) |
|
|
|
|
|
data_raw = df[FEATURE_COLUMNS].values |
|
|
|
|
|
|
|
|
scaler = MinMaxScaler() |
|
|
data_scaled = scaler.fit_transform(data_raw) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class CNNTimeSeriesDataset(Dataset): |
|
|
""" |
|
|
Trả về x_window: shape (channels, seq_len) |
|
|
Mỗi channel tương ứng một feature. |
|
|
""" |
|
|
def __init__(self, data, seq_len): |
|
|
self.data = data |
|
|
self.seq_len = seq_len |
|
|
self.num_items = data.shape[0] - seq_len |
|
|
|
|
|
def __len__(self): |
|
|
return self.num_items |
|
|
|
|
|
def __getitem__(self, idx): |
|
|
window = self.data[idx : idx + self.seq_len] |
|
|
|
|
|
x = window.T |
|
|
return torch.tensor(x, dtype=torch.float32) |
|
|
|
|
|
|
|
|
num_total = data_scaled.shape[0] |
|
|
num_items = num_total - SEQUENCE_LENGTH |
|
|
|
|
|
|
|
|
y_seq = np.zeros(num_items, dtype=int) |
|
|
for i in range(num_items): |
|
|
y_seq[i] = labels_all[i + SEQUENCE_LENGTH] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sss1 = StratifiedShuffleSplit(n_splits=1, test_size=TEST_RATIO, random_state=RANDOM_STATE) |
|
|
for train_val_idx, test_idx in sss1.split(np.zeros(num_items), y_seq): |
|
|
pass |
|
|
|
|
|
val_size_rel = VAL_RATIO / (TRAIN_RATIO + VAL_RATIO) |
|
|
sss2 = StratifiedShuffleSplit(n_splits=1, test_size=val_size_rel, random_state=RANDOM_STATE) |
|
|
for train_idx_rel, val_idx_rel in sss2.split(np.zeros(len(train_val_idx)), y_seq[train_val_idx]): |
|
|
pass |
|
|
|
|
|
train_idx = train_val_idx[train_idx_rel] |
|
|
val_idx = train_val_idx[val_idx_rel] |
|
|
|
|
|
def count_labels(indices, y): |
|
|
u, c = np.unique(y[indices], return_counts=True) |
|
|
return dict(zip(u.tolist(), c.tolist())) |
|
|
|
|
|
print("Train labels:", count_labels(train_idx, y_seq)) |
|
|
print("Val labels:", count_labels(val_idx, y_seq)) |
|
|
print("Test labels:", count_labels(test_idx, y_seq)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dataset_all = CNNTimeSeriesDataset(data_scaled, SEQUENCE_LENGTH) |
|
|
|
|
|
|
|
|
train_normal_idx = [i for i in train_idx if y_seq[i] < 2] |
|
|
val_normal_idx = [i for i in val_idx if y_seq[i] < 2] |
|
|
|
|
|
train_ae_dataset = Subset(dataset_all, train_normal_idx) |
|
|
val_ae_dataset = Subset(dataset_all, val_normal_idx) |
|
|
test_dataset = Subset(dataset_all, test_idx) |
|
|
|
|
|
train_ae_loader = DataLoader(train_ae_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True) |
|
|
val_ae_loader = DataLoader(val_ae_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=False) |
|
|
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=False) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class CNNAutoencoder(nn.Module): |
|
|
def __init__(self, channels, seq_len): |
|
|
super(CNNAutoencoder, self).__init__() |
|
|
self.channels = channels |
|
|
self.seq_len = seq_len |
|
|
|
|
|
|
|
|
self.encoder = nn.Sequential( |
|
|
nn.Conv1d(in_channels=channels, out_channels=32, kernel_size=3, padding=1), |
|
|
nn.ReLU(), |
|
|
nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1), |
|
|
nn.ReLU(), |
|
|
nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1), |
|
|
nn.ReLU(), |
|
|
|
|
|
) |
|
|
|
|
|
self.decoder = nn.Sequential( |
|
|
nn.ConvTranspose1d(in_channels=128, out_channels=64, kernel_size=3, padding=1), |
|
|
nn.ReLU(), |
|
|
nn.ConvTranspose1d(in_channels=64, out_channels=32, kernel_size=3, padding=1), |
|
|
nn.ReLU(), |
|
|
nn.ConvTranspose1d(in_channels=32, out_channels=channels, kernel_size=3, padding=1), |
|
|
nn.Sigmoid() |
|
|
) |
|
|
|
|
|
def forward(self, x): |
|
|
""" |
|
|
x: (batch, channels, seq_len) |
|
|
trả về x_recon: (batch, channels, seq_len) |
|
|
""" |
|
|
z = self.encoder(x) |
|
|
x_recon = self.decoder(z) |
|
|
return x_recon |
|
|
|
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
ae_model = CNNAutoencoder(channels=CHANNELS, seq_len=SEQ_LEN).to(device) |
|
|
|
|
|
ae_criterion = nn.MSELoss() |
|
|
ae_optimizer = torch.optim.Adam(ae_model.parameters(), lr=AE_LR) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
best_val_loss = float("inf") |
|
|
best_ae_path = "best_cnn_ae.pth" |
|
|
|
|
|
for epoch in range(1, AE_EPOCHS + 1): |
|
|
ae_model.train() |
|
|
train_loss_sum = 0.0 |
|
|
for x_batch in train_ae_loader: |
|
|
|
|
|
x_batch = x_batch.to(device) |
|
|
ae_optimizer.zero_grad() |
|
|
x_recon = ae_model(x_batch) |
|
|
loss = ae_criterion(x_recon, x_batch) |
|
|
loss.backward() |
|
|
ae_optimizer.step() |
|
|
train_loss_sum += loss.item() * x_batch.size(0) |
|
|
train_loss = train_loss_sum / len(train_ae_loader.dataset) |
|
|
|
|
|
ae_model.eval() |
|
|
val_loss_sum = 0.0 |
|
|
with torch.no_grad(): |
|
|
for x_batch in val_ae_loader: |
|
|
x_batch = x_batch.to(device) |
|
|
x_recon = ae_model(x_batch) |
|
|
loss = ae_criterion(x_recon, x_batch) |
|
|
val_loss_sum += loss.item() * x_batch.size(0) |
|
|
val_loss = val_loss_sum / len(val_ae_loader.dataset) |
|
|
|
|
|
print(f"Epoch {epoch:02d} | AE Train Loss: {train_loss:.6f} | AE Val Loss: {val_loss:.6f}") |
|
|
if val_loss < best_val_loss: |
|
|
best_val_loss = val_loss |
|
|
torch.save(ae_model.state_dict(), best_ae_path) |
|
|
|
|
|
ae_model.load_state_dict(torch.load(best_ae_path, map_location=device)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
val_norm_errors = [] |
|
|
ae_model.eval() |
|
|
with torch.no_grad(): |
|
|
for x_batch in val_ae_loader: |
|
|
x_batch = x_batch.to(device) |
|
|
x_recon = ae_model(x_batch) |
|
|
|
|
|
batch_errors = torch.mean((x_recon - x_batch) ** 2, dim=(1, 2)) |
|
|
val_norm_errors.append(batch_errors.cpu().numpy()) |
|
|
val_norm_errors = np.concatenate(val_norm_errors, axis=0) |
|
|
|
|
|
mu_val = np.mean(val_norm_errors) |
|
|
sigma_val = np.std(val_norm_errors) |
|
|
threshold = mu_val + THRESHOLD_STD_FACTOR * sigma_val |
|
|
|
|
|
print(f"\nThreshold (mean + {THRESHOLD_STD_FACTOR}*std) từ validation normal: {threshold:.6f}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_errors = [] |
|
|
ae_model.eval() |
|
|
with torch.no_grad(): |
|
|
for x_batch in test_loader: |
|
|
x_batch = x_batch.to(device) |
|
|
x_recon = ae_model(x_batch) |
|
|
batch_errors = torch.mean((x_recon - x_batch) ** 2, dim=(1, 2)) |
|
|
test_errors.append(batch_errors.cpu().numpy()) |
|
|
test_errors = np.concatenate(test_errors, axis=0) |
|
|
|
|
|
anomalies = test_errors > threshold |
|
|
num_anomalies = np.sum(anomalies) |
|
|
print(f"Phát hiện {num_anomalies} samples bất thường trong tập test (trên tổng {len(test_errors)})") |
|
|
print("Chỉ số sample bất thường (relative to test set):", np.where(anomalies)[0]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
y_true = [] |
|
|
for idx in test_idx: |
|
|
y_true.append(1 if labels_all[idx + SEQUENCE_LENGTH] == 2 else 0) |
|
|
y_true = np.array(y_true, dtype=int) |
|
|
y_pred = anomalies.astype(int) |
|
|
|
|
|
cm = confusion_matrix(y_true, y_pred) |
|
|
tn, fp, fn, tp = cm.ravel() |
|
|
|
|
|
precision = precision_score(y_true, y_pred, zero_division=0) |
|
|
recall = recall_score(y_true, y_pred, zero_division=0) |
|
|
f1 = f1_score(y_true, y_pred, zero_division=0) |
|
|
accuracy = accuracy_score(y_true, y_pred) |
|
|
|
|
|
print("\n=== Confusion Matrix ===") |
|
|
print(cm) |
|
|
print(f"TN: {tn}, FP: {fp}") |
|
|
print(f"FN: {fn}, TP: {tp}\n") |
|
|
|
|
|
print("=== Metrics for Anomaly Detection ===") |
|
|
print(f"Accuracy : {accuracy:.4f}") |
|
|
print(f"Precision: {precision:.4f}") |
|
|
print(f"Recall : {recall:.4f}") |
|
|
print(f"F1-score : {f1:.4f}\n") |
|
|
|
|
|
print("=== Classification Report ===") |
|
|
print( |
|
|
classification_report( |
|
|
y_true, |
|
|
y_pred, |
|
|
target_names=["Normal (0)", "Anomaly (1)"], |
|
|
zero_division=0 |
|
|
) |
|
|
) |
|
|
|