import os import numpy as np import pandas as pd from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import StratifiedShuffleSplit from sklearn.metrics import ( confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, classification_report ) import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader, Subset # ============================================ # 1. CÀI ĐẶT THAM SỐ CHUNG # ============================================ DATA_PATH = "dataset.xlsx" FEATURE_COLUMNS = [ "Temp", "Turbidity (cm)", "DO(mg/L)", "BOD (mg/L)", "CO2", "pH`", "Alkalinity (mg L-1 )", "Hardness (mg L-1 )", "Calcium (mg L-1 )", "Ammonia (mg L-1 )", "Nitrite (mg L-1 )", "Phosphorus (mg L-1 )", "H2S (mg L-1 )", "Plankton (No. L-1)" ] LABEL_COL = "Water Quality" SEQUENCE_LENGTH = 10 TRAIN_RATIO = 0.8 VAL_RATIO = 0.1 TEST_RATIO = 0.1 # CNN-AE hyperparameters INPUT_DIM = len(FEATURE_COLUMNS) # 14 features SEQ_LEN = SEQUENCE_LENGTH # 10 time steps CHANNELS = INPUT_DIM # treat each feature as a channel AE_LR = 1e-3 AE_EPOCHS = 50 BATCH_SIZE = 64 RANDOM_STATE = 42 THRESHOLD_STD_FACTOR = 2 # threshold = mean + 2*std on validation normal # ============================================ # 2. ĐỌC VÀ TIỀN XỬ LÝ DỮ LIỆU # ============================================ df = pd.read_excel(DATA_PATH) df = df.dropna(how="all") # Đảm bảo label là int df[LABEL_COL] = df[LABEL_COL].astype(int) labels_all = df[LABEL_COL].values # shape = (num_total,) # Chuyển dấu phẩy sang dấu chấm, convert các cột tính sang float for col in FEATURE_COLUMNS: if df[col].dtype == object or df[col].dtype == str: df[col] = df[col].apply(lambda x: str(x).replace(",", ".")) df[col] = df[col].astype(float) data_raw = df[FEATURE_COLUMNS].values # shape = (num_total, 14) # Chuẩn hóa min-max scaler = MinMaxScaler() data_scaled = scaler.fit_transform(data_raw) # shape = (num_total, 14) # ============================================ # 3. DATASET CHO TIME-SERIES # ============================================ class CNNTimeSeriesDataset(Dataset): """ Trả về x_window: shape (channels, seq_len) Mỗi channel tương ứng một feature. """ def __init__(self, data, seq_len): self.data = data self.seq_len = seq_len self.num_items = data.shape[0] - seq_len def __len__(self): return self.num_items def __getitem__(self, idx): window = self.data[idx : idx + self.seq_len] # shape = (seq_len, features) # transpose thành (features, seq_len) để input cho Conv1d x = window.T # (channels, seq_len) return torch.tensor(x, dtype=torch.float32) num_total = data_scaled.shape[0] num_items = num_total - SEQUENCE_LENGTH # Tạo mảng y_seq: nhãn tại cuối mỗi window y_seq = np.zeros(num_items, dtype=int) for i in range(num_items): y_seq[i] = labels_all[i + SEQUENCE_LENGTH] # ============================================ # 4. STRATIFIED SPLIT (TRAIN/VAL/TEST) # ============================================ sss1 = StratifiedShuffleSplit(n_splits=1, test_size=TEST_RATIO, random_state=RANDOM_STATE) for train_val_idx, test_idx in sss1.split(np.zeros(num_items), y_seq): pass val_size_rel = VAL_RATIO / (TRAIN_RATIO + VAL_RATIO) sss2 = StratifiedShuffleSplit(n_splits=1, test_size=val_size_rel, random_state=RANDOM_STATE) for train_idx_rel, val_idx_rel in sss2.split(np.zeros(len(train_val_idx)), y_seq[train_val_idx]): pass train_idx = train_val_idx[train_idx_rel] val_idx = train_val_idx[val_idx_rel] def count_labels(indices, y): u, c = np.unique(y[indices], return_counts=True) return dict(zip(u.tolist(), c.tolist())) print("Train labels:", count_labels(train_idx, y_seq)) print("Val labels:", count_labels(val_idx, y_seq)) print("Test labels:", count_labels(test_idx, y_seq)) # ============================================ # 5. TẠO DATALOADER CHO AUTOENCODER (CHỈ DÙNG NORMAL) # ============================================ dataset_all = CNNTimeSeriesDataset(data_scaled, SEQUENCE_LENGTH) # Chỉ lấy index có nhãn 0 hoặc 1 cho train/val AE train_normal_idx = [i for i in train_idx if y_seq[i] < 2] val_normal_idx = [i for i in val_idx if y_seq[i] < 2] train_ae_dataset = Subset(dataset_all, train_normal_idx) val_ae_dataset = Subset(dataset_all, val_normal_idx) test_dataset = Subset(dataset_all, test_idx) train_ae_loader = DataLoader(train_ae_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True) val_ae_loader = DataLoader(val_ae_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=False) test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=False) # ============================================ # 6. XÂY DỰNG LSTM‐CNN AUTOENCODER # ============================================ class CNNAutoencoder(nn.Module): def __init__(self, channels, seq_len): super(CNNAutoencoder, self).__init__() self.channels = channels self.seq_len = seq_len # Encoder: Conv1d layers self.encoder = nn.Sequential( nn.Conv1d(in_channels=channels, out_channels=32, kernel_size=3, padding=1), nn.ReLU(), nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1), nn.ReLU(), nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1), nn.ReLU(), # Giữ nguyên chiều seq_len nhưng tăng depth ) # Decoder: ConvTranspose1d layers self.decoder = nn.Sequential( nn.ConvTranspose1d(in_channels=128, out_channels=64, kernel_size=3, padding=1), nn.ReLU(), nn.ConvTranspose1d(in_channels=64, out_channels=32, kernel_size=3, padding=1), nn.ReLU(), nn.ConvTranspose1d(in_channels=32, out_channels=channels, kernel_size=3, padding=1), nn.Sigmoid() # output trong [0,1] do data đã chuẩn hóa ) def forward(self, x): """ x: (batch, channels, seq_len) trả về x_recon: (batch, channels, seq_len) """ z = self.encoder(x) x_recon = self.decoder(z) return x_recon device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ae_model = CNNAutoencoder(channels=CHANNELS, seq_len=SEQ_LEN).to(device) ae_criterion = nn.MSELoss() ae_optimizer = torch.optim.Adam(ae_model.parameters(), lr=AE_LR) # ============================================ # 7. HUẤN LUYỆN CNN‐AE # ============================================ best_val_loss = float("inf") best_ae_path = "best_cnn_ae.pth" for epoch in range(1, AE_EPOCHS + 1): ae_model.train() train_loss_sum = 0.0 for x_batch in train_ae_loader: # x_batch: (batch, channels, seq_len) x_batch = x_batch.to(device) ae_optimizer.zero_grad() x_recon = ae_model(x_batch) loss = ae_criterion(x_recon, x_batch) loss.backward() ae_optimizer.step() train_loss_sum += loss.item() * x_batch.size(0) train_loss = train_loss_sum / len(train_ae_loader.dataset) ae_model.eval() val_loss_sum = 0.0 with torch.no_grad(): for x_batch in val_ae_loader: x_batch = x_batch.to(device) x_recon = ae_model(x_batch) loss = ae_criterion(x_recon, x_batch) val_loss_sum += loss.item() * x_batch.size(0) val_loss = val_loss_sum / len(val_ae_loader.dataset) print(f"Epoch {epoch:02d} | AE Train Loss: {train_loss:.6f} | AE Val Loss: {val_loss:.6f}") if val_loss < best_val_loss: best_val_loss = val_loss torch.save(ae_model.state_dict(), best_ae_path) ae_model.load_state_dict(torch.load(best_ae_path, map_location=device)) # ============================================ # 8. TÍNH RECONSTRUCTION ERROR TRÊN VALIDATION NORMAL # ============================================ val_norm_errors = [] ae_model.eval() with torch.no_grad(): for x_batch in val_ae_loader: x_batch = x_batch.to(device) x_recon = ae_model(x_batch) # MSE dọc (channels x seq_len) cho mỗi sample batch_errors = torch.mean((x_recon - x_batch) ** 2, dim=(1, 2)) val_norm_errors.append(batch_errors.cpu().numpy()) val_norm_errors = np.concatenate(val_norm_errors, axis=0) mu_val = np.mean(val_norm_errors) sigma_val = np.std(val_norm_errors) threshold = mu_val + THRESHOLD_STD_FACTOR * sigma_val print(f"\nThreshold (mean + {THRESHOLD_STD_FACTOR}*std) từ validation normal: {threshold:.6f}") # ============================================ # 9. TÍNH RECONSTRUCTION ERROR TRÊN TEST & PHÁT HIỆN BẤT THƯỜNG # ============================================ test_errors = [] ae_model.eval() with torch.no_grad(): for x_batch in test_loader: x_batch = x_batch.to(device) x_recon = ae_model(x_batch) batch_errors = torch.mean((x_recon - x_batch) ** 2, dim=(1, 2)) test_errors.append(batch_errors.cpu().numpy()) test_errors = np.concatenate(test_errors, axis=0) anomalies = test_errors > threshold num_anomalies = np.sum(anomalies) print(f"Phát hiện {num_anomalies} samples bất thường trong tập test (trên tổng {len(test_errors)})") print("Chỉ số sample bất thường (relative to test set):", np.where(anomalies)[0]) # ============================================ # 10. ĐÁNH GIÁ KẾT QUẢ # ============================================ y_true = [] for idx in test_idx: y_true.append(1 if labels_all[idx + SEQUENCE_LENGTH] == 2 else 0) y_true = np.array(y_true, dtype=int) y_pred = anomalies.astype(int) cm = confusion_matrix(y_true, y_pred) tn, fp, fn, tp = cm.ravel() precision = precision_score(y_true, y_pred, zero_division=0) recall = recall_score(y_true, y_pred, zero_division=0) f1 = f1_score(y_true, y_pred, zero_division=0) accuracy = accuracy_score(y_true, y_pred) print("\n=== Confusion Matrix ===") print(cm) print(f"TN: {tn}, FP: {fp}") print(f"FN: {fn}, TP: {tp}\n") print("=== Metrics for Anomaly Detection ===") print(f"Accuracy : {accuracy:.4f}") print(f"Precision: {precision:.4f}") print(f"Recall : {recall:.4f}") print(f"F1-score : {f1:.4f}\n") print("=== Classification Report ===") print( classification_report( y_true, y_pred, target_names=["Normal (0)", "Anomaly (1)"], zero_division=0 ) )