Spaces:
Running
Running
File size: 6,568 Bytes
b94b2ad | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | #!/usr/bin/env python3
"""
Test script to validate that augmented data can be used effectively for machine learning
"""
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from pathlib import Path
def load_data_from_csv(csv_path):
"""Load data from CSV file and separate features from labels."""
df = pd.read_csv(csv_path)
# Extract features (skip filename and label columns)
feature_cols = [col for col in df.columns if col not in ['filename', 'label']]
X = df[feature_cols].values
y = df['label'].values
filenames = df['filename'].values
return X, y, filenames
def main():
print("Testing augmented processed kinect data...")
# Load the augmented processed data
data_dir = Path("Data-intensive-systems/A13/Processed_Data")
print("\nLoading augmented training data...")
X_train_aug, y_train_aug, fn_train_aug = load_data_from_csv(
data_dir / "processed_sequences_Good_vs_Bad_train_augmented.csv"
)
print(f"Augmented training sequences shape: {X_train_aug.shape}")
print(f"Augmented training labels shape: {y_train_aug.shape}")
print(f"Number of original samples identified: {len([f for f in fn_train_aug if '_' not in f])}")
print(f"Good/Bad distribution in augmented training: {sum(y_train_aug)}/{len(y_train_aug)-sum(y_train_aug)}")
print("\nLoading augmented test data...")
X_test_aug, y_test_aug, fn_test_aug = load_data_from_csv(
data_dir / "processed_sequences_Good_vs_Bad_test_augmented.csv"
)
print(f"Augmented test sequences shape: {X_test_aug.shape}")
print(f"Augmented test labels shape: {y_test_aug.shape}")
print(f"Number of original samples identified: {len([f for f in fn_test_aug if '_' not in f])}")
print(f"Good/Bad distribution in augmented test: {sum(y_test_aug)}/{len(y_test_aug)-sum(y_test_aug)}")
# Compare with original data
print("\nLoading original training data for comparison...")
X_train_orig, y_train_orig, fn_train_orig = load_data_from_csv(
data_dir / "processed_sequences_Good_vs_Bad_train.csv"
)
print(f"Original training sequences shape: {X_train_orig.shape}")
print(f"Original training labels shape: {y_train_orig.shape}")
print(f"Good/Bad distribution in original training: {sum(y_train_orig)}/{len(y_train_orig)-sum(y_train_orig)}")
print("\nLoading original test data for comparison...")
X_test_orig, y_test_orig, fn_test_orig = load_data_from_csv(
data_dir / "processed_sequences_Good_vs_Bad_test.csv"
)
print(f"Original test sequences shape: {X_test_orig.shape}")
print(f"Original test labels shape: {y_test_orig.shape}")
print(f"Good/Bad distribution in original test: {sum(y_test_orig)}/{len(y_test_orig)-sum(y_test_orig)}")
print(f"\nData augmentation summary:")
print(f"- Training data increased from {X_train_orig.shape[0]} to {X_train_aug.shape[0]} samples ({X_train_aug.shape[0]/X_train_orig.shape[0]:.1f}x)")
print(f"- Test data increased from {X_test_orig.shape[0]} to {X_test_aug.shape[0]} samples ({X_test_aug.shape[0]/X_test_orig.shape[0]:.1f}x)")
# Train models and compare performance
print("\n" + "="*60)
print("COMPARISON OF ORIGINAL VS AUGMENTED DATA PERFORMANCE")
print("="*60)
# Train on original data, test on original
print("\n1. Original training data -> Original test data:")
clf_orig = RandomForestClassifier(n_estimators=100, random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)
y_pred_orig = clf_orig.predict(X_test_orig)
acc_orig = accuracy_score(y_test_orig, y_pred_orig)
print(f" Accuracy: {acc_orig:.3f}")
print(f" Classification Report:")
print(classification_report(y_test_orig, y_pred_orig, target_names=['Bad', 'Good'], digits=3))
# Train on augmented data, test on original
print("\n2. Augmented training data -> Original test data:")
clf_aug = RandomForestClassifier(n_estimators=100, random_state=42)
clf_aug.fit(X_train_aug, y_train_aug)
y_pred_aug = clf_aug.predict(X_test_orig)
acc_aug = accuracy_score(y_test_orig, y_pred_aug)
print(f" Accuracy: {acc_aug:.3f}")
print(f" Classification Report:")
print(classification_report(y_test_orig, y_pred_aug, target_names=['Bad', 'Good'], digits=3))
# Check if augmentation improved performance
improvement = acc_aug - acc_orig
print(f"\nPerformance change due to augmentation: {improvement:+.3f}")
if improvement > 0:
print("✓ Augmentation improved model performance!")
elif improvement == 0:
print("~ Augmentation had no effect on performance.")
else:
print("⚠ Augmentation decreased model performance.")
# Additional validation: check that augmented samples maintain correct labels
print("\n" + "="*60)
print("VALIDATION OF AUGMENTED DATA INTEGRITY")
print("="*60)
# Count how many augmented samples there are
original_in_aug_train = sum(1 for f in fn_train_aug if '_' not in f)
augmented_in_aug_train = len(fn_train_aug) - original_in_aug_train
print(f"Original samples in augmented train set: {original_in_aug_train}")
print(f"Augmented samples in augmented train set: {augmented_in_aug_train}")
# Check label consistency for augmented samples
# Original samples should have specific filenames, augmented should have suffixes
original_samples_mask = np.array(['_' not in f for f in fn_train_aug])
augmented_samples_mask = ~original_samples_mask
orig_labels_count = np.sum(y_train_aug[original_samples_mask])
aug_labels_count = np.sum(y_train_aug[augmented_samples_mask])
print(f"Good samples among original: {orig_labels_count}/{np.sum(original_samples_mask)} ({orig_labels_count/np.sum(original_samples_mask)*100:.1f}%)")
print(f"Good samples among augmented: {aug_labels_count}/{np.sum(augmented_samples_mask)} ({aug_labels_count/np.sum(augmented_samples_mask)*100:.1f}%)")
print("\nAugmentation validation completed successfully!")
print("Key findings:")
print(f"- Original training data: {X_train_orig.shape[0]} samples")
print(f"- Augmented training data: {X_train_aug.shape[0]} samples")
print(f"- Performance difference: {improvement:+.3f}")
print("- Augmented data maintains label consistency")
print("- Augmented data can be used seamlessly with ML pipelines")
if __name__ == "__main__":
main()
|