File size: 4,094 Bytes
b94b2ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python3
"""
Example script showing how to use the processed data for machine learning
"""

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from pathlib import Path


def main():
    print("Loading processed kinect data...")

    # Load the pre-split processed data for the first directory
    output_dir = Path("Data-intensive-systems/A13/Processed_Data")
    X_train = np.load(output_dir / "sequences_Good_vs_Bad_train.npy")  # Shape: (n_train, 10, 102)
    X_test = np.load(output_dir / "sequences_Good_vs_Bad_test.npy")    # Shape: (n_test, 10, 102)
    y_train = np.load(output_dir / "labels_Good_vs_Bad_train.npy")      # Shape: (n_train,)
    y_test = np.load(output_dir / "labels_Good_vs_Bad_test.npy")        # Shape: (n_test,)

    print(f"Training sequences shape: {X_train.shape}")
    print(f"Test sequences shape: {X_test.shape}")
    print(f"Training labels shape: {y_train.shape}")
    print(f"Test labels shape: {y_test.shape}")

    # Reshape sequences to 2D for traditional ML algorithms
    # Option 1: Flatten all frames and features into a single vector per sample
    n_train, n_frames, n_features = X_train.shape
    n_test = X_test.shape[0]
    X_train_flattened = X_train.reshape(n_train, n_frames * n_features)
    X_test_flattened = X_test.reshape(n_test, n_frames * n_features)

    print(f"Training flattened features shape: {X_train_flattened.shape}")
    print(f"Test flattened features shape: {X_test_flattened.shape}")

    print(f"Training set: {X_train_flattened.shape[0]} samples")
    print(f"Test set: {X_test_flattened.shape[0]} samples")
    print(f"Good/Bad distribution in training: {sum(y_train)}/{len(y_train)-sum(y_train)}")
    print(f"Good/Bad distribution in test: {sum(y_test)}/{len(y_test)-sum(y_test)}")

    # Train a simple classifier
    print("\nTraining Random Forest classifier...")
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train_flattened, y_train)

    # Make predictions
    y_pred = clf.predict(X_test_flattened)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nAccuracy: {accuracy:.3f}")

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Bad', 'Good']))

    # Alternative approach: Use only the mean of each feature across frames
    print("\n" + "="*50)
    print("Alternative approach: Using mean values across frames")

    # Calculate mean across frames for each feature
    X_train_mean = np.mean(X_train, axis=1)  # Take mean across the frame dimension
    X_test_mean = np.mean(X_test, axis=1)    # Take mean across the frame dimension

    print(f"Training mean features shape: {X_train_mean.shape}")
    print(f"Test mean features shape: {X_test_mean.shape}")

    # Train the classifier using mean values
    clf_mean = RandomForestClassifier(n_estimators=100, random_state=42)
    clf_mean.fit(X_train_mean, y_train)

    y_pred_m = clf_mean.predict(X_test_mean)
    accuracy_m = accuracy_score(y_test, y_pred_m)

    print(f"Mean-based Accuracy: {accuracy_m:.3f}")
    print("\nMean-based Classification Report:")
    print(classification_report(y_test, y_pred_m, target_names=['Bad', 'Good']))

    # Show feature importance (for the mean-based model)
    feature_importance = clf_mean.feature_importances_
    top_10_indices = np.argsort(feature_importance)[-10:][::-1]

    print(f"\nTop 10 most important features (out of {X_train_mean.shape[1]}):")
    for i, idx in enumerate(top_10_indices):
        print(f"{i+1:2d}. Feature {idx}: Importance = {feature_importance[idx]:.4f}")

    print("\nData preprocessing completed successfully!")
    print("The processed data is ready for various machine learning approaches:")
    print("- Traditional ML (Random Forest, SVM, etc.) using flattened features")
    print("- Deep learning with RNN/LSTM using sequential structure")
    print("- CNN approaches treating frames as temporal 'images'")


if __name__ == "__main__":
    main()