Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Example script showing how to use the processed data for machine learning | |
| """ | |
| import numpy as np | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.metrics import accuracy_score, classification_report | |
| from pathlib import Path | |
| def main(): | |
| print("Loading processed kinect data...") | |
| # Load the pre-split processed data for the first directory | |
| output_dir = Path("Data-intensive-systems/A13/Processed_Data") | |
| X_train = np.load(output_dir / "sequences_Good_vs_Bad_train.npy") # Shape: (n_train, 10, 102) | |
| X_test = np.load(output_dir / "sequences_Good_vs_Bad_test.npy") # Shape: (n_test, 10, 102) | |
| y_train = np.load(output_dir / "labels_Good_vs_Bad_train.npy") # Shape: (n_train,) | |
| y_test = np.load(output_dir / "labels_Good_vs_Bad_test.npy") # Shape: (n_test,) | |
| print(f"Training sequences shape: {X_train.shape}") | |
| print(f"Test sequences shape: {X_test.shape}") | |
| print(f"Training labels shape: {y_train.shape}") | |
| print(f"Test labels shape: {y_test.shape}") | |
| # Reshape sequences to 2D for traditional ML algorithms | |
| # Option 1: Flatten all frames and features into a single vector per sample | |
| n_train, n_frames, n_features = X_train.shape | |
| n_test = X_test.shape[0] | |
| X_train_flattened = X_train.reshape(n_train, n_frames * n_features) | |
| X_test_flattened = X_test.reshape(n_test, n_frames * n_features) | |
| print(f"Training flattened features shape: {X_train_flattened.shape}") | |
| print(f"Test flattened features shape: {X_test_flattened.shape}") | |
| print(f"Training set: {X_train_flattened.shape[0]} samples") | |
| print(f"Test set: {X_test_flattened.shape[0]} samples") | |
| print(f"Good/Bad distribution in training: {sum(y_train)}/{len(y_train)-sum(y_train)}") | |
| print(f"Good/Bad distribution in test: {sum(y_test)}/{len(y_test)-sum(y_test)}") | |
| # Train a simple classifier | |
| print("\nTraining Random Forest classifier...") | |
| clf = RandomForestClassifier(n_estimators=100, random_state=42) | |
| clf.fit(X_train_flattened, y_train) | |
| # Make predictions | |
| y_pred = clf.predict(X_test_flattened) | |
| # Evaluate the model | |
| accuracy = accuracy_score(y_test, y_pred) | |
| print(f"\nAccuracy: {accuracy:.3f}") | |
| print("\nClassification Report:") | |
| print(classification_report(y_test, y_pred, target_names=['Bad', 'Good'])) | |
| # Alternative approach: Use only the mean of each feature across frames | |
| print("\n" + "="*50) | |
| print("Alternative approach: Using mean values across frames") | |
| # Calculate mean across frames for each feature | |
| X_train_mean = np.mean(X_train, axis=1) # Take mean across the frame dimension | |
| X_test_mean = np.mean(X_test, axis=1) # Take mean across the frame dimension | |
| print(f"Training mean features shape: {X_train_mean.shape}") | |
| print(f"Test mean features shape: {X_test_mean.shape}") | |
| # Train the classifier using mean values | |
| clf_mean = RandomForestClassifier(n_estimators=100, random_state=42) | |
| clf_mean.fit(X_train_mean, y_train) | |
| y_pred_m = clf_mean.predict(X_test_mean) | |
| accuracy_m = accuracy_score(y_test, y_pred_m) | |
| print(f"Mean-based Accuracy: {accuracy_m:.3f}") | |
| print("\nMean-based Classification Report:") | |
| print(classification_report(y_test, y_pred_m, target_names=['Bad', 'Good'])) | |
| # Show feature importance (for the mean-based model) | |
| feature_importance = clf_mean.feature_importances_ | |
| top_10_indices = np.argsort(feature_importance)[-10:][::-1] | |
| print(f"\nTop 10 most important features (out of {X_train_mean.shape[1]}):") | |
| for i, idx in enumerate(top_10_indices): | |
| print(f"{i+1:2d}. Feature {idx}: Importance = {feature_importance[idx]:.4f}") | |
| print("\nData preprocessing completed successfully!") | |
| print("The processed data is ready for various machine learning approaches:") | |
| print("- Traditional ML (Random Forest, SVM, etc.) using flattened features") | |
| print("- Deep learning with RNN/LSTM using sequential structure") | |
| print("- CNN approaches treating frames as temporal 'images'") | |
| if __name__ == "__main__": | |
| main() | |