import os import cv2 import numpy as np import pickle import sys import threading import concurrent.futures from sklearn.model_selection import train_test_split from sklearn.svm import SVC from sklearn.metrics import classification_report, confusion_matrix # Import the feature extraction function (from feature_extractor.py) from feature_extractor import extract_features_from_image training_log = "" # Helper function for parallel processing def process_image(file_path, class_name): """ Reads the image, extracts combined features, and returns (features, label). Returns (None, None) if image reading fails. """ image = cv2.imread(file_path, cv2.IMREAD_COLOR) if image is None: print(f"Warning: Could not read {file_path}") return None, None feats = extract_features_from_image(image) return feats['combined_features'], class_name # --------------------------------------------------------------------- # 1. Data Loading with Parallel Feature Extraction # --------------------------------------------------------------------- def load_dataset(dataset_folder, max_workers=4): """ Expects dataset_folder to contain subfolders (one per class). Each subfolder has images of that class. This function: - Reads each image (in parallel) - Extracts a feature vector - Returns arrays of feature vectors (X) and labels (y). """ X = [] y = [] classes = [] # list of class names print(f"Scanning dataset folder: {dataset_folder}") for class_name in os.listdir(dataset_folder): class_path = os.path.join(dataset_folder, class_name) if not os.path.isdir(class_path): continue classes.append(class_name) print(f"\nProcessing class: {class_name}") image_files = [ f for f in os.listdir(class_path) if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff')) ] total_images = len(image_files) image_count = 0 # Use ThreadPoolExecutor for parallel extraction with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: future_to_file = {} for filename in image_files: file_path = os.path.join(class_path, filename) # Submit tasks future = executor.submit(process_image, file_path, class_name) future_to_file[future] = filename for future in concurrent.futures.as_completed(future_to_file): filename = future_to_file[future] features, label = future.result() if features is not None: X.append(features) y.append(label) image_count += 1 print( f"\rProcessed {image_count}/{total_images} images in '{class_name}'", end="", flush=True ) print(f"\nCompleted class: {class_name} with {image_count} images.") X = np.array(X, dtype=np.float32) y = np.array(y) print(f"Finished loading dataset. Total classes: {len(classes)}. Total images: {len(X)}.") return X, y, classes from sklearn.svm import LinearSVC from sklearn.ensemble import BaggingClassifier from sklearn.metrics import classification_report, confusion_matrix from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier from sklearn.pipeline import make_pipeline from sklearn.svm import LinearSVC from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, confusion_matrix def train_classifiers(X, y): """ Splits data into training/test sets, trains: - A Bagging ensemble of LinearSVC classifiers (with scaling) - A RandomForestClassifier - A VotingClassifier that combines both Returns: models (dict): A dictionary with keys 'svm', 'rf', 'combined' test_data (tuple): (X_test, y_test) """ # Split dataset: 80% train, 20% test X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=31, shuffle=True ) # --- Train SVM Ensemble --- svm_pipeline = make_pipeline(StandardScaler(), LinearSVC(random_state=31)) svm_ensemble = BaggingClassifier( estimator=svm_pipeline, n_estimators=10, # Adjust for speed/accuracy trade-off n_jobs=-1, verbose=1 ) print("Training SVM ensemble classifier...") svm_ensemble.fit(X_train, y_train) print("\nSVM Ensemble Classification Report:") y_pred_svm = svm_ensemble.predict(X_test) print(classification_report(y_test, y_pred_svm)) print("Confusion Matrix:") print(confusion_matrix(y_test, y_pred_svm)) # --- Train RandomForest --- rf = RandomForestClassifier( n_estimators=100, # Adjust as needed random_state=31, n_jobs=-1 ) print("\nTraining RandomForest classifier...") rf.fit(X_train, y_train) print("\nRandomForest Classification Report:") y_pred_rf = rf.predict(X_test) print(classification_report(y_test, y_pred_rf)) print("Confusion Matrix:") print(confusion_matrix(y_test, y_pred_rf)) # --- Train Combined Voting Classifier --- combined_clf = VotingClassifier( estimators=[('svm', svm_ensemble), ('rf', rf)], voting='soft' ) print("\nTraining Combined Voting classifier...") combined_clf.fit(X_train, y_train) print("\nCombined Voting Classifier Report:") y_pred_combined = combined_clf.predict(X_test) print(classification_report(y_test, y_pred_combined)) print("Confusion Matrix:") print(confusion_matrix(y_test, y_pred_combined)) models = { 'svm': svm_ensemble, 'rf': rf, 'combined': combined_clf } return models, (X_test, y_test) # --------------------------------------------------------------------- # 3. Training Thread # --------------------------------------------------------------------- def train_model_thread(dataset_folder, model_filename, max_workers=4): global training_log training_log += "Starting training...\n" print("Starting training...") # (A) Load Data (in parallel) X, y, classes = load_dataset(dataset_folder, max_workers=max_workers) # (B) Train Classifier and Print Metrics models, _ = train_classifiers(X, y) print("Training complete.") training_log += "Training complete.\n" # (C) Save the Model model_data = {'models': models, 'class_names': classes} with open(model_filename, "wb") as f: pickle.dump(model_data, f) training_log += f"Model saved to {model_filename}\n" print(f"Model saved to {model_filename}") # --------------------------------------------------------------------- # Main # --------------------------------------------------------------------- if __name__ == "__main__": dataset_folder = "./../images_dataset" # Adjust path as needed model_filename = "svm_rf_combined.pkl" # Launch training in a separate thread # You can tune 'max_workers' to the number of desired threads. max_workers = 32 training_thread = threading.Thread( target=train_model_thread, args=(dataset_folder, model_filename, max_workers) ) training_thread.start() # Wait until training finishes training_thread.join()