Kalhar.Pandya
final
600cada
import os
import cv2
import numpy as np
import pickle
import sys
import threading
import concurrent.futures
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
# Import the feature extraction function (from feature_extractor.py)
from feature_extractor import extract_features_from_image
training_log = ""
# Helper function for parallel processing
def process_image(file_path, class_name):
"""
Reads the image, extracts combined features, and returns (features, label).
Returns (None, None) if image reading fails.
"""
image = cv2.imread(file_path, cv2.IMREAD_COLOR)
if image is None:
print(f"Warning: Could not read {file_path}")
return None, None
feats = extract_features_from_image(image)
return feats['combined_features'], class_name
# ---------------------------------------------------------------------
# 1. Data Loading with Parallel Feature Extraction
# ---------------------------------------------------------------------
def load_dataset(dataset_folder, max_workers=4):
"""
Expects dataset_folder to contain subfolders (one per class).
Each subfolder has images of that class. This function:
- Reads each image (in parallel)
- Extracts a feature vector
- Returns arrays of feature vectors (X) and labels (y).
"""
X = []
y = []
classes = [] # list of class names
print(f"Scanning dataset folder: {dataset_folder}")
for class_name in os.listdir(dataset_folder):
class_path = os.path.join(dataset_folder, class_name)
if not os.path.isdir(class_path):
continue
classes.append(class_name)
print(f"\nProcessing class: {class_name}")
image_files = [
f for f in os.listdir(class_path)
if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff'))
]
total_images = len(image_files)
image_count = 0
# Use ThreadPoolExecutor for parallel extraction
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_file = {}
for filename in image_files:
file_path = os.path.join(class_path, filename)
# Submit tasks
future = executor.submit(process_image, file_path, class_name)
future_to_file[future] = filename
for future in concurrent.futures.as_completed(future_to_file):
filename = future_to_file[future]
features, label = future.result()
if features is not None:
X.append(features)
y.append(label)
image_count += 1
print(
f"\rProcessed {image_count}/{total_images} images in '{class_name}'",
end="", flush=True
)
print(f"\nCompleted class: {class_name} with {image_count} images.")
X = np.array(X, dtype=np.float32)
y = np.array(y)
print(f"Finished loading dataset. Total classes: {len(classes)}. Total images: {len(X)}.")
return X, y, classes
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
def train_classifiers(X, y):
"""
Splits data into training/test sets, trains:
- A Bagging ensemble of LinearSVC classifiers (with scaling)
- A RandomForestClassifier
- A VotingClassifier that combines both
Returns:
models (dict): A dictionary with keys 'svm', 'rf', 'combined'
test_data (tuple): (X_test, y_test)
"""
# Split dataset: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=31, shuffle=True
)
# --- Train SVM Ensemble ---
svm_pipeline = make_pipeline(StandardScaler(),
LinearSVC(random_state=31))
svm_ensemble = BaggingClassifier(
estimator=svm_pipeline,
n_estimators=10, # Adjust for speed/accuracy trade-off
n_jobs=-1,
verbose=1
)
print("Training SVM ensemble classifier...")
svm_ensemble.fit(X_train, y_train)
print("\nSVM Ensemble Classification Report:")
y_pred_svm = svm_ensemble.predict(X_test)
print(classification_report(y_test, y_pred_svm))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))
# --- Train RandomForest ---
rf = RandomForestClassifier(
n_estimators=100, # Adjust as needed
random_state=31,
n_jobs=-1
)
print("\nTraining RandomForest classifier...")
rf.fit(X_train, y_train)
print("\nRandomForest Classification Report:")
y_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
# --- Train Combined Voting Classifier ---
combined_clf = VotingClassifier(
estimators=[('svm', svm_ensemble), ('rf', rf)],
voting='soft'
)
print("\nTraining Combined Voting classifier...")
combined_clf.fit(X_train, y_train)
print("\nCombined Voting Classifier Report:")
y_pred_combined = combined_clf.predict(X_test)
print(classification_report(y_test, y_pred_combined))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_combined))
models = {
'svm': svm_ensemble,
'rf': rf,
'combined': combined_clf
}
return models, (X_test, y_test)
# ---------------------------------------------------------------------
# 3. Training Thread
# ---------------------------------------------------------------------
def train_model_thread(dataset_folder, model_filename, max_workers=4):
global training_log
training_log += "Starting training...\n"
print("Starting training...")
# (A) Load Data (in parallel)
X, y, classes = load_dataset(dataset_folder, max_workers=max_workers)
# (B) Train Classifier and Print Metrics
models, _ = train_classifiers(X, y)
print("Training complete.")
training_log += "Training complete.\n"
# (C) Save the Model
model_data = {'models': models, 'class_names': classes}
with open(model_filename, "wb") as f:
pickle.dump(model_data, f)
training_log += f"Model saved to {model_filename}\n"
print(f"Model saved to {model_filename}")
# ---------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------
if __name__ == "__main__":
dataset_folder = "./../images_dataset" # Adjust path as needed
model_filename = "svm_rf_combined.pkl"
# Launch training in a separate thread
# You can tune 'max_workers' to the number of desired threads.
max_workers = 32
training_thread = threading.Thread(
target=train_model_thread,
args=(dataset_folder, model_filename, max_workers)
)
training_thread.start()
# Wait until training finishes
training_thread.join()