import os
import cv2
import numpy as np
import pickle
import sys
import threading
import concurrent.futures

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# Import the feature extraction function (from feature_extractor.py)
from feature_extractor import extract_features_from_image

training_log = ""

# Helper function for parallel processing
def process_image(file_path, class_name):
    """
    Reads the image, extracts combined features, and returns (features, label).
    Returns (None, None) if image reading fails.
    """
    image = cv2.imread(file_path, cv2.IMREAD_COLOR)
    if image is None:
        print(f"Warning: Could not read {file_path}")
        return None, None
    
    feats = extract_features_from_image(image)
    return feats['combined_features'], class_name

# ---------------------------------------------------------------------
# 1. Data Loading with Parallel Feature Extraction
# ---------------------------------------------------------------------
def load_dataset(dataset_folder, max_workers=4):
    """
    Expects dataset_folder to contain subfolders (one per class).
    Each subfolder has images of that class. This function:
      - Reads each image (in parallel)
      - Extracts a feature vector
      - Returns arrays of feature vectors (X) and labels (y).
    """
    X = []
    y = []
    classes = []  # list of class names

    print(f"Scanning dataset folder: {dataset_folder}")
    for class_name in os.listdir(dataset_folder):
        class_path = os.path.join(dataset_folder, class_name)
        if not os.path.isdir(class_path):
            continue
        classes.append(class_name)

        print(f"\nProcessing class: {class_name}")
        image_files = [
            f for f in os.listdir(class_path)
            if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff'))
        ]
        total_images = len(image_files)
        image_count = 0

        # Use ThreadPoolExecutor for parallel extraction
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_file = {}
            for filename in image_files:
                file_path = os.path.join(class_path, filename)
                # Submit tasks
                future = executor.submit(process_image, file_path, class_name)
                future_to_file[future] = filename

            for future in concurrent.futures.as_completed(future_to_file):
                filename = future_to_file[future]
                features, label = future.result()
                if features is not None:
                    X.append(features)
                    y.append(label)
                    image_count += 1
                    print(
                        f"\rProcessed {image_count}/{total_images} images in '{class_name}'",
                        end="", flush=True
                    )

        print(f"\nCompleted class: {class_name} with {image_count} images.")

    X = np.array(X, dtype=np.float32)
    y = np.array(y)
    print(f"Finished loading dataset. Total classes: {len(classes)}. Total images: {len(X)}.")

    return X, y, classes


from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

def train_classifiers(X, y):
    """
    Splits data into training/test sets, trains:
      - A Bagging ensemble of LinearSVC classifiers (with scaling)
      - A RandomForestClassifier
      - A VotingClassifier that combines both
    
    Returns:
        models (dict): A dictionary with keys 'svm', 'rf', 'combined'
        test_data (tuple): (X_test, y_test)
    """
    # Split dataset: 80% train, 20% test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=31, shuffle=True
    )
    
    # --- Train SVM Ensemble ---
    svm_pipeline = make_pipeline(StandardScaler(), 
                                 LinearSVC(random_state=31))
    
    svm_ensemble = BaggingClassifier(
        estimator=svm_pipeline,
        n_estimators=10,  # Adjust for speed/accuracy trade-off
        n_jobs=-1,
        verbose=1
    )
    
    print("Training SVM ensemble classifier...")
    svm_ensemble.fit(X_train, y_train)
    
    print("\nSVM Ensemble Classification Report:")
    y_pred_svm = svm_ensemble.predict(X_test)
    print(classification_report(y_test, y_pred_svm))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred_svm))
    
    # --- Train RandomForest ---
    rf = RandomForestClassifier(
        n_estimators=100,  # Adjust as needed
        random_state=31,
        n_jobs=-1
    )
    
    print("\nTraining RandomForest classifier...")
    rf.fit(X_train, y_train)
    
    print("\nRandomForest Classification Report:")
    y_pred_rf = rf.predict(X_test)
    print(classification_report(y_test, y_pred_rf))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred_rf))
    
    # --- Train Combined Voting Classifier ---
    combined_clf = VotingClassifier(
        estimators=[('svm', svm_ensemble), ('rf', rf)],
        voting='soft'
    )
    
    print("\nTraining Combined Voting classifier...")
    combined_clf.fit(X_train, y_train)
    
    print("\nCombined Voting Classifier Report:")
    y_pred_combined = combined_clf.predict(X_test)
    print(classification_report(y_test, y_pred_combined))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred_combined))
    
    models = {
        'svm': svm_ensemble,
        'rf': rf,
        'combined': combined_clf
    }
    
    return models, (X_test, y_test)


# ---------------------------------------------------------------------
# 3. Training Thread
# ---------------------------------------------------------------------
def train_model_thread(dataset_folder, model_filename, max_workers=4):
    global training_log

    training_log += "Starting training...\n"
    print("Starting training...")

    # (A) Load Data (in parallel)
    X, y, classes = load_dataset(dataset_folder, max_workers=max_workers)

    # (B) Train Classifier and Print Metrics
    models, _ = train_classifiers(X, y)
    print("Training complete.")
    training_log += "Training complete.\n"

    # (C) Save the Model
    model_data = {'models': models, 'class_names': classes}
    with open(model_filename, "wb") as f:
        pickle.dump(model_data, f)
    training_log += f"Model saved to {model_filename}\n"
    print(f"Model saved to {model_filename}")

# ---------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------
if __name__ == "__main__":
    dataset_folder = "./../images_dataset"   # Adjust path as needed
    model_filename = "svm_rf_combined.pkl"

    # Launch training in a separate thread
    # You can tune 'max_workers' to the number of desired threads.
    max_workers = 32

    training_thread = threading.Thread(
        target=train_model_thread,
        args=(dataset_folder, model_filename, max_workers)
    )
    training_thread.start()
    # Wait until training finishes
    training_thread.join()