import os import cv2 import numpy as np import pickle from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, classification_report from sklearn.preprocessing import LabelEncoder # Configuration DATASET_DIR = "/home/codernotme/Projects/Github/katariaoptics/dataset" MODEL_PATH = "/home/codernotme/Projects/Github/katariaoptics/ai_service/face_shape_model.pkl" IMG_SIZE = (64, 64) # Resize detected faces to this size # Load Haar Cascade for face detection cascade_path = cv2.data.haarcascades + 'haarcascade_frontalface_default.xml' face_cascade = cv2.CascadeClassifier(cascade_path) def load_dataset(): images = [] labels = [] # Expected classes classes = ["heart", "oblong", "oval", "round", "square"] print(f"Loading dataset from {DATASET_DIR}...") for label in classes: folder_path = os.path.join(DATASET_DIR, label) if not os.path.isdir(folder_path): print(f"Warning: Folder {folder_path} not found.") continue print(f"Processing class: {label}") count = 0 total_files = len(os.listdir(folder_path)) for i, filename in enumerate(os.listdir(folder_path)): if i % 50 == 0: print(f" Processed {i}/{total_files} images...") img_path = os.path.join(folder_path, filename) try: # Read image img = cv2.imread(img_path) if img is None: continue # Resize if huge to speed up detection h, w = img.shape[:2] if w > 1000: scale = 1000 / w img = cv2.resize(img, (1000, int(h * scale))) # Convert to grayscale gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # Detect faces faces = face_cascade.detectMultiScale(gray, 1.1, 4) # If face detected, crop and use it if len(faces) > 0: # Use the largest face (x, y, w, h) = max(faces, key=lambda f: f[2] * f[3]) face_roi = gray[y:y+h, x:x+w] # Resize and flatten resized = cv2.resize(face_roi, IMG_SIZE) flat_features = resized.flatten() images.append(flat_features) labels.append(label) count += 1 except Exception as e: print(f"Error processing {img_path}: {e}") print(f" Loaded {count} images for {label}") return np.array(images), np.array(labels) def train_model(): X, y = load_dataset() if len(X) == 0: print("Error: No images loaded. Dataset might be empty or paths incorrect.") return print(f"Total dataset size: {len(X)} samples") # Encode labels le = LabelEncoder() y_encoded = le.fit_transform(y) # Save label encoder classes for inference with open(MODEL_PATH.replace(".pkl", "_classes.pkl"), "wb") as f: pickle.dump(le.classes_, f) print(f"Saved class labels: {le.classes_}") # Split dataset X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42) # Train Random Forest print("Training Random Forest Classifier...") clf = RandomForestClassifier(n_estimators=100, random_state=42) clf.fit(X_train, y_train) # Evaluate y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print(f"Model Accuracy: {accuracy * 100:.2f}%") print("\nClassification Report:") print(classification_report(y_test, y_pred, target_names=le.classes_)) # Save model with open(MODEL_PATH, "wb") as f: pickle.dump(clf, f) print(f"Model saved to {MODEL_PATH}") if __name__ == "__main__": train_model()