import pandas as pd import joblib import os from sklearn.metrics import silhouette_score def validate_project(): print("--- Final Project Validation ---") # 1. Check Data Cleaning df_clean = pd.read_csv('data/processed/cleaned_retail_data.csv') print(f"Cleaned Data Rows: {len(df_clean)}") print(f"Unique Countries: {df_clean.Country.nunique()} (Should be 1 for UK-only)") # Check for Cancellations cancellations = df_clean[df_clean['Invoice'].astype(str).str.contains('C', na=False)] print(f"Cancellations in Cleaned Data: {len(cancellations)} (Should be 0)") # Check for Negative values neg_qty = df_clean[df_clean['Quantity'] <= 0] neg_price = df_clean[df_clean['Price'] <= 0] print(f"Negative Quantities: {len(neg_qty)} (Should be 0)") print(f"Zero/Negative Prices: {len(neg_price)} (Should be 0)") # 2. Check RFM Features df_rfm = pd.read_csv('data/processed/rfm_features.csv') print(f"RFM Customers: {len(df_rfm)}") # 3. Check Model & Score if os.path.exists('data/processed/scaled_rfm_data.pkl') and os.path.exists('models/kmeans_model.pkl'): data = joblib.load('data/processed/scaled_rfm_data.pkl') model = joblib.load('models/kmeans_model.pkl') X = data['rfm_scaled'] score = silhouette_score(X, model.labels_) print(f"Model Clusters (k): {model.n_clusters}") print(f"Silhouette Score (k={model.n_clusters}): {score:.4f}") else: print("Model/Scaled data files missing!") if __name__ == "__main__": validate_project()