| import pandas as pd | |
| import joblib | |
| import os | |
| from sklearn.metrics import silhouette_score | |
| def validate_project(): | |
| print("--- Final Project Validation ---") | |
| # 1. Check Data Cleaning | |
| df_clean = pd.read_csv('data/processed/cleaned_retail_data.csv') | |
| print(f"Cleaned Data Rows: {len(df_clean)}") | |
| print(f"Unique Countries: {df_clean.Country.nunique()} (Should be 1 for UK-only)") | |
| # Check for Cancellations | |
| cancellations = df_clean[df_clean['Invoice'].astype(str).str.contains('C', na=False)] | |
| print(f"Cancellations in Cleaned Data: {len(cancellations)} (Should be 0)") | |
| # Check for Negative values | |
| neg_qty = df_clean[df_clean['Quantity'] <= 0] | |
| neg_price = df_clean[df_clean['Price'] <= 0] | |
| print(f"Negative Quantities: {len(neg_qty)} (Should be 0)") | |
| print(f"Zero/Negative Prices: {len(neg_price)} (Should be 0)") | |
| # 2. Check RFM Features | |
| df_rfm = pd.read_csv('data/processed/rfm_features.csv') | |
| print(f"RFM Customers: {len(df_rfm)}") | |
| # 3. Check Model & Score | |
| if os.path.exists('data/processed/scaled_rfm_data.pkl') and os.path.exists('models/kmeans_model.pkl'): | |
| data = joblib.load('data/processed/scaled_rfm_data.pkl') | |
| model = joblib.load('models/kmeans_model.pkl') | |
| X = data['rfm_scaled'] | |
| score = silhouette_score(X, model.labels_) | |
| print(f"Model Clusters (k): {model.n_clusters}") | |
| print(f"Silhouette Score (k={model.n_clusters}): {score:.4f}") | |
| else: | |
| print("Model/Scaled data files missing!") | |
| if __name__ == "__main__": | |
| validate_project() | |