File size: 1,589 Bytes
72d0706 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | import pandas as pd
import joblib
import os
from sklearn.metrics import silhouette_score
def validate_project():
print("--- Final Project Validation ---")
# 1. Check Data Cleaning
df_clean = pd.read_csv('data/processed/cleaned_retail_data.csv')
print(f"Cleaned Data Rows: {len(df_clean)}")
print(f"Unique Countries: {df_clean.Country.nunique()} (Should be 1 for UK-only)")
# Check for Cancellations
cancellations = df_clean[df_clean['Invoice'].astype(str).str.contains('C', na=False)]
print(f"Cancellations in Cleaned Data: {len(cancellations)} (Should be 0)")
# Check for Negative values
neg_qty = df_clean[df_clean['Quantity'] <= 0]
neg_price = df_clean[df_clean['Price'] <= 0]
print(f"Negative Quantities: {len(neg_qty)} (Should be 0)")
print(f"Zero/Negative Prices: {len(neg_price)} (Should be 0)")
# 2. Check RFM Features
df_rfm = pd.read_csv('data/processed/rfm_features.csv')
print(f"RFM Customers: {len(df_rfm)}")
# 3. Check Model & Score
if os.path.exists('data/processed/scaled_rfm_data.pkl') and os.path.exists('models/kmeans_model.pkl'):
data = joblib.load('data/processed/scaled_rfm_data.pkl')
model = joblib.load('models/kmeans_model.pkl')
X = data['rfm_scaled']
score = silhouette_score(X, model.labels_)
print(f"Model Clusters (k): {model.n_clusters}")
print(f"Silhouette Score (k={model.n_clusters}): {score:.4f}")
else:
print("Model/Scaled data files missing!")
if __name__ == "__main__":
validate_project()
|