File size: 1,589 Bytes
72d0706
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
import joblib
import os
from sklearn.metrics import silhouette_score

def validate_project():
    print("--- Final Project Validation ---")
    
    # 1. Check Data Cleaning
    df_clean = pd.read_csv('data/processed/cleaned_retail_data.csv')
    print(f"Cleaned Data Rows: {len(df_clean)}")
    print(f"Unique Countries: {df_clean.Country.nunique()} (Should be 1 for UK-only)")
    
    # Check for Cancellations
    cancellations = df_clean[df_clean['Invoice'].astype(str).str.contains('C', na=False)]
    print(f"Cancellations in Cleaned Data: {len(cancellations)} (Should be 0)")
    
    # Check for Negative values
    neg_qty = df_clean[df_clean['Quantity'] <= 0]
    neg_price = df_clean[df_clean['Price'] <= 0]
    print(f"Negative Quantities: {len(neg_qty)} (Should be 0)")
    print(f"Zero/Negative Prices: {len(neg_price)} (Should be 0)")
    
    # 2. Check RFM Features
    df_rfm = pd.read_csv('data/processed/rfm_features.csv')
    print(f"RFM Customers: {len(df_rfm)}")
    
    # 3. Check Model & Score
    if os.path.exists('data/processed/scaled_rfm_data.pkl') and os.path.exists('models/kmeans_model.pkl'):
        data = joblib.load('data/processed/scaled_rfm_data.pkl')
        model = joblib.load('models/kmeans_model.pkl')
        X = data['rfm_scaled']
        score = silhouette_score(X, model.labels_)
        print(f"Model Clusters (k): {model.n_clusters}")
        print(f"Silhouette Score (k={model.n_clusters}): {score:.4f}")
    else:
        print("Model/Scaled data files missing!")

if __name__ == "__main__":
    validate_project()