Spaces:

Divya499
/

segmentx-behavioral-intelligence

Sleeping

DIVYANSHI SINGH

Initial commit: SegmentX Behavioral Intelligence Portal

72d0706 2 months ago

1.59 kB

	import pandas as pd
	import joblib
	import os
	from sklearn.metrics import silhouette_score

	def validate_project():
	print("--- Final Project Validation ---")

	# 1. Check Data Cleaning
	df_clean = pd.read_csv('data/processed/cleaned_retail_data.csv')
	print(f"Cleaned Data Rows: {len(df_clean)}")
	print(f"Unique Countries: {df_clean.Country.nunique()} (Should be 1 for UK-only)")

	# Check for Cancellations
	cancellations = df_clean[df_clean['Invoice'].astype(str).str.contains('C', na=False)]
	print(f"Cancellations in Cleaned Data: {len(cancellations)} (Should be 0)")

	# Check for Negative values
	neg_qty = df_clean[df_clean['Quantity'] <= 0]
	neg_price = df_clean[df_clean['Price'] <= 0]
	print(f"Negative Quantities: {len(neg_qty)} (Should be 0)")
	print(f"Zero/Negative Prices: {len(neg_price)} (Should be 0)")

	# 2. Check RFM Features
	df_rfm = pd.read_csv('data/processed/rfm_features.csv')
	print(f"RFM Customers: {len(df_rfm)}")

	# 3. Check Model & Score
	if os.path.exists('data/processed/scaled_rfm_data.pkl') and os.path.exists('models/kmeans_model.pkl'):
	data = joblib.load('data/processed/scaled_rfm_data.pkl')
	model = joblib.load('models/kmeans_model.pkl')
	X = data['rfm_scaled']
	score = silhouette_score(X, model.labels_)
	print(f"Model Clusters (k): {model.n_clusters}")
	print(f"Silhouette Score (k={model.n_clusters}): {score:.4f}")
	else:
	print("Model/Scaled data files missing!")

	if __name__ == "__main__":
	validate_project()