MHuzaifaa
/

finalhackathon

machine-learning

crime-prediction

Model card Files Files and versions

finalhackathon / src /verify_pipeline.py

MHuzaifaa's picture

Upload project

100fb60 5 months ago

3.48 kB

	import pandas as pd
	import numpy as np
	import os
	import joblib
	from preprocessing import preprocess_pipeline

	def verify_data_integrity():
	print("=== Starting Deep Verification ===")

	# Paths
	data_dir = os.path.join(os.path.dirname(__file__), '../data/crimedataset')
	train_path = os.path.join(data_dir, 'train.csv')
	test_path = os.path.join(data_dir, 'test.csv')

	# 1. Load Data
	print("\n[1] Loading Data...")
	if not os.path.exists(train_path):
	print("X Train file missing!")
	return

	df_train = pd.read_csv(train_path, parse_dates=['Dates'])
	print(f"OK Train Data Loaded: {df_train.shape}")

	# 2. Check for Duplicates
	print("\n[2] Checking for Duplicates...")
	duplicates = df_train.duplicated().sum()
	if duplicates > 0:
	print(f"! Warning: {duplicates} duplicate rows found in training data.")
	else:
	print("OK No duplicates found.")

	# 3. Class Balance
	print("\n[3] Checking Class Balance...")
	violent_categories = [
	'ASSAULT', 'ROBBERY', 'SEX OFFENSES FORCIBLE', 'KIDNAPPING', 'HOMICIDE', 'ARSON'
	]
	df_train['IsViolent'] = df_train['Category'].apply(lambda x: 1 if x in violent_categories else 0)
	balance = df_train['IsViolent'].value_counts(normalize=True)
	print(f"Violent Crime Ratio: {balance.get(1, 0)*100:.2f}%")
	print(f"Non-Violent Crime Ratio: {balance.get(0, 0)*100:.2f}%")

	if balance.get(1, 0) < 0.1:
	print("! Severe Class Imbalance detected (<10% positive class). Model may struggle with Recall.")

	# 4. Check for Data Leakage (Train vs Test overlap)
	# Since test data might not have labels, we check for exact feature matches if test exists
	if os.path.exists(test_path):
	print("\n[4] Checking for Data Leakage (Train/Test Overlap)...")
	df_test = pd.read_csv(test_path, parse_dates=['Dates'])
	# Check intersection of Dates and Location
	# This is a heuristic; exact row match might be too slow for large data
	# We'll check a sample
	train_dates = set(df_train['Dates'].dt.date.unique())
	test_dates = set(df_test['Dates'].dt.date.unique())

	overlap = train_dates.intersection(test_dates)
	if len(overlap) > 0:
	print(f"! Warning: Found {len(overlap)} days present in BOTH Train and Test sets. Possible leakage if splitting by time.")
	else:
	print("OK No date overlap between Train and Test.")

	# 5. Verify Model Artifacts
	print("\n[5] Verifying Model Artifacts...")
	models_dir = os.path.join(os.path.dirname(__file__), '../models')
	required_files = ['best_model.pkl', 'label_encoders.pkl', 'kmeans.pkl']

	all_exist = True
	for f in required_files:
	fpath = os.path.join(models_dir, f)
	if os.path.exists(fpath):
	print(f"OK Found {f}")
	# Try loading
	try:
	joblib.load(fpath)
	print(f" -> Successfully loaded {f}")
	except Exception as e:
	print(f" X Failed to load {f}: {e}")
	all_exist = False
	else:
	print(f"X Missing {f}")
	all_exist = False

	if all_exist:
	print("\n=== Verification Complete: SYSTEM HEALTHY ===")
	else:
	print("\n=== Verification Complete: ISSUES DETECTED ===")

	if __name__ == "__main__":
	verify_data_integrity()