Spaces:

anly656
/

dr_jones

Sleeping

App Files Files Community

dr_jones / Logistic_Regression /NominalLogistic_Template.py

anly656

Upload 50 files

8643b59 verified 3 months ago

raw

history blame contribute delete

3.88 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	Last Modified on Sep 20, 2025
	@purpose: This Code produces a stratified random sample of a large
	dataset with a nominal target and a nominal predictor
	@notes: Uses sci-kit learn train_test_split with stratify option
	@data: CellphoneActivity.csv with n=165,633 cases and k=19 columns
	Saves CellphoneActivity_StratifiedRS.csv with n=33127 (20% of full data)
	@email: ejones@tamu.edu
	"""
	# ANSI color codes - use high intensity bright red for maximum visibility on black background
	RED = "\033[38;5;197m" # Bright 256-color red
	GOLD = "\033[38;5;185m"
	TEAL = "\033[38;5;50m"
	GREEN = "\033[38;5;82m"
	RESET = "\033[0m"

	from AdvancedAnalytics.ReplaceImputeEncode import DT, ReplaceImputeEncode
	from sklearn.model_selection import train_test_split
	import warnings
	import pandas as pd

	warnings.filterwarnings("ignore",
	category=UserWarning)
	data_map = {
	'activity':[DT.Nominal, ('sitting', 'sittingdown',
	'standing', 'standingup', 'walking')],
	'user': [DT.Nominal,('debora', 'jose_carlos', 'katia', 'wallace')],
	'gender': [DT.Binary, ('Man', 'Woman')],
	'age': [DT.Interval,(20, 80)],
	'height': [DT.Interval,(1.5, 1.75)],
	'weight': [DT.Interval,(50, 85)],
	'BMI': [DT.Interval,(20, 30)],
	'x1': [DT.Interval,(-750, +750)],
	'y1': [DT.Interval,(-750, +750)],
	'z1': [DT.Interval,(-750, +750)],
	'x2': [DT.Interval,(-750, +750)],
	'y2': [DT.Interval,(-750, +750)],
	'z2': [DT.Interval,(-750, +750)],
	'x3': [DT.Interval,(-750, +750)],
	'y3': [DT.Interval,(-750, +750)],
	'z3': [DT.Interval,(-750, +750)],
	'x4': [DT.Interval,(-750, +750)],
	'y4': [DT.Interval,(-750, +750)],
	'z4': [DT.Interval,(-750, +750)]
	}
	target = "activity" # A Nominal target
	check_var = ["weight", "x1", "y1", "z1"] # Interval Check Variables
	size = 0.2 # Sample Size
	file_in = "../data/CellphoneActivity.csv"
	file_out = "../data/CellphoneActivity_StratifiedRS.csv"
	#--------- Create Random Stratified Sample -----------------------------------
	categorical_list = []
	for key in data_map:
	if data_map[key][0] == DT.Nominal or data_map[key][0] == DT.Binary:
	categorical_list.append(key)
	l = len(file_in)
	print("")
	print(f"{RED}--> READING ORIGINAL DATASET: {file_in} {RESET}")
	df = pd.read_csv(file_in, index_col=None)

	Xt, Xv = train_test_split(df, train_size = size, stratify=df[categorical_list],
	random_state=12345)
	print(f"{RED}--> SAVING RANDOM STRATIFIED SAMPLE: {file_out} {RESET}\n")
	Xt.to_csv(file_out)

	df2 = pd.read_csv(file_out)
	target2 = df2[target].value_counts()
	target1 = df[target].value_counts()
	print(f"{TEAL}"+39*"="+f"{RESET}")
	print(f"{GREEN}"+2"", "COMPARING ORIGINAL TO SAMPLE DATA",
	2""+f"{RESET}")
	print(f"{TEAL}"+39*"="+f"{RESET}\n")

	print(f"{TEAL}"+36*"="+f"{RESET}")
	print(f"{GREEN}SAMPLE SIZES{RESET}")
	print(f"{GREEN}TARGET ORI OUT RATIO{RESET}")
	print(f"{TEAL}"+36*"-"+f"{RESET}")
	for target_val in target1.index:
	ori = target1[target_val]
	out = target2[target_val]
	ratio = ori/out
	print(f"{target_val:15s}{ori: 7.0f}{out: 7.0f}{ratio: 7.1f}")

	for var in check_var:
	print("")
	check_ori = df.groupby (target)[var].mean()
	check_out = df2.groupby(target)[var].mean()
	print(f"{TEAL}"+36*"="+f"{RESET}")
	print(f"{GREEN}CHECK AVERAGES FOR {var.upper():5s}{RESET}")
	print(f"{GREEN}TARGET ORI OUT RATIO{RESET}")
	print(f"{TEAL}"+36*"-"+f"{RESET}")
	for target_val in check_ori.index:
	ori = check_ori[target_val]
	out = check_out[target_val]
	ratio = ori/out
	print(f"{target_val:15s}{ori: 7.2f}{out: 7.2f}{ratio: 7.2f}")