dr_jones / Logistic_Regression /NominalLogistic_Template.py
anly656's picture
Upload 50 files
8643b59 verified
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Last Modified on Sep 20, 2025
@purpose: This Code produces a stratified random sample of a large
dataset with a nominal target and a nominal predictor
@notes: Uses sci-kit learn train_test_split with stratify option
@data: CellphoneActivity.csv with n=165,633 cases and k=19 columns
Saves CellphoneActivity_StratifiedRS.csv with n=33127 (20% of full data)
@email: ejones@tamu.edu
"""
# ANSI color codes - use high intensity bright red for maximum visibility on black background
RED = "\033[38;5;197m" # Bright 256-color red
GOLD = "\033[38;5;185m"
TEAL = "\033[38;5;50m"
GREEN = "\033[38;5;82m"
RESET = "\033[0m"
from AdvancedAnalytics.ReplaceImputeEncode import DT, ReplaceImputeEncode
from sklearn.model_selection import train_test_split
import warnings
import pandas as pd
warnings.filterwarnings("ignore",
category=UserWarning)
data_map = {
'activity':[DT.Nominal, ('sitting', 'sittingdown',
'standing', 'standingup', 'walking')],
'user': [DT.Nominal,('debora', 'jose_carlos', 'katia', 'wallace')],
'gender': [DT.Binary, ('Man', 'Woman')],
'age': [DT.Interval,(20, 80)],
'height': [DT.Interval,(1.5, 1.75)],
'weight': [DT.Interval,(50, 85)],
'BMI': [DT.Interval,(20, 30)],
'x1': [DT.Interval,(-750, +750)],
'y1': [DT.Interval,(-750, +750)],
'z1': [DT.Interval,(-750, +750)],
'x2': [DT.Interval,(-750, +750)],
'y2': [DT.Interval,(-750, +750)],
'z2': [DT.Interval,(-750, +750)],
'x3': [DT.Interval,(-750, +750)],
'y3': [DT.Interval,(-750, +750)],
'z3': [DT.Interval,(-750, +750)],
'x4': [DT.Interval,(-750, +750)],
'y4': [DT.Interval,(-750, +750)],
'z4': [DT.Interval,(-750, +750)]
}
target = "activity" # A Nominal target
check_var = ["weight", "x1", "y1", "z1"] # Interval Check Variables
size = 0.2 # Sample Size
file_in = "../data/CellphoneActivity.csv"
file_out = "../data/CellphoneActivity_StratifiedRS.csv"
#--------- Create Random Stratified Sample -----------------------------------
categorical_list = []
for key in data_map:
if data_map[key][0] == DT.Nominal or data_map[key][0] == DT.Binary:
categorical_list.append(key)
l = len(file_in)
print("")
print(f"{RED}--> READING ORIGINAL DATASET: {file_in} {RESET}")
df = pd.read_csv(file_in, index_col=None)
Xt, Xv = train_test_split(df, train_size = size, stratify=df[categorical_list],
random_state=12345)
print(f"{RED}--> SAVING RANDOM STRATIFIED SAMPLE: {file_out} {RESET}\n")
Xt.to_csv(file_out)
df2 = pd.read_csv(file_out)
target2 = df2[target].value_counts()
target1 = df[target].value_counts()
print(f"{TEAL}"+39*"="+f"{RESET}")
print(f"{GREEN}"+2*"*", "COMPARING ORIGINAL TO SAMPLE DATA",
2*"*"+f"{RESET}")
print(f"{TEAL}"+39*"="+f"{RESET}\n")
print(f"{TEAL}"+36*"="+f"{RESET}")
print(f"{GREEN}SAMPLE SIZES{RESET}")
print(f"{GREEN}TARGET ORI OUT RATIO{RESET}")
print(f"{TEAL}"+36*"-"+f"{RESET}")
for target_val in target1.index:
ori = target1[target_val]
out = target2[target_val]
ratio = ori/out
print(f"{target_val:15s}{ori: 7.0f}{out: 7.0f}{ratio: 7.1f}")
for var in check_var:
print("")
check_ori = df.groupby (target)[var].mean()
check_out = df2.groupby(target)[var].mean()
print(f"{TEAL}"+36*"="+f"{RESET}")
print(f"{GREEN}CHECK AVERAGES FOR {var.upper():5s}{RESET}")
print(f"{GREEN}TARGET ORI OUT RATIO{RESET}")
print(f"{TEAL}"+36*"-"+f"{RESET}")
for target_val in check_ori.index:
ori = check_ori[target_val]
out = check_out[target_val]
ratio = ori/out
print(f"{target_val:15s}{ori: 7.2f}{out: 7.2f}{ratio: 7.2f}")