user-churn / utils /data_prep.py
VasithaTilakumara
chatinterface layout change
7ae4950
import pandas as pd
from sklearn.model_selection import train_test_split
# ---------------- App Data ---------------- #
def load_app_data(path="data/lsapp.tsv"):
df = pd.read_csv(path, sep="\t")
df.rename(columns={"lsapp.tsv": "userid"}, inplace=True)
df["timestamp"] = pd.to_datetime(df["timestamp"])
return df
def preprocess_app(df):
# feature engineering: session count, recency, churn
user_sessions = df.groupby("userid").size().reset_index(name="session_count")
last_session_time = df.groupby("userid")["timestamp"].max().reset_index()
last_session_time["recency"] = (df["timestamp"].max() - last_session_time["timestamp"]).dt.days
features = pd.merge(user_sessions, last_session_time, on="userid")
features["churn"] = (features["recency"] > 30).astype(int)
return features
def split_app(features):
X = features[["session_count", "recency"]]
y = features["churn"]
return train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
# ---------------- Fitness Data ---------------- #
def load_fitness_data(path="data/DadosV3.csv"):
df = pd.read_csv(path)
df["Gender"] = df["Gender"].map({"Male": 1, "Female": 0}) # encode gender
return df
def preprocess_fitness(df):
features = df[["Age","Gender","EnrollmentDuration","DaysWithoutFrequency",
"AttendedClasses","NumberOfActivities","NumberOfRenewals","Dropout"]]
return features
def split_fitness(features):
X = features.drop(columns=["Dropout"])
y = features["Dropout"]
return train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)