import pandas as pd from sklearn.model_selection import train_test_split # ---------------- App Data ---------------- # def load_app_data(path="data/lsapp.tsv"): df = pd.read_csv(path, sep="\t") df.rename(columns={"lsapp.tsv": "userid"}, inplace=True) df["timestamp"] = pd.to_datetime(df["timestamp"]) return df def preprocess_app(df): # feature engineering: session count, recency, churn user_sessions = df.groupby("userid").size().reset_index(name="session_count") last_session_time = df.groupby("userid")["timestamp"].max().reset_index() last_session_time["recency"] = (df["timestamp"].max() - last_session_time["timestamp"]).dt.days features = pd.merge(user_sessions, last_session_time, on="userid") features["churn"] = (features["recency"] > 30).astype(int) return features def split_app(features): X = features[["session_count", "recency"]] y = features["churn"] return train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) # ---------------- Fitness Data ---------------- # def load_fitness_data(path="data/DadosV3.csv"): df = pd.read_csv(path) df["Gender"] = df["Gender"].map({"Male": 1, "Female": 0}) # encode gender return df def preprocess_fitness(df): features = df[["Age","Gender","EnrollmentDuration","DaysWithoutFrequency", "AttendedClasses","NumberOfActivities","NumberOfRenewals","Dropout"]] return features def split_fitness(features): X = features.drop(columns=["Dropout"]) y = features["Dropout"] return train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)