File size: 1,623 Bytes
53b92fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
from sklearn.model_selection import train_test_split

# ---------------- App Data ---------------- #
def load_app_data(path="data/lsapp.tsv"):
    df = pd.read_csv(path, sep="\t")    
    df.rename(columns={"lsapp.tsv": "userid"}, inplace=True)
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    return df

def preprocess_app(df):
    # feature engineering: session count, recency, churn
    user_sessions = df.groupby("userid").size().reset_index(name="session_count")
    last_session_time = df.groupby("userid")["timestamp"].max().reset_index()
    last_session_time["recency"] = (df["timestamp"].max() - last_session_time["timestamp"]).dt.days

    features = pd.merge(user_sessions, last_session_time, on="userid")
    features["churn"] = (features["recency"] > 30).astype(int)
    return features

def split_app(features):
    X = features[["session_count", "recency"]]
    y = features["churn"]
    return train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# ---------------- Fitness Data ---------------- #
def load_fitness_data(path="data/DadosV3.csv"):
    df = pd.read_csv(path)
    df["Gender"] = df["Gender"].map({"Male": 1, "Female": 0})  # encode gender
    return df

def preprocess_fitness(df):
    features = df[["Age","Gender","EnrollmentDuration","DaysWithoutFrequency",
                   "AttendedClasses","NumberOfActivities","NumberOfRenewals","Dropout"]]
    return features

def split_fitness(features):
    X = features.drop(columns=["Dropout"])
    y = features["Dropout"]
    return train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)