| ################ Dicts with encodings ################ | |
| # cabin_dict= "Cabin": {"N": 0, "C": 1, "E": 2, "G": 3, "D":4, "A": 5, "B": 6, "F": 7, "T": 8} | |
| cleanup_catergories = {"sex": {"female": 1, "male": 0}, "embarked": {"S": 0, "C": 1, "Q": 2}} | |
| sex_dict = {"female": 1, "male": 0} | |
| embarked_dict = {"S": 0, "C": 1, "Q": 2} | |
| # Reversed | |
| """ | |
| title_dict = { | |
| 0: ["Mr"], | |
| 1: ["Miss"], | |
| 2: ["Mrs"], | |
| 3: ["Master"], | |
| # Rare titles, not worth individual categorys | |
| 4: [ | |
| "Dr", | |
| "Rev", | |
| "Mlle", | |
| "Major", | |
| "Col", | |
| "Countess", | |
| "Capt", | |
| "Ms", | |
| "Sir", | |
| "Lady", | |
| "Nme", | |
| "Don", | |
| "Jonkheer", | |
| ], | |
| } | |
| """ | |
| ##################################################### | |
| def feat_eng(df): | |
| """ | |
| Main function containg the feature engineering part | |
| of the pipeline. | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| import hopsworks | |
| # Load the data_frame | |
| # df = pd.read_csv( | |
| # "https://raw.githubusercontent.com/ID2223KTH/id2223kth.github.io/master/assignments/lab1/titanic.csv" | |
| # ) | |
| # Drop features and NaNs | |
| df.drop(["Ticket", "Cabin", "Fare", "PassengerId", "Name"], axis=1, inplace=True) | |
| df = df[df["Embarked"].notna()] | |
| # Feature engineering | |
| # Creat a title feature | |
| # if "Name" in df.columns: | |
| # df["Title"] = df.Name.str.extract("([A-Za-z]+)\\.") | |
| # df.drop("Name", axis=1, inplace=True) | |
| # # Interpolate missing ages | |
| # for title in df["Title"].unique(): | |
| # # This sould be optimized | |
| # mask = (df["Title"] == title) & df["Age"].isna() | |
| # # Get sutible candidates for age sampling | |
| # candidates = df.loc[(df["Title"] == title) & df["Age"].notna()] | |
| # g = candidates.groupby("Age", dropna=True)["Age"].count() | |
| # g = g.apply(lambda x: x / g.sum()) | |
| # weights = g.to_numpy() | |
| # ages = g.index | |
| # df.update(df["Age"][mask].apply(lambda x: np.random.choice(ages, p=weights))) | |
| # Cast age to int | |
| df["Age"] = df["Age"].astype("int") | |
| # Bin ages | |
| # df['Age'] = pd.cut(df['Age'],[0,8,15,30,65,150]) | |
| # # Bin fare | |
| # df['Fare'] = pd.cut(df['Fare'],[0,200,400,600,1000]) | |
| # # Bin SibSp | |
| # pd.cut(df['SibSp'], [0,1,2,7], right=False) | |
| # Cabin into categories based on first letter(deck of boat) | |
| # df["Cabin"] = df["Cabin"].str.slice(0,1) | |
| # Make a separate category of all te NANs | |
| # df["Cabin"] = df["Cabin"].fillna("N") | |
| # Fixes for hopsworks... | |
| df.columns = df.columns.str.lower() | |
| # Final encoding | |
| df = df.replace(cleanup_catergories) | |
| return df | |