Spaces:

antonbol
/

titanic

Runtime error

App Files Files Community

titanic / feature_engineering.py

antonbol

removed title again lol

20f693c about 3 years ago

raw

history blame contribute delete

2.65 kB


	################ Dicts with encodings ################
	# cabin_dict= "Cabin": {"N": 0, "C": 1, "E": 2, "G": 3, "D":4, "A": 5, "B": 6, "F": 7, "T": 8}
	cleanup_catergories = {"sex": {"female": 1, "male": 0}, "embarked": {"S": 0, "C": 1, "Q": 2}}

	sex_dict = {"female": 1, "male": 0}
	embarked_dict = {"S": 0, "C": 1, "Q": 2}

	# Reversed
	"""
	title_dict = {
	0: ["Mr"],
	1: ["Miss"],
	2: ["Mrs"],
	3: ["Master"],
	# Rare titles, not worth individual categorys
	4: [
	"Dr",
	"Rev",
	"Mlle",
	"Major",
	"Col",
	"Countess",
	"Capt",
	"Ms",
	"Sir",
	"Lady",
	"Nme",
	"Don",
	"Jonkheer",
	],
	}
	"""
	#####################################################

	def feat_eng(df):
	"""
	Main function containg the feature engineering part
	of the pipeline.
	"""
	import pandas as pd
	import numpy as np
	import hopsworks

	# Load the data_frame
	# df = pd.read_csv(
	# "https://raw.githubusercontent.com/ID2223KTH/id2223kth.github.io/master/assignments/lab1/titanic.csv"
	# )

	# Drop features and NaNs
	df.drop(["Ticket", "Cabin", "Fare", "PassengerId", "Name"], axis=1, inplace=True)
	df = df[df["Embarked"].notna()]

	# Feature engineering
	# Creat a title feature
	# if "Name" in df.columns:
	# df["Title"] = df.Name.str.extract("([A-Za-z]+)\\.")
	# df.drop("Name", axis=1, inplace=True)

	# # Interpolate missing ages
	# for title in df["Title"].unique():
	# # This sould be optimized
	# mask = (df["Title"] == title) & df["Age"].isna()

	# # Get sutible candidates for age sampling
	# candidates = df.loc[(df["Title"] == title) & df["Age"].notna()]

	# g = candidates.groupby("Age", dropna=True)["Age"].count()
	# g = g.apply(lambda x: x / g.sum())

	# weights = g.to_numpy()
	# ages = g.index

	# df.update(df["Age"][mask].apply(lambda x: np.random.choice(ages, p=weights)))

	# Cast age to int
	df["Age"] = df["Age"].astype("int")
	# Bin ages
	# df['Age'] = pd.cut(df['Age'],[0,8,15,30,65,150])

	# # Bin fare
	# df['Fare'] = pd.cut(df['Fare'],[0,200,400,600,1000])


	# # Bin SibSp
	# pd.cut(df['SibSp'], [0,1,2,7], right=False)

	# Cabin into categories based on first letter(deck of boat)
	# df["Cabin"] = df["Cabin"].str.slice(0,1)

	# Make a separate category of all te NANs
	# df["Cabin"] = df["Cabin"].fillna("N")

	# Fixes for hopsworks...
	df.columns = df.columns.str.lower()

	# Final encoding
	df = df.replace(cleanup_catergories)

	return df