removed title
Browse files- feature_engineering.py +15 -15
feature_engineering.py
CHANGED
|
@@ -48,30 +48,30 @@ def feat_eng(df):
|
|
| 48 |
# )
|
| 49 |
|
| 50 |
# Drop features and NaNs
|
| 51 |
-
df.drop(["Ticket", "Cabin", "Fare", "PassengerId"], axis=1, inplace=True)
|
| 52 |
df = df[df["Embarked"].notna()]
|
| 53 |
|
| 54 |
# Feature engineering
|
| 55 |
# Creat a title feature
|
| 56 |
-
if "Name" in df.columns:
|
| 57 |
-
|
| 58 |
-
|
| 59 |
|
| 60 |
-
# Interpolate missing ages
|
| 61 |
-
for title in df["Title"].unique():
|
| 62 |
-
|
| 63 |
-
|
| 64 |
|
| 65 |
-
|
| 66 |
-
|
| 67 |
|
| 68 |
-
|
| 69 |
-
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
| 73 |
|
| 74 |
-
|
| 75 |
|
| 76 |
# Cast age to int
|
| 77 |
df["Age"] = df["Age"].astype("int")
|
|
|
|
| 48 |
# )
|
| 49 |
|
| 50 |
# Drop features and NaNs
|
| 51 |
+
df.drop(["Ticket", "Cabin", "Fare", "PassengerId", "Title"], axis=1, inplace=True)
|
| 52 |
df = df[df["Embarked"].notna()]
|
| 53 |
|
| 54 |
# Feature engineering
|
| 55 |
# Creat a title feature
|
| 56 |
+
# if "Name" in df.columns:
|
| 57 |
+
# df["Title"] = df.Name.str.extract("([A-Za-z]+)\\.")
|
| 58 |
+
# df.drop("Name", axis=1, inplace=True)
|
| 59 |
|
| 60 |
+
# # Interpolate missing ages
|
| 61 |
+
# for title in df["Title"].unique():
|
| 62 |
+
# # This sould be optimized
|
| 63 |
+
# mask = (df["Title"] == title) & df["Age"].isna()
|
| 64 |
|
| 65 |
+
# # Get sutible candidates for age sampling
|
| 66 |
+
# candidates = df.loc[(df["Title"] == title) & df["Age"].notna()]
|
| 67 |
|
| 68 |
+
# g = candidates.groupby("Age", dropna=True)["Age"].count()
|
| 69 |
+
# g = g.apply(lambda x: x / g.sum())
|
| 70 |
|
| 71 |
+
# weights = g.to_numpy()
|
| 72 |
+
# ages = g.index
|
| 73 |
|
| 74 |
+
# df.update(df["Age"][mask].apply(lambda x: np.random.choice(ages, p=weights)))
|
| 75 |
|
| 76 |
# Cast age to int
|
| 77 |
df["Age"] = df["Age"].astype("int")
|