Create ModelOptimization.py
Browse files- ModelOptimization.py +103 -0
ModelOptimization.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import json
|
| 4 |
+
from Quin.Core import ModelOptimization
|
| 5 |
+
|
| 6 |
+
EPSILON = 1e-5
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class FeatureEngineer(ModelOptimization):
|
| 10 |
+
|
| 11 |
+
def apply(self, df, k, condition):
|
| 12 |
+
df[k] = df['features'].apply(condition)
|
| 13 |
+
df[k] = df[k].astype(np.int8)
|
| 14 |
+
|
| 15 |
+
def fit(self, X, y=None, **fit_params):
|
| 16 |
+
return self
|
| 17 |
+
|
| 18 |
+
def transform(self, X, y=None):
|
| 19 |
+
df = X.copy()
|
| 20 |
+
|
| 21 |
+
df.features = df.features.apply(lambda x: ' '.join([y.replace(' ', '_') for y in x]))
|
| 22 |
+
df.features = df.features.apply(lambda x: x.lower())
|
| 23 |
+
df.features = df.features.apply(lambda x: x.replace('-', '_'))
|
| 24 |
+
|
| 25 |
+
for k, condition in (('dishwasher', lambda x: 'dishwasher' in x),
|
| 26 |
+
('doorman', lambda x: 'doorman' in x or 'concierge' in x),
|
| 27 |
+
('pets', lambda x: "pets" in x or "pet" in x or "dog" in x or "cats" in x and "no_pets" not in x),
|
| 28 |
+
('air_conditioning', lambda x: 'air_conditioning' in x or 'central' in x),
|
| 29 |
+
('parking', lambda x: 'parking' in x),
|
| 30 |
+
('balcony', lambda x: 'balcony' in x or 'deck' in x or 'terrace' in x or 'patio' in x),
|
| 31 |
+
('bike', lambda x: 'bike' in x),
|
| 32 |
+
('storage', lambda x: 'storage' in x),
|
| 33 |
+
('outdoor', lambda x: 'outdoor' in x or 'courtyard' in x or 'garden' in x),
|
| 34 |
+
('roof', lambda x: 'roof' in x),
|
| 35 |
+
('gym', lambda x: 'gym' in x or 'fitness' in x),
|
| 36 |
+
('pool', lambda x: 'pool' in x),
|
| 37 |
+
('backyard', lambda x: 'backyard' in x),
|
| 38 |
+
('laundry', lambda x: 'laundry' in x),
|
| 39 |
+
('hardwood_floors', lambda x: 'hardwood_floors' in x),
|
| 40 |
+
('new_construction', lambda x: 'new_construction' in x),
|
| 41 |
+
('dryer', lambda x: 'dryer' in x),
|
| 42 |
+
('elevator', lambda x: 'elevator' in x),
|
| 43 |
+
('garage', lambda x: 'garage' in x),
|
| 44 |
+
('pre_war', lambda x: 'pre_war' in x or 'prewar' in x),
|
| 45 |
+
('post_war', lambda x: 'post_war' in x or 'postwar' in x),
|
| 46 |
+
('no_fee', lambda x: 'no_fee' in x),
|
| 47 |
+
('low_fee', lambda x: 'reduced_fee' in x or 'low_fee' in x),
|
| 48 |
+
('fire', lambda x: 'fireplace' in x),
|
| 49 |
+
('private', lambda x: 'private' in x),
|
| 50 |
+
('wheelchair', lambda x: 'wheelchair' in x),
|
| 51 |
+
('internet', lambda x: 'wifi' in x or 'wi_fi' in x or 'internet' in x),
|
| 52 |
+
('yoga', lambda x: 'yoga' in x),
|
| 53 |
+
('furnished', lambda x: 'furnished' in x),
|
| 54 |
+
('multi_level', lambda x: 'multi_level' in x),
|
| 55 |
+
('exclusive', lambda x: 'exclusive' in x),
|
| 56 |
+
('high_ceil', lambda x: 'high_ceil' in x),
|
| 57 |
+
('green', lambda x: 'green_b' in x),
|
| 58 |
+
('stainless', lambda x: 'stainless_' in x),
|
| 59 |
+
('simplex', lambda x: 'simplex' in x),
|
| 60 |
+
('public', lambda x: 'public' in x),
|
| 61 |
+
):
|
| 62 |
+
self.apply(df, k, condition)
|
| 63 |
+
|
| 64 |
+
df['bathrooms'] = df['bathrooms'].apply(lambda x: x if x < 5 else 5)
|
| 65 |
+
df['bedrooms'] = df['bedrooms'].apply(lambda x: x if x < 5 else 5)
|
| 66 |
+
df["num_photos"] = df["photos"].apply(len)
|
| 67 |
+
df["num_features"] = df["features"].apply(len)
|
| 68 |
+
created = pd.to_datetime(df.pop("created"))
|
| 69 |
+
df["listing_age"] = (pd.to_datetime('today') - created).apply(lambda x: x.days)
|
| 70 |
+
df["room_dif"] = df["bedrooms"] - df["bathrooms"]
|
| 71 |
+
df["room_sum"] = df["bedrooms"] + df["bathrooms"]
|
| 72 |
+
df["price_per_room"] = df["price"] / df["room_sum"].apply(lambda x: max(x, .5))
|
| 73 |
+
df["bedrooms_share"] = df["bedrooms"] / df["room_sum"].apply(lambda x: max(x, .5))
|
| 74 |
+
df['price'] = df['price'].apply(lambda x: np.log(x + EPSILON))
|
| 75 |
+
|
| 76 |
+
key_types = df.dtypes.to_dict()
|
| 77 |
+
for k in key_types:
|
| 78 |
+
if key_types[k].name not in ('int64', 'float64', 'int8'):
|
| 79 |
+
df.pop(k)
|
| 80 |
+
|
| 81 |
+
for k in ('latitude', 'longitude', 'listing_id'):
|
| 82 |
+
df.pop(k)
|
| 83 |
+
return df
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def encode(x):
|
| 87 |
+
if x == 'low':
|
| 88 |
+
return 0
|
| 89 |
+
elif x == 'medium':
|
| 90 |
+
return 1
|
| 91 |
+
elif x == 'high':
|
| 92 |
+
return 2
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def get_data():
|
| 96 |
+
with open('train.json', 'r') as raw_data:
|
| 97 |
+
data = json.load(raw_data)
|
| 98 |
+
|
| 99 |
+
df = pd.DataFrame(data)
|
| 100 |
+
target = df.pop('interest_level').apply(encode)
|
| 101 |
+
|
| 102 |
+
df = FeatureEngineer().fit_transform(df)
|
| 103 |
+
return df, target
|