In [96]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np

In [2]:
pd.__version__

'2.3.2'

In [55]:
df = pd.read_csv("car_fuel_efficiency.csv")

In [56]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


## 1. No. of Records

In [57]:
df.shape

(9704, 11)

## 2. Distinct fuel types

In [58]:
fuels = df['fuel_type']
fuels.head()

0    Gasoline
1    Gasoline
2    Gasoline
3      Diesel
4    Gasoline
Name: fuel_type, dtype: object

In [59]:
fuels.nunique()

2

## 3. Null Values

In [60]:
df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

Clearly, in fuel types there is no missing values

## 4. Max fuel efficiency 

In [61]:
df.columns

Index(['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight',
       'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain',
       'num_doors', 'fuel_efficiency_mpg'],
      dtype='object')

In [62]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [63]:
mask_asia = df['origin'] == 'Asia'
eff = df[mask_asia]
eff.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
8,250,1.0,174.0,2714.21931,10.3,2016,Asia,Diesel,Front-wheel drive,-1.0,16.823554
12,320,5.0,145.0,2783.868974,15.1,2010,Asia,Diesel,All-wheel drive,1.0,16.17582
14,200,6.0,160.0,3582.687368,14.9,2007,Asia,Diesel,All-wheel drive,0.0,11.871091
20,150,3.0,197.0,2231.808142,18.7,2011,Asia,Gasoline,Front-wheel drive,1.0,18.889083
21,160,4.0,133.0,2659.431451,,2016,Asia,Gasoline,Front-wheel drive,-1.0,16.07773


In [64]:
max_eff = max(eff['fuel_efficiency_mpg'])
max_eff

23.759122836520497

## 5. Median value of horsepower

In [65]:
df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [66]:
# median of the horsepower col
df['horsepower'].median()

149.0

In [67]:
# most frequent value here
df['horsepower'].value_counts()

horsepower
152.0    142
145.0    141
151.0    134
148.0    130
141.0    130
        ... 
40.0       1
57.0       1
245.0      1
252.0      1
61.0       1
Name: count, Length: 192, dtype: int64

In [74]:
# alternatively mode gives the max freq count
mode_horsepower = df['horsepower'].mode()[0]
mode_horsepower

np.float64(152.0)

In [80]:
# fill the missing values in the col with mode
df['horsepower'].fillna(mode_horsepower, inplace=True)

In [81]:
# check if null values are removed or not
df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower               0
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

Clearly the null values have been imputed

In [82]:
# now recalculate the median
df['horsepower'].median()

152.0

## 6. Model building

In [84]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,152.0,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [83]:
mask_asia.head()

0    False
1    False
2    False
3    False
4    False
Name: origin, dtype: bool

In [88]:
columns_to_keep = ['vehicle_weight', 'model_year']

In [94]:
# subset the asian data
df_asia = df[mask_asia]
df_asia_final = df_asia[columns_to_keep].head(7)
df_asia_final

Unnamed: 0,vehicle_weight,model_year
8,2714.21931,2016
12,2783.868974,2010
14,3582.687368,2007
20,2231.808142,2011
21,2659.431451,2016
34,2844.227534,2014
38,3761.994038,2019


In [100]:
# get the underlying numpy array
X = np.array(df_asia_final)
X.shape

(7, 2)

In [110]:
# take the dot product with the traspose (7,2).(2,7) -> (7,7)
XTX = X.T @ X
XTX.shape

(2, 2)

In [111]:
XTX_inv = np.linalg.inv(XTX)
XTX_inv.shape

(2, 2)

In [112]:
# Create an array y with values 
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])
y 

array([1100, 1300,  800,  900, 1000, 1100, 1200])

In [114]:
# Multiply the inverse of XTX with the transpose of X, and then multiply the result by y. Call the result w
step = XTX_inv @ X.T
w = step @ y

In [115]:
# sum of all the elements of the result / weights
np.sum(w)

np.float64(0.5187709081074016)

## End of Week 1