# 1. Load the Dataset

The dataset you loaded has three classes of Iris species (setosa, versicolor, virginica) and four features (sepal length, sepal width, petal length, petal width). These features can predict the species.

In [76]:
!pip install datasets



In [77]:
from datasets import load_dataset

ds = load_dataset("scikit-learn/iris")

In [78]:
import pandas as pd

data = pd.DataFrame(ds['train'])
data.drop
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


# 2. Preprocess the Data
We need to split the data into training and testing sets for evaluation. We’ll also normalize the data to improve model performance.


In [79]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [80]:
data.drop('Id', axis=1, inplace=True)

In [81]:
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


refs: [Data science | Data Pre-processing using Scikit-learn| Iris dataset| Jay Patel@medium](https://jay190301.medium.com/data-science-data-pre-processing-using-scikit-learn-iris-dataset-1ba0a9ae04e6)

## Data Encoding

1. label encoding

In [82]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['Species'] = le.fit_transform(data['Species'])
data['Species'].value_counts()

Unnamed: 0_level_0,count
Species,Unnamed: 1_level_1
0,50
1,50
2,50


In [83]:
le.classes_

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

2. Onehot encoder

In [84]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
transformed_data = ohe.fit_transform(data['Species'].values.reshape(-1,1)).toarray()

In [85]:
ohe.categories_

[array([0, 1, 2])]

In [86]:
transformed_data = pd.DataFrame(
    transformed_data,
    columns=['setosa', 'versicolor', 'virginica'],
    index=data.index
)
transformed_data.head()

Unnamed: 0,setosa,versicolor,virginica
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0


## Normalization

$$
x' = \frac{x - \text{min}(x)}{\text{max}(x) - \text{min}(x)}
$$

In [87]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

mms = MinMaxScaler(feature_range=(0,1))
normalized_data = mms.fit_transform(data)
pd.DataFrame(
    normalized_data,
    columns=data.columns,
)


Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,0.222222,0.625000,0.067797,0.041667,0.0
1,0.166667,0.416667,0.067797,0.041667,0.0
2,0.111111,0.500000,0.050847,0.041667,0.0
3,0.083333,0.458333,0.084746,0.041667,0.0
4,0.194444,0.666667,0.067797,0.041667,0.0
...,...,...,...,...,...
145,0.666667,0.416667,0.711864,0.916667,1.0
146,0.555556,0.208333,0.677966,0.750000,1.0
147,0.611111,0.416667,0.711864,0.791667,1.0
148,0.527778,0.583333,0.745763,0.916667,1.0


## Standardization

$$
z = \frac{x - \mu}{\sigma}
$$

In [88]:
standard_scaler = StandardScaler()
standardized_data = standard_scaler.fit_transform(data)
pd.DataFrame(
    standardized_data,
    columns=data.columns,
)

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,-0.900681,1.032057,-1.341272,-1.312977,-1.224745
1,-1.143017,-0.124958,-1.341272,-1.312977,-1.224745
2,-1.385353,0.337848,-1.398138,-1.312977,-1.224745
3,-1.506521,0.106445,-1.284407,-1.312977,-1.224745
4,-1.021849,1.263460,-1.341272,-1.312977,-1.224745
...,...,...,...,...,...
145,1.038005,-0.124958,0.819624,1.447956,1.224745
146,0.553333,-1.281972,0.705893,0.922064,1.224745
147,0.795669,-0.124958,0.819624,1.053537,1.224745
148,0.432165,0.800654,0.933356,1.447956,1.224745


## Imputation of missing values


In [89]:
from sklearn.impute import SimpleImputer
import numpy as np

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputed_data = imputer.fit_transform(data)
pd.DataFrame(
    imputed_data,
).isnull().sum()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


## Discretization

1. Quantile Discretization Transform

In [90]:
from sklearn.preprocessing import KBinsDiscretizer

trans = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
new_data = trans.fit_transform(data)
pd.DataFrame(
    new_data,
    columns=data.columns
)



Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,2.0,7.0,1.0,1.0,0.0
1,1.0,4.0,1.0,1.0,0.0
2,0.0,6.0,0.0,1.0,0.0
3,0.0,5.0,2.0,1.0,0.0
4,2.0,7.0,1.0,1.0,0.0
...,...,...,...,...,...
145,8.0,4.0,7.0,8.0,1.0
146,7.0,1.0,7.0,7.0,1.0
147,7.0,4.0,7.0,7.0,1.0
148,6.0,7.0,8.0,8.0,1.0


2. Uniform Discretization Transform

In [91]:
trans = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
new_data = trans.fit_transform(data)
pd.DataFrame(
    new_data,
    columns=data.columns
)

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,2.0,6.0,0.0,0.0,0.0
1,1.0,4.0,0.0,0.0,0.0
2,1.0,5.0,0.0,0.0,0.0
3,0.0,4.0,0.0,0.0,0.0
4,1.0,6.0,0.0,0.0,0.0
...,...,...,...,...,...
145,6.0,4.0,7.0,9.0,9.0
146,5.0,2.0,6.0,7.0,9.0
147,6.0,4.0,7.0,7.0,9.0
148,5.0,5.0,7.0,9.0,9.0


3. KMeans Discretization Transform

In [92]:
trans = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='kmeans')
new_data = trans.fit_transform(data)
pd.DataFrame(
    new_data,
    columns=data.columns
)

  return fit_method(estimator, *args, **kwargs)


Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,2.0,6.0,0.0,0.0,0.0
1,1.0,4.0,0.0,0.0,0.0
2,1.0,4.0,0.0,0.0,0.0
3,0.0,4.0,0.0,0.0,0.0
4,1.0,6.0,0.0,0.0,0.0
...,...,...,...,...,...
145,6.0,4.0,5.0,9.0,2.0
146,5.0,2.0,5.0,7.0,2.0
147,6.0,4.0,5.0,7.0,2.0
148,5.0,5.0,6.0,9.0,2.0


## In this dataset we use Standardization


In [93]:
# Extract features and labels

X = data.drop('Species', axis=1)
y = data['Species']

In [94]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [95]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# 3. Choose a Model

https://en.wikipedia.org/wiki/Logistic_regression

In [96]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)


# 4. Train the Model


In [97]:
y_pred = model.predict(X_test)

# 5. Evaluate the Model


In [98]:
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [99]:
import joblib

joblib.dump(model, 'iris_logistic_regression_model.pkl')
print("Model saved to iris_logistic_regression_model.pkl")


Model saved to iris_logistic_regression_model.pkl


In [100]:
loaded_model = joblib.load('logistic_regression_model.pkl')
print("Model loaded successfully")


new_predictions = loaded_model.predict(X_test)
print(new_predictions)

Model loaded successfully
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]


In [101]:
accuracy = accuracy_score(y_test, new_predictions)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, new_predictions))

Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

