Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """Fraud Detection.ipynb | |
| Automatically generated by Colab. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1JTnx_TGE4NuRxerkz1nbP9jrCP59prdN | |
| Dependencies Import | |
| """ | |
| # this code will import all the libraries that we need for this model. | |
| import numpy as np | |
| import pandas as pd | |
| from matplotlib import pyplot as plt | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.metrics import accuracy_score | |
| # read the dataset that is holding the information for the credit card | |
| # transactions. | |
| credit_card_data = pd.read_csv('/content/creditcard.csv') | |
| # this will display the first 5 rows of the dataset. | |
| credit_card_data.head() | |
| # this will display the last 5 rows of the dataset. | |
| credit_card_data.tail() | |
| # information about the dataset showcasing the datatypes used and whether the | |
| #dataset contains null values or not. | |
| credit_card_data.info() | |
| # check for the number of missing values on each column. | |
| credit_card_data.isnull().sum() | |
| # check the distribution between acceptable transaction and fraudulent transaction. | |
| credit_card_data['Class'].value_counts() | |
| """Looking at the above values it shows that the dataset is very unstable and inbalanced. | |
| 0---> would represent acceptable transaction | |
| 1---> would the represent fradulent transaction. | |
| """ | |
| # data seperation for analysis. | |
| acceptable = credit_card_data[credit_card_data.Class == 0] | |
| fraudulent = credit_card_data[credit_card_data.Class == 1] | |
| print(acceptable.shape) | |
| print(fraudulent.shape) | |
| # statistical measures. | |
| acceptable.Amount.describe() | |
| fraudulent.Amount.describe() | |
| # value comparison of both transactions. | |
| credit_card_data.groupby('Class').mean() | |
| """Data Sampling""" | |
| # before sampling the dataset... | |
| # calculate the percentage of both the acceptable and fraudulent transactions of | |
| # the dataset. | |
| classes=credit_card_data['Class'].value_counts() | |
| acceptable_percent=classes[0]/credit_card_data['Class'].count()*100 | |
| fraudulent_percent=classes[1]/credit_card_data['Class'].count()*100 | |
| print(acceptable_percent) | |
| print(fraudulent_percent) | |
| labels = ['Acceptable','Fraudulent'] | |
| count = credit_card_data.value_counts(credit_card_data['Class']) | |
| count.plot(kind = "bar",rot=0) | |
| plt.title("Labels") | |
| plt.ylabel("Count") | |
| plt.xticks(range(2), labels) | |
| plt.show() | |
| """create a sample dataset that contains a normal distribution of both transactions.""" | |
| acceptable_sample=acceptable.sample(n=492) | |
| """Concatinate the sample dataset into the already existing fraudulent dataframe""" | |
| # axis = 0 (rows) | |
| # axis = 1 (columns) | |
| new_dataset=pd.concat([acceptable_sample,fraudulent],axis=0) | |
| # view the first 5 rows of the new dataset picked at random. | |
| new_dataset.head() | |
| new_dataset.tail() | |
| new_dataset['Class'].value_counts() | |
| new_dataset.groupby('Class').mean() | |
| """Splitting dataset into training and testind sets. (80%, 10%)""" | |
| # x represent the features(content of the dataset) and y represent the class. | |
| x = new_dataset.drop(columns = 'Class',axis = 1) | |
| y = new_dataset['Class'] | |
| # this prints the data except the class column. | |
| print(x) | |
| # prints the classes. | |
| print(y) | |
| """Splitting dataset into training and testind sets. (80%, 20%)""" | |
| x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=2) | |
| # original data, the training data, test data split. | |
| print(x.shape,x_train.shape,x_test.shape) | |
| """Training the Model. | |
| Logistics Regression Model | |
| """ | |
| model = LogisticRegression() | |
| # train the logistics regression model using the training data. | |
| model.fit(x_train, y_train) | |
| # check the accuracy of the training data. | |
| x_train_prediction = model.predict(x_train) | |
| training_data_accuracy = accuracy_score(x_train_prediction, y_train) | |
| # if the accuracy is above 80% then it means the model is good | |
| print('Accuracy on the Training data : ', training_data_accuracy * 100) | |
| # accuracy on the test data | |
| x_test_prediction = model.predict(x_test) | |
| test_data_accuracy = accuracy_score(x_test_prediction, y_test) | |
| print('Accuracy on the Test data : ', test_data_accuracy * 100) | |
| import pickle | |
| pickle.dump(model,open('model.pkl','wb')) |