Fraud_Detection / fraud_detection.py
cmasukume's picture
Upload 21 files
045d34f verified
# -*- coding: utf-8 -*-
"""Fraud Detection.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1JTnx_TGE4NuRxerkz1nbP9jrCP59prdN
Dependencies Import
"""
# this code will import all the libraries that we need for this model.
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# read the dataset that is holding the information for the credit card
# transactions.
credit_card_data = pd.read_csv('/content/creditcard.csv')
# this will display the first 5 rows of the dataset.
credit_card_data.head()
# this will display the last 5 rows of the dataset.
credit_card_data.tail()
# information about the dataset showcasing the datatypes used and whether the
#dataset contains null values or not.
credit_card_data.info()
# check for the number of missing values on each column.
credit_card_data.isnull().sum()
# check the distribution between acceptable transaction and fraudulent transaction.
credit_card_data['Class'].value_counts()
"""Looking at the above values it shows that the dataset is very unstable and inbalanced.
0---> would represent acceptable transaction
1---> would the represent fradulent transaction.
"""
# data seperation for analysis.
acceptable = credit_card_data[credit_card_data.Class == 0]
fraudulent = credit_card_data[credit_card_data.Class == 1]
print(acceptable.shape)
print(fraudulent.shape)
# statistical measures.
acceptable.Amount.describe()
fraudulent.Amount.describe()
# value comparison of both transactions.
credit_card_data.groupby('Class').mean()
"""Data Sampling"""
# before sampling the dataset...
# calculate the percentage of both the acceptable and fraudulent transactions of
# the dataset.
classes=credit_card_data['Class'].value_counts()
acceptable_percent=classes[0]/credit_card_data['Class'].count()*100
fraudulent_percent=classes[1]/credit_card_data['Class'].count()*100
print(acceptable_percent)
print(fraudulent_percent)
labels = ['Acceptable','Fraudulent']
count = credit_card_data.value_counts(credit_card_data['Class'])
count.plot(kind = "bar",rot=0)
plt.title("Labels")
plt.ylabel("Count")
plt.xticks(range(2), labels)
plt.show()
"""create a sample dataset that contains a normal distribution of both transactions."""
acceptable_sample=acceptable.sample(n=492)
"""Concatinate the sample dataset into the already existing fraudulent dataframe"""
# axis = 0 (rows)
# axis = 1 (columns)
new_dataset=pd.concat([acceptable_sample,fraudulent],axis=0)
# view the first 5 rows of the new dataset picked at random.
new_dataset.head()
new_dataset.tail()
new_dataset['Class'].value_counts()
new_dataset.groupby('Class').mean()
"""Splitting dataset into training and testind sets. (80%, 10%)"""
# x represent the features(content of the dataset) and y represent the class.
x = new_dataset.drop(columns = 'Class',axis = 1)
y = new_dataset['Class']
# this prints the data except the class column.
print(x)
# prints the classes.
print(y)
"""Splitting dataset into training and testind sets. (80%, 20%)"""
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=2)
# original data, the training data, test data split.
print(x.shape,x_train.shape,x_test.shape)
"""Training the Model.
Logistics Regression Model
"""
model = LogisticRegression()
# train the logistics regression model using the training data.
model.fit(x_train, y_train)
# check the accuracy of the training data.
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)
# if the accuracy is above 80% then it means the model is good
print('Accuracy on the Training data : ', training_data_accuracy * 100)
# accuracy on the test data
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)
print('Accuracy on the Test data : ', test_data_accuracy * 100)
import pickle
pickle.dump(model,open('model.pkl','wb'))