Spaces:
Sleeping
Sleeping
File size: 4,119 Bytes
045d34f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | # -*- coding: utf-8 -*-
"""Fraud Detection.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1JTnx_TGE4NuRxerkz1nbP9jrCP59prdN
Dependencies Import
"""
# this code will import all the libraries that we need for this model.
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# read the dataset that is holding the information for the credit card
# transactions.
credit_card_data = pd.read_csv('/content/creditcard.csv')
# this will display the first 5 rows of the dataset.
credit_card_data.head()
# this will display the last 5 rows of the dataset.
credit_card_data.tail()
# information about the dataset showcasing the datatypes used and whether the
#dataset contains null values or not.
credit_card_data.info()
# check for the number of missing values on each column.
credit_card_data.isnull().sum()
# check the distribution between acceptable transaction and fraudulent transaction.
credit_card_data['Class'].value_counts()
"""Looking at the above values it shows that the dataset is very unstable and inbalanced.
0---> would represent acceptable transaction
1---> would the represent fradulent transaction.
"""
# data seperation for analysis.
acceptable = credit_card_data[credit_card_data.Class == 0]
fraudulent = credit_card_data[credit_card_data.Class == 1]
print(acceptable.shape)
print(fraudulent.shape)
# statistical measures.
acceptable.Amount.describe()
fraudulent.Amount.describe()
# value comparison of both transactions.
credit_card_data.groupby('Class').mean()
"""Data Sampling"""
# before sampling the dataset...
# calculate the percentage of both the acceptable and fraudulent transactions of
# the dataset.
classes=credit_card_data['Class'].value_counts()
acceptable_percent=classes[0]/credit_card_data['Class'].count()*100
fraudulent_percent=classes[1]/credit_card_data['Class'].count()*100
print(acceptable_percent)
print(fraudulent_percent)
labels = ['Acceptable','Fraudulent']
count = credit_card_data.value_counts(credit_card_data['Class'])
count.plot(kind = "bar",rot=0)
plt.title("Labels")
plt.ylabel("Count")
plt.xticks(range(2), labels)
plt.show()
"""create a sample dataset that contains a normal distribution of both transactions."""
acceptable_sample=acceptable.sample(n=492)
"""Concatinate the sample dataset into the already existing fraudulent dataframe"""
# axis = 0 (rows)
# axis = 1 (columns)
new_dataset=pd.concat([acceptable_sample,fraudulent],axis=0)
# view the first 5 rows of the new dataset picked at random.
new_dataset.head()
new_dataset.tail()
new_dataset['Class'].value_counts()
new_dataset.groupby('Class').mean()
"""Splitting dataset into training and testind sets. (80%, 10%)"""
# x represent the features(content of the dataset) and y represent the class.
x = new_dataset.drop(columns = 'Class',axis = 1)
y = new_dataset['Class']
# this prints the data except the class column.
print(x)
# prints the classes.
print(y)
"""Splitting dataset into training and testind sets. (80%, 20%)"""
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=2)
# original data, the training data, test data split.
print(x.shape,x_train.shape,x_test.shape)
"""Training the Model.
Logistics Regression Model
"""
model = LogisticRegression()
# train the logistics regression model using the training data.
model.fit(x_train, y_train)
# check the accuracy of the training data.
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)
# if the accuracy is above 80% then it means the model is good
print('Accuracy on the Training data : ', training_data_accuracy * 100)
# accuracy on the test data
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)
print('Accuracy on the Test data : ', test_data_accuracy * 100)
import pickle
pickle.dump(model,open('model.pkl','wb')) |