File size: 4,119 Bytes
045d34f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# -*- coding: utf-8 -*-
"""Fraud Detection.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1JTnx_TGE4NuRxerkz1nbP9jrCP59prdN

Dependencies Import
"""

# this code will import all the libraries that we need for this model.
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# read the dataset that is holding the information for the credit card
# transactions.
credit_card_data = pd.read_csv('/content/creditcard.csv')

# this will display the first 5 rows of the dataset.
credit_card_data.head()

# this will display the last 5 rows of the dataset.
credit_card_data.tail()

# information about the dataset showcasing the datatypes used and whether the
#dataset contains null values or not.
credit_card_data.info()

# check for the number of missing values on each column.
credit_card_data.isnull().sum()

# check the distribution between acceptable transaction and fraudulent transaction.
credit_card_data['Class'].value_counts()

"""Looking at the above values it shows that the dataset is very unstable and inbalanced.

0---> would represent acceptable transaction

1---> would the represent fradulent transaction.
"""

# data seperation for analysis.
acceptable = credit_card_data[credit_card_data.Class == 0]
fraudulent = credit_card_data[credit_card_data.Class == 1]

print(acceptable.shape)
print(fraudulent.shape)

# statistical measures.
acceptable.Amount.describe()

fraudulent.Amount.describe()

# value comparison of both transactions.
credit_card_data.groupby('Class').mean()

"""Data Sampling"""

# before sampling the dataset...
# calculate the percentage of both the acceptable and fraudulent transactions of
# the dataset.
classes=credit_card_data['Class'].value_counts()
acceptable_percent=classes[0]/credit_card_data['Class'].count()*100
fraudulent_percent=classes[1]/credit_card_data['Class'].count()*100
print(acceptable_percent)
print(fraudulent_percent)

labels = ['Acceptable','Fraudulent']
count = credit_card_data.value_counts(credit_card_data['Class'])
count.plot(kind = "bar",rot=0)
plt.title("Labels")
plt.ylabel("Count")
plt.xticks(range(2), labels)
plt.show()

"""create a sample dataset that contains a normal distribution of both transactions."""

acceptable_sample=acceptable.sample(n=492)

"""Concatinate the sample dataset into the already existing fraudulent dataframe"""

# axis = 0 (rows)
# axis = 1 (columns)
new_dataset=pd.concat([acceptable_sample,fraudulent],axis=0)

# view the first 5 rows of the new dataset picked at random.
new_dataset.head()

new_dataset.tail()

new_dataset['Class'].value_counts()

new_dataset.groupby('Class').mean()

"""Splitting dataset into training and testind sets. (80%, 10%)"""

# x represent the features(content of the dataset) and y represent the class.
x = new_dataset.drop(columns = 'Class',axis = 1)
y = new_dataset['Class']

# this prints the data except the class column.
print(x)

# prints the classes.
print(y)

"""Splitting dataset into training and testind sets. (80%, 20%)"""

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=2)

# original data, the training data, test data split.
print(x.shape,x_train.shape,x_test.shape)

"""Training the Model.

Logistics Regression Model
"""

model = LogisticRegression()

# train the logistics regression model using the training data.
model.fit(x_train, y_train)

# check the accuracy of the training data.
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)

# if the accuracy is above 80% then it means the model is good
print('Accuracy on the Training data : ', training_data_accuracy * 100)

# accuracy on the test data
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)

print('Accuracy on the Test data : ', test_data_accuracy * 100)

import pickle
pickle.dump(model,open('model.pkl','wb'))