Spaces:

cmasukume
/

Fraud_Detection

Sleeping

File size: 4,119 Bytes

045d34f

# -*- coding: utf-8 -*-
"""Fraud Detection.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1JTnx_TGE4NuRxerkz1nbP9jrCP59prdN

Dependencies Import
"""

# this code will import all the libraries that we need for this model.
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# read the dataset that is holding the information for the credit card
# transactions.
credit_card_data = pd.read_csv('/content/creditcard.csv')

# this will display the first 5 rows of the dataset.
credit_card_data.head()

# this will display the last 5 rows of the dataset.
credit_card_data.tail()

# information about the dataset showcasing the datatypes used and whether the
#dataset contains null values or not.
credit_card_data.info()

# check for the number of missing values on each column.
credit_card_data.isnull().sum()

# check the distribution between acceptable transaction and fraudulent transaction.
credit_card_data['Class'].value_counts()

"""Looking at the above values it shows that the dataset is very unstable and inbalanced.

0---> would represent acceptable transaction

1---> would the represent fradulent transaction.
"""

# data seperation for analysis.
acceptable = credit_card_data[credit_card_data.Class == 0]
fraudulent = credit_card_data[credit_card_data.Class == 1]

print(acceptable.shape)
print(fraudulent.shape)

# statistical measures.
acceptable.Amount.describe()

fraudulent.Amount.describe()

# value comparison of both transactions.
credit_card_data.groupby('Class').mean()

"""Data Sampling"""

# before sampling the dataset...
# calculate the percentage of both the acceptable and fraudulent transactions of
# the dataset.
classes=credit_card_data['Class'].value_counts()
acceptable_percent=classes[0]/credit_card_data['Class'].count()*100
fraudulent_percent=classes[1]/credit_card_data['Class'].count()*100
print(acceptable_percent)
print(fraudulent_percent)

labels = ['Acceptable','Fraudulent']
count = credit_card_data.value_counts(credit_card_data['Class'])
count.plot(kind = "bar",rot=0)
plt.title("Labels")
plt.ylabel("Count")
plt.xticks(range(2), labels)
plt.show()

"""create a sample dataset that contains a normal distribution of both transactions."""

acceptable_sample=acceptable.sample(n=492)

"""Concatinate the sample dataset into the already existing fraudulent dataframe"""

# axis = 0 (rows)
# axis = 1 (columns)
new_dataset=pd.concat([acceptable_sample,fraudulent],axis=0)

# view the first 5 rows of the new dataset picked at random.
new_dataset.head()

new_dataset.tail()

new_dataset['Class'].value_counts()

new_dataset.groupby('Class').mean()

"""Splitting dataset into training and testind sets. (80%, 10%)"""

# x represent the features(content of the dataset) and y represent the class.
x = new_dataset.drop(columns = 'Class',axis = 1)
y = new_dataset['Class']

# this prints the data except the class column.
print(x)

# prints the classes.
print(y)

"""Splitting dataset into training and testind sets. (80%, 20%)"""

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=2)

# original data, the training data, test data split.
print(x.shape,x_train.shape,x_test.shape)

"""Training the Model.

Logistics Regression Model
"""

model = LogisticRegression()

# train the logistics regression model using the training data.
model.fit(x_train, y_train)

# check the accuracy of the training data.
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)

# if the accuracy is above 80% then it means the model is good
print('Accuracy on the Training data : ', training_data_accuracy * 100)

# accuracy on the test data
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)

print('Accuracy on the Test data : ', test_data_accuracy * 100)

import pickle
pickle.dump(model,open('model.pkl','wb'))