# -*- coding: utf-8 -*- """Fraud Detection.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1JTnx_TGE4NuRxerkz1nbP9jrCP59prdN Dependencies Import """ # this code will import all the libraries that we need for this model. import numpy as np import pandas as pd from matplotlib import pyplot as plt from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score # read the dataset that is holding the information for the credit card # transactions. credit_card_data = pd.read_csv('/content/creditcard.csv') # this will display the first 5 rows of the dataset. credit_card_data.head() # this will display the last 5 rows of the dataset. credit_card_data.tail() # information about the dataset showcasing the datatypes used and whether the #dataset contains null values or not. credit_card_data.info() # check for the number of missing values on each column. credit_card_data.isnull().sum() # check the distribution between acceptable transaction and fraudulent transaction. credit_card_data['Class'].value_counts() """Looking at the above values it shows that the dataset is very unstable and inbalanced. 0---> would represent acceptable transaction 1---> would the represent fradulent transaction. """ # data seperation for analysis. acceptable = credit_card_data[credit_card_data.Class == 0] fraudulent = credit_card_data[credit_card_data.Class == 1] print(acceptable.shape) print(fraudulent.shape) # statistical measures. acceptable.Amount.describe() fraudulent.Amount.describe() # value comparison of both transactions. credit_card_data.groupby('Class').mean() """Data Sampling""" # before sampling the dataset... # calculate the percentage of both the acceptable and fraudulent transactions of # the dataset. classes=credit_card_data['Class'].value_counts() acceptable_percent=classes[0]/credit_card_data['Class'].count()*100 fraudulent_percent=classes[1]/credit_card_data['Class'].count()*100 print(acceptable_percent) print(fraudulent_percent) labels = ['Acceptable','Fraudulent'] count = credit_card_data.value_counts(credit_card_data['Class']) count.plot(kind = "bar",rot=0) plt.title("Labels") plt.ylabel("Count") plt.xticks(range(2), labels) plt.show() """create a sample dataset that contains a normal distribution of both transactions.""" acceptable_sample=acceptable.sample(n=492) """Concatinate the sample dataset into the already existing fraudulent dataframe""" # axis = 0 (rows) # axis = 1 (columns) new_dataset=pd.concat([acceptable_sample,fraudulent],axis=0) # view the first 5 rows of the new dataset picked at random. new_dataset.head() new_dataset.tail() new_dataset['Class'].value_counts() new_dataset.groupby('Class').mean() """Splitting dataset into training and testind sets. (80%, 10%)""" # x represent the features(content of the dataset) and y represent the class. x = new_dataset.drop(columns = 'Class',axis = 1) y = new_dataset['Class'] # this prints the data except the class column. print(x) # prints the classes. print(y) """Splitting dataset into training and testind sets. (80%, 20%)""" x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=2) # original data, the training data, test data split. print(x.shape,x_train.shape,x_test.shape) """Training the Model. Logistics Regression Model """ model = LogisticRegression() # train the logistics regression model using the training data. model.fit(x_train, y_train) # check the accuracy of the training data. x_train_prediction = model.predict(x_train) training_data_accuracy = accuracy_score(x_train_prediction, y_train) # if the accuracy is above 80% then it means the model is good print('Accuracy on the Training data : ', training_data_accuracy * 100) # accuracy on the test data x_test_prediction = model.predict(x_test) test_data_accuracy = accuracy_score(x_test_prediction, y_test) print('Accuracy on the Test data : ', test_data_accuracy * 100) import pickle pickle.dump(model,open('model.pkl','wb'))