Spaces:

cmasukume
/

Fraud_Detection

Sleeping

App Files Files Community

Fraud_Detection / fraud_detection.py

cmasukume

Upload 21 files

045d34f verified over 1 year ago

raw

history blame contribute delete

4.12 kB

	# -- coding: utf-8 --
	"""Fraud Detection.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1JTnx_TGE4NuRxerkz1nbP9jrCP59prdN

	Dependencies Import
	"""

	# this code will import all the libraries that we need for this model.
	import numpy as np
	import pandas as pd
	from matplotlib import pyplot as plt
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import accuracy_score

	# read the dataset that is holding the information for the credit card
	# transactions.
	credit_card_data = pd.read_csv('/content/creditcard.csv')

	# this will display the first 5 rows of the dataset.
	credit_card_data.head()

	# this will display the last 5 rows of the dataset.
	credit_card_data.tail()

	# information about the dataset showcasing the datatypes used and whether the
	#dataset contains null values or not.
	credit_card_data.info()

	# check for the number of missing values on each column.
	credit_card_data.isnull().sum()

	# check the distribution between acceptable transaction and fraudulent transaction.
	credit_card_data['Class'].value_counts()

	"""Looking at the above values it shows that the dataset is very unstable and inbalanced.

	0---> would represent acceptable transaction

	1---> would the represent fradulent transaction.
	"""

	# data seperation for analysis.
	acceptable = credit_card_data[credit_card_data.Class == 0]
	fraudulent = credit_card_data[credit_card_data.Class == 1]

	print(acceptable.shape)
	print(fraudulent.shape)

	# statistical measures.
	acceptable.Amount.describe()

	fraudulent.Amount.describe()

	# value comparison of both transactions.
	credit_card_data.groupby('Class').mean()

	"""Data Sampling"""

	# before sampling the dataset...
	# calculate the percentage of both the acceptable and fraudulent transactions of
	# the dataset.
	classes=credit_card_data['Class'].value_counts()
	acceptable_percent=classes[0]/credit_card_data['Class'].count()*100
	fraudulent_percent=classes[1]/credit_card_data['Class'].count()*100
	print(acceptable_percent)
	print(fraudulent_percent)

	labels = ['Acceptable','Fraudulent']
	count = credit_card_data.value_counts(credit_card_data['Class'])
	count.plot(kind = "bar",rot=0)
	plt.title("Labels")
	plt.ylabel("Count")
	plt.xticks(range(2), labels)
	plt.show()

	"""create a sample dataset that contains a normal distribution of both transactions."""

	acceptable_sample=acceptable.sample(n=492)

	"""Concatinate the sample dataset into the already existing fraudulent dataframe"""

	# axis = 0 (rows)
	# axis = 1 (columns)
	new_dataset=pd.concat([acceptable_sample,fraudulent],axis=0)

	# view the first 5 rows of the new dataset picked at random.
	new_dataset.head()

	new_dataset.tail()

	new_dataset['Class'].value_counts()

	new_dataset.groupby('Class').mean()

	"""Splitting dataset into training and testind sets. (80%, 10%)"""

	# x represent the features(content of the dataset) and y represent the class.
	x = new_dataset.drop(columns = 'Class',axis = 1)
	y = new_dataset['Class']

	# this prints the data except the class column.
	print(x)

	# prints the classes.
	print(y)

	"""Splitting dataset into training and testind sets. (80%, 20%)"""

	x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=2)

	# original data, the training data, test data split.
	print(x.shape,x_train.shape,x_test.shape)

	"""Training the Model.

	Logistics Regression Model
	"""

	model = LogisticRegression()

	# train the logistics regression model using the training data.
	model.fit(x_train, y_train)

	# check the accuracy of the training data.
	x_train_prediction = model.predict(x_train)
	training_data_accuracy = accuracy_score(x_train_prediction, y_train)

	# if the accuracy is above 80% then it means the model is good
	print('Accuracy on the Training data : ', training_data_accuracy * 100)

	# accuracy on the test data
	x_test_prediction = model.predict(x_test)
	test_data_accuracy = accuracy_score(x_test_prediction, y_test)

	print('Accuracy on the Test data : ', test_data_accuracy * 100)

	import pickle
	pickle.dump(model,open('model.pkl','wb'))