import geopandas as gpd import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import altair as alt from adjustText import adjust_text # import math # from datetime import date, time # import scikitplot as skplot # from sklearn.preprocessing import MinMaxScaler # from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier, RandomForestRegressor # from sklearn.linear_model import LinearRegression, LogisticRegression # from sklearn.cluster import KMeans # from sklearn.decomposition import PCA # from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split # from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score # from imblearn.over_sampling import RandomOverSampler from filepath_help import directory def get_us_state_fraudulent_transaction(): return pd.read_csv( directory['us_state_fraudulent_transaction']) def get_us_geo_date(): data = gpd.read_file(directory['us_geo_data']) data = data.drop_duplicates() return data def get_us_street_fraudulent_transaction(): return pd.read_csv(directory['us_streets']) def get_bivariate_data(): return pd.read_csv(directory['bivariate_analysis']) def get_analysis_data(): analysis = { 'Category': pd.read_csv(directory['category']), 'Gender': pd.read_csv(directory['gender']), 'Month': pd.read_csv(directory['month']), 'Day of Week': pd.read_csv(directory['day_of_week']), } return analysis def get_gender_analysis_data(): data = { 'category_gender': pd.read_csv(directory['category_gender']), 'category_gender_heatmap': pd.read_csv(directory['category_gender_heatmap']), 'age_group_and_gender_heatmap': pd.read_csv(directory['age_group_and_gender_heatmap']), } return data def get_age_analysis_data(): data = pd.read_csv(directory['age_amount']) return data class FraudDetection: def __init__(self) -> None: self.us_state_fraudulent_transaction = get_us_state_fraudulent_transaction() self.us_streets_fraudulent_transaction = get_us_street_fraudulent_transaction() self.geo_states = get_us_geo_date() def plot_class_imbalance(self, field='CombinedData'): class_imbalance = pd.read_csv(directory['class_imbalance_stats']) chart = ( alt.Chart(data=class_imbalance) .mark_arc( cornerRadius=4, padAngle=0.008) .encode( color='isFraud', theta=field, tooltip=['isFraud', field]) .properties( title='Class imbalance in the '+field ) ) return chart def plot_us_states_fraudulent_transaction(self,): chart = ( alt.Chart(data=self.us_state_fraudulent_transaction) .mark_bar( cornerRadius=5) .encode(x='state', y=alt.Y('is_fraud', title="Fraudulent Transaction"), ) .properties(title="Fraudulent Transaction across United States of America") ) return chart def plot_geo_data_us_states(self, region): states = self.geo_states if region == 'All regions' else self.geo_states.query( f'region == "{region}"') fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20, 30)) states.plot(ax=ax, color='dodgerblue') ax.axis(False) texts = [] for short, name, geo in zip(states.STUSPS, states.NAME, states.geometry): x, y = geo.centroid.coords[0] ax.scatter(x=x, y=y, marker=f'${short}$', s=400) texts.append(ax.text(x=x, y=y, s=name)) adjust_text(texts, arrowprops={ 'arrowstyle': '->', 'color': 'k'}, expand_points=(2, 2)) return fig def plot_selected_states(self, selection): if len(selection) == 0: selection = ['MT', 'NY', 'CA', 'FL'] data = self.geo_states.query(f"STUSPS in {selection}") fig, ax = plt.subplots(nrows=1, ncols=1) data.boundary.plot(ax=ax) ax.axis(False) sample = self.us_state_fraudulent_transaction.merge( data, left_on='state', right_on='STUSPS') for name, count, geo in zip(sample.NAME, sample.is_fraud, sample.geometry): ax.annotate(text=f"{name}={count}", xy=geo.centroid.coords[0]) return fig def plot_street_level_fraudulent_transaction(self, state): data = self.us_streets_fraudulent_transaction.set_index( keys=['state']).loc[state] chart = ( alt.Chart(data=data) .mark_bar( cornerRadius=5) .encode(x='street', y=alt.Y('is_fraud', title=""), tooltip=[ alt.Tooltip( 'is_fraud', title="Fraudulent Transaction"), alt.Tooltip('street', title=f"Street"), ] ) .properties(title="Fraudulent Transaction across American Streets") ) return chart # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ANALYSIS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # class FraudDetectionAnalysis: def __init__(self): self.data = get_analysis_data() def plot_univariate(self, key, field=None, title="Transactions"): data = self.data[key] chart = ( alt.Chart(data=data, title=title) .mark_bar(cornerRadius=25) .encode( y=alt.Y(key, title=key.capitalize()), x=alt.X(field, title=title), ) ) return chart def pie_chart(self, key, field=None, title='Transactions'): data = self.data[key] chart = ( alt.Chart(data=data, title=title) .mark_arc() .encode( theta=alt.Theta(f"{field}:Q", title=title), color=alt.Color(f"{key}:N", title=key) ) ) return chart class BivariateAnalysis: def __init__(self) -> None: self.data = get_bivariate_data() def get_data(self, key): # return self.data[[key, "Fraudulent Transaction"]].groupby(by=key).sum().reset_index() sample = ( self.data[["Fraudulent Transaction", key]].groupby( [key, "Fraudulent Transaction"]) .value_counts().unstack(level=1) ) sample.columns.name = '' sample = ( sample.reset_index() .rename({0: "Fair Transaction", 1: "Fraudulent Transaction"}, axis=1) ) sample['Total Transaction'] = sample['Fair Transaction'] + \ sample['Fraudulent Transaction'] return sample class GenderAnalysis: def __init__(self) -> None: self.data = get_gender_analysis_data() def plot_heatmap( self, key, index='gender', rotation=80, cmap='Blues', xlabel='X axis', ylabel='Gender' ): data = self.data[key].set_index(index) fig, ax = plt.subplots(nrows=1, ncols=1) ax.tick_params(color='tab:blue', labelcolor='gray', width=2) for spine in ax.spines.values(): spine.set_edgecolor('tab:blue') spine.set_linewidth(2) y, x = data.shape ax.matshow(data.values, cmap=cmap) ax.set_xticks(np.arange(x), data.columns, rotation=rotation) ax.set_yticks(np.arange(y), data.index) ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) text = [ax.text(x=j, y=i, s=f'{data.values[i][j]}', ha='center', va='center') for i in np.arange(y) for j in np.arange(x)] return fig class AgeAnalysis: def __init__(self) -> None: self.data = get_age_analysis_data() def age_group_count_plot(self,): with plt.style.context('fivethirtyeight'): fig, ax = plt.subplots(nrows=1, ncols=1) sns.countplot(data=self.data, x='AgeGroup', ax=ax) return fig def age_violin_plot(self,): with plt.style.context('ggplot'): fig, ax = plt.subplots(nrows=1, ncols=1) sns.violinplot( data=self.data.sample(5000), x='Age', y='Gender', hue='Gender', ax=ax, split=True, scale='count', linewidth=4, fontsize=16) return fig def age_gender_stats_interactive(self, category: list[str]): category = ["<25", "25-40"] if len(category) == 0 else category data = self.data.query(f'AgeGroup in {category}') data = data.sample(5000) if 5000 < data.shape[0] else data with plt.style.context('ggplot'): fig, ax = plt.subplots(nrows=1, ncols=1) sns.violinplot( data=data, x='Age', y='AgeGroup', hue='Gender', split=True) return fig def plot_age( self, color_encode=False, element='poly', sample_size=1000, binrange=None, binwith=None, kde=False, fill=True, hatch=''): data = self.data fig, ax = plt.subplots(nrows=1, ncols=1) ax.tick_params(color='tab:blue', labelcolor='gray', width=2) for spine in ax.spines.values(): spine.set_edgecolor('tab:blue') spine.set_linewidth(2) with plt.style.context('fivethirtyeight'): sns.histplot( data=data.sample(sample_size), x='Age', element=element, hue='AgeGroup' if color_encode else None, binrange=binrange, binwidth=binwith, ax=ax, kde=kde, fill=fill, hatch=hatch) return fig def age_realted_query(self, query): search = { 'Less than 25': "Age<25", 'Between 25 and 50': "Age > 25 and Age < 50 ", 'Below 50': "Age < 50", 'Above 50': "Age > 50", 'Between 50 and 60': "Age > 50 and Age < 60", 'Above 60': "Age > 60", 'Above 80': "Age > 80" } binwidth = 5 if query in ['Less than 25', 'Between 50 and 60'] else 10 res = self.data.query(search[query]) grid = sns.FacetGrid( data=res, col='Fraud', sharey=False, ) grid.map_dataframe(func=sns.histplot, x='Age', binwidth=binwidth, hatch='-', ec='white') return res, grid.figure def KDE_plot_age_group_and_transaction_amount( self, query, sample_size=1000, fraud_only=False, age_group="All"): search = { 'Greater than 1000': "Amount > 1000", 'Less than 1000': "Amount < 1000", 'Less than 500': "Amount < 500", 'Less than 300': "Amount < 300", 'Less than 100': "Amount < 200", } data = self.data.query(search[query]) data = data.query("Fraud=='Yes'") if fraud_only else data data = data.query( f"AgeGroup=='{age_group}'") if age_group != 'All' else data data = data.sample( sample_size) if sample_size <= data.shape[0] else data with plt.style.context('fivethirtyeight'): fig, ax = plt.subplots(nrows=1, ncols=1) sns.kdeplot( data=data, x='Age', y='Amount', hue='AgeGroup', fill=True, ax=ax) ax.grid(color='white') # fig.set_facecolor("white") return fig def compare_transactions_across_age_group( self, sample_size=1000, lowerbound=0, upperbound=100 ): data = self.data.query( f"Amount >={lowerbound} and Amount <= {upperbound}") data = data.sample( sample_size) if sample_size < data.shape[0] else data with plt.style.context('fivethirtyeight'): fig, ax = plt.subplots(nrows=1, ncols=1) sns.histplot( data=data, x='Amount', y='AgeGroup', hue='AgeGroup', ax=ax) ax.grid(color='white', linewidth=2) return fig def transaction_amount_study(self, query: str = 'Less than 500', age_group: str = 'All'): search = { 'Less than 1000': "Amount < 1000", 'Less than 500': "Amount < 500", 'Less than 300': "Amount < 300", 'Less than 100': "Amount < 200", } data = self.data.query(search[query]) data = data.query( f"AgeGroup=='{age_group}'") if age_group != 'All' else data with plt.style.context('fivethirtyeight'): fig, ax = plt.subplots(nrows=1, ncols=1) sns.histplot( data=data, x='Amount', hue='AgeGroup', element='poly', ax=ax) return fig