Spaces:
Sleeping
Sleeping
| import geopandas as gpd | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import altair as alt | |
| from adjustText import adjust_text | |
| # import math | |
| # from datetime import date, time | |
| # import scikitplot as skplot | |
| # from sklearn.preprocessing import MinMaxScaler | |
| # from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier, RandomForestRegressor | |
| # from sklearn.linear_model import LinearRegression, LogisticRegression | |
| # from sklearn.cluster import KMeans | |
| # from sklearn.decomposition import PCA | |
| # from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split | |
| # from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score | |
| # from imblearn.over_sampling import RandomOverSampler | |
| from filepath_help import directory | |
| def get_us_state_fraudulent_transaction(): | |
| return pd.read_csv( | |
| directory['us_state_fraudulent_transaction']) | |
| def get_us_geo_date(): | |
| data = gpd.read_file(directory['us_geo_data']) | |
| data = data.drop_duplicates() | |
| return data | |
| def get_us_street_fraudulent_transaction(): | |
| return pd.read_csv(directory['us_streets']) | |
| def get_bivariate_data(): | |
| return pd.read_csv(directory['bivariate_analysis']) | |
| def get_analysis_data(): | |
| analysis = { | |
| 'Category': pd.read_csv(directory['category']), | |
| 'Gender': pd.read_csv(directory['gender']), | |
| 'Month': pd.read_csv(directory['month']), | |
| 'Day of Week': pd.read_csv(directory['day_of_week']), | |
| } | |
| return analysis | |
| def get_gender_analysis_data(): | |
| data = { | |
| 'category_gender': pd.read_csv(directory['category_gender']), | |
| 'category_gender_heatmap': pd.read_csv(directory['category_gender_heatmap']), | |
| 'age_group_and_gender_heatmap': pd.read_csv(directory['age_group_and_gender_heatmap']), | |
| } | |
| return data | |
| def get_age_analysis_data(): | |
| data = pd.read_csv(directory['age_amount']) | |
| return data | |
| class FraudDetection: | |
| def __init__(self) -> None: | |
| self.us_state_fraudulent_transaction = get_us_state_fraudulent_transaction() | |
| self.us_streets_fraudulent_transaction = get_us_street_fraudulent_transaction() | |
| self.geo_states = get_us_geo_date() | |
| def plot_class_imbalance(self, field='CombinedData'): | |
| class_imbalance = pd.read_csv(directory['class_imbalance_stats']) | |
| chart = ( | |
| alt.Chart(data=class_imbalance) | |
| .mark_arc( | |
| cornerRadius=4, padAngle=0.008) | |
| .encode( | |
| color='isFraud', theta=field, tooltip=['isFraud', field]) | |
| .properties( | |
| title='Class imbalance in the '+field | |
| ) | |
| ) | |
| return chart | |
| def plot_us_states_fraudulent_transaction(self,): | |
| chart = ( | |
| alt.Chart(data=self.us_state_fraudulent_transaction) | |
| .mark_bar( | |
| cornerRadius=5) | |
| .encode(x='state', y=alt.Y('is_fraud', title="Fraudulent Transaction"), | |
| ) | |
| .properties(title="Fraudulent Transaction across United States of America") | |
| ) | |
| return chart | |
| def plot_geo_data_us_states(self, region): | |
| states = self.geo_states if region == 'All regions' else self.geo_states.query( | |
| f'region == "{region}"') | |
| fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20, 30)) | |
| states.plot(ax=ax, color='dodgerblue') | |
| ax.axis(False) | |
| texts = [] | |
| for short, name, geo in zip(states.STUSPS, states.NAME, states.geometry): | |
| x, y = geo.centroid.coords[0] | |
| ax.scatter(x=x, y=y, | |
| marker=f'${short}$', s=400) | |
| texts.append(ax.text(x=x, y=y, s=name)) | |
| adjust_text(texts, arrowprops={ | |
| 'arrowstyle': '->', 'color': 'k'}, expand_points=(2, 2)) | |
| return fig | |
| def plot_selected_states(self, selection): | |
| if len(selection) == 0: | |
| selection = ['MT', 'NY', 'CA', 'FL'] | |
| data = self.geo_states.query(f"STUSPS in {selection}") | |
| fig, ax = plt.subplots(nrows=1, ncols=1) | |
| data.boundary.plot(ax=ax) | |
| ax.axis(False) | |
| sample = self.us_state_fraudulent_transaction.merge( | |
| data, left_on='state', right_on='STUSPS') | |
| for name, count, geo in zip(sample.NAME, sample.is_fraud, sample.geometry): | |
| ax.annotate(text=f"{name}={count}", xy=geo.centroid.coords[0]) | |
| return fig | |
| def plot_street_level_fraudulent_transaction(self, state): | |
| data = self.us_streets_fraudulent_transaction.set_index( | |
| keys=['state']).loc[state] | |
| chart = ( | |
| alt.Chart(data=data) | |
| .mark_bar( | |
| cornerRadius=5) | |
| .encode(x='street', y=alt.Y('is_fraud', title=""), | |
| tooltip=[ | |
| alt.Tooltip( | |
| 'is_fraud', title="Fraudulent Transaction"), | |
| alt.Tooltip('street', title=f"Street"), | |
| ] | |
| ) | |
| .properties(title="Fraudulent Transaction across American Streets") | |
| ) | |
| return chart | |
| # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ANALYSIS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # | |
| class FraudDetectionAnalysis: | |
| def __init__(self): | |
| self.data = get_analysis_data() | |
| def plot_univariate(self, key, field=None, title="Transactions"): | |
| data = self.data[key] | |
| chart = ( | |
| alt.Chart(data=data, title=title) | |
| .mark_bar(cornerRadius=25) | |
| .encode( | |
| y=alt.Y(key, | |
| title=key.capitalize()), | |
| x=alt.X(field, title=title), | |
| ) | |
| ) | |
| return chart | |
| def pie_chart(self, key, field=None, title='Transactions'): | |
| data = self.data[key] | |
| chart = ( | |
| alt.Chart(data=data, title=title) | |
| .mark_arc() | |
| .encode( | |
| theta=alt.Theta(f"{field}:Q", title=title), | |
| color=alt.Color(f"{key}:N", title=key) | |
| ) | |
| ) | |
| return chart | |
| class BivariateAnalysis: | |
| def __init__(self) -> None: | |
| self.data = get_bivariate_data() | |
| def get_data(self, key): | |
| # return self.data[[key, "Fraudulent Transaction"]].groupby(by=key).sum().reset_index() | |
| sample = ( | |
| self.data[["Fraudulent Transaction", key]].groupby( | |
| [key, "Fraudulent Transaction"]) | |
| .value_counts().unstack(level=1) | |
| ) | |
| sample.columns.name = '' | |
| sample = ( | |
| sample.reset_index() | |
| .rename({0: "Fair Transaction", 1: "Fraudulent Transaction"}, axis=1) | |
| ) | |
| sample['Total Transaction'] = sample['Fair Transaction'] + \ | |
| sample['Fraudulent Transaction'] | |
| return sample | |
| class GenderAnalysis: | |
| def __init__(self) -> None: | |
| self.data = get_gender_analysis_data() | |
| def plot_heatmap( | |
| self, key, | |
| index='gender', rotation=80, cmap='Blues', | |
| xlabel='X axis', ylabel='Gender' | |
| ): | |
| data = self.data[key].set_index(index) | |
| fig, ax = plt.subplots(nrows=1, ncols=1) | |
| ax.tick_params(color='tab:blue', labelcolor='gray', width=2) | |
| for spine in ax.spines.values(): | |
| spine.set_edgecolor('tab:blue') | |
| spine.set_linewidth(2) | |
| y, x = data.shape | |
| ax.matshow(data.values, cmap=cmap) | |
| ax.set_xticks(np.arange(x), data.columns, rotation=rotation) | |
| ax.set_yticks(np.arange(y), data.index) | |
| ax.set_xlabel(xlabel) | |
| ax.set_ylabel(ylabel) | |
| text = [ax.text(x=j, y=i, s=f'{data.values[i][j]}', ha='center', va='center') | |
| for i in np.arange(y) for j in np.arange(x)] | |
| return fig | |
| class AgeAnalysis: | |
| def __init__(self) -> None: | |
| self.data = get_age_analysis_data() | |
| def age_group_count_plot(self,): | |
| with plt.style.context('fivethirtyeight'): | |
| fig, ax = plt.subplots(nrows=1, ncols=1) | |
| sns.countplot(data=self.data, x='AgeGroup', ax=ax) | |
| return fig | |
| def age_violin_plot(self,): | |
| with plt.style.context('ggplot'): | |
| fig, ax = plt.subplots(nrows=1, ncols=1) | |
| sns.violinplot( | |
| data=self.data.sample(5000), x='Age', y='Gender', hue='Gender', | |
| ax=ax, split=True, scale='count', linewidth=4, fontsize=16) | |
| return fig | |
| def age_gender_stats_interactive(self, category: list[str]): | |
| category = ["<25", "25-40"] if len(category) == 0 else category | |
| data = self.data.query(f'AgeGroup in {category}') | |
| data = data.sample(5000) if 5000 < data.shape[0] else data | |
| with plt.style.context('ggplot'): | |
| fig, ax = plt.subplots(nrows=1, ncols=1) | |
| sns.violinplot( | |
| data=data, | |
| x='Age', y='AgeGroup', hue='Gender', split=True) | |
| return fig | |
| def plot_age( | |
| self, color_encode=False, element='poly', | |
| sample_size=1000, binrange=None, binwith=None, | |
| kde=False, fill=True, hatch=''): | |
| data = self.data | |
| fig, ax = plt.subplots(nrows=1, ncols=1) | |
| ax.tick_params(color='tab:blue', labelcolor='gray', width=2) | |
| for spine in ax.spines.values(): | |
| spine.set_edgecolor('tab:blue') | |
| spine.set_linewidth(2) | |
| with plt.style.context('fivethirtyeight'): | |
| sns.histplot( | |
| data=data.sample(sample_size), x='Age', element=element, | |
| hue='AgeGroup' if color_encode else None, | |
| binrange=binrange, binwidth=binwith, | |
| ax=ax, kde=kde, fill=fill, hatch=hatch) | |
| return fig | |
| def age_realted_query(self, query): | |
| search = { | |
| 'Less than 25': "Age<25", | |
| 'Between 25 and 50': "Age > 25 and Age < 50 ", | |
| 'Below 50': "Age < 50", | |
| 'Above 50': "Age > 50", | |
| 'Between 50 and 60': "Age > 50 and Age < 60", | |
| 'Above 60': "Age > 60", | |
| 'Above 80': "Age > 80" | |
| } | |
| binwidth = 5 if query in ['Less than 25', 'Between 50 and 60'] else 10 | |
| res = self.data.query(search[query]) | |
| grid = sns.FacetGrid( | |
| data=res, col='Fraud', sharey=False, | |
| ) | |
| grid.map_dataframe(func=sns.histplot, x='Age', | |
| binwidth=binwidth, hatch='-', ec='white') | |
| return res, grid.figure | |
| def KDE_plot_age_group_and_transaction_amount( | |
| self, query, sample_size=1000, fraud_only=False, age_group="All"): | |
| search = { | |
| 'Greater than 1000': "Amount > 1000", | |
| 'Less than 1000': "Amount < 1000", | |
| 'Less than 500': "Amount < 500", | |
| 'Less than 300': "Amount < 300", | |
| 'Less than 100': "Amount < 200", | |
| } | |
| data = self.data.query(search[query]) | |
| data = data.query("Fraud=='Yes'") if fraud_only else data | |
| data = data.query( | |
| f"AgeGroup=='{age_group}'") if age_group != 'All' else data | |
| data = data.sample( | |
| sample_size) if sample_size <= data.shape[0] else data | |
| with plt.style.context('fivethirtyeight'): | |
| fig, ax = plt.subplots(nrows=1, ncols=1) | |
| sns.kdeplot( | |
| data=data, x='Age', y='Amount', hue='AgeGroup', fill=True, ax=ax) | |
| ax.grid(color='white') | |
| # fig.set_facecolor("white") | |
| return fig | |
| def compare_transactions_across_age_group( | |
| self, sample_size=1000, lowerbound=0, upperbound=100 | |
| ): | |
| data = self.data.query( | |
| f"Amount >={lowerbound} and Amount <= {upperbound}") | |
| data = data.sample( | |
| sample_size) if sample_size < data.shape[0] else data | |
| with plt.style.context('fivethirtyeight'): | |
| fig, ax = plt.subplots(nrows=1, ncols=1) | |
| sns.histplot( | |
| data=data, x='Amount', | |
| y='AgeGroup', hue='AgeGroup', ax=ax) | |
| ax.grid(color='white', linewidth=2) | |
| return fig | |
| def transaction_amount_study(self, query: str = 'Less than 500', age_group: str = 'All'): | |
| search = { | |
| 'Less than 1000': "Amount < 1000", | |
| 'Less than 500': "Amount < 500", | |
| 'Less than 300': "Amount < 300", | |
| 'Less than 100': "Amount < 200", | |
| } | |
| data = self.data.query(search[query]) | |
| data = data.query( | |
| f"AgeGroup=='{age_group}'") if age_group != 'All' else data | |
| with plt.style.context('fivethirtyeight'): | |
| fig, ax = plt.subplots(nrows=1, ncols=1) | |
| sns.histplot( | |
| data=data, x='Amount', | |
| hue='AgeGroup', element='poly', ax=ax) | |
| return fig | |