msi commited on
Commit
d032558
·
1 Parent(s): c6f428a

Add application file

Browse files
app.py.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import streamlit as st
3
+ import json
4
+ import pandas as pd
5
+
6
+ # Load configuration data from JSON
7
+ with open("backend/src/store.json", "r") as file:
8
+ x = json.load(file)
9
+
10
+ # Page Title
11
+ import streamlit as st
12
+ import pandas as pd
13
+ import plotly.express as px
14
+ import requests
15
+ import json
16
+
17
+ # Helper function to map Yes/No to 1/0
18
+ def gett(choice):
19
+ return 1 if choice == "Yes" else 0
20
+
21
+ # Load configuration data from JSON
22
+ with open("backend/src/store.json", "r") as file:
23
+ x = json.load(file)
24
+
25
+ # Load the data
26
+ @st.cache_data
27
+ def load_data():
28
+ file_path = "data/loan_data.csv" # Update this with your file path
29
+ return pd.read_csv(file_path)
30
+
31
+ df = load_data()
32
+
33
+ # Sidebar with radio buttons for navigation
34
+ sidebar_option = st.sidebar.radio("Select Page", ["Dashboard", "Prediction"])
35
+
36
+ if sidebar_option == "Dashboard":
37
+ # Dashboard Page
38
+ st.title("SBA Loans Dashboard")
39
+ st.markdown("Explore loan trends, analyze defaults, and gain insights into SBA loan data.")
40
+
41
+ # Sidebar Filters for Dashboard
42
+ st.sidebar.header("Filters")
43
+ states = st.sidebar.multiselect("Select State(s):", df["State"].unique(), default=df["State"].unique())
44
+ industries = st.sidebar.multiselect("Select Industry(s):", df["Industry"].unique(), default=df["Industry"].unique())
45
+ approval_year = st.sidebar.slider("Approval Fiscal Year Range:",
46
+ int(df["ApprovalFY"].min()),
47
+ int(df["ApprovalFY"].max()),
48
+ (int(df["ApprovalFY"].min()), int(df["ApprovalFY"].max())))
49
+
50
+ # Filter Data
51
+ filtered_data = df[(df["State"].isin(states)) &
52
+ (df["Industry"].isin(industries)) &
53
+ (df["ApprovalFY"] >= approval_year[0]) &
54
+ (df["ApprovalFY"] <= approval_year[1])]
55
+
56
+ # Overview Metrics
57
+ st.header("Key Metrics")
58
+ col1, col2, col3, col4 = st.columns(4)
59
+ col1.metric("Total Loans", len(filtered_data))
60
+ col2.metric("Default Rate", f"{(filtered_data['Default'].mean() * 100):.2f}%")
61
+ col3.metric("Avg Loan Amount", f"${filtered_data['DisbursementGross'].mean():,.2f}")
62
+ col4.metric("Franchise Loans", f"{filtered_data['IsFranchise'].sum()}")
63
+
64
+ # Visualizations
65
+ st.header("Visualizations")
66
+
67
+ # Loan Distribution by State
68
+ fig_state = px.bar(filtered_data.groupby("State").size().reset_index(name="Loans"),
69
+ x="State", y="Loans", title="Loan Distribution by State")
70
+ st.plotly_chart(fig_state)
71
+
72
+ # Loan Amount vs Default Rate
73
+ fig_default = px.scatter(filtered_data,
74
+ x="DisbursementGross", y="Default",
75
+ color="Industry",
76
+ title="Loan Amount vs Default Rate",
77
+ size="GrAppv", hover_data=["State"])
78
+ st.plotly_chart(fig_default)
79
+
80
+ # Trends Over Time
81
+ fig_trend = px.line(filtered_data.groupby("ApprovalFY").size().reset_index(name="Loans"),
82
+ x="ApprovalFY", y="Loans", title="Loan Trends Over Time")
83
+ st.plotly_chart(fig_trend)
84
+
85
+ # Default Rate by Industry
86
+ fig_industry = px.bar(filtered_data.groupby("Industry")["Default"].mean().reset_index(name="Default Rate"),
87
+ x="Industry", y="Default Rate",
88
+ title="Default Rate by Industry", text_auto=".2f")
89
+ st.plotly_chart(fig_industry)
90
+
91
+ elif sidebar_option == "Prediction":
92
+ # Prediction Page
93
+ st.title("Loan Default Prediction")
94
+ st.markdown("Enter loan details to predict whether the loan will default.")
95
+
96
+ # Creating the form fields
97
+ with st.form("form1", clear_on_submit=False):
98
+ state = st.selectbox("Enter your State", tuple(x['State'].keys()))
99
+ category = st.selectbox("Enter your Bank State", tuple(x['BankState'].keys()))
100
+ appY = st.selectbox("Select your Approval Year",
101
+ (1997, 1980, 2006, 1998, 1999, 2000, 2001, 1972, 2003, 2004, 1978,
102
+ 1979, 1981, 2005, 1982, 1983, 1973, 1984, 2007, 1985, 1986, 1987,
103
+ 2008, 1988, 2009, 1989, 1991, 1990, 1974, 2010, 1992, 1993, 2002,
104
+ 1994, 1975, 1977, 1976, 1969, 1995, 1970, 1996, 1971))
105
+ term = st.text_input("Term", "0")
106
+ noemp = st.text_input("Number of Employees", "0")
107
+ urban = st.selectbox("Select the Zone Type", ("Urban", "Rural", "Undefined"))
108
+ rev = st.selectbox("Select Revolving Line of Credit", ("Yes", "No"))
109
+ low = st.selectbox("Select LowDoc Loan Program", ("Yes", "No"))
110
+ disb = st.text_input("Enter the Amount Disbursed", "0")
111
+ merch_long = st.text_input("Enter Gross Amount of Loan Approved by Bank", "0")
112
+ indus = st.selectbox("Enter your Industry Category", tuple(x['Industry'].keys()))
113
+ fran = st.selectbox("Is it a Franchise?", ("Yes", "No"))
114
+ busi = st.selectbox("Is it a New Business?", ("Yes", "No"))
115
+ disY = st.selectbox("Select your Disbursement Year",
116
+ (1999, 1997, 1980, 1998, 2006, 2002, 2001, 2000, 2003, 1982, 2004,
117
+ 2071, 2005, 2009, 2007, 2008, 1981, 2072, 1978, 1979, 1996, 2010,
118
+ 1995, 2012, 1983, 1985, 1984, 2048, 1987, 2073, 1986, 2011, 1988,
119
+ 1989, 2013, 1990, 1991, 2014, 1992, 1993, 1994, 2020, 1974, 2028,
120
+ 1975, 1976, 1977, 2069, 2070))
121
+ days_dis = st.text_input("Enter the Days to Disbursement", "0")
122
+ sba = st.text_input("Enter SBA's Guaranteed Amount of Approved Loan", "0")
123
+ appvD = st.selectbox("Is it AppvDisbursed?", ("Yes", "No"))
124
+ realsta = st.selectbox("Is it Real Estate?", ("Yes", "No"))
125
+ great = st.selectbox("During the Great Recession?", ("Yes", "No"))
126
+
127
+ # Prepare data for API
128
+ dd = {
129
+ "State": x['State'][state],
130
+ "BankState": x['BankState'][category],
131
+ "ApprovalFY": appY,
132
+ "Term": term,
133
+ "NoEmp": noemp,
134
+ "UrbanRural": urban,
135
+ "RevLineCr": gett(rev),
136
+ "LowDoc": gett(low),
137
+ "DisbursementGross": disb,
138
+ "GrAppv": merch_long,
139
+ "Industry": x['Industry'][indus],
140
+ "IsFranchise": gett(fran),
141
+ "NewBusiness": gett(busi),
142
+ "DisbursementFY": disY,
143
+ "DaysToDisbursement": days_dis,
144
+ "SBA_AppvPct": sba,
145
+ "AppvDisbursed": gett(appvD),
146
+ "RealEstate": gett(realsta),
147
+ "GreatRecession": gett(great),
148
+ }
149
+
150
+ submit = st.form_submit_button("Submit this form")
151
+ if submit:
152
+ try:
153
+ res = requests.post("http://127.0.0.1:8000/predict", data=json.dumps(dd))
154
+ predictions = res.json().get("predictions")
155
+ if predictions == [0]:
156
+ st.success("Paid In Full, The loan was successfully repaid. 😃")
157
+ else:
158
+ st.error("Charged Off, The loan defaulted and was written off as a loss. 🚨")
159
+ except Exception as e:
160
+ st.error(f"Error: {e}")
161
+
162
+ # File uploader for historical transactions
163
+ st.subheader("Or Enter your Historical Transactions CSV File")
164
+ data = st.file_uploader("Choose a CSV File")
165
+
166
+ if data is not None:
167
+ try:
168
+ file = {"file": data.getvalue()}
169
+ res = requests.post("http://127.0.0.1:8000/predict/csv", files=file)
170
+ predictions = res.json().get("predictions")
171
+ st.text(predictions)
172
+ except Exception as e:
173
+ st.error(f"Error: {e}")
backend/example_json/transaction_info.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from pydantic import BaseModel
3
+ import datetime
4
+
5
+
6
+ class TransactionModel(BaseModel):
7
+ State: object = Field(...)
8
+ BankState: object = Field(...)
9
+ ApprovalFY: int = Field(...)
10
+ Term: int = Field(...)
11
+ NoEmp: int = Field(...)
12
+ UrbanRural: int = Field(...)
13
+ RevLineCr: int = Field(...)
14
+ LowDoc: int = Field(...)
15
+ DisbursementGross: float = Field(...)
16
+ GrAppv: float = Field(...)
17
+ Industry: object = Field(...)
18
+ IsFranchise: int = Field(...)
19
+ NewBusiness: int = Field(...)
20
+ DisbursementFY: int = Field(...)
21
+ DaysToDisbursement: int = Field(...)
22
+ SBA_AppvPct: float = Field(...)
23
+ AppvDisbursed: int = Field(...)
24
+ RealEstate: int = Field(...)
25
+ GreatRecession: int = Field(...)
26
+
27
+
28
+ class Config:
29
+ populate_by_name = True
30
+ arbitrary_types_allowed = True
31
+ json_schema_extra = {
32
+ "example": {
33
+ "State": "AK",
34
+ "BankState" : "AK",
35
+ "ApprovalFY": 1994,
36
+ "Term": 84,
37
+ "NoEmp": 5,
38
+ "UrbanRural": 0 ,
39
+ "RevLineCr": 0,
40
+ "LowDoc":0,
41
+ "DisbursementGross":60000.0 ,
42
+ 'GrAppv':60000.0,
43
+ 'Industry':"Retail_trade" ,
44
+ 'IsFranchise':0,
45
+ 'NewBusiness':1,
46
+ 'DisbursementFY':1997,
47
+ 'DaysToDisbursement':870,
48
+ 'SBA_AppvPct':0.80,
49
+ 'AppvDisbursed':1,
50
+ 'RealEstate':0,
51
+ 'GreatRecession':1
52
+
53
+ }
54
+ }
backend/main.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from operator import index
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ from datetime import datetime
6
+ import sklearn
7
+ from fastapi import FastAPI, File, UploadFile
8
+ import uvicorn
9
+ import sys
10
+ from fastapi.middleware.cors import CORSMiddleware
11
+ from pydantic import BaseModel
12
+ import mlflow
13
+ from src.clean_data_json import scaling,clean_data
14
+ #from src.clean_data_json import clean_data_json
15
+ from example_json.transaction_info import TransactionModel
16
+ import mlflow.pyfunc
17
+
18
+
19
+
20
+
21
+
22
+
23
+ """
24
+ from dotenv import load_dotenv
25
+ import os
26
+ load_dotenv("../backend/src/.env")
27
+
28
+ DagsHub_username = os.getenv("DagsHub_username")
29
+ DagsHub_token=os.getenv("DagsHub_token")
30
+ os.environ['MLFLOW_TRACKING_USERNAME']= DagsHub_username
31
+ os.environ["MLFLOW_TRACKING_PASSWORD"] = DagsHub_token
32
+ """
33
+
34
+ #setup mlflow
35
+ mlflow.set_tracking_uri("file:///C:/Users/msi/Desktop/mlops/mlruns")
36
+ mlflow.set_experiment("loan_approval_prediction")
37
+ mlflow.sklearn.autolog(disable=True)
38
+
39
+ app = FastAPI()
40
+ origins = ['*']
41
+
42
+ app.add_middleware(
43
+ CORSMiddleware,
44
+ allow_origins=origins,
45
+ allow_credentials=True,
46
+ allow_methods=["*"],
47
+ allow_headers=["*"],
48
+ )
49
+
50
+
51
+ #let's call the model from the model registry ( in production stage)
52
+
53
+ df_mlflow=mlflow.search_runs(filter_string="metrics.F1_score_test < 1")
54
+ run_id = '508f98877c244ee58b2ee59373384b32'
55
+
56
+
57
+
58
+ logged_model = f'runs:/{run_id}/ML_models'
59
+
60
+ # Load model as a PyFuncModel.
61
+ model = mlflow.pyfunc.load_model(logged_model)
62
+
63
+ @app.get("/")
64
+ def read_root():
65
+ return {"Hello": "to fraud detector app version 2"}
66
+
67
+ # this endpoint receives data in the form of csv file (histotical transactions data)
68
+ @app.post("/predict/csv")
69
+ def return_predictions(file: UploadFile = File(...)):
70
+ data = pd.read_csv(file.file)
71
+ data=data.drop(columns=['Unnamed: 0','Default'])
72
+ preprocessed_data = clean_data(data)
73
+ scaled=scaling(preprocessed_data)
74
+ predictions = model.predict(scaled)
75
+ return {"predictions": predictions.tolist()}
76
+
77
+
78
+ # this endpoint receives data in the form of json (informations about one transaction)
79
+ @app.post("/predict")
80
+ def predict(data : TransactionModel):
81
+ received = data.dict()
82
+ df = pd.DataFrame(received,index=[0])
83
+ scl=clean_data(df)
84
+ preprocessed_data = scaling(scl)
85
+ print(preprocessed_data)
86
+ predictions = model.predict(preprocessed_data)
87
+ return {"predictions": predictions.tolist()}
88
+
89
+
90
+ if __name__ == "__main__":
91
+ uvicorn.run("main:app", host="0.0.0.0", port=8080)
92
+
93
+
backend/src/clean_data_json.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+ from sklearn.preprocessing import StandardScaler
4
+
5
+ scale = StandardScaler()
6
+
7
+
8
+ with open("../backend/src/store.json", "r") as file:
9
+ x = json.load(file)
10
+ def clean_data(df):
11
+ df['State'] = df['State'].map(x['State'])
12
+ df['BankState'] = df['BankState'].map(x['BankState'])
13
+ df['Industry'] = df['Industry'].map(x['Industry'])
14
+ return df
15
+
16
+ # Function to scale data
17
+ def scaling(df):
18
+ # Only scale numerical columns
19
+ num_cols = df.select_dtypes(include=['number']).columns
20
+ df[num_cols] = scale.fit_transform(df[num_cols])
21
+ return df
backend/src/data_preprocessing.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+ from sklearn.preprocessing import StandardScaler
4
+ import numpy as np
5
+ from sklearn.preprocessing import LabelEncoder
6
+
7
+ encoder = LabelEncoder()
8
+ scale = StandardScaler()
9
+ def convert_money(x):
10
+ if isinstance(x, str): # Check if x is a string
11
+ x = x[1:].replace(',', '') # Remove the dollar sign and commas
12
+ return float(x)
13
+ elif isinstance(x, (int, float)): # If already numeric, return as is
14
+ return float(x)
15
+ else:
16
+ return None # Handle unexpected types gracefully
17
+
18
+ def clean_year(x):
19
+ if isinstance(x, str):
20
+ return x.replace('A', '')
21
+ return x
22
+
23
+ with open("../backend/src/store.json", "r") as file:
24
+ x = json.load(file)
25
+
26
+ def transform_data(df):
27
+ df[['DisbursementGross', 'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv']] = df[['DisbursementGross', 'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv']].applymap(convert_money)
28
+ df['State'] = df['State'].map(x['State'])
29
+ df['BankState'] = df['BankState'].map(x['BankState'])
30
+ df['Industry'] = df['Industry'].map(x['Industry'])
31
+ df['ApprovalFY'] = df['ApprovalFY'].apply(clean_year).astype('int64')
32
+ df.dropna(subset=['Name', 'City', 'State', 'BankState', 'NewExist','RevLineCr', 'LowDoc', 'DisbursementDate', 'MIS_Status'], inplace=True)
33
+ df = df.astype({'Zip': 'str', 'NewExist': 'int64', 'UrbanRural': 'str', 'DisbursementGross': 'float', 'BalanceGross': 'float',
34
+ 'ChgOffPrinGr': 'float', 'GrAppv': 'float', 'SBA_Appv': 'float'})
35
+ df['Industry'] = df['NAICS'].astype('str').apply(lambda x: x[:2])
36
+ df['Industry'] = df['Industry'].map({
37
+ '0':'Unknown',
38
+ '11': 'Ag/For/Fish/Hunt',
39
+ '21': 'Min/Quar/Oil_Gas_ext',
40
+ '22': 'Utilities',
41
+ '23': 'Construction',
42
+ '31': 'Manufacturing',
43
+ '32': 'Manufacturing',
44
+ '33': 'Manufacturing',
45
+ '42': 'Wholesale_trade',
46
+ '44': 'Retail_trade',
47
+ '45': 'Retail_trade',
48
+ '48': 'Trans/Ware',
49
+ '49': 'Trans/Ware',
50
+ '51': 'Information',
51
+ '52': 'Finance/Insurance',
52
+ '53': 'RE/Rental/Lease',
53
+ '54': 'Prof/Science/Tech',
54
+ '55': 'Mgmt_comp',
55
+ '56': 'Admin_sup/Waste_Mgmt_Rem',
56
+ '61': 'Educational',
57
+ '62': 'Healthcare/Social_assist',
58
+ '71': 'Arts/Entertain/Rec',
59
+ '72': 'Accom/Food_serv',
60
+ '81': 'Other_no_pub',
61
+ '92': 'Public_Admin'
62
+ })
63
+ df.dropna(subset=['Industry'], inplace=True)
64
+ df.loc[(df['FranchiseCode'] <= 1), 'IsFranchise'] = 0
65
+ df.loc[(df['FranchiseCode'] > 1), 'IsFranchise'] = 1
66
+ df = df.astype({'IsFranchise': 'int64'})
67
+ df = df[(df['NewExist'] == 1) | (df['NewExist'] == 2)]
68
+
69
+ # Create NewBusiness field where 0 = Existing business and 1 = New business; based on NewExist field
70
+ df.loc[(df['NewExist'] == 1), 'NewBusiness'] = 0
71
+ df.loc[(df['NewExist'] == 2), 'NewBusiness'] = 1
72
+ df = df[(df['RevLineCr'] == 'Y') | (df['RevLineCr'] == 'N')]
73
+ df = df[(df['LowDoc'] == 'Y') | (df['LowDoc'] == 'N')]
74
+
75
+ # RevLineCr and LowDoc: 0 = No, 1 = Yes
76
+ df['RevLineCr'] = np.where(df['RevLineCr'] == 'N', 0, 1)
77
+ df['LowDoc'] = np.where(df['LowDoc'] == 'N', 0, 1)
78
+ df['Default'] = np.where(df['MIS_Status'] == 'P I F', 0, 1)
79
+ df[['ApprovalDate', 'DisbursementDate']] = df[['ApprovalDate', 'DisbursementDate']].apply(pd.to_datetime)
80
+ df['DisbursementFY'] = df['DisbursementDate'].map(lambda x: x.year)
81
+ df['DaysToDisbursement'] = df['DisbursementDate'] - df['ApprovalDate']
82
+ df['DaysToDisbursement'] = df['DaysToDisbursement'].astype('str').apply(lambda x: x[:x.index('d') - 1]).astype('int64')
83
+ df['SBA_AppvPct'] = df['SBA_Appv'] / df['GrAppv']
84
+ df['AppvDisbursed'] = np.where(df['DisbursementGross'] == df['GrAppv'], 1, 0)
85
+ df = df.astype({'IsFranchise': 'int64', 'NewBusiness': 'int64'})
86
+ df.drop(columns=['LoanNr_ChkDgt', 'Name', 'City', 'Zip', 'Bank', 'NAICS', 'ApprovalDate', 'NewExist', 'FranchiseCode',
87
+ 'ChgOffDate', 'DisbursementDate', 'BalanceGross', 'ChgOffPrinGr', 'SBA_Appv', 'MIS_Status','CreateJob','RetainedJob'], inplace=True)
88
+ # Field for loans backed by Real Estate (loans with a term of at least 20 years)
89
+ df['RealEstate'] = np.where(df['Term'] >= 240, 1, 0)
90
+
91
+ # Field for loans active during the Great Recession (2007-2009)
92
+ df['GreatRecession'] = np.where(((2007 <= df['DisbursementFY']) & (df['DisbursementFY'] <= 2009)) |
93
+ ((df['DisbursementFY'] < 2007) & (df['DisbursementFY'] + (df['Term']/12) >= 2007)), 1, 0)
94
+ df['DisbursedGreaterAppv'] = np.where(df['DisbursementGross'] > df['GrAppv'], 1, 0)
95
+ for column in df.select_dtypes(include='object').columns:
96
+ # Encode the column
97
+ df[column] = encoder.fit_transform(df[column])
98
+ y = df['Default']
99
+ X = df.drop('Default', axis=1)
100
+ return X,y
101
+
102
+
backend/src/store.json ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "State": {
3
+ "AK": 0,
4
+ "AL": 1,
5
+ "AR": 2,
6
+ "AZ": 3,
7
+ "CA": 4,
8
+ "CO": 5,
9
+ "CT": 6,
10
+ "DC": 7,
11
+ "DE": 8,
12
+ "FL": 9,
13
+ "GA": 10,
14
+ "HI": 11,
15
+ "IA": 12,
16
+ "ID": 13,
17
+ "IL": 14,
18
+ "IN": 15,
19
+ "KS": 16,
20
+ "KY": 17,
21
+ "LA": 18,
22
+ "MA": 19,
23
+ "MD": 20,
24
+ "ME": 21,
25
+ "MI": 22,
26
+ "MN": 23,
27
+ "MO": 24,
28
+ "MS": 25,
29
+ "MT": 26,
30
+ "NC": 27,
31
+ "ND": 28,
32
+ "NE": 29,
33
+ "NH": 30,
34
+ "NJ": 31,
35
+ "NM": 32,
36
+ "NV": 33,
37
+ "NY": 34,
38
+ "OH": 35,
39
+ "OK": 36,
40
+ "OR": 37,
41
+ "PA": 38,
42
+ "RI": 39,
43
+ "SC": 40,
44
+ "SD": 41,
45
+ "TN": 42,
46
+ "TX": 43,
47
+ "UT": 44,
48
+ "VA": 45,
49
+ "VT": 46,
50
+ "WA": 47,
51
+ "WI": 48,
52
+ "WV": 49,
53
+ "WY": 50
54
+ },
55
+ "BankState": {
56
+ "AK": 0,
57
+ "AL": 1,
58
+ "AR": 2,
59
+ "AZ": 3,
60
+ "CA": 4,
61
+ "CO": 5,
62
+ "CT": 6,
63
+ "DC": 7,
64
+ "DE": 8,
65
+ "EN": 9,
66
+ "FL": 10,
67
+ "GA": 11,
68
+ "GU": 12,
69
+ "HI": 13,
70
+ "IA": 14,
71
+ "ID": 15,
72
+ "IL": 16,
73
+ "IN": 17,
74
+ "KS": 18,
75
+ "KY": 19,
76
+ "LA": 20,
77
+ "MA": 21,
78
+ "MD": 22,
79
+ "ME": 23,
80
+ "MI": 24,
81
+ "MN": 25,
82
+ "MO": 26,
83
+ "MS": 27,
84
+ "MT": 28,
85
+ "NC": 29,
86
+ "ND": 30,
87
+ "NE": 31,
88
+ "NH": 32,
89
+ "NJ": 33,
90
+ "NM": 34,
91
+ "NV": 35,
92
+ "NY": 36,
93
+ "OH": 37,
94
+ "OK": 38,
95
+ "OR": 39,
96
+ "PA": 40,
97
+ "PR": 41,
98
+ "RI": 42,
99
+ "SC": 43,
100
+ "SD": 44,
101
+ "TN": 45,
102
+ "TX": 46,
103
+ "UT": 47,
104
+ "VA": 48,
105
+ "VT": 49,
106
+ "WA": 50,
107
+ "WI": 51,
108
+ "WV": 52,
109
+ "WY": 53
110
+ },
111
+ "Industry": {
112
+ "Accom/Food_serv": 0,
113
+ "Admin_sup/Waste_Mgmt_Rem": 1,
114
+ "Ag/For/Fish/Hunt": 2,
115
+ "Arts/Entertain/Rec": 3,
116
+ "Construction": 4,
117
+ "Educational": 5,
118
+ "Finance/Insurance": 6,
119
+ "Healthcare/Social_assist": 7,
120
+ "Information": 8,
121
+ "Manufacturing": 9,
122
+ "Mgmt_comp": 10,
123
+ "Min/Quar/Oil_Gas_ext": 11,
124
+ "Other_no_pub": 12,
125
+ "Prof/Science/Tech": 13,
126
+ "Public_Admin": 14,
127
+ "RE/Rental/Lease": 15,
128
+ "Retail_trade": 16,
129
+ "Trans/Ware": 17,
130
+ "Unknown": 18,
131
+ "Utilities": 19,
132
+ "Wholesale_trade": 20
133
+ }
134
+ }
backend/src/store.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {'State':{'AK': 0, 'AL': 1, 'AR': 2, 'AZ': 3, 'CA': 4, 'CO': 5, 'CT': 6, 'DC': 7, 'DE': 8, 'FL': 9, 'GA': 10, 'HI': 11, 'IA': 12, 'ID': 13, 'IL': 14, 'IN': 15, 'KS': 16, 'KY': 17, 'LA': 18, 'MA': 19, 'MD': 20, 'ME': 21, 'MI': 22, 'MN': 23, 'MO': 24, 'MS': 25, 'MT': 26, 'NC': 27, 'ND': 28, 'NE': 29, 'NH': 30, 'NJ': 31, 'NM': 32, 'NV': 33, 'NY': 34, 'OH': 35, 'OK': 36, 'OR': 37, 'PA': 38, 'RI': 39, 'SC': 40, 'SD': 41, 'TN': 42, 'TX': 43, 'UT': 44, 'VA': 45, 'VT': 46, 'WA': 47, 'WI': 48, 'WV': 49, 'WY': 50}}
2
+ {'BankState':{'AK': 0, 'AL': 1, 'AR': 2, 'AZ': 3, 'CA': 4, 'CO': 5, 'CT': 6, 'DC': 7, 'DE': 8, 'EN': 9, 'FL': 10, 'GA': 11, 'GU': 12, 'HI': 13, 'IA': 14, 'ID': 15, 'IL': 16, 'IN': 17, 'KS': 18, 'KY': 19, 'LA': 20, 'MA': 21, 'MD': 22, 'ME': 23, 'MI': 24, 'MN': 25, 'MO': 26, 'MS': 27, 'MT': 28, 'NC': 29, 'ND': 30, 'NE': 31, 'NH': 32, 'NJ': 33, 'NM': 34, 'NV': 35, 'NY': 36, 'OH': 37, 'OK': 38, 'OR': 39, 'PA': 40, 'PR': 41, 'RI': 42, 'SC': 43, 'SD': 44, 'TN': 45, 'TX': 46, 'UT': 47, 'VA': 48, 'VT': 49, 'WA': 50, 'WI': 51, 'WV': 52, 'WY': 53}}
3
+ {'Industry':{'Accom/Food_serv': 0, 'Admin_sup/Waste_Mgmt_Rem': 1, 'Ag/For/Fish/Hunt': 2, 'Arts/Entertain/Rec': 3, 'Construction': 4, 'Educational': 5, 'Finance/Insurance': 6, 'Healthcare/Social_assist': 7, 'Information': 8, 'Manufacturing': 9, 'Mgmt_comp': 10, 'Min/Quar/Oil_Gas_ext': 11, 'Other_no_pub': 12, 'Prof/Science/Tech': 13, 'Public_Admin': 14, 'RE/Rental/Lease': 15, 'Retail_trade': 16, 'Trans/Ware': 17, 'Unknown': 18, 'Utilities': 19, 'Wholesale_trade': 20}}
backend/test_app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import pickle
3
+ import pandas as pd
4
+ import sys
5
+ from src.clean_data_json import clean_data_json
6
+ from src.data_preprocessing_training import transform_data
7
+ import mlflow
8
+ #from dotenv import load_dotenv
9
+ import os
10
+
11
+ from dotenv import load_dotenv
12
+
13
+ load_dotenv("../backend/src/.env")
14
+
15
+ DagsHub_username = os.getenv("DagsHub_username")
16
+ DagsHub_token=os.getenv("DagsHub_token")
17
+
18
+ os.environ['MLFLOW_TRACKING_USERNAME']= DagsHub_username
19
+ os.environ["MLFLOW_TRACKING_PASSWORD"] = DagsHub_token
20
+
21
+ """
22
+ os.environ['MLFLOW_TRACKING_USERNAME']= "..."
23
+ os.environ["MLFLOW_TRACKING_PASSWORD"] = "..."
24
+ """
25
+
26
+ #setup mlflow
27
+ mlflow.set_tracking_uri('https://dagshub.com/.../....mlflow') #your mlfow tracking uri
28
+
29
+
30
+ #tests if the model works as expected
31
+
32
+ def test_model_use():
33
+
34
+ #let's call the model from the model registry ( in production stage)
35
+
36
+ all_experiments = [exp.experiment_id for exp in mlflow.search_experiments()]
37
+ df_mlflow = mlflow.search_runs(experiment_ids=all_experiments,filter_string="metrics.F1_score_test <1")
38
+ run_id = df_mlflow.loc[df_mlflow['metrics.F1_score_test'].idxmax()]['run_id']
39
+
40
+
41
+ logged_model = f'runs:/{run_id}/ML_models'
42
+
43
+ # Load model as a PyFuncModel.
44
+ model = mlflow.pyfunc.load_model(logged_model)
45
+
46
+ d = {'trans_date_trans_time': "2019-01-18 23:20:16", "category" : "shopping_net",
47
+ "amt": 1334.07,
48
+ "gender": 'F',
49
+ "zip": 29438,
50
+ "lat": 32.5486 ,
51
+ "long": -80.307,
52
+ "dob": "1997-07-05",
53
+ "merch_lat": 31.615611,
54
+ "merch_long": -79.702908}
55
+ df = pd.DataFrame(data=d,index=[0])
56
+ dd = clean_data_json(df)
57
+ predict_result = model.predict(dd)
58
+ print(predict_result[0])
59
+ assert predict_result[0] == 1
requirements.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Data Manipulation and Visualization
2
+ pandas==2.1.1
3
+ numpy==1.26.0
4
+ matplotlib==3.8.0
5
+ seaborn==0.12.2
6
+
7
+ # Machine Learning
8
+ scikit-learn==1.3.1
9
+ xgboost==1.7.6
10
+ imbalanced-learn==0.11.0
11
+
12
+ # Deployment
13
+ fastapi==0.103.0
14
+ uvicorn[standard]==0.23.2
15
+
16
+ # Dashboard
17
+ streamlit==1.25.0
18
+
19
+ # Experiment Tracking
20
+ mlflow==2.8.1
21
+
22
+ # Monitoring
23
+ arize-ai==1.12.0
24
+
25
+ # Additional Tools
26
+ joblib==1.4.1
27
+ pyyaml==6.0
28
+