Spaces:
Sleeping
Sleeping
msi commited on
Commit ·
d032558
1
Parent(s): c6f428a
Add application file
Browse files- app.py.py +173 -0
- backend/example_json/transaction_info.py +54 -0
- backend/main.py +93 -0
- backend/src/clean_data_json.py +21 -0
- backend/src/data_preprocessing.py +102 -0
- backend/src/store.json +134 -0
- backend/src/store.txt +3 -0
- backend/test_app.py +59 -0
- requirements.txt +28 -0
app.py.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import streamlit as st
|
| 3 |
+
import json
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
# Load configuration data from JSON
|
| 7 |
+
with open("backend/src/store.json", "r") as file:
|
| 8 |
+
x = json.load(file)
|
| 9 |
+
|
| 10 |
+
# Page Title
|
| 11 |
+
import streamlit as st
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import plotly.express as px
|
| 14 |
+
import requests
|
| 15 |
+
import json
|
| 16 |
+
|
| 17 |
+
# Helper function to map Yes/No to 1/0
|
| 18 |
+
def gett(choice):
|
| 19 |
+
return 1 if choice == "Yes" else 0
|
| 20 |
+
|
| 21 |
+
# Load configuration data from JSON
|
| 22 |
+
with open("backend/src/store.json", "r") as file:
|
| 23 |
+
x = json.load(file)
|
| 24 |
+
|
| 25 |
+
# Load the data
|
| 26 |
+
@st.cache_data
|
| 27 |
+
def load_data():
|
| 28 |
+
file_path = "data/loan_data.csv" # Update this with your file path
|
| 29 |
+
return pd.read_csv(file_path)
|
| 30 |
+
|
| 31 |
+
df = load_data()
|
| 32 |
+
|
| 33 |
+
# Sidebar with radio buttons for navigation
|
| 34 |
+
sidebar_option = st.sidebar.radio("Select Page", ["Dashboard", "Prediction"])
|
| 35 |
+
|
| 36 |
+
if sidebar_option == "Dashboard":
|
| 37 |
+
# Dashboard Page
|
| 38 |
+
st.title("SBA Loans Dashboard")
|
| 39 |
+
st.markdown("Explore loan trends, analyze defaults, and gain insights into SBA loan data.")
|
| 40 |
+
|
| 41 |
+
# Sidebar Filters for Dashboard
|
| 42 |
+
st.sidebar.header("Filters")
|
| 43 |
+
states = st.sidebar.multiselect("Select State(s):", df["State"].unique(), default=df["State"].unique())
|
| 44 |
+
industries = st.sidebar.multiselect("Select Industry(s):", df["Industry"].unique(), default=df["Industry"].unique())
|
| 45 |
+
approval_year = st.sidebar.slider("Approval Fiscal Year Range:",
|
| 46 |
+
int(df["ApprovalFY"].min()),
|
| 47 |
+
int(df["ApprovalFY"].max()),
|
| 48 |
+
(int(df["ApprovalFY"].min()), int(df["ApprovalFY"].max())))
|
| 49 |
+
|
| 50 |
+
# Filter Data
|
| 51 |
+
filtered_data = df[(df["State"].isin(states)) &
|
| 52 |
+
(df["Industry"].isin(industries)) &
|
| 53 |
+
(df["ApprovalFY"] >= approval_year[0]) &
|
| 54 |
+
(df["ApprovalFY"] <= approval_year[1])]
|
| 55 |
+
|
| 56 |
+
# Overview Metrics
|
| 57 |
+
st.header("Key Metrics")
|
| 58 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 59 |
+
col1.metric("Total Loans", len(filtered_data))
|
| 60 |
+
col2.metric("Default Rate", f"{(filtered_data['Default'].mean() * 100):.2f}%")
|
| 61 |
+
col3.metric("Avg Loan Amount", f"${filtered_data['DisbursementGross'].mean():,.2f}")
|
| 62 |
+
col4.metric("Franchise Loans", f"{filtered_data['IsFranchise'].sum()}")
|
| 63 |
+
|
| 64 |
+
# Visualizations
|
| 65 |
+
st.header("Visualizations")
|
| 66 |
+
|
| 67 |
+
# Loan Distribution by State
|
| 68 |
+
fig_state = px.bar(filtered_data.groupby("State").size().reset_index(name="Loans"),
|
| 69 |
+
x="State", y="Loans", title="Loan Distribution by State")
|
| 70 |
+
st.plotly_chart(fig_state)
|
| 71 |
+
|
| 72 |
+
# Loan Amount vs Default Rate
|
| 73 |
+
fig_default = px.scatter(filtered_data,
|
| 74 |
+
x="DisbursementGross", y="Default",
|
| 75 |
+
color="Industry",
|
| 76 |
+
title="Loan Amount vs Default Rate",
|
| 77 |
+
size="GrAppv", hover_data=["State"])
|
| 78 |
+
st.plotly_chart(fig_default)
|
| 79 |
+
|
| 80 |
+
# Trends Over Time
|
| 81 |
+
fig_trend = px.line(filtered_data.groupby("ApprovalFY").size().reset_index(name="Loans"),
|
| 82 |
+
x="ApprovalFY", y="Loans", title="Loan Trends Over Time")
|
| 83 |
+
st.plotly_chart(fig_trend)
|
| 84 |
+
|
| 85 |
+
# Default Rate by Industry
|
| 86 |
+
fig_industry = px.bar(filtered_data.groupby("Industry")["Default"].mean().reset_index(name="Default Rate"),
|
| 87 |
+
x="Industry", y="Default Rate",
|
| 88 |
+
title="Default Rate by Industry", text_auto=".2f")
|
| 89 |
+
st.plotly_chart(fig_industry)
|
| 90 |
+
|
| 91 |
+
elif sidebar_option == "Prediction":
|
| 92 |
+
# Prediction Page
|
| 93 |
+
st.title("Loan Default Prediction")
|
| 94 |
+
st.markdown("Enter loan details to predict whether the loan will default.")
|
| 95 |
+
|
| 96 |
+
# Creating the form fields
|
| 97 |
+
with st.form("form1", clear_on_submit=False):
|
| 98 |
+
state = st.selectbox("Enter your State", tuple(x['State'].keys()))
|
| 99 |
+
category = st.selectbox("Enter your Bank State", tuple(x['BankState'].keys()))
|
| 100 |
+
appY = st.selectbox("Select your Approval Year",
|
| 101 |
+
(1997, 1980, 2006, 1998, 1999, 2000, 2001, 1972, 2003, 2004, 1978,
|
| 102 |
+
1979, 1981, 2005, 1982, 1983, 1973, 1984, 2007, 1985, 1986, 1987,
|
| 103 |
+
2008, 1988, 2009, 1989, 1991, 1990, 1974, 2010, 1992, 1993, 2002,
|
| 104 |
+
1994, 1975, 1977, 1976, 1969, 1995, 1970, 1996, 1971))
|
| 105 |
+
term = st.text_input("Term", "0")
|
| 106 |
+
noemp = st.text_input("Number of Employees", "0")
|
| 107 |
+
urban = st.selectbox("Select the Zone Type", ("Urban", "Rural", "Undefined"))
|
| 108 |
+
rev = st.selectbox("Select Revolving Line of Credit", ("Yes", "No"))
|
| 109 |
+
low = st.selectbox("Select LowDoc Loan Program", ("Yes", "No"))
|
| 110 |
+
disb = st.text_input("Enter the Amount Disbursed", "0")
|
| 111 |
+
merch_long = st.text_input("Enter Gross Amount of Loan Approved by Bank", "0")
|
| 112 |
+
indus = st.selectbox("Enter your Industry Category", tuple(x['Industry'].keys()))
|
| 113 |
+
fran = st.selectbox("Is it a Franchise?", ("Yes", "No"))
|
| 114 |
+
busi = st.selectbox("Is it a New Business?", ("Yes", "No"))
|
| 115 |
+
disY = st.selectbox("Select your Disbursement Year",
|
| 116 |
+
(1999, 1997, 1980, 1998, 2006, 2002, 2001, 2000, 2003, 1982, 2004,
|
| 117 |
+
2071, 2005, 2009, 2007, 2008, 1981, 2072, 1978, 1979, 1996, 2010,
|
| 118 |
+
1995, 2012, 1983, 1985, 1984, 2048, 1987, 2073, 1986, 2011, 1988,
|
| 119 |
+
1989, 2013, 1990, 1991, 2014, 1992, 1993, 1994, 2020, 1974, 2028,
|
| 120 |
+
1975, 1976, 1977, 2069, 2070))
|
| 121 |
+
days_dis = st.text_input("Enter the Days to Disbursement", "0")
|
| 122 |
+
sba = st.text_input("Enter SBA's Guaranteed Amount of Approved Loan", "0")
|
| 123 |
+
appvD = st.selectbox("Is it AppvDisbursed?", ("Yes", "No"))
|
| 124 |
+
realsta = st.selectbox("Is it Real Estate?", ("Yes", "No"))
|
| 125 |
+
great = st.selectbox("During the Great Recession?", ("Yes", "No"))
|
| 126 |
+
|
| 127 |
+
# Prepare data for API
|
| 128 |
+
dd = {
|
| 129 |
+
"State": x['State'][state],
|
| 130 |
+
"BankState": x['BankState'][category],
|
| 131 |
+
"ApprovalFY": appY,
|
| 132 |
+
"Term": term,
|
| 133 |
+
"NoEmp": noemp,
|
| 134 |
+
"UrbanRural": urban,
|
| 135 |
+
"RevLineCr": gett(rev),
|
| 136 |
+
"LowDoc": gett(low),
|
| 137 |
+
"DisbursementGross": disb,
|
| 138 |
+
"GrAppv": merch_long,
|
| 139 |
+
"Industry": x['Industry'][indus],
|
| 140 |
+
"IsFranchise": gett(fran),
|
| 141 |
+
"NewBusiness": gett(busi),
|
| 142 |
+
"DisbursementFY": disY,
|
| 143 |
+
"DaysToDisbursement": days_dis,
|
| 144 |
+
"SBA_AppvPct": sba,
|
| 145 |
+
"AppvDisbursed": gett(appvD),
|
| 146 |
+
"RealEstate": gett(realsta),
|
| 147 |
+
"GreatRecession": gett(great),
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
submit = st.form_submit_button("Submit this form")
|
| 151 |
+
if submit:
|
| 152 |
+
try:
|
| 153 |
+
res = requests.post("http://127.0.0.1:8000/predict", data=json.dumps(dd))
|
| 154 |
+
predictions = res.json().get("predictions")
|
| 155 |
+
if predictions == [0]:
|
| 156 |
+
st.success("Paid In Full, The loan was successfully repaid. 😃")
|
| 157 |
+
else:
|
| 158 |
+
st.error("Charged Off, The loan defaulted and was written off as a loss. 🚨")
|
| 159 |
+
except Exception as e:
|
| 160 |
+
st.error(f"Error: {e}")
|
| 161 |
+
|
| 162 |
+
# File uploader for historical transactions
|
| 163 |
+
st.subheader("Or Enter your Historical Transactions CSV File")
|
| 164 |
+
data = st.file_uploader("Choose a CSV File")
|
| 165 |
+
|
| 166 |
+
if data is not None:
|
| 167 |
+
try:
|
| 168 |
+
file = {"file": data.getvalue()}
|
| 169 |
+
res = requests.post("http://127.0.0.1:8000/predict/csv", files=file)
|
| 170 |
+
predictions = res.json().get("predictions")
|
| 171 |
+
st.text(predictions)
|
| 172 |
+
except Exception as e:
|
| 173 |
+
st.error(f"Error: {e}")
|
backend/example_json/transaction_info.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
import datetime
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class TransactionModel(BaseModel):
|
| 7 |
+
State: object = Field(...)
|
| 8 |
+
BankState: object = Field(...)
|
| 9 |
+
ApprovalFY: int = Field(...)
|
| 10 |
+
Term: int = Field(...)
|
| 11 |
+
NoEmp: int = Field(...)
|
| 12 |
+
UrbanRural: int = Field(...)
|
| 13 |
+
RevLineCr: int = Field(...)
|
| 14 |
+
LowDoc: int = Field(...)
|
| 15 |
+
DisbursementGross: float = Field(...)
|
| 16 |
+
GrAppv: float = Field(...)
|
| 17 |
+
Industry: object = Field(...)
|
| 18 |
+
IsFranchise: int = Field(...)
|
| 19 |
+
NewBusiness: int = Field(...)
|
| 20 |
+
DisbursementFY: int = Field(...)
|
| 21 |
+
DaysToDisbursement: int = Field(...)
|
| 22 |
+
SBA_AppvPct: float = Field(...)
|
| 23 |
+
AppvDisbursed: int = Field(...)
|
| 24 |
+
RealEstate: int = Field(...)
|
| 25 |
+
GreatRecession: int = Field(...)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class Config:
|
| 29 |
+
populate_by_name = True
|
| 30 |
+
arbitrary_types_allowed = True
|
| 31 |
+
json_schema_extra = {
|
| 32 |
+
"example": {
|
| 33 |
+
"State": "AK",
|
| 34 |
+
"BankState" : "AK",
|
| 35 |
+
"ApprovalFY": 1994,
|
| 36 |
+
"Term": 84,
|
| 37 |
+
"NoEmp": 5,
|
| 38 |
+
"UrbanRural": 0 ,
|
| 39 |
+
"RevLineCr": 0,
|
| 40 |
+
"LowDoc":0,
|
| 41 |
+
"DisbursementGross":60000.0 ,
|
| 42 |
+
'GrAppv':60000.0,
|
| 43 |
+
'Industry':"Retail_trade" ,
|
| 44 |
+
'IsFranchise':0,
|
| 45 |
+
'NewBusiness':1,
|
| 46 |
+
'DisbursementFY':1997,
|
| 47 |
+
'DaysToDisbursement':870,
|
| 48 |
+
'SBA_AppvPct':0.80,
|
| 49 |
+
'AppvDisbursed':1,
|
| 50 |
+
'RealEstate':0,
|
| 51 |
+
'GreatRecession':1
|
| 52 |
+
|
| 53 |
+
}
|
| 54 |
+
}
|
backend/main.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from operator import index
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
import sklearn
|
| 7 |
+
from fastapi import FastAPI, File, UploadFile
|
| 8 |
+
import uvicorn
|
| 9 |
+
import sys
|
| 10 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 11 |
+
from pydantic import BaseModel
|
| 12 |
+
import mlflow
|
| 13 |
+
from src.clean_data_json import scaling,clean_data
|
| 14 |
+
#from src.clean_data_json import clean_data_json
|
| 15 |
+
from example_json.transaction_info import TransactionModel
|
| 16 |
+
import mlflow.pyfunc
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
"""
|
| 24 |
+
from dotenv import load_dotenv
|
| 25 |
+
import os
|
| 26 |
+
load_dotenv("../backend/src/.env")
|
| 27 |
+
|
| 28 |
+
DagsHub_username = os.getenv("DagsHub_username")
|
| 29 |
+
DagsHub_token=os.getenv("DagsHub_token")
|
| 30 |
+
os.environ['MLFLOW_TRACKING_USERNAME']= DagsHub_username
|
| 31 |
+
os.environ["MLFLOW_TRACKING_PASSWORD"] = DagsHub_token
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
#setup mlflow
|
| 35 |
+
mlflow.set_tracking_uri("file:///C:/Users/msi/Desktop/mlops/mlruns")
|
| 36 |
+
mlflow.set_experiment("loan_approval_prediction")
|
| 37 |
+
mlflow.sklearn.autolog(disable=True)
|
| 38 |
+
|
| 39 |
+
app = FastAPI()
|
| 40 |
+
origins = ['*']
|
| 41 |
+
|
| 42 |
+
app.add_middleware(
|
| 43 |
+
CORSMiddleware,
|
| 44 |
+
allow_origins=origins,
|
| 45 |
+
allow_credentials=True,
|
| 46 |
+
allow_methods=["*"],
|
| 47 |
+
allow_headers=["*"],
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
#let's call the model from the model registry ( in production stage)
|
| 52 |
+
|
| 53 |
+
df_mlflow=mlflow.search_runs(filter_string="metrics.F1_score_test < 1")
|
| 54 |
+
run_id = '508f98877c244ee58b2ee59373384b32'
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
logged_model = f'runs:/{run_id}/ML_models'
|
| 59 |
+
|
| 60 |
+
# Load model as a PyFuncModel.
|
| 61 |
+
model = mlflow.pyfunc.load_model(logged_model)
|
| 62 |
+
|
| 63 |
+
@app.get("/")
|
| 64 |
+
def read_root():
|
| 65 |
+
return {"Hello": "to fraud detector app version 2"}
|
| 66 |
+
|
| 67 |
+
# this endpoint receives data in the form of csv file (histotical transactions data)
|
| 68 |
+
@app.post("/predict/csv")
|
| 69 |
+
def return_predictions(file: UploadFile = File(...)):
|
| 70 |
+
data = pd.read_csv(file.file)
|
| 71 |
+
data=data.drop(columns=['Unnamed: 0','Default'])
|
| 72 |
+
preprocessed_data = clean_data(data)
|
| 73 |
+
scaled=scaling(preprocessed_data)
|
| 74 |
+
predictions = model.predict(scaled)
|
| 75 |
+
return {"predictions": predictions.tolist()}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
# this endpoint receives data in the form of json (informations about one transaction)
|
| 79 |
+
@app.post("/predict")
|
| 80 |
+
def predict(data : TransactionModel):
|
| 81 |
+
received = data.dict()
|
| 82 |
+
df = pd.DataFrame(received,index=[0])
|
| 83 |
+
scl=clean_data(df)
|
| 84 |
+
preprocessed_data = scaling(scl)
|
| 85 |
+
print(preprocessed_data)
|
| 86 |
+
predictions = model.predict(preprocessed_data)
|
| 87 |
+
return {"predictions": predictions.tolist()}
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
if __name__ == "__main__":
|
| 91 |
+
uvicorn.run("main:app", host="0.0.0.0", port=8080)
|
| 92 |
+
|
| 93 |
+
|
backend/src/clean_data_json.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from sklearn.preprocessing import StandardScaler
|
| 4 |
+
|
| 5 |
+
scale = StandardScaler()
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
with open("../backend/src/store.json", "r") as file:
|
| 9 |
+
x = json.load(file)
|
| 10 |
+
def clean_data(df):
|
| 11 |
+
df['State'] = df['State'].map(x['State'])
|
| 12 |
+
df['BankState'] = df['BankState'].map(x['BankState'])
|
| 13 |
+
df['Industry'] = df['Industry'].map(x['Industry'])
|
| 14 |
+
return df
|
| 15 |
+
|
| 16 |
+
# Function to scale data
|
| 17 |
+
def scaling(df):
|
| 18 |
+
# Only scale numerical columns
|
| 19 |
+
num_cols = df.select_dtypes(include=['number']).columns
|
| 20 |
+
df[num_cols] = scale.fit_transform(df[num_cols])
|
| 21 |
+
return df
|
backend/src/data_preprocessing.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from sklearn.preprocessing import StandardScaler
|
| 4 |
+
import numpy as np
|
| 5 |
+
from sklearn.preprocessing import LabelEncoder
|
| 6 |
+
|
| 7 |
+
encoder = LabelEncoder()
|
| 8 |
+
scale = StandardScaler()
|
| 9 |
+
def convert_money(x):
|
| 10 |
+
if isinstance(x, str): # Check if x is a string
|
| 11 |
+
x = x[1:].replace(',', '') # Remove the dollar sign and commas
|
| 12 |
+
return float(x)
|
| 13 |
+
elif isinstance(x, (int, float)): # If already numeric, return as is
|
| 14 |
+
return float(x)
|
| 15 |
+
else:
|
| 16 |
+
return None # Handle unexpected types gracefully
|
| 17 |
+
|
| 18 |
+
def clean_year(x):
|
| 19 |
+
if isinstance(x, str):
|
| 20 |
+
return x.replace('A', '')
|
| 21 |
+
return x
|
| 22 |
+
|
| 23 |
+
with open("../backend/src/store.json", "r") as file:
|
| 24 |
+
x = json.load(file)
|
| 25 |
+
|
| 26 |
+
def transform_data(df):
|
| 27 |
+
df[['DisbursementGross', 'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv']] = df[['DisbursementGross', 'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv']].applymap(convert_money)
|
| 28 |
+
df['State'] = df['State'].map(x['State'])
|
| 29 |
+
df['BankState'] = df['BankState'].map(x['BankState'])
|
| 30 |
+
df['Industry'] = df['Industry'].map(x['Industry'])
|
| 31 |
+
df['ApprovalFY'] = df['ApprovalFY'].apply(clean_year).astype('int64')
|
| 32 |
+
df.dropna(subset=['Name', 'City', 'State', 'BankState', 'NewExist','RevLineCr', 'LowDoc', 'DisbursementDate', 'MIS_Status'], inplace=True)
|
| 33 |
+
df = df.astype({'Zip': 'str', 'NewExist': 'int64', 'UrbanRural': 'str', 'DisbursementGross': 'float', 'BalanceGross': 'float',
|
| 34 |
+
'ChgOffPrinGr': 'float', 'GrAppv': 'float', 'SBA_Appv': 'float'})
|
| 35 |
+
df['Industry'] = df['NAICS'].astype('str').apply(lambda x: x[:2])
|
| 36 |
+
df['Industry'] = df['Industry'].map({
|
| 37 |
+
'0':'Unknown',
|
| 38 |
+
'11': 'Ag/For/Fish/Hunt',
|
| 39 |
+
'21': 'Min/Quar/Oil_Gas_ext',
|
| 40 |
+
'22': 'Utilities',
|
| 41 |
+
'23': 'Construction',
|
| 42 |
+
'31': 'Manufacturing',
|
| 43 |
+
'32': 'Manufacturing',
|
| 44 |
+
'33': 'Manufacturing',
|
| 45 |
+
'42': 'Wholesale_trade',
|
| 46 |
+
'44': 'Retail_trade',
|
| 47 |
+
'45': 'Retail_trade',
|
| 48 |
+
'48': 'Trans/Ware',
|
| 49 |
+
'49': 'Trans/Ware',
|
| 50 |
+
'51': 'Information',
|
| 51 |
+
'52': 'Finance/Insurance',
|
| 52 |
+
'53': 'RE/Rental/Lease',
|
| 53 |
+
'54': 'Prof/Science/Tech',
|
| 54 |
+
'55': 'Mgmt_comp',
|
| 55 |
+
'56': 'Admin_sup/Waste_Mgmt_Rem',
|
| 56 |
+
'61': 'Educational',
|
| 57 |
+
'62': 'Healthcare/Social_assist',
|
| 58 |
+
'71': 'Arts/Entertain/Rec',
|
| 59 |
+
'72': 'Accom/Food_serv',
|
| 60 |
+
'81': 'Other_no_pub',
|
| 61 |
+
'92': 'Public_Admin'
|
| 62 |
+
})
|
| 63 |
+
df.dropna(subset=['Industry'], inplace=True)
|
| 64 |
+
df.loc[(df['FranchiseCode'] <= 1), 'IsFranchise'] = 0
|
| 65 |
+
df.loc[(df['FranchiseCode'] > 1), 'IsFranchise'] = 1
|
| 66 |
+
df = df.astype({'IsFranchise': 'int64'})
|
| 67 |
+
df = df[(df['NewExist'] == 1) | (df['NewExist'] == 2)]
|
| 68 |
+
|
| 69 |
+
# Create NewBusiness field where 0 = Existing business and 1 = New business; based on NewExist field
|
| 70 |
+
df.loc[(df['NewExist'] == 1), 'NewBusiness'] = 0
|
| 71 |
+
df.loc[(df['NewExist'] == 2), 'NewBusiness'] = 1
|
| 72 |
+
df = df[(df['RevLineCr'] == 'Y') | (df['RevLineCr'] == 'N')]
|
| 73 |
+
df = df[(df['LowDoc'] == 'Y') | (df['LowDoc'] == 'N')]
|
| 74 |
+
|
| 75 |
+
# RevLineCr and LowDoc: 0 = No, 1 = Yes
|
| 76 |
+
df['RevLineCr'] = np.where(df['RevLineCr'] == 'N', 0, 1)
|
| 77 |
+
df['LowDoc'] = np.where(df['LowDoc'] == 'N', 0, 1)
|
| 78 |
+
df['Default'] = np.where(df['MIS_Status'] == 'P I F', 0, 1)
|
| 79 |
+
df[['ApprovalDate', 'DisbursementDate']] = df[['ApprovalDate', 'DisbursementDate']].apply(pd.to_datetime)
|
| 80 |
+
df['DisbursementFY'] = df['DisbursementDate'].map(lambda x: x.year)
|
| 81 |
+
df['DaysToDisbursement'] = df['DisbursementDate'] - df['ApprovalDate']
|
| 82 |
+
df['DaysToDisbursement'] = df['DaysToDisbursement'].astype('str').apply(lambda x: x[:x.index('d') - 1]).astype('int64')
|
| 83 |
+
df['SBA_AppvPct'] = df['SBA_Appv'] / df['GrAppv']
|
| 84 |
+
df['AppvDisbursed'] = np.where(df['DisbursementGross'] == df['GrAppv'], 1, 0)
|
| 85 |
+
df = df.astype({'IsFranchise': 'int64', 'NewBusiness': 'int64'})
|
| 86 |
+
df.drop(columns=['LoanNr_ChkDgt', 'Name', 'City', 'Zip', 'Bank', 'NAICS', 'ApprovalDate', 'NewExist', 'FranchiseCode',
|
| 87 |
+
'ChgOffDate', 'DisbursementDate', 'BalanceGross', 'ChgOffPrinGr', 'SBA_Appv', 'MIS_Status','CreateJob','RetainedJob'], inplace=True)
|
| 88 |
+
# Field for loans backed by Real Estate (loans with a term of at least 20 years)
|
| 89 |
+
df['RealEstate'] = np.where(df['Term'] >= 240, 1, 0)
|
| 90 |
+
|
| 91 |
+
# Field for loans active during the Great Recession (2007-2009)
|
| 92 |
+
df['GreatRecession'] = np.where(((2007 <= df['DisbursementFY']) & (df['DisbursementFY'] <= 2009)) |
|
| 93 |
+
((df['DisbursementFY'] < 2007) & (df['DisbursementFY'] + (df['Term']/12) >= 2007)), 1, 0)
|
| 94 |
+
df['DisbursedGreaterAppv'] = np.where(df['DisbursementGross'] > df['GrAppv'], 1, 0)
|
| 95 |
+
for column in df.select_dtypes(include='object').columns:
|
| 96 |
+
# Encode the column
|
| 97 |
+
df[column] = encoder.fit_transform(df[column])
|
| 98 |
+
y = df['Default']
|
| 99 |
+
X = df.drop('Default', axis=1)
|
| 100 |
+
return X,y
|
| 101 |
+
|
| 102 |
+
|
backend/src/store.json
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"State": {
|
| 3 |
+
"AK": 0,
|
| 4 |
+
"AL": 1,
|
| 5 |
+
"AR": 2,
|
| 6 |
+
"AZ": 3,
|
| 7 |
+
"CA": 4,
|
| 8 |
+
"CO": 5,
|
| 9 |
+
"CT": 6,
|
| 10 |
+
"DC": 7,
|
| 11 |
+
"DE": 8,
|
| 12 |
+
"FL": 9,
|
| 13 |
+
"GA": 10,
|
| 14 |
+
"HI": 11,
|
| 15 |
+
"IA": 12,
|
| 16 |
+
"ID": 13,
|
| 17 |
+
"IL": 14,
|
| 18 |
+
"IN": 15,
|
| 19 |
+
"KS": 16,
|
| 20 |
+
"KY": 17,
|
| 21 |
+
"LA": 18,
|
| 22 |
+
"MA": 19,
|
| 23 |
+
"MD": 20,
|
| 24 |
+
"ME": 21,
|
| 25 |
+
"MI": 22,
|
| 26 |
+
"MN": 23,
|
| 27 |
+
"MO": 24,
|
| 28 |
+
"MS": 25,
|
| 29 |
+
"MT": 26,
|
| 30 |
+
"NC": 27,
|
| 31 |
+
"ND": 28,
|
| 32 |
+
"NE": 29,
|
| 33 |
+
"NH": 30,
|
| 34 |
+
"NJ": 31,
|
| 35 |
+
"NM": 32,
|
| 36 |
+
"NV": 33,
|
| 37 |
+
"NY": 34,
|
| 38 |
+
"OH": 35,
|
| 39 |
+
"OK": 36,
|
| 40 |
+
"OR": 37,
|
| 41 |
+
"PA": 38,
|
| 42 |
+
"RI": 39,
|
| 43 |
+
"SC": 40,
|
| 44 |
+
"SD": 41,
|
| 45 |
+
"TN": 42,
|
| 46 |
+
"TX": 43,
|
| 47 |
+
"UT": 44,
|
| 48 |
+
"VA": 45,
|
| 49 |
+
"VT": 46,
|
| 50 |
+
"WA": 47,
|
| 51 |
+
"WI": 48,
|
| 52 |
+
"WV": 49,
|
| 53 |
+
"WY": 50
|
| 54 |
+
},
|
| 55 |
+
"BankState": {
|
| 56 |
+
"AK": 0,
|
| 57 |
+
"AL": 1,
|
| 58 |
+
"AR": 2,
|
| 59 |
+
"AZ": 3,
|
| 60 |
+
"CA": 4,
|
| 61 |
+
"CO": 5,
|
| 62 |
+
"CT": 6,
|
| 63 |
+
"DC": 7,
|
| 64 |
+
"DE": 8,
|
| 65 |
+
"EN": 9,
|
| 66 |
+
"FL": 10,
|
| 67 |
+
"GA": 11,
|
| 68 |
+
"GU": 12,
|
| 69 |
+
"HI": 13,
|
| 70 |
+
"IA": 14,
|
| 71 |
+
"ID": 15,
|
| 72 |
+
"IL": 16,
|
| 73 |
+
"IN": 17,
|
| 74 |
+
"KS": 18,
|
| 75 |
+
"KY": 19,
|
| 76 |
+
"LA": 20,
|
| 77 |
+
"MA": 21,
|
| 78 |
+
"MD": 22,
|
| 79 |
+
"ME": 23,
|
| 80 |
+
"MI": 24,
|
| 81 |
+
"MN": 25,
|
| 82 |
+
"MO": 26,
|
| 83 |
+
"MS": 27,
|
| 84 |
+
"MT": 28,
|
| 85 |
+
"NC": 29,
|
| 86 |
+
"ND": 30,
|
| 87 |
+
"NE": 31,
|
| 88 |
+
"NH": 32,
|
| 89 |
+
"NJ": 33,
|
| 90 |
+
"NM": 34,
|
| 91 |
+
"NV": 35,
|
| 92 |
+
"NY": 36,
|
| 93 |
+
"OH": 37,
|
| 94 |
+
"OK": 38,
|
| 95 |
+
"OR": 39,
|
| 96 |
+
"PA": 40,
|
| 97 |
+
"PR": 41,
|
| 98 |
+
"RI": 42,
|
| 99 |
+
"SC": 43,
|
| 100 |
+
"SD": 44,
|
| 101 |
+
"TN": 45,
|
| 102 |
+
"TX": 46,
|
| 103 |
+
"UT": 47,
|
| 104 |
+
"VA": 48,
|
| 105 |
+
"VT": 49,
|
| 106 |
+
"WA": 50,
|
| 107 |
+
"WI": 51,
|
| 108 |
+
"WV": 52,
|
| 109 |
+
"WY": 53
|
| 110 |
+
},
|
| 111 |
+
"Industry": {
|
| 112 |
+
"Accom/Food_serv": 0,
|
| 113 |
+
"Admin_sup/Waste_Mgmt_Rem": 1,
|
| 114 |
+
"Ag/For/Fish/Hunt": 2,
|
| 115 |
+
"Arts/Entertain/Rec": 3,
|
| 116 |
+
"Construction": 4,
|
| 117 |
+
"Educational": 5,
|
| 118 |
+
"Finance/Insurance": 6,
|
| 119 |
+
"Healthcare/Social_assist": 7,
|
| 120 |
+
"Information": 8,
|
| 121 |
+
"Manufacturing": 9,
|
| 122 |
+
"Mgmt_comp": 10,
|
| 123 |
+
"Min/Quar/Oil_Gas_ext": 11,
|
| 124 |
+
"Other_no_pub": 12,
|
| 125 |
+
"Prof/Science/Tech": 13,
|
| 126 |
+
"Public_Admin": 14,
|
| 127 |
+
"RE/Rental/Lease": 15,
|
| 128 |
+
"Retail_trade": 16,
|
| 129 |
+
"Trans/Ware": 17,
|
| 130 |
+
"Unknown": 18,
|
| 131 |
+
"Utilities": 19,
|
| 132 |
+
"Wholesale_trade": 20
|
| 133 |
+
}
|
| 134 |
+
}
|
backend/src/store.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{'State':{'AK': 0, 'AL': 1, 'AR': 2, 'AZ': 3, 'CA': 4, 'CO': 5, 'CT': 6, 'DC': 7, 'DE': 8, 'FL': 9, 'GA': 10, 'HI': 11, 'IA': 12, 'ID': 13, 'IL': 14, 'IN': 15, 'KS': 16, 'KY': 17, 'LA': 18, 'MA': 19, 'MD': 20, 'ME': 21, 'MI': 22, 'MN': 23, 'MO': 24, 'MS': 25, 'MT': 26, 'NC': 27, 'ND': 28, 'NE': 29, 'NH': 30, 'NJ': 31, 'NM': 32, 'NV': 33, 'NY': 34, 'OH': 35, 'OK': 36, 'OR': 37, 'PA': 38, 'RI': 39, 'SC': 40, 'SD': 41, 'TN': 42, 'TX': 43, 'UT': 44, 'VA': 45, 'VT': 46, 'WA': 47, 'WI': 48, 'WV': 49, 'WY': 50}}
|
| 2 |
+
{'BankState':{'AK': 0, 'AL': 1, 'AR': 2, 'AZ': 3, 'CA': 4, 'CO': 5, 'CT': 6, 'DC': 7, 'DE': 8, 'EN': 9, 'FL': 10, 'GA': 11, 'GU': 12, 'HI': 13, 'IA': 14, 'ID': 15, 'IL': 16, 'IN': 17, 'KS': 18, 'KY': 19, 'LA': 20, 'MA': 21, 'MD': 22, 'ME': 23, 'MI': 24, 'MN': 25, 'MO': 26, 'MS': 27, 'MT': 28, 'NC': 29, 'ND': 30, 'NE': 31, 'NH': 32, 'NJ': 33, 'NM': 34, 'NV': 35, 'NY': 36, 'OH': 37, 'OK': 38, 'OR': 39, 'PA': 40, 'PR': 41, 'RI': 42, 'SC': 43, 'SD': 44, 'TN': 45, 'TX': 46, 'UT': 47, 'VA': 48, 'VT': 49, 'WA': 50, 'WI': 51, 'WV': 52, 'WY': 53}}
|
| 3 |
+
{'Industry':{'Accom/Food_serv': 0, 'Admin_sup/Waste_Mgmt_Rem': 1, 'Ag/For/Fish/Hunt': 2, 'Arts/Entertain/Rec': 3, 'Construction': 4, 'Educational': 5, 'Finance/Insurance': 6, 'Healthcare/Social_assist': 7, 'Information': 8, 'Manufacturing': 9, 'Mgmt_comp': 10, 'Min/Quar/Oil_Gas_ext': 11, 'Other_no_pub': 12, 'Prof/Science/Tech': 13, 'Public_Admin': 14, 'RE/Rental/Lease': 15, 'Retail_trade': 16, 'Trans/Ware': 17, 'Unknown': 18, 'Utilities': 19, 'Wholesale_trade': 20}}
|
backend/test_app.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
import pickle
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import sys
|
| 5 |
+
from src.clean_data_json import clean_data_json
|
| 6 |
+
from src.data_preprocessing_training import transform_data
|
| 7 |
+
import mlflow
|
| 8 |
+
#from dotenv import load_dotenv
|
| 9 |
+
import os
|
| 10 |
+
|
| 11 |
+
from dotenv import load_dotenv
|
| 12 |
+
|
| 13 |
+
load_dotenv("../backend/src/.env")
|
| 14 |
+
|
| 15 |
+
DagsHub_username = os.getenv("DagsHub_username")
|
| 16 |
+
DagsHub_token=os.getenv("DagsHub_token")
|
| 17 |
+
|
| 18 |
+
os.environ['MLFLOW_TRACKING_USERNAME']= DagsHub_username
|
| 19 |
+
os.environ["MLFLOW_TRACKING_PASSWORD"] = DagsHub_token
|
| 20 |
+
|
| 21 |
+
"""
|
| 22 |
+
os.environ['MLFLOW_TRACKING_USERNAME']= "..."
|
| 23 |
+
os.environ["MLFLOW_TRACKING_PASSWORD"] = "..."
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
#setup mlflow
|
| 27 |
+
mlflow.set_tracking_uri('https://dagshub.com/.../....mlflow') #your mlfow tracking uri
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
#tests if the model works as expected
|
| 31 |
+
|
| 32 |
+
def test_model_use():
|
| 33 |
+
|
| 34 |
+
#let's call the model from the model registry ( in production stage)
|
| 35 |
+
|
| 36 |
+
all_experiments = [exp.experiment_id for exp in mlflow.search_experiments()]
|
| 37 |
+
df_mlflow = mlflow.search_runs(experiment_ids=all_experiments,filter_string="metrics.F1_score_test <1")
|
| 38 |
+
run_id = df_mlflow.loc[df_mlflow['metrics.F1_score_test'].idxmax()]['run_id']
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
logged_model = f'runs:/{run_id}/ML_models'
|
| 42 |
+
|
| 43 |
+
# Load model as a PyFuncModel.
|
| 44 |
+
model = mlflow.pyfunc.load_model(logged_model)
|
| 45 |
+
|
| 46 |
+
d = {'trans_date_trans_time': "2019-01-18 23:20:16", "category" : "shopping_net",
|
| 47 |
+
"amt": 1334.07,
|
| 48 |
+
"gender": 'F',
|
| 49 |
+
"zip": 29438,
|
| 50 |
+
"lat": 32.5486 ,
|
| 51 |
+
"long": -80.307,
|
| 52 |
+
"dob": "1997-07-05",
|
| 53 |
+
"merch_lat": 31.615611,
|
| 54 |
+
"merch_long": -79.702908}
|
| 55 |
+
df = pd.DataFrame(data=d,index=[0])
|
| 56 |
+
dd = clean_data_json(df)
|
| 57 |
+
predict_result = model.predict(dd)
|
| 58 |
+
print(predict_result[0])
|
| 59 |
+
assert predict_result[0] == 1
|
requirements.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Data Manipulation and Visualization
|
| 2 |
+
pandas==2.1.1
|
| 3 |
+
numpy==1.26.0
|
| 4 |
+
matplotlib==3.8.0
|
| 5 |
+
seaborn==0.12.2
|
| 6 |
+
|
| 7 |
+
# Machine Learning
|
| 8 |
+
scikit-learn==1.3.1
|
| 9 |
+
xgboost==1.7.6
|
| 10 |
+
imbalanced-learn==0.11.0
|
| 11 |
+
|
| 12 |
+
# Deployment
|
| 13 |
+
fastapi==0.103.0
|
| 14 |
+
uvicorn[standard]==0.23.2
|
| 15 |
+
|
| 16 |
+
# Dashboard
|
| 17 |
+
streamlit==1.25.0
|
| 18 |
+
|
| 19 |
+
# Experiment Tracking
|
| 20 |
+
mlflow==2.8.1
|
| 21 |
+
|
| 22 |
+
# Monitoring
|
| 23 |
+
arize-ai==1.12.0
|
| 24 |
+
|
| 25 |
+
# Additional Tools
|
| 26 |
+
joblib==1.4.1
|
| 27 |
+
pyyaml==6.0
|
| 28 |
+
|