Spaces:

MISSAOUI
/

Loan_Payment_Prediction

Sleeping

App Files Files Community

msi commited on Mar 22, 2025

Commit

d032558

1 Parent(s): c6f428a

Add application file

Browse files

Files changed (9) hide show

app.py.py +173 -0
backend/example_json/transaction_info.py +54 -0
backend/main.py +93 -0
backend/src/clean_data_json.py +21 -0
backend/src/data_preprocessing.py +102 -0
backend/src/store.json +134 -0
backend/src/store.txt +3 -0
backend/test_app.py +59 -0
requirements.txt +28 -0

app.py.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import requests
+import streamlit as st
+import json
+import pandas as pd
+# Load configuration data from JSON
+with open("backend/src/store.json", "r") as file:
+    x = json.load(file)
+# Page Title
+import streamlit as st
+import pandas as pd
+import plotly.express as px
+import requests
+import json
+# Helper function to map Yes/No to 1/0
+def gett(choice):
+    return 1 if choice == "Yes" else 0
+# Load configuration data from JSON
+with open("backend/src/store.json", "r") as file:
+    x = json.load(file)
+# Load the data
+@st.cache_data
+def load_data():
+    file_path = "data/loan_data.csv"  # Update this with your file path
+    return pd.read_csv(file_path)
+df = load_data()
+# Sidebar with radio buttons for navigation
+sidebar_option = st.sidebar.radio("Select Page", ["Dashboard", "Prediction"])
+if sidebar_option == "Dashboard":
+    # Dashboard Page
+    st.title("SBA Loans Dashboard")
+    st.markdown("Explore loan trends, analyze defaults, and gain insights into SBA loan data.")
+    # Sidebar Filters for Dashboard
+    st.sidebar.header("Filters")
+    states = st.sidebar.multiselect("Select State(s):", df["State"].unique(), default=df["State"].unique())
+    industries = st.sidebar.multiselect("Select Industry(s):", df["Industry"].unique(), default=df["Industry"].unique())
+    approval_year = st.sidebar.slider("Approval Fiscal Year Range:",
+                                       int(df["ApprovalFY"].min()),
+                                       int(df["ApprovalFY"].max()),
+                                       (int(df["ApprovalFY"].min()), int(df["ApprovalFY"].max())))
+    # Filter Data
+    filtered_data = df[(df["State"].isin(states)) &
+                       (df["Industry"].isin(industries)) &
+                       (df["ApprovalFY"] >= approval_year[0]) &
+                       (df["ApprovalFY"] <= approval_year[1])]
+    # Overview Metrics
+    st.header("Key Metrics")
+    col1, col2, col3, col4 = st.columns(4)
+    col1.metric("Total Loans", len(filtered_data))
+    col2.metric("Default Rate", f"{(filtered_data['Default'].mean() * 100):.2f}%")
+    col3.metric("Avg Loan Amount", f"${filtered_data['DisbursementGross'].mean():,.2f}")
+    col4.metric("Franchise Loans", f"{filtered_data['IsFranchise'].sum()}")
+    # Visualizations
+    st.header("Visualizations")
+    # Loan Distribution by State
+    fig_state = px.bar(filtered_data.groupby("State").size().reset_index(name="Loans"),
+                       x="State", y="Loans", title="Loan Distribution by State")
+    st.plotly_chart(fig_state)
+    # Loan Amount vs Default Rate
+    fig_default = px.scatter(filtered_data,
+                             x="DisbursementGross", y="Default",
+                             color="Industry",
+                             title="Loan Amount vs Default Rate",
+                             size="GrAppv", hover_data=["State"])
+    st.plotly_chart(fig_default)
+    # Trends Over Time
+    fig_trend = px.line(filtered_data.groupby("ApprovalFY").size().reset_index(name="Loans"),
+                        x="ApprovalFY", y="Loans", title="Loan Trends Over Time")
+    st.plotly_chart(fig_trend)
+    # Default Rate by Industry
+    fig_industry = px.bar(filtered_data.groupby("Industry")["Default"].mean().reset_index(name="Default Rate"),
+                          x="Industry", y="Default Rate",
+                          title="Default Rate by Industry", text_auto=".2f")
+    st.plotly_chart(fig_industry)
+elif sidebar_option == "Prediction":
+    # Prediction Page
+    st.title("Loan Default Prediction")
+    st.markdown("Enter loan details to predict whether the loan will default.")
+    # Creating the form fields
+    with st.form("form1", clear_on_submit=False):
+        state = st.selectbox("Enter your State", tuple(x['State'].keys()))
+        category = st.selectbox("Enter your Bank State", tuple(x['BankState'].keys()))
+        appY = st.selectbox("Select your Approval Year",
+                            (1997, 1980, 2006, 1998, 1999, 2000, 2001, 1972, 2003, 2004, 1978,
+                             1979, 1981, 2005, 1982, 1983, 1973, 1984, 2007, 1985, 1986, 1987,
+                             2008, 1988, 2009, 1989, 1991, 1990, 1974, 2010, 1992, 1993, 2002,
+                             1994, 1975, 1977, 1976, 1969, 1995, 1970, 1996, 1971))
+        term = st.text_input("Term", "0")
+        noemp = st.text_input("Number of Employees", "0")
+        urban = st.selectbox("Select the Zone Type", ("Urban", "Rural", "Undefined"))
+        rev = st.selectbox("Select Revolving Line of Credit", ("Yes", "No"))
+        low = st.selectbox("Select LowDoc Loan Program", ("Yes", "No"))
+        disb = st.text_input("Enter the Amount Disbursed", "0")
+        merch_long = st.text_input("Enter Gross Amount of Loan Approved by Bank", "0")
+        indus = st.selectbox("Enter your Industry Category", tuple(x['Industry'].keys()))
+        fran = st.selectbox("Is it a Franchise?", ("Yes", "No"))
+        busi = st.selectbox("Is it a New Business?", ("Yes", "No"))
+        disY = st.selectbox("Select your Disbursement Year",
+                            (1999, 1997, 1980, 1998, 2006, 2002, 2001, 2000, 2003, 1982, 2004,
+                             2071, 2005, 2009, 2007, 2008, 1981, 2072, 1978, 1979, 1996, 2010,
+                             1995, 2012, 1983, 1985, 1984, 2048, 1987, 2073, 1986, 2011, 1988,
+                             1989, 2013, 1990, 1991, 2014, 1992, 1993, 1994, 2020, 1974, 2028,
+                             1975, 1976, 1977, 2069, 2070))
+        days_dis = st.text_input("Enter the Days to Disbursement", "0")
+        sba = st.text_input("Enter SBA's Guaranteed Amount of Approved Loan", "0")
+        appvD = st.selectbox("Is it AppvDisbursed?", ("Yes", "No"))
+        realsta = st.selectbox("Is it Real Estate?", ("Yes", "No"))
+        great = st.selectbox("During the Great Recession?", ("Yes", "No"))
+        # Prepare data for API
+        dd = {
+            "State": x['State'][state],
+            "BankState": x['BankState'][category],
+            "ApprovalFY": appY,
+            "Term": term,
+            "NoEmp": noemp,
+            "UrbanRural": urban,
+            "RevLineCr": gett(rev),
+            "LowDoc": gett(low),
+            "DisbursementGross": disb,
+            "GrAppv": merch_long,
+            "Industry": x['Industry'][indus],
+            "IsFranchise": gett(fran),
+            "NewBusiness": gett(busi),
+            "DisbursementFY": disY,
+            "DaysToDisbursement": days_dis,
+            "SBA_AppvPct": sba,
+            "AppvDisbursed": gett(appvD),
+            "RealEstate": gett(realsta),
+            "GreatRecession": gett(great),
+        }
+        submit = st.form_submit_button("Submit this form")
+        if submit:
+            try:
+                res = requests.post("http://127.0.0.1:8000/predict", data=json.dumps(dd))
+                predictions = res.json().get("predictions")
+                if predictions == [0]:
+                    st.success("Paid In Full, The loan was successfully repaid. 😃")
+                else:
+                    st.error("Charged Off, The loan defaulted and was written off as a loss. 🚨")
+            except Exception as e:
+                st.error(f"Error: {e}")
+    # File uploader for historical transactions
+    st.subheader("Or Enter your Historical Transactions CSV File")
+    data = st.file_uploader("Choose a CSV File")
+    if data is not None:
+        try:
+            file = {"file": data.getvalue()}
+            res = requests.post("http://127.0.0.1:8000/predict/csv", files=file)
+            predictions = res.json().get("predictions")
+            st.text(predictions)
+        except Exception as e:
+            st.error(f"Error: {e}")

backend/example_json/transaction_info.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from pydantic import BaseModel, Field
+from pydantic import BaseModel
+import datetime
+class TransactionModel(BaseModel):
+    State: object = Field(...)
+    BankState: object = Field(...)
+    ApprovalFY: int = Field(...)
+    Term: int = Field(...)
+    NoEmp: int = Field(...)
+    UrbanRural: int = Field(...)
+    RevLineCr: int = Field(...)
+    LowDoc: int = Field(...)
+    DisbursementGross: float = Field(...)
+    GrAppv: float = Field(...)
+    Industry: object = Field(...)
+    IsFranchise: int = Field(...)
+    NewBusiness: int = Field(...)
+    DisbursementFY: int = Field(...)
+    DaysToDisbursement: int = Field(...)
+    SBA_AppvPct: float = Field(...)
+    AppvDisbursed: int = Field(...)
+    RealEstate: int = Field(...)
+    GreatRecession: int = Field(...)
+    class Config:
+       populate_by_name = True
+       arbitrary_types_allowed = True
+       json_schema_extra = {
+           "example": {
+               "State": "AK",
+                "BankState" : "AK",
+                "ApprovalFY": 1994,
+                "Term": 84,
+                "NoEmp": 5,
+                "UrbanRural": 0 ,
+                "RevLineCr": 0,
+                "LowDoc":0,
+                "DisbursementGross":60000.0	,
+                'GrAppv':60000.0,
+                'Industry':"Retail_trade"	,
+                'IsFranchise':0,
+                'NewBusiness':1,
+                'DisbursementFY':1997,
+                'DaysToDisbursement':870,
+                'SBA_AppvPct':0.80,
+                'AppvDisbursed':1,
+                'RealEstate':0,
+                'GreatRecession':1
+           }
+       }

backend/main.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from operator import index
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from datetime import datetime
+import sklearn
+from fastapi import FastAPI, File, UploadFile
+import uvicorn
+import sys
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+import mlflow
+from src.clean_data_json import scaling,clean_data
+#from src.clean_data_json import clean_data_json
+from example_json.transaction_info import TransactionModel
+import mlflow.pyfunc
+"""
+from dotenv import load_dotenv
+import os
+load_dotenv("../backend/src/.env")
+DagsHub_username = os.getenv("DagsHub_username")
+DagsHub_token=os.getenv("DagsHub_token")
+os.environ['MLFLOW_TRACKING_USERNAME']= DagsHub_username
+os.environ["MLFLOW_TRACKING_PASSWORD"] = DagsHub_token
+"""
+#setup mlflow
+mlflow.set_tracking_uri("file:///C:/Users/msi/Desktop/mlops/mlruns")
+mlflow.set_experiment("loan_approval_prediction")
+mlflow.sklearn.autolog(disable=True)
+app = FastAPI()
+origins = ['*']
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+#let's call the model from the model registry ( in production stage)
+df_mlflow=mlflow.search_runs(filter_string="metrics.F1_score_test < 1")
+run_id = '508f98877c244ee58b2ee59373384b32'
+logged_model = f'runs:/{run_id}/ML_models'
+# Load model as a PyFuncModel.
+model = mlflow.pyfunc.load_model(logged_model)
+@app.get("/")
+def read_root():
+    return {"Hello": "to fraud detector app version 2"}
+# this endpoint receives data in the form of csv file (histotical transactions data)
+@app.post("/predict/csv")
+def return_predictions(file: UploadFile = File(...)):
+    data = pd.read_csv(file.file)
+    data=data.drop(columns=['Unnamed: 0','Default'])
+    preprocessed_data = clean_data(data)
+    scaled=scaling(preprocessed_data)
+    predictions = model.predict(scaled)
+    return {"predictions": predictions.tolist()}
+# this endpoint receives data in the form of json (informations about one transaction)
+@app.post("/predict")
+def predict(data : TransactionModel):
+    received = data.dict()
+    df =  pd.DataFrame(received,index=[0])
+    scl=clean_data(df)
+    preprocessed_data = scaling(scl)
+    print(preprocessed_data)
+    predictions = model.predict(preprocessed_data)
+    return {"predictions": predictions.tolist()}
+if __name__ == "__main__":
+    uvicorn.run("main:app", host="0.0.0.0", port=8080)

backend/src/clean_data_json.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import json
+import pandas as pd
+from sklearn.preprocessing import StandardScaler
+scale = StandardScaler()
+with open("../backend/src/store.json", "r") as file:
+    x = json.load(file)
+def clean_data(df):
+    df['State'] = df['State'].map(x['State'])
+    df['BankState'] = df['BankState'].map(x['BankState'])
+    df['Industry'] = df['Industry'].map(x['Industry'])
+    return df
+# Function to scale data
+def scaling(df):
+    # Only scale numerical columns
+    num_cols = df.select_dtypes(include=['number']).columns
+    df[num_cols] = scale.fit_transform(df[num_cols])
+    return df

backend/src/data_preprocessing.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import json
+import pandas as pd
+from sklearn.preprocessing import StandardScaler
+import numpy as np
+from sklearn.preprocessing import LabelEncoder
+encoder = LabelEncoder()
+scale = StandardScaler()
+def convert_money(x):
+    if isinstance(x, str):  # Check if x is a string
+        x = x[1:].replace(',', '')  # Remove the dollar sign and commas
+        return float(x)
+    elif isinstance(x, (int, float)):  # If already numeric, return as is
+        return float(x)
+    else:
+        return None  # Handle unexpected types gracefully
+def clean_year(x):
+    if isinstance(x, str):
+        return x.replace('A', '')
+    return x
+with open("../backend/src/store.json", "r") as file:
+    x = json.load(file)
+def transform_data(df):
+    df[['DisbursementGross', 'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv']] = df[['DisbursementGross', 'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv']].applymap(convert_money)
+    df['State'] = df['State'].map(x['State'])
+    df['BankState'] = df['BankState'].map(x['BankState'])
+    df['Industry'] = df['Industry'].map(x['Industry'])
+    df['ApprovalFY'] = df['ApprovalFY'].apply(clean_year).astype('int64')
+    df.dropna(subset=['Name', 'City', 'State', 'BankState', 'NewExist','RevLineCr', 'LowDoc', 'DisbursementDate', 'MIS_Status'], inplace=True)
+    df = df.astype({'Zip': 'str', 'NewExist': 'int64', 'UrbanRural': 'str', 'DisbursementGross': 'float', 'BalanceGross': 'float',
+                          'ChgOffPrinGr': 'float', 'GrAppv': 'float', 'SBA_Appv': 'float'})
+    df['Industry'] = df['NAICS'].astype('str').apply(lambda x: x[:2])
+    df['Industry'] = df['Industry'].map({
+    '0':'Unknown',
+    '11': 'Ag/For/Fish/Hunt',
+    '21': 'Min/Quar/Oil_Gas_ext',
+    '22': 'Utilities',
+    '23': 'Construction',
+    '31': 'Manufacturing',
+    '32': 'Manufacturing',
+    '33': 'Manufacturing',
+    '42': 'Wholesale_trade',
+    '44': 'Retail_trade',
+    '45': 'Retail_trade',
+    '48': 'Trans/Ware',
+    '49': 'Trans/Ware',
+    '51': 'Information',
+    '52': 'Finance/Insurance',
+    '53': 'RE/Rental/Lease',
+    '54': 'Prof/Science/Tech',
+    '55': 'Mgmt_comp',
+    '56': 'Admin_sup/Waste_Mgmt_Rem',
+    '61': 'Educational',
+    '62': 'Healthcare/Social_assist',
+    '71': 'Arts/Entertain/Rec',
+    '72': 'Accom/Food_serv',
+    '81': 'Other_no_pub',
+    '92': 'Public_Admin'
+})
+    df.dropna(subset=['Industry'], inplace=True)
+    df.loc[(df['FranchiseCode'] <= 1), 'IsFranchise'] = 0
+    df.loc[(df['FranchiseCode'] > 1), 'IsFranchise'] = 1
+    df = df.astype({'IsFranchise': 'int64'})
+    df = df[(df['NewExist'] == 1) | (df['NewExist'] == 2)]
+# Create NewBusiness field where 0 = Existing business and 1 = New business; based on NewExist field
+    df.loc[(df['NewExist'] == 1), 'NewBusiness'] = 0
+    df.loc[(df['NewExist'] == 2), 'NewBusiness'] = 1
+    df = df[(df['RevLineCr'] == 'Y') | (df['RevLineCr'] == 'N')]
+    df = df[(df['LowDoc'] == 'Y') | (df['LowDoc'] == 'N')]
+# RevLineCr and LowDoc: 0 = No, 1 = Yes
+    df['RevLineCr'] = np.where(df['RevLineCr'] == 'N', 0, 1)
+    df['LowDoc'] = np.where(df['LowDoc'] == 'N', 0, 1)
+    df['Default'] = np.where(df['MIS_Status'] == 'P I F', 0, 1)
+    df[['ApprovalDate', 'DisbursementDate']] = df[['ApprovalDate', 'DisbursementDate']].apply(pd.to_datetime)
+    df['DisbursementFY'] = df['DisbursementDate'].map(lambda x: x.year)
+    df['DaysToDisbursement'] = df['DisbursementDate'] - df['ApprovalDate']
+    df['DaysToDisbursement'] = df['DaysToDisbursement'].astype('str').apply(lambda x: x[:x.index('d') - 1]).astype('int64')
+    df['SBA_AppvPct'] = df['SBA_Appv'] / df['GrAppv']
+    df['AppvDisbursed'] = np.where(df['DisbursementGross'] == df['GrAppv'], 1, 0)
+    df = df.astype({'IsFranchise': 'int64', 'NewBusiness': 'int64'})
+    df.drop(columns=['LoanNr_ChkDgt', 'Name', 'City', 'Zip', 'Bank', 'NAICS', 'ApprovalDate', 'NewExist', 'FranchiseCode',
+                      'ChgOffDate', 'DisbursementDate', 'BalanceGross', 'ChgOffPrinGr', 'SBA_Appv', 'MIS_Status','CreateJob','RetainedJob'], inplace=True)
+    # Field for loans backed by Real Estate (loans with a term of at least 20 years)
+    df['RealEstate'] = np.where(df['Term'] >= 240, 1, 0)
+# Field for loans active during the Great Recession (2007-2009)
+    df['GreatRecession'] = np.where(((2007 <= df['DisbursementFY']) & (df['DisbursementFY'] <= 2009)) |
+                                     ((df['DisbursementFY'] < 2007) & (df['DisbursementFY'] + (df['Term']/12) >= 2007)), 1, 0)
+    df['DisbursedGreaterAppv'] = np.where(df['DisbursementGross'] > df['GrAppv'], 1, 0)
+    for column in df.select_dtypes(include='object').columns:
+    # Encode the column
+        df[column] = encoder.fit_transform(df[column])
+    y = df['Default']
+    X = df.drop('Default', axis=1)
+    return X,y

backend/src/store.json ADDED Viewed

	@@ -0,0 +1,134 @@

+{
+    "State": {
+        "AK": 0,
+        "AL": 1,
+        "AR": 2,
+        "AZ": 3,
+        "CA": 4,
+        "CO": 5,
+        "CT": 6,
+        "DC": 7,
+        "DE": 8,
+        "FL": 9,
+        "GA": 10,
+        "HI": 11,
+        "IA": 12,
+        "ID": 13,
+        "IL": 14,
+        "IN": 15,
+        "KS": 16,
+        "KY": 17,
+        "LA": 18,
+        "MA": 19,
+        "MD": 20,
+        "ME": 21,
+        "MI": 22,
+        "MN": 23,
+        "MO": 24,
+        "MS": 25,
+        "MT": 26,
+        "NC": 27,
+        "ND": 28,
+        "NE": 29,
+        "NH": 30,
+        "NJ": 31,
+        "NM": 32,
+        "NV": 33,
+        "NY": 34,
+        "OH": 35,
+        "OK": 36,
+        "OR": 37,
+        "PA": 38,
+        "RI": 39,
+        "SC": 40,
+        "SD": 41,
+        "TN": 42,
+        "TX": 43,
+        "UT": 44,
+        "VA": 45,
+        "VT": 46,
+        "WA": 47,
+        "WI": 48,
+        "WV": 49,
+        "WY": 50
+    },
+    "BankState": {
+        "AK": 0,
+        "AL": 1,
+        "AR": 2,
+        "AZ": 3,
+        "CA": 4,
+        "CO": 5,
+        "CT": 6,
+        "DC": 7,
+        "DE": 8,
+        "EN": 9,
+        "FL": 10,
+        "GA": 11,
+        "GU": 12,
+        "HI": 13,
+        "IA": 14,
+        "ID": 15,
+        "IL": 16,
+        "IN": 17,
+        "KS": 18,
+        "KY": 19,
+        "LA": 20,
+        "MA": 21,
+        "MD": 22,
+        "ME": 23,
+        "MI": 24,
+        "MN": 25,
+        "MO": 26,
+        "MS": 27,
+        "MT": 28,
+        "NC": 29,
+        "ND": 30,
+        "NE": 31,
+        "NH": 32,
+        "NJ": 33,
+        "NM": 34,
+        "NV": 35,
+        "NY": 36,
+        "OH": 37,
+        "OK": 38,
+        "OR": 39,
+        "PA": 40,
+        "PR": 41,
+        "RI": 42,
+        "SC": 43,
+        "SD": 44,
+        "TN": 45,
+        "TX": 46,
+        "UT": 47,
+        "VA": 48,
+        "VT": 49,
+        "WA": 50,
+        "WI": 51,
+        "WV": 52,
+        "WY": 53
+    },
+    "Industry": {
+        "Accom/Food_serv": 0,
+        "Admin_sup/Waste_Mgmt_Rem": 1,
+        "Ag/For/Fish/Hunt": 2,
+        "Arts/Entertain/Rec": 3,
+        "Construction": 4,
+        "Educational": 5,
+        "Finance/Insurance": 6,
+        "Healthcare/Social_assist": 7,
+        "Information": 8,
+        "Manufacturing": 9,
+        "Mgmt_comp": 10,
+        "Min/Quar/Oil_Gas_ext": 11,
+        "Other_no_pub": 12,
+        "Prof/Science/Tech": 13,
+        "Public_Admin": 14,
+        "RE/Rental/Lease": 15,
+        "Retail_trade": 16,
+        "Trans/Ware": 17,
+        "Unknown": 18,
+        "Utilities": 19,
+        "Wholesale_trade": 20
+    }
+}

backend/src/store.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+{'State':{'AK': 0, 'AL': 1, 'AR': 2, 'AZ': 3, 'CA': 4, 'CO': 5, 'CT': 6, 'DC': 7, 'DE': 8, 'FL': 9, 'GA': 10, 'HI': 11, 'IA': 12, 'ID': 13, 'IL': 14, 'IN': 15, 'KS': 16, 'KY': 17, 'LA': 18, 'MA': 19, 'MD': 20, 'ME': 21, 'MI': 22, 'MN': 23, 'MO': 24, 'MS': 25, 'MT': 26, 'NC': 27, 'ND': 28, 'NE': 29, 'NH': 30, 'NJ': 31, 'NM': 32, 'NV': 33, 'NY': 34, 'OH': 35, 'OK': 36, 'OR': 37, 'PA': 38, 'RI': 39, 'SC': 40, 'SD': 41, 'TN': 42, 'TX': 43, 'UT': 44, 'VA': 45, 'VT': 46, 'WA': 47, 'WI': 48, 'WV': 49, 'WY': 50}}
+{'BankState':{'AK': 0, 'AL': 1, 'AR': 2, 'AZ': 3, 'CA': 4, 'CO': 5, 'CT': 6, 'DC': 7, 'DE': 8, 'EN': 9, 'FL': 10, 'GA': 11, 'GU': 12, 'HI': 13, 'IA': 14, 'ID': 15, 'IL': 16, 'IN': 17, 'KS': 18, 'KY': 19, 'LA': 20, 'MA': 21, 'MD': 22, 'ME': 23, 'MI': 24, 'MN': 25, 'MO': 26, 'MS': 27, 'MT': 28, 'NC': 29, 'ND': 30, 'NE': 31, 'NH': 32, 'NJ': 33, 'NM': 34, 'NV': 35, 'NY': 36, 'OH': 37, 'OK': 38, 'OR': 39, 'PA': 40, 'PR': 41, 'RI': 42, 'SC': 43, 'SD': 44, 'TN': 45, 'TX': 46, 'UT': 47, 'VA': 48, 'VT': 49, 'WA': 50, 'WI': 51, 'WV': 52, 'WY': 53}}
+{'Industry':{'Accom/Food_serv': 0, 'Admin_sup/Waste_Mgmt_Rem': 1, 'Ag/For/Fish/Hunt': 2, 'Arts/Entertain/Rec': 3, 'Construction': 4, 'Educational': 5, 'Finance/Insurance': 6, 'Healthcare/Social_assist': 7, 'Information': 8, 'Manufacturing': 9, 'Mgmt_comp': 10, 'Min/Quar/Oil_Gas_ext': 11, 'Other_no_pub': 12, 'Prof/Science/Tech': 13, 'Public_Admin': 14, 'RE/Rental/Lease': 15, 'Retail_trade': 16, 'Trans/Ware': 17, 'Unknown': 18, 'Utilities': 19, 'Wholesale_trade': 20}}

backend/test_app.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from pathlib import Path
+import pickle
+import pandas as pd
+import sys
+from src.clean_data_json import clean_data_json
+from src.data_preprocessing_training import transform_data
+import mlflow
+#from dotenv import load_dotenv
+import os
+from dotenv import load_dotenv
+load_dotenv("../backend/src/.env")
+DagsHub_username = os.getenv("DagsHub_username")
+DagsHub_token=os.getenv("DagsHub_token")
+os.environ['MLFLOW_TRACKING_USERNAME']= DagsHub_username
+os.environ["MLFLOW_TRACKING_PASSWORD"] = DagsHub_token
+"""
+os.environ['MLFLOW_TRACKING_USERNAME']= "..."
+os.environ["MLFLOW_TRACKING_PASSWORD"] = "..."
+"""
+#setup mlflow
+mlflow.set_tracking_uri('https://dagshub.com/.../....mlflow') #your mlfow tracking uri
+#tests if the model works as expected
+def test_model_use():
+    #let's call the model from the model registry ( in production stage)
+    all_experiments = [exp.experiment_id for exp in mlflow.search_experiments()]
+    df_mlflow = mlflow.search_runs(experiment_ids=all_experiments,filter_string="metrics.F1_score_test <1")
+    run_id = df_mlflow.loc[df_mlflow['metrics.F1_score_test'].idxmax()]['run_id']
+    logged_model = f'runs:/{run_id}/ML_models'
+    # Load model as a PyFuncModel.
+    model = mlflow.pyfunc.load_model(logged_model)
+    d = {'trans_date_trans_time': "2019-01-18 23:20:16", "category" : "shopping_net",
+                "amt": 1334.07,
+                "gender": 'F',
+                "zip": 29438,
+                "lat": 32.5486 ,
+                "long": -80.307,
+                "dob": "1997-07-05",
+                "merch_lat": 31.615611,
+                "merch_long": -79.702908}
+    df = pd.DataFrame(data=d,index=[0])
+    dd = clean_data_json(df)
+    predict_result = model.predict(dd)
+    print(predict_result[0])
+    assert predict_result[0] == 1

requirements.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+# Data Manipulation and Visualization
+pandas==2.1.1
+numpy==1.26.0
+matplotlib==3.8.0
+seaborn==0.12.2
+# Machine Learning
+scikit-learn==1.3.1
+xgboost==1.7.6
+imbalanced-learn==0.11.0
+# Deployment
+fastapi==0.103.0
+uvicorn[standard]==0.23.2
+# Dashboard
+streamlit==1.25.0
+# Experiment Tracking
+mlflow==2.8.1
+# Monitoring
+arize-ai==1.12.0
+# Additional Tools
+joblib==1.4.1
+pyyaml==6.0