from huggingface_hub import hf_hub_download import os import json import time import joblib import lightgbm import numpy as np import pandas as pd import gradio as gr from datetime import datetime, timedelta from huggingface_hub import upload_file from sklearn.ensemble import RandomForestRegressor HF_TOKEN = os.environ.get("HF_TOKEN") if HF_TOKEN is None: raise ValueError("HF_TOKEN not set. Add it to Space Secrets.") DATASET_REPO = "GFT-Poland/synthetic-pricing-dataset" MODEL_REPO = "GFT-Poland/basic-pricing-model" DATASET_FILE = "synthetic_pricing_dataset.csv" MODEL_FILE = "model.joblib" FEATURES = [ "age", "monthly_income", "living_costs", "existing_liabilities", "credit_limits", "ltv", "interest_rate", "loan_term_years", "installment_type" ] TARGET = "max_loan_amount" # ============================= # GENERATE DATASET # ============================= def generate_dataset(): n = 1_000_000 seed = 42 import numpy as np import pandas as pd np.random.seed(seed) rows = [] def installment_to_loan(m, rate, years, itype): r = rate / 12 n = years * 12 if m <= 0: return 0.0 if itype == 0: return m * ((1 - (1 + r) ** -n) / r) if itype == 1: return m * n * 0.82 if itype == 2: r = (rate + 0.05) / 12 return m * ((1 - (1 + r) ** -n) / r) for _ in range(n): age = np.random.randint(21, 65) monthly_income = np.random.uniform(2000, 50000) living_costs = ( 900 + 0.35 * (monthly_income ** 0.85) + np.random.uniform(-300, 300) ) existing_liabilities = monthly_income * np.random.uniform(0.05, 0.30) credit_limits = monthly_income * np.random.uniform(3, 9) ltv = np.random.uniform(0.4, 0.95) interest_rate = np.random.uniform(0.045, 0.095) loan_term_years = np.random.randint( 10, min(35, 75 - age) + 1 ) installment_type = np.random.choice([0, 1, 2], p=[0.5, 0.3, 0.2]) disposable_income = ( monthly_income - living_costs - existing_liabilities ) if disposable_income <= 0: max_loan = 0 else: dsr_base = 0.28 + 0.18 * np.tanh(monthly_income / 12000) dsr_adj = 0 if ltv > 0.85: dsr_adj -= 0.10 if interest_rate > 0.07: dsr_adj -= 0.07 if installment_type == 2: dsr_adj -= 0.05 if credit_limits > monthly_income * 6: dsr_adj -= 0.05 dsr_limit = np.clip(dsr_base + dsr_adj, 0.25, 0.55) max_installment = disposable_income * dsr_limit loan_from_installment = installment_to_loan( max_installment, interest_rate, loan_term_years, installment_type ) max_loan = min( loan_from_installment, monthly_income * 240, credit_limits * 12 ) rows.append({ "age": age, "monthly_income": round(monthly_income, 2), "living_costs": round(living_costs, 2), "existing_liabilities": round(existing_liabilities, 2), "credit_limits": round(credit_limits, 2), "ltv": round(ltv, 2), "interest_rate": round(interest_rate, 4), "loan_term_years": loan_term_years, "installment_type": installment_type, "max_loan_amount": round(max_loan, 2), }) df = pd.DataFrame(rows) df["update_ts"] = time.time() df.to_csv(DATASET_FILE, index=False) upload_file( path_or_fileobj=DATASET_FILE, path_in_repo=DATASET_FILE, repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN, ) return f"Creditworthiness dataset generated ({len(df)} rows)" # ============================= # HELPER: DOWNLOAD IF NEWER # ============================= def download_if_newer(repo_id, filename, local_path, repo_type="dataset"): try: hf_file = hf_hub_download(repo_id=repo_id, filename=filename, repo_type=repo_type, token=HF_TOKEN) repo_mtime = os.path.getmtime(hf_file) except Exception: return f"No remote file found, using local {local_path}" if os.path.exists(local_path): local_mtime = os.path.getmtime(local_path) if repo_mtime > local_mtime: import shutil shutil.copy2(hf_file, local_path) return f"Updated local {local_path} from repo." else: return f"Local {local_path} is up-to-date." else: import shutil shutil.copy2(hf_file, local_path) return f"Downloaded {local_path} from repo." # ============================= # TRAIN MODEL # ============================= def train_model(): download_if_newer( DATASET_REPO, DATASET_FILE, DATASET_FILE, repo_type="dataset" ) df = pd.read_csv(DATASET_FILE) X = df[FEATURES] y = df[TARGET] from lightgbm import LGBMRegressor model = LGBMRegressor( n_estimators=600, learning_rate=0.05, max_depth=7, num_leaves=40, subsample=0.85, colsample_bytree=0.85, monotone_constraints=[ 0, # age +1, # income -1, # living costs -1, # liabilities -1, # credit limits -1, # LTV -1, # interest rate +1, # loan term -1, # installment type risk ], random_state=42, n_jobs=-1, ) model.fit(X, y) joblib.dump(model, MODEL_FILE) upload_file( path_or_fileobj=MODEL_FILE, path_in_repo=MODEL_FILE, repo_id=MODEL_REPO, repo_type="model", token=HF_TOKEN, ) return "Creditworthiness model trained and uploaded" # ============================= # INVOKE MODEL # ============================= def invoke_model(json_input): try: msg = download_if_newer(MODEL_REPO, MODEL_FILE, MODEL_FILE, repo_type="model") data = json.loads(json_input) model = joblib.load(MODEL_FILE) X = pd.DataFrame([data], columns=FEATURES) prediction = model.predict(X)[0] return {"max_loan_amount": round(float(prediction), 2), "info": msg} except Exception as e: return {"error": str(e)} # ============================= # UI # ============================= with gr.Blocks() as app: gr.Markdown("# HF Pricing Engine – MVP") with gr.Row(): gen_btn = gr.Button("Generate dataset") train_btn = gr.Button("Train model") status = gr.Textbox(label="Status") gen_btn.click(generate_dataset, outputs=status) train_btn.click(train_model, outputs=status) gr.Markdown("## Invoke model, for intallment typ: 0 = fixed 1 = decreasing 2 = variable") json_input = gr.Textbox( label="Input JSON", lines=8, value=json.dumps({ "age": 34, "monthly_income": 10000, "living_costs": 1000, "existing_liabilities": 1000, "credit_limits": 50000, "ltv": 0.8, "interest_rate": 0.065, "loan_term_years": 30, "installment_type": 0 }, indent=2), ) invoke_btn = gr.Button("Invoke") result = gr.JSON(label="Result") invoke_btn.click(invoke_model, inputs=json_input, outputs=result) app.launch()