import gradio as gr import pandas as pd import numpy as np import os from data_prep import load_data, basic_clean, feature_engineer, prepare_features from embed import build_text_for_embedding, embed_texts from clustering import reduce_and_cluster def run_pipeline(uploaded_csv, k=6, use_hdbscan=False): # create data folder os.makedirs("data", exist_ok=True) # save uploaded CSV csv_path = "data/uploaded.csv" uploaded_csv.save(csv_path) # load & preprocess df = load_data(csv_path) df = basic_clean(df) df = feature_engineer(df) features = prepare_features(df) # text embedding texts = build_text_for_embedding(df) embs = embed_texts(texts) # clustering labels, arts = reduce_and_cluster( embs, k=int(k), use_hdbscan=use_hdbscan ) df["cluster"] = labels # summary summary_text = f"Clusters created: {len(set(labels))}" # sample customers sample_df = df.groupby("cluster").head(3) return summary_text, sample_df def main(): with gr.Blocks() as demo: gr.Markdown("# Customer Segmentation — Hugging Face Space") with gr.Row(): csv_in = gr.File(label="Upload Customer CSV (required)") k = gr.Slider(2, 20, value=6, step=1, label="K (for KMeans)") use_hdbscan = gr.Checkbox(label="Use HDBSCAN instead of KMeans") out_text = gr.Textbox(label="Output Summary") out_table = gr.Dataframe(label="Sample Clustered Rows") run_btn = gr.Button("Run Segmentation") run_btn.click( fn=run_pipeline, inputs=[csv_in, k, use_hdbscan], outputs=[out_text, out_table] ) demo.launch() if __name__ == "__main__": main()