File size: 1,774 Bytes
afca748
 
 
 
 
 
 
 
 
 
 
 
3ff843f
 
 
 
 
 
afca748
3ff843f
 
 
 
 
afca748
3ff843f
 
 
afca748
3ff843f
 
 
 
 
 
 
 
afca748
3ff843f
 
afca748
3ff843f
 
afca748
3ff843f
afca748
 
3ff843f
 
afca748
3ff843f
afca748
3ff843f
 
 
afca748
3ff843f
afca748
3ff843f
 
afca748
3ff843f
afca748
3ff843f
 
 
 
 
afca748
3ff843f
afca748
 
3ff843f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import gradio as gr
import pandas as pd
import numpy as np
import os

from data_prep import load_data, basic_clean, feature_engineer, prepare_features
from embed import build_text_for_embedding, embed_texts
from clustering import reduce_and_cluster


def run_pipeline(uploaded_csv, k=6, use_hdbscan=False):

    # create data folder
    os.makedirs("data", exist_ok=True)

    # save uploaded CSV
    csv_path = "data/uploaded.csv"
    uploaded_csv.save(csv_path)

    # load & preprocess
    df = load_data(csv_path)
    df = basic_clean(df)
    df = feature_engineer(df)
    features = prepare_features(df)

    # text embedding
    texts = build_text_for_embedding(df)
    embs = embed_texts(texts)

    # clustering
    labels, arts = reduce_and_cluster(
        embs,
        k=int(k),
        use_hdbscan=use_hdbscan
    )
    
    df["cluster"] = labels

    # summary
    summary_text = f"Clusters created: {len(set(labels))}"

    # sample customers
    sample_df = df.groupby("cluster").head(3)

    return summary_text, sample_df


def main():
    with gr.Blocks() as demo:

        gr.Markdown("# Customer Segmentation — Hugging Face Space")

        with gr.Row():
            csv_in = gr.File(label="Upload Customer CSV (required)")
            k = gr.Slider(2, 20, value=6, step=1, label="K (for KMeans)")

        use_hdbscan = gr.Checkbox(label="Use HDBSCAN instead of KMeans")

        out_text = gr.Textbox(label="Output Summary")
        out_table = gr.Dataframe(label="Sample Clustered Rows")

        run_btn = gr.Button("Run Segmentation")

        run_btn.click(
            fn=run_pipeline,
            inputs=[csv_in, k, use_hdbscan],
            outputs=[out_text, out_table]
        )

    demo.launch()


if __name__ == "__main__":
    main()