File size: 6,576 Bytes
f3a6f24
 
 
 
 
5e03423
99592de
 
 
 
f3a6f24
af7396c
 
 
ddfcd41
 
 
af7396c
 
 
ddfcd41
af7396c
 
 
ddfcd41
 
 
 
 
 
99592de
 
ddfcd41
 
 
 
 
 
5e03423
ddfcd41
 
 
5e03423
ddfcd41
 
 
 
5e03423
ddfcd41
 
 
 
 
 
 
 
 
 
 
 
 
f3a6f24
 
 
 
af7396c
f3a6f24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import os
import streamlit as st
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import requests
import urllib3

# Suppress SSL warnings for local development
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

DATA_DIR = os.path.join(os.path.dirname(__file__), "..", "data")
DATA_PATH = os.path.join(DATA_DIR, "online-retail.csv")

# Direct download URL from UCI Machine Learning Repository
# This is a ZIP file containing the Excel file
DATA_URL = "https://archive.ics.uci.edu/static/public/352/online+retail.zip"


def download_data_if_needed():
    """Download the online retail dataset from UCI if it doesn't exist locally."""
    if not os.path.exists(DATA_PATH):
        os.makedirs(DATA_DIR, exist_ok=True)
        
        with st.spinner("📥 Downloading Online Retail dataset from UCI (22MB ZIP, one-time only)..."):
            try:
                import zipfile
                import io
                
                # Download ZIP file from UCI
                # verify=False is needed for some local SSL certificate issues
                response = requests.get(DATA_URL, timeout=120, verify=False)
                response.raise_for_status()
                
                # Extract ZIP in memory
                with zipfile.ZipFile(io.BytesIO(response.content)) as zip_file:
                    # The ZIP contains "Online Retail.xlsx"
                    excel_filename = "Online Retail.xlsx"
                    
                    # Extract Excel file
                    excel_path = os.path.join(DATA_DIR, excel_filename)
                    zip_file.extract(excel_filename, DATA_DIR)
                    
                    # Convert to CSV
                    st.info("Converting Excel to CSV...")
                    df = pd.read_excel(excel_path)
                    df.to_csv(DATA_PATH, index=False)
                    
                    # Clean up Excel file
                    os.remove(excel_path)
                
                st.success("✅ Dataset downloaded and converted successfully!")
                
            except Exception as e:
                st.error(f"Failed to download dataset: {str(e)}")
                st.info(
                    "**Alternative:** Download manually from "
                    "https://archive.ics.uci.edu/dataset/352/online+retail "
                    "and place 'online-retail.csv' in the data/ folder."
                )
                raise Exception(f"Could not download dataset: {e}")


@st.cache_data
def load_raw_transactions() -> pd.DataFrame:
    download_data_if_needed()
    df = pd.read_csv(DATA_PATH, encoding="latin-1")
    df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])
    df["CustomerID"] = df["CustomerID"].astype("Int64")
    return df


@st.cache_data
def load_clean_transactions() -> pd.DataFrame:
    df = load_raw_transactions()
    df = df.dropna(subset=["CustomerID"])
    df = df[~df["InvoiceNo"].astype(str).str.startswith("C")]
    df = df[df["Quantity"] > 0]
    df = df[df["UnitPrice"] > 0]
    df["TotalPrice"] = df["Quantity"] * df["UnitPrice"]
    return df.reset_index(drop=True)


@st.cache_data
def build_interaction_matrix():
    """Build user-item interaction matrix from purchase data.

    Returns:
        interactions: DataFrame (users × items) with purchase counts
        user_map: dict mapping user_idx -> CustomerID
        item_map: dict mapping item_idx -> StockCode
        item_desc: dict mapping StockCode -> Description
    """
    df = load_clean_transactions()

    user_item = (
        df.groupby(["CustomerID", "StockCode"])
        .agg(purchase_count=("Quantity", "sum"), description=("Description", "first"))
        .reset_index()
    )

    item_desc = dict(zip(user_item["StockCode"], user_item["description"]))

    users = user_item["CustomerID"].unique()
    items = user_item["StockCode"].unique()

    user_to_idx = {u: i for i, u in enumerate(users)}
    item_to_idx = {it: i for i, it in enumerate(items)}
    user_map = {i: u for u, i in user_to_idx.items()}
    item_map = {i: it for it, i in item_to_idx.items()}

    rows = user_item["CustomerID"].map(user_to_idx).values
    cols = user_item["StockCode"].map(item_to_idx).values
    vals = user_item["purchase_count"].values.astype(np.float32)

    interaction_sparse = csr_matrix((vals, (rows, cols)), shape=(len(users), len(items)))

    interactions = pd.DataFrame(
        interaction_sparse.toarray(),
        index=[user_map[i] for i in range(len(users))],
        columns=[item_map[i] for i in range(len(items))],
    )
    interactions.index.name = "CustomerID"

    return interactions, user_map, item_map, item_desc, interaction_sparse


def build_matrix_from_transactions(df, all_users, all_items):
    """Build a sparse interaction matrix from a subset of transactions using a fixed user/item universe."""
    user_to_idx = {u: i for i, u in enumerate(all_users)}
    item_to_idx = {it: i for i, it in enumerate(all_items)}

    user_item = (
        df.groupby(["CustomerID", "StockCode"])["Quantity"]
        .sum()
        .reset_index()
    )

    valid = user_item[
        user_item["CustomerID"].isin(user_to_idx) & user_item["StockCode"].isin(item_to_idx)
    ]

    rows = valid["CustomerID"].map(user_to_idx).values
    cols = valid["StockCode"].map(item_to_idx).values
    vals = valid["Quantity"].values.astype(np.float32)

    return csr_matrix((vals, (rows, cols)), shape=(len(all_users), len(all_items)))


@st.cache_data
def get_rec_train_test(test_ratio: float = 0.2, random_state: int = 42):
    """Split interaction matrix into train/test by masking a fraction of each user's interactions."""
    interactions, user_map, item_map, item_desc, _ = build_interaction_matrix()
    rng = np.random.RandomState(random_state)

    train = interactions.copy()
    test_entries = []

    for user_idx in range(len(interactions)):
        row = interactions.iloc[user_idx]
        nonzero = row[row > 0].index.tolist()
        if len(nonzero) < 5:
            continue
        n_test = max(1, int(len(nonzero) * test_ratio))
        test_items = rng.choice(nonzero, size=n_test, replace=False)
        for item in test_items:
            test_entries.append((interactions.index[user_idx], item, row[item]))
            train.loc[train.index[user_idx], item] = 0

    test_df = pd.DataFrame(test_entries, columns=["CustomerID", "StockCode", "score"])

    train_sparse = csr_matrix(train.values.astype(np.float32))
    return train, train_sparse, test_df, user_map, item_map, item_desc