File size: 6,430 Bytes
914d711
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0a396a
914d711
 
 
 
 
 
 
c0a396a
914d711
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import os
import numpy as np
import pandas as pd
import streamlit as st
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors


st.set_page_config(page_title="Amazon Recommender", layout="wide")
st.title("Amazon Recommendation System")
st.caption("Simple item-based and user-based recommendations built from ratings data.")


@st.cache_data(show_spinner=False)
def load_data() -> pd.DataFrame:
    csv_path = os.path.join(os.path.dirname(__file__), 'ratings_Electronics.csv')
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"File not found: {csv_path}")
    # File has no header; assign names explicitly to match the notebook
    df = pd.read_csv(csv_path, header=None, names=['user_id', 'product_id', 'rating', 'timestamp'])

    # Ensure types
    df['user_id'] = df['user_id'].astype(str)
    df['product_id'] = df['product_id'].astype(str)
    df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
    df = df.dropna(subset=['rating']).reset_index(drop=True)

    # Keep only users with > 100 ratings (same as notebook)
    user_counts = df['user_id'].value_counts()
    df = df[df['user_id'].map(user_counts) > 100].reset_index(drop=True)
    return df


@st.cache_resource(show_spinner=False)
def build_models(df: pd.DataFrame):
    # Encode IDs to categorical integer indices
    user_cats = df['user_id'].astype('category')
    product_cats = df['product_id'].astype('category')
    user_index = user_cats.cat.codes.values
    product_index = product_cats.cat.codes.values
    ratings_values = df['rating'].astype(np.float32).values

    num_users = user_cats.cat.categories.size
    num_items = product_cats.cat.categories.size

    uim_sparse = csr_matrix((ratings_values, (user_index, product_index)), shape=(num_users, num_items))

    # Fit KNN models
    item_knn = NearestNeighbors(metric='cosine', algorithm='brute')
    item_knn.fit(uim_sparse.T)

    user_knn = NearestNeighbors(metric='cosine', algorithm='brute')
    user_knn.fit(uim_sparse)

    # Mappings
    product_id_to_col = {pid: idx for idx, pid in enumerate(product_cats.cat.categories)}
    col_to_product_id = {idx: pid for pid, idx in product_id_to_col.items()}
    user_id_to_row = {uid: idx for idx, uid in enumerate(user_cats.cat.categories)}
    row_to_user_id = {idx: uid for uid, idx in user_id_to_row.items()}

    # Density for info
    density = (uim_sparse.nnz / (uim_sparse.shape[0] * uim_sparse.shape[1])) * 100

    return {
        'uim': uim_sparse,
        'item_knn': item_knn,
        'user_knn': user_knn,
        'product_id_to_col': product_id_to_col,
        'col_to_product_id': col_to_product_id,
        'user_id_to_row': user_id_to_row,
        'row_to_user_id': row_to_user_id,
        'num_users': num_users,
        'num_items': num_items,
        'density': density,
    }


def recommend_similar_products(models, query_product_id, top_k=10) -> pd.DataFrame:
    item_knn = models['item_knn']
    uim = models['uim']
    product_id_to_col = models['product_id_to_col']
    col_to_product_id = models['col_to_product_id']

    if query_product_id not in product_id_to_col:
        return pd.DataFrame(columns=['product_id', 'similarity'])

    qcol = product_id_to_col[query_product_id]
    distances, indices = item_knn.kneighbors(uim.T[qcol], n_neighbors=top_k + 1)
    indices = indices.ravel()[1:]
    distances = distances.ravel()[1:]
    return pd.DataFrame({
        'product_id': [col_to_product_id[i] for i in indices],
        'similarity': 1.0 - distances
    })


def recommend_products_for_user(models, query_user_id, top_k=10, neighbor_k=50) -> pd.DataFrame:
    user_knn = models['user_knn']
    uim = models['uim']
    user_id_to_row = models['user_id_to_row']
    col_to_product_id = models['col_to_product_id']

    if query_user_id not in user_id_to_row:
        return pd.DataFrame(columns=['product_id', 'score'])

    qrow = user_id_to_row[query_user_id]
    n_neighbors = min(neighbor_k + 1, uim.shape[0])
    distances, indices = user_knn.kneighbors(uim[qrow], n_neighbors=n_neighbors)
    indices = indices.ravel()
    distances = distances.ravel()

    mask = indices != qrow
    neighbor_indices = indices[mask][:neighbor_k]
    neighbor_distances = distances[mask][:neighbor_k]
    similarities = 1.0 - neighbor_distances

    if neighbor_indices.size == 0:
        return pd.DataFrame(columns=['product_id', 'score'])

    neighbor_ratings = uim[neighbor_indices]
    weighted_scores = similarities @ neighbor_ratings.toarray()

    user_rated_mask = uim[qrow].toarray().ravel() > 0
    weighted_scores[user_rated_mask] = -np.inf

    if np.all(~np.isfinite(weighted_scores)):
        return pd.DataFrame(columns=['product_id', 'score'])

    k = min(top_k, len(weighted_scores) - 1)
    top_cols = np.argpartition(-weighted_scores, kth=k)[:top_k]
    top_cols = top_cols[np.argsort(-weighted_scores[top_cols])]

    return pd.DataFrame({
        'product_id': [col_to_product_id[c] for c in top_cols],
        'score': weighted_scores[top_cols]
    })


with st.sidebar:
    st.subheader("Configuration")
    top_k = st.number_input("Results (top_k)", min_value=1, max_value=50, value=10, step=1)
    neighbor_k = st.number_input("Neighbors (user-based)", min_value=5, max_value=200, value=50, step=5)


try:
    df = load_data()
    models = build_models(df)
    st.success(f"Loaded {len(df)} ratings | Users: {models['num_users']} | Items: {models['num_items']} | Density: {models['density']:.4f}%")

    tab1, tab2 = st.tabs(["Recommend by Product", "Recommend for User"])

    with tab1:
        st.subheader("Product → Similar Products")
        product_ids = df['product_id'].drop_duplicates().head(5000).tolist()
        pid = st.selectbox("Select a product_id", product_ids)
        if st.button("Find similar products", key="by_product"):
            recs = recommend_similar_products(models, pid, top_k=int(top_k))
            st.dataframe(recs, width='stretch')

    with tab2:
        st.subheader("User → Recommended Products")
        user_ids = df['user_id'].drop_duplicates().head(5000).tolist()
        uid = st.selectbox("Select a user_id", user_ids)
        if st.button("Recommend for user", key="by_user"):
            recs = recommend_products_for_user(models, uid, top_k=int(top_k), neighbor_k=int(neighbor_k))
            st.dataframe(recs, width='stretch')

except Exception as e:
    st.error(str(e))