File size: 6,430 Bytes
914d711 c0a396a 914d711 c0a396a 914d711 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | import os
import numpy as np
import pandas as pd
import streamlit as st
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
st.set_page_config(page_title="Amazon Recommender", layout="wide")
st.title("Amazon Recommendation System")
st.caption("Simple item-based and user-based recommendations built from ratings data.")
@st.cache_data(show_spinner=False)
def load_data() -> pd.DataFrame:
csv_path = os.path.join(os.path.dirname(__file__), 'ratings_Electronics.csv')
if not os.path.exists(csv_path):
raise FileNotFoundError(f"File not found: {csv_path}")
# File has no header; assign names explicitly to match the notebook
df = pd.read_csv(csv_path, header=None, names=['user_id', 'product_id', 'rating', 'timestamp'])
# Ensure types
df['user_id'] = df['user_id'].astype(str)
df['product_id'] = df['product_id'].astype(str)
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
df = df.dropna(subset=['rating']).reset_index(drop=True)
# Keep only users with > 100 ratings (same as notebook)
user_counts = df['user_id'].value_counts()
df = df[df['user_id'].map(user_counts) > 100].reset_index(drop=True)
return df
@st.cache_resource(show_spinner=False)
def build_models(df: pd.DataFrame):
# Encode IDs to categorical integer indices
user_cats = df['user_id'].astype('category')
product_cats = df['product_id'].astype('category')
user_index = user_cats.cat.codes.values
product_index = product_cats.cat.codes.values
ratings_values = df['rating'].astype(np.float32).values
num_users = user_cats.cat.categories.size
num_items = product_cats.cat.categories.size
uim_sparse = csr_matrix((ratings_values, (user_index, product_index)), shape=(num_users, num_items))
# Fit KNN models
item_knn = NearestNeighbors(metric='cosine', algorithm='brute')
item_knn.fit(uim_sparse.T)
user_knn = NearestNeighbors(metric='cosine', algorithm='brute')
user_knn.fit(uim_sparse)
# Mappings
product_id_to_col = {pid: idx for idx, pid in enumerate(product_cats.cat.categories)}
col_to_product_id = {idx: pid for pid, idx in product_id_to_col.items()}
user_id_to_row = {uid: idx for idx, uid in enumerate(user_cats.cat.categories)}
row_to_user_id = {idx: uid for uid, idx in user_id_to_row.items()}
# Density for info
density = (uim_sparse.nnz / (uim_sparse.shape[0] * uim_sparse.shape[1])) * 100
return {
'uim': uim_sparse,
'item_knn': item_knn,
'user_knn': user_knn,
'product_id_to_col': product_id_to_col,
'col_to_product_id': col_to_product_id,
'user_id_to_row': user_id_to_row,
'row_to_user_id': row_to_user_id,
'num_users': num_users,
'num_items': num_items,
'density': density,
}
def recommend_similar_products(models, query_product_id, top_k=10) -> pd.DataFrame:
item_knn = models['item_knn']
uim = models['uim']
product_id_to_col = models['product_id_to_col']
col_to_product_id = models['col_to_product_id']
if query_product_id not in product_id_to_col:
return pd.DataFrame(columns=['product_id', 'similarity'])
qcol = product_id_to_col[query_product_id]
distances, indices = item_knn.kneighbors(uim.T[qcol], n_neighbors=top_k + 1)
indices = indices.ravel()[1:]
distances = distances.ravel()[1:]
return pd.DataFrame({
'product_id': [col_to_product_id[i] for i in indices],
'similarity': 1.0 - distances
})
def recommend_products_for_user(models, query_user_id, top_k=10, neighbor_k=50) -> pd.DataFrame:
user_knn = models['user_knn']
uim = models['uim']
user_id_to_row = models['user_id_to_row']
col_to_product_id = models['col_to_product_id']
if query_user_id not in user_id_to_row:
return pd.DataFrame(columns=['product_id', 'score'])
qrow = user_id_to_row[query_user_id]
n_neighbors = min(neighbor_k + 1, uim.shape[0])
distances, indices = user_knn.kneighbors(uim[qrow], n_neighbors=n_neighbors)
indices = indices.ravel()
distances = distances.ravel()
mask = indices != qrow
neighbor_indices = indices[mask][:neighbor_k]
neighbor_distances = distances[mask][:neighbor_k]
similarities = 1.0 - neighbor_distances
if neighbor_indices.size == 0:
return pd.DataFrame(columns=['product_id', 'score'])
neighbor_ratings = uim[neighbor_indices]
weighted_scores = similarities @ neighbor_ratings.toarray()
user_rated_mask = uim[qrow].toarray().ravel() > 0
weighted_scores[user_rated_mask] = -np.inf
if np.all(~np.isfinite(weighted_scores)):
return pd.DataFrame(columns=['product_id', 'score'])
k = min(top_k, len(weighted_scores) - 1)
top_cols = np.argpartition(-weighted_scores, kth=k)[:top_k]
top_cols = top_cols[np.argsort(-weighted_scores[top_cols])]
return pd.DataFrame({
'product_id': [col_to_product_id[c] for c in top_cols],
'score': weighted_scores[top_cols]
})
with st.sidebar:
st.subheader("Configuration")
top_k = st.number_input("Results (top_k)", min_value=1, max_value=50, value=10, step=1)
neighbor_k = st.number_input("Neighbors (user-based)", min_value=5, max_value=200, value=50, step=5)
try:
df = load_data()
models = build_models(df)
st.success(f"Loaded {len(df)} ratings | Users: {models['num_users']} | Items: {models['num_items']} | Density: {models['density']:.4f}%")
tab1, tab2 = st.tabs(["Recommend by Product", "Recommend for User"])
with tab1:
st.subheader("Product → Similar Products")
product_ids = df['product_id'].drop_duplicates().head(5000).tolist()
pid = st.selectbox("Select a product_id", product_ids)
if st.button("Find similar products", key="by_product"):
recs = recommend_similar_products(models, pid, top_k=int(top_k))
st.dataframe(recs, width='stretch')
with tab2:
st.subheader("User → Recommended Products")
user_ids = df['user_id'].drop_duplicates().head(5000).tolist()
uid = st.selectbox("Select a user_id", user_ids)
if st.button("Recommend for user", key="by_user"):
recs = recommend_products_for_user(models, uid, top_k=int(top_k), neighbor_k=int(neighbor_k))
st.dataframe(recs, width='stretch')
except Exception as e:
st.error(str(e))
|