import os import numpy as np import pandas as pd import streamlit as st from scipy.sparse import csr_matrix from sklearn.neighbors import NearestNeighbors st.set_page_config(page_title="Amazon Recommender", layout="wide") st.title("Amazon Recommendation System") st.caption("Simple item-based and user-based recommendations built from ratings data.") @st.cache_data(show_spinner=False) def load_data() -> pd.DataFrame: csv_path = os.path.join(os.path.dirname(__file__), 'ratings_Electronics.csv') if not os.path.exists(csv_path): raise FileNotFoundError(f"File not found: {csv_path}") # File has no header; assign names explicitly to match the notebook df = pd.read_csv(csv_path, header=None, names=['user_id', 'product_id', 'rating', 'timestamp']) # Ensure types df['user_id'] = df['user_id'].astype(str) df['product_id'] = df['product_id'].astype(str) df['rating'] = pd.to_numeric(df['rating'], errors='coerce') df = df.dropna(subset=['rating']).reset_index(drop=True) # Keep only users with > 100 ratings (same as notebook) user_counts = df['user_id'].value_counts() df = df[df['user_id'].map(user_counts) > 100].reset_index(drop=True) return df @st.cache_resource(show_spinner=False) def build_models(df: pd.DataFrame): # Encode IDs to categorical integer indices user_cats = df['user_id'].astype('category') product_cats = df['product_id'].astype('category') user_index = user_cats.cat.codes.values product_index = product_cats.cat.codes.values ratings_values = df['rating'].astype(np.float32).values num_users = user_cats.cat.categories.size num_items = product_cats.cat.categories.size uim_sparse = csr_matrix((ratings_values, (user_index, product_index)), shape=(num_users, num_items)) # Fit KNN models item_knn = NearestNeighbors(metric='cosine', algorithm='brute') item_knn.fit(uim_sparse.T) user_knn = NearestNeighbors(metric='cosine', algorithm='brute') user_knn.fit(uim_sparse) # Mappings product_id_to_col = {pid: idx for idx, pid in enumerate(product_cats.cat.categories)} col_to_product_id = {idx: pid for pid, idx in product_id_to_col.items()} user_id_to_row = {uid: idx for idx, uid in enumerate(user_cats.cat.categories)} row_to_user_id = {idx: uid for uid, idx in user_id_to_row.items()} # Density for info density = (uim_sparse.nnz / (uim_sparse.shape[0] * uim_sparse.shape[1])) * 100 return { 'uim': uim_sparse, 'item_knn': item_knn, 'user_knn': user_knn, 'product_id_to_col': product_id_to_col, 'col_to_product_id': col_to_product_id, 'user_id_to_row': user_id_to_row, 'row_to_user_id': row_to_user_id, 'num_users': num_users, 'num_items': num_items, 'density': density, } def recommend_similar_products(models, query_product_id, top_k=10) -> pd.DataFrame: item_knn = models['item_knn'] uim = models['uim'] product_id_to_col = models['product_id_to_col'] col_to_product_id = models['col_to_product_id'] if query_product_id not in product_id_to_col: return pd.DataFrame(columns=['product_id', 'similarity']) qcol = product_id_to_col[query_product_id] distances, indices = item_knn.kneighbors(uim.T[qcol], n_neighbors=top_k + 1) indices = indices.ravel()[1:] distances = distances.ravel()[1:] return pd.DataFrame({ 'product_id': [col_to_product_id[i] for i in indices], 'similarity': 1.0 - distances }) def recommend_products_for_user(models, query_user_id, top_k=10, neighbor_k=50) -> pd.DataFrame: user_knn = models['user_knn'] uim = models['uim'] user_id_to_row = models['user_id_to_row'] col_to_product_id = models['col_to_product_id'] if query_user_id not in user_id_to_row: return pd.DataFrame(columns=['product_id', 'score']) qrow = user_id_to_row[query_user_id] n_neighbors = min(neighbor_k + 1, uim.shape[0]) distances, indices = user_knn.kneighbors(uim[qrow], n_neighbors=n_neighbors) indices = indices.ravel() distances = distances.ravel() mask = indices != qrow neighbor_indices = indices[mask][:neighbor_k] neighbor_distances = distances[mask][:neighbor_k] similarities = 1.0 - neighbor_distances if neighbor_indices.size == 0: return pd.DataFrame(columns=['product_id', 'score']) neighbor_ratings = uim[neighbor_indices] weighted_scores = similarities @ neighbor_ratings.toarray() user_rated_mask = uim[qrow].toarray().ravel() > 0 weighted_scores[user_rated_mask] = -np.inf if np.all(~np.isfinite(weighted_scores)): return pd.DataFrame(columns=['product_id', 'score']) k = min(top_k, len(weighted_scores) - 1) top_cols = np.argpartition(-weighted_scores, kth=k)[:top_k] top_cols = top_cols[np.argsort(-weighted_scores[top_cols])] return pd.DataFrame({ 'product_id': [col_to_product_id[c] for c in top_cols], 'score': weighted_scores[top_cols] }) with st.sidebar: st.subheader("Configuration") top_k = st.number_input("Results (top_k)", min_value=1, max_value=50, value=10, step=1) neighbor_k = st.number_input("Neighbors (user-based)", min_value=5, max_value=200, value=50, step=5) try: df = load_data() models = build_models(df) st.success(f"Loaded {len(df)} ratings | Users: {models['num_users']} | Items: {models['num_items']} | Density: {models['density']:.4f}%") tab1, tab2 = st.tabs(["Recommend by Product", "Recommend for User"]) with tab1: st.subheader("Product → Similar Products") product_ids = df['product_id'].drop_duplicates().head(5000).tolist() pid = st.selectbox("Select a product_id", product_ids) if st.button("Find similar products", key="by_product"): recs = recommend_similar_products(models, pid, top_k=int(top_k)) st.dataframe(recs, width='stretch') with tab2: st.subheader("User → Recommended Products") user_ids = df['user_id'].drop_duplicates().head(5000).tolist() uid = st.selectbox("Select a user_id", user_ids) if st.button("Recommend for user", key="by_user"): recs = recommend_products_for_user(models, uid, top_k=int(top_k), neighbor_k=int(neighbor_k)) st.dataframe(recs, width='stretch') except Exception as e: st.error(str(e))