DanielKiani's picture
Initial commit of recommender system project
38ae75d
import pandas as pd
import numpy as np
import torch
import datetime
def calculate_metrics(recommendations_dict, ground_truth_dict, k):
"""
Calculates Precision@k, Recall@k, and HitRate@k.
args:
----------
recommendations_dict : {user_id: [recommended_item_ids]}
ground_truth_dict : {user_id: set of ground truth item_ids}
k : int
Returns
-------
dict with mean precision, recall, and hit rate
"""
all_precisions, all_recalls, all_hits = [], [], []
for user_id, true_items in ground_truth_dict.items():
recs = recommendations_dict.get(user_id, [])[:k]
if not true_items:
continue
hits = len(set(recs) & true_items)
precision = hits / k if k > 0 else 0
recall = hits / len(true_items)
hit_rate = 1.0 if hits > 0 else 0.0
all_precisions.append(precision)
all_recalls.append(recall)
all_hits.append(hit_rate)
if not all_precisions:
return {"mean_precision@k": 0, "mean_recall@k": 0, "mean_hitrate@k": 0}
return {
"mean_precision@k": np.mean(all_precisions),
"mean_recall@k": np.mean(all_recalls),
"mean_hitrate@k": np.mean(all_hits)
}
def prepare_ground_truth(df, mode="purchase", event_weights=None):
"""
Prepares ground truth dictionaries for evaluation.
Parameters
----------
df : pd.DataFrame
Test dataframe containing at least ['visitorid', 'itemid', 'event'].
mode : str, default="purchase"
- "purchase" : Only use transactions as ground truth.
- "all" : Use all events. Optionally weight them.
event_weights : dict, optional
Example: {"view": 1, "addtocart": 3, "transaction": 5}.
Used only if mode == "all".
Returns
-------
dict : {user_id: set of item_ids}
"""
if mode == "purchase":
df_filtered = df[df["event"] == "transaction"]
ground_truth = df_filtered.groupby("visitorid")["itemid"].apply(set).to_dict()
elif mode == "all":
if event_weights is None:
# Default: treat all events equally
ground_truth = df.groupby("visitorid")["itemid"].apply(set).to_dict()
else:
# Weighted ground truth (for more advanced eval)
ground_truth = {}
for uid, user_df in df.groupby("visitorid"):
weighted_items = []
for _, row in user_df.iterrows():
weight = event_weights.get(row["event"], 1)
weighted_items.extend([row["itemid"]] * weight)
ground_truth[uid] = set(weighted_items)
else:
raise ValueError("mode must be 'purchase' or 'all'")
return ground_truth
def load_item_properties(data_folder='data/'):
"""
Loads item properties and creates a mapping from item ID to its category ID.
Handles both a single properties file or two split parts.
Args:
data_folder (str): The path to the folder containing item property files.
Returns:
dict: A dictionary mapping {itemid: categoryid}.
"""
print("Loading item properties...")
try:
# First, try to load the two separate parts and combine them.
props_df_part1 = pd.read_csv(data_folder + 'item_properties_part1.csv')
props_df_part2 = pd.read_csv(data_folder + 'item_properties_part2.csv')
props_df = pd.concat([props_df_part1, props_df_part2], ignore_index=True)
print("Successfully loaded and combined item_properties_part1.csv and item_properties_part2.csv.")
except FileNotFoundError:
try:
# If the parts are not found, try to load a single combined file.
props_df = pd.read_csv(data_folder + 'item_properties.csv')
print("Successfully loaded a single item_properties.csv.")
except FileNotFoundError:
print(f"Warning: No item properties files found. Cannot display category information.")
return {}
category_df = props_df[props_df['property'] == 'categoryid'].copy()
category_df['value'] = pd.to_numeric(category_df['value'], errors='coerce').astype('Int64')
item_to_category_map = category_df.set_index('itemid')['value'].to_dict()
print("Item to category mapping created successfully.")
return item_to_category_map
def load_category_tree(data_folder='data/'):
"""
Loads the category tree to map categories to their parent categories.
Args:
data_folder (str): The path to the folder containing category_tree.csv.
Returns:
dict: A dictionary mapping {categoryid: parentid}.
"""
print("Loading category tree...")
try:
tree_df = pd.read_csv(data_folder + 'category_tree.csv')
category_parent_map = tree_df.set_index('categoryid')['parentid'].to_dict()
print("Category tree loaded successfully.")
return category_parent_map
except FileNotFoundError:
print("Warning: 'category_tree.csv' not found. Cannot display parent category information.")
return {}
def get_popular_items(train_df, k=10):
"""
Calculates the top-k most popular items based on transaction count.
"""
purchase_counts = train_df[train_df['event'] == 'transaction']['itemid'].value_counts()
return purchase_counts.head(k).index.tolist()
def show_user_recommendations(visitor_id, model, datamodule, popular_items, item_category_map, category_parent_map, k=10):
"""
Displays recommendations for a user, including category and parent category information.
"""
print(f"\n--- Recommendations for Visitor ID: {visitor_id} ---")
model.eval()
def format_item_with_category(item_id):
category_id = item_category_map.get(item_id, 'N/A')
parent_id = category_parent_map.get(category_id, 'N/A') if category_id != 'N/A' else 'N/A'
return f"Item: {item_id} (Category: {category_id}, Parent: {parent_id})"
user_history_ids = datamodule.user_history.get(visitor_id)
if user_history_ids is None:
print(f"User {visitor_id} not found in training history. Providing popularity-based recommendations.")
print(f"\nTop {k} Popular Items (Fallback):")
recs_with_cats = [format_item_with_category(item_id) for item_id in popular_items]
print(recs_with_cats)
print("-------------------------------------------------")
return
history_with_cats = [format_item_with_category(item_id) for item_id in user_history_ids]
print(f"User's Historical Interactions:")
print(history_with_cats)
history_indices = [datamodule.item_map[i] for i in user_history_ids if i in datamodule.item_map]
if not history_indices:
print("None of the user's historical items are in the model's vocabulary.")
return
max_len = datamodule.max_len
input_seq = history_indices[-max_len:]
padded_input = np.zeros(max_len, dtype=np.int64)
padded_input[-len(input_seq):] = input_seq
input_tensor = torch.LongTensor(np.array([padded_input]))
input_tensor = input_tensor.to(model.device)
with torch.no_grad():
logits = model(input_tensor)
last_item_logits = logits[0, -1, :]
top_indices = torch.topk(last_item_logits, k).indices.tolist()
recommended_item_ids = [datamodule.inverse_item_map[idx] for idx in top_indices if idx in datamodule.inverse_item_map]
print(f"\nTop {k} Recommended Items:")
recs_with_cats = [format_item_with_category(item_id) for item_id in recommended_item_ids]
print(recs_with_cats)
print("-------------------------------------------------")