MARS: Multi-scale Adaptive Recurrence with State compression

2319f81 verified 22 days ago

13.5 kB

	"""
	Data pipeline for MARS sequential recommendation.

	Supports:
	1. Amazon Reviews 2023 (Books, Movies_and_TV, etc.) — filtered for power users
	2. MovieLens-1M
	3. Synthetic data for testing

	All data is converted to a unified format:
	- user_id: int
	- item_ids: List[int] (chronologically sorted)
	- timestamps: List[float] (in seconds)
	"""

	import os
	import random
	import numpy as np
	import torch
	from torch.utils.data import Dataset, DataLoader
	from typing import Dict, List, Tuple, Optional
	from collections import defaultdict
	import json


	def download_movielens_1m(data_dir: str = './data/ml-1m') -> str:
	"""Download and extract MovieLens-1M dataset."""
	import urllib.request
	import zipfile

	os.makedirs(data_dir, exist_ok=True)
	ratings_path = os.path.join(data_dir, 'ratings.dat')

	if not os.path.exists(ratings_path):
	url = 'https://files.grouplens.org/datasets/movielens/ml-1m.zip'
	zip_path = os.path.join(data_dir, 'ml-1m.zip')
	print(f"Downloading MovieLens-1M from {url}...")
	urllib.request.urlretrieve(url, zip_path)

	with zipfile.ZipFile(zip_path, 'r') as z:
	z.extractall(data_dir)

	# Move files up one level
	inner_dir = os.path.join(data_dir, 'ml-1m')
	if os.path.exists(inner_dir):
	for f in os.listdir(inner_dir):
	os.rename(os.path.join(inner_dir, f), os.path.join(data_dir, f))
	os.rmdir(inner_dir)

	os.remove(zip_path)

	return ratings_path


	def load_movielens_1m(data_dir: str = './data/ml-1m', min_interactions: int = 5):
	"""Load MovieLens-1M and return user sequences."""
	ratings_path = download_movielens_1m(data_dir)

	# Parse ratings.dat
	user_interactions = defaultdict(list)

	with open(ratings_path, 'r') as f:
	for line in f:
	parts = line.strip().split('::')
	user_id = int(parts[0])
	item_id = int(parts[1])
	rating = float(parts[2])
	timestamp = int(parts[3])

	# Keep all ratings (implicit feedback style)
	user_interactions[user_id].append((item_id, timestamp))

	# Sort by timestamp, filter short sequences
	sequences = {}
	for uid, interactions in user_interactions.items():
	interactions.sort(key=lambda x: x[1])
	if len(interactions) >= min_interactions:
	sequences[uid] = {
	'item_ids': [x[0] for x in interactions],
	'timestamps': [float(x[1]) for x in interactions]
	}

	return sequences


	def load_amazon_reviews(
	category: str = 'Movies_and_TV',
	min_interactions: int = 20,
	max_users: int = 50000,
	data_dir: str = './data/amazon'
	):
	"""
	Load Amazon Reviews 2023 dataset from HF Hub.
	Filters to users with min_interactions+ for long-sequence modeling.
	"""
	try:
	from datasets import load_dataset

	print(f"Loading Amazon Reviews 2023 - {category}...")
	# Try benchmark format first
	try:
	ds = load_dataset(
	"McAuley-Lab/Amazon-Reviews-2023",
	f"0core_rating_only_{category}",
	trust_remote_code=True,
	split="train"
	)
	except Exception:
	# Fallback to raw reviews
	ds = load_dataset(
	"McAuley-Lab/Amazon-Reviews-2023",
	f"raw_review_{category}",
	trust_remote_code=True,
	split="full"
	)

	# Build user sequences
	user_interactions = defaultdict(list)
	for row in ds:
	uid = row.get('user_id', row.get('reviewerID'))
	iid = row.get('parent_asin', row.get('asin'))
	ts = row.get('timestamp', row.get('unixReviewTime', 0))
	if uid and iid:
	user_interactions[uid].append((iid, float(ts) / 1000 if ts > 1e12 else float(ts)))

	# Filter and sort
	sequences = {}
	for uid, interactions in user_interactions.items():
	interactions.sort(key=lambda x: x[1])
	if len(interactions) >= min_interactions:
	sequences[uid] = {
	'item_ids': [x[0] for x in interactions],
	'timestamps': [x[1] for x in interactions]
	}

	# Limit users
	if len(sequences) > max_users:
	keys = random.sample(list(sequences.keys()), max_users)
	sequences = {k: sequences[k] for k in keys}

	return sequences

	except Exception as e:
	print(f"Failed to load Amazon Reviews: {e}")
	return {}


	def generate_synthetic_data(
	num_users: int = 5000,
	num_items: int = 10000,
	min_seq_len: int = 50,
	max_seq_len: int = 1000,
	seed: int = 42
	) -> Dict:
	"""
	Generate synthetic sequential interaction data for testing.
	Simulates realistic patterns:
	- Power law item popularity
	- Temporal patterns (daily/weekly)
	- User interest drift over time
	"""
	rng = np.random.RandomState(seed)

	# Power law item popularity
	item_popularity = rng.power(0.8, num_items)
	item_popularity /= item_popularity.sum()

	sequences = {}
	base_time = 1600000000 # ~Sep 2020

	for uid in range(num_users):
	seq_len = rng.randint(min_seq_len, max_seq_len + 1)

	# User has a few interest clusters
	num_clusters = rng.randint(2, 6)
	cluster_centers = rng.choice(num_items, num_clusters, replace=False)
	cluster_weights = rng.dirichlet(np.ones(num_clusters))

	items = []
	timestamps = []
	current_time = base_time + rng.randint(0, 86400 * 365) # Random start

	for t in range(seq_len):
	# Interest drift: cluster weights shift over time
	drift = rng.dirichlet(np.ones(num_clusters) * 5)
	current_weights = 0.8 * cluster_weights + 0.2 * drift

	# Select cluster, then item near cluster center
	cluster = rng.choice(num_clusters, p=current_weights / current_weights.sum())
	center = cluster_centers[cluster]

	# Items near cluster center (with some randomness)
	local_items = np.arange(
	max(0, center - 50),
	min(num_items, center + 50)
	)
	local_probs = item_popularity[local_items]
	local_probs /= local_probs.sum()

	item = local_items[rng.choice(len(local_items), p=local_probs)]
	items.append(int(item) + 1) # 1-indexed (0 = padding)

	# Time gap: exponential with daily/weekly patterns
	gap = rng.exponential(3600) # avg 1 hour
	# Add daily pattern
	hour = (current_time % 86400) / 3600
	if 2 < hour < 8: # Less activity at night
	gap *= 3

	current_time += gap
	timestamps.append(current_time)

	sequences[uid] = {
	'item_ids': items,
	'timestamps': timestamps
	}

	return sequences


	class ReindexedData:
	"""Reindex items to contiguous integers and provide train/val/test splits."""

	def __init__(
	self,
	sequences: Dict,
	max_seq_len: int = 512,
	val_ratio: float = 0.1,
	test_ratio: float = 0.1,
	):
	self.max_seq_len = max_seq_len

	# Collect all items and reindex
	all_items = set()
	for uid, data in sequences.items():
	all_items.update(data['item_ids'])

	self.item2idx = {item: idx + 1 for idx, item in enumerate(sorted(all_items))}
	self.idx2item = {idx: item for item, idx in self.item2idx.items()}
	self.num_items = len(self.item2idx)

	print(f"Total users: {len(sequences)}, Total items: {self.num_items}")

	# Reindex and split
	self.train_data = []
	self.val_data = []
	self.test_data = []

	seq_lens = []
	for uid, data in sequences.items():
	item_ids = [self.item2idx[i] for i in data['item_ids']]
	timestamps = data['timestamps']

	# Truncate to max_seq_len
	if len(item_ids) > max_seq_len:
	item_ids = item_ids[-max_seq_len:]
	timestamps = timestamps[-max_seq_len:]

	seq_lens.append(len(item_ids))

	if len(item_ids) < 3:
	continue

	# Leave-one-out split
	self.train_data.append({
	'user_id': uid,
	'item_ids': item_ids[:-2],
	'timestamps': timestamps[:-2],
	'next_item': item_ids[-2],
	})
	self.val_data.append({
	'user_id': uid,
	'item_ids': item_ids[:-1],
	'timestamps': timestamps[:-1],
	'next_item': item_ids[-1],
	})
	self.test_data.append({
	'user_id': uid,
	'item_ids': item_ids[:-1],
	'timestamps': timestamps[:-1],
	'next_item': item_ids[-1],
	})

	seq_lens = np.array(seq_lens)
	print(f"Sequence length stats: mean={seq_lens.mean():.1f}, "
	f"median={np.median(seq_lens):.1f}, "
	f"max={seq_lens.max()}, min={seq_lens.min()}")
	print(f"Users with 100+ interactions: {(seq_lens >= 100).sum()}")
	print(f"Users with 200+ interactions: {(seq_lens >= 200).sum()}")
	print(f"Train: {len(self.train_data)}, Val: {len(self.val_data)}, "
	f"Test: {len(self.test_data)}")


	class SeqRecDataset(Dataset):
	"""Sequential recommendation dataset with negative sampling."""

	def __init__(
	self,
	data: List[Dict],
	num_items: int,
	max_seq_len: int = 512,
	num_negatives: int = 1,
	is_training: bool = True,
	):
	self.data = data
	self.num_items = num_items
	self.max_seq_len = max_seq_len
	self.num_negatives = num_negatives
	self.is_training = is_training

	def __len__(self):
	return len(self.data)

	def __getitem__(self, idx):
	sample = self.data[idx]

	item_ids = sample['item_ids'][-self.max_seq_len:]
	timestamps = sample['timestamps'][-self.max_seq_len:]
	next_item = sample['next_item']

	# Padding
	seq_len = len(item_ids)
	pad_len = self.max_seq_len - seq_len

	# Right-padding (needed for causal attention to work correctly)
	padded_items = item_ids + [0] * pad_len
	padded_timestamps = timestamps + [0.0] * pad_len
	mask = [True] * seq_len + [False] * pad_len

	# Negative sampling
	item_set = set(item_ids)
	negatives = []
	for _ in range(self.num_negatives):
	neg = random.randint(1, self.num_items)
	while neg in item_set:
	neg = random.randint(1, self.num_items)
	negatives.append(neg)

	return {
	'item_ids': torch.tensor(padded_items, dtype=torch.long),
	'timestamps': torch.tensor(padded_timestamps, dtype=torch.float32),
	'mask': torch.tensor(mask, dtype=torch.bool),
	'positive_ids': torch.tensor(next_item, dtype=torch.long),
	'negative_ids': torch.tensor(negatives, dtype=torch.long),
	}


	def create_dataloaders(
	data: ReindexedData,
	max_seq_len: int = 512,
	batch_size: int = 128,
	num_negatives: int = 4,
	eval_negatives: int = 999,
	num_workers: int = 2,
	) -> Tuple[DataLoader, DataLoader, DataLoader]:
	"""Create train/val/test dataloaders.

	Uses 999 negatives for evaluation (standard SASRec protocol).
	"""

	train_dataset = SeqRecDataset(
	data.train_data, data.num_items, max_seq_len,
	num_negatives=num_negatives, is_training=True
	)
	val_dataset = SeqRecDataset(
	data.val_data, data.num_items, max_seq_len,
	num_negatives=eval_negatives, is_training=False
	)
	test_dataset = SeqRecDataset(
	data.test_data, data.num_items, max_seq_len,
	num_negatives=eval_negatives, is_training=False
	)

	train_loader = DataLoader(
	train_dataset, batch_size=batch_size, shuffle=True,
	num_workers=num_workers, pin_memory=True, drop_last=True,
	)
	val_loader = DataLoader(
	val_dataset, batch_size=batch_size, shuffle=False,
	num_workers=num_workers, pin_memory=True,
	)
	test_loader = DataLoader(
	test_dataset, batch_size=batch_size, shuffle=False,
	num_workers=num_workers, pin_memory=True,
	)

	return train_loader, val_loader, test_loader


	def save_data_config(data: ReindexedData, path: str):
	"""Save data configuration for model loading."""
	config = {
	'num_items': data.num_items,
	'num_train': len(data.train_data),
	'num_val': len(data.val_data),
	'num_test': len(data.test_data),
	}
	os.makedirs(os.path.dirname(path) if os.path.dirname(path) else '.', exist_ok=True)
	with open(path, 'w') as f:
	json.dump(config, f, indent=2)
	return config