import json import os from collections import Counter from tqdm import tqdm # Configuration DATASET = "beauty" INPUT_FILE = f"data/processed/{DATASET}/cove/sequential_data.txt" OUTPUT_FILE = f"data/processed/{DATASET}/user_seq.json" # Ensure directory exists os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) # Step 1: Read all sequences user_sequences = [] item_counts = Counter() with open(INPUT_FILE, "r") as f: for line in f: items = line.strip().split() if len(items) >= 2: user_sequences.append(items) item_counts.update(items) # Step 2: Get top 500 most popular items popular_items = [item for item, _ in item_counts.most_common(500)] # Step 3: Build user sequence dict user_seq_dict = {} for user_id, seq in enumerate(tqdm(user_sequences, desc="Building user_seq")): history = seq[:-1] next_item = seq[-1] # Sample 100 popular items not in history or next_item negatives = [item for item in popular_items if item not in seq][:100] # Include ground-truth as the first item in candidates candidates = [next_item] + negatives user_seq_dict[str(user_id)] = { "history": history, "next_item": next_item, "candidates": candidates } # Step 4: Save to JSON with open(OUTPUT_FILE, "w") as f: json.dump(user_seq_dict, f, indent=2) print(f"[INFO] Created {OUTPUT_FILE} with {len(user_seq_dict)} users.")