Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| from collections import Counter | |
| from tqdm import tqdm | |
| # Configuration | |
| DATASET = "beauty" | |
| INPUT_FILE = f"data/processed/{DATASET}/cove/sequential_data.txt" | |
| OUTPUT_FILE = f"data/processed/{DATASET}/user_seq.json" | |
| # Ensure directory exists | |
| os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) | |
| # Step 1: Read all sequences | |
| user_sequences = [] | |
| item_counts = Counter() | |
| with open(INPUT_FILE, "r") as f: | |
| for line in f: | |
| items = line.strip().split() | |
| if len(items) >= 2: | |
| user_sequences.append(items) | |
| item_counts.update(items) | |
| # Step 2: Get top 500 most popular items | |
| popular_items = [item for item, _ in item_counts.most_common(500)] | |
| # Step 3: Build user sequence dict | |
| user_seq_dict = {} | |
| for user_id, seq in enumerate(tqdm(user_sequences, desc="Building user_seq")): | |
| history = seq[:-1] | |
| next_item = seq[-1] | |
| # Sample 100 popular items not in history or next_item | |
| negatives = [item for item in popular_items if item not in seq][:100] | |
| # Include ground-truth as the first item in candidates | |
| candidates = [next_item] + negatives | |
| user_seq_dict[str(user_id)] = { | |
| "history": history, | |
| "next_item": next_item, | |
| "candidates": candidates | |
| } | |
| # Step 4: Save to JSON | |
| with open(OUTPUT_FILE, "w") as f: | |
| json.dump(user_seq_dict, f, indent=2) | |
| print(f"[INFO] Created {OUTPUT_FILE} with {len(user_seq_dict)} users.") |