Spaces:
Sleeping
Sleeping
File size: 1,422 Bytes
8d8bf0e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import json
import os
from collections import Counter
from tqdm import tqdm
# Configuration
DATASET = "beauty"
INPUT_FILE = f"data/processed/{DATASET}/cove/sequential_data.txt"
OUTPUT_FILE = f"data/processed/{DATASET}/user_seq.json"
# Ensure directory exists
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
# Step 1: Read all sequences
user_sequences = []
item_counts = Counter()
with open(INPUT_FILE, "r") as f:
for line in f:
items = line.strip().split()
if len(items) >= 2:
user_sequences.append(items)
item_counts.update(items)
# Step 2: Get top 500 most popular items
popular_items = [item for item, _ in item_counts.most_common(500)]
# Step 3: Build user sequence dict
user_seq_dict = {}
for user_id, seq in enumerate(tqdm(user_sequences, desc="Building user_seq")):
history = seq[:-1]
next_item = seq[-1]
# Sample 100 popular items not in history or next_item
negatives = [item for item in popular_items if item not in seq][:100]
# Include ground-truth as the first item in candidates
candidates = [next_item] + negatives
user_seq_dict[str(user_id)] = {
"history": history,
"next_item": next_item,
"candidates": candidates
}
# Step 4: Save to JSON
with open(OUTPUT_FILE, "w") as f:
json.dump(user_seq_dict, f, indent=2)
print(f"[INFO] Created {OUTPUT_FILE} with {len(user_seq_dict)} users.") |