File size: 1,422 Bytes
8d8bf0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import json
import os
from collections import Counter
from tqdm import tqdm

# Configuration
DATASET = "beauty"
INPUT_FILE = f"data/processed/{DATASET}/cove/sequential_data.txt"
OUTPUT_FILE = f"data/processed/{DATASET}/user_seq.json"

# Ensure directory exists
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)

# Step 1: Read all sequences
user_sequences = []
item_counts = Counter()

with open(INPUT_FILE, "r") as f:
    for line in f:
        items = line.strip().split()
        if len(items) >= 2:
            user_sequences.append(items)
            item_counts.update(items)

# Step 2: Get top 500 most popular items
popular_items = [item for item, _ in item_counts.most_common(500)]

# Step 3: Build user sequence dict
user_seq_dict = {}
for user_id, seq in enumerate(tqdm(user_sequences, desc="Building user_seq")):
    history = seq[:-1]
    next_item = seq[-1]

    # Sample 100 popular items not in history or next_item
    negatives = [item for item in popular_items if item not in seq][:100]

    # Include ground-truth as the first item in candidates
    candidates = [next_item] + negatives

    user_seq_dict[str(user_id)] = {
        "history": history,
        "next_item": next_item,
        "candidates": candidates
    }

# Step 4: Save to JSON
with open(OUTPUT_FILE, "w") as f:
    json.dump(user_seq_dict, f, indent=2)

print(f"[INFO] Created {OUTPUT_FILE} with {len(user_seq_dict)} users.")