File size: 3,651 Bytes
7964128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import json
import torch
from sentence_transformers import SentenceTransformer
import random
import faiss

NUM_USERS = 10000
MIN_SEQUENCE_LENGTH = 5
MAX_SEQUENCE_LENGTH = 50
DATA_DIR = Path("data")
CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
OUTPUT_DIR = DATA_DIR / "synthetic"
MODEL_NAME = "all-MiniLM-L6-v2" 

def main():
    print("Loading catalog...")
    df = pd.read_csv(CATALOG_PATH)
    
    df['rich_content'] = (
        "Title: " + df['title'].fillna("") + 
        "; Author: " + df['authors'].fillna("Unknown") +
        "; Genres: " + df['genres'].fillna("") + 
        "; Description: " + df['description'].fillna("").astype(str).str.slice(0, 300)
    )
    
    titles = df['title'].tolist()
    content_to_encode = df['rich_content'].tolist()
    
    EMBEDDINGS_CACHE = DATA_DIR / "embeddings_cache.npy"
        
    if EMBEDDINGS_CACHE.exists():
        print(f"Loading cached embeddings from {EMBEDDINGS_CACHE}...")
        emb_np = np.load(EMBEDDINGS_CACHE)
        print("Embeddings loaded.")
    else:
        print(f"Loading Teacher Model ({MODEL_NAME})...")
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = SentenceTransformer(MODEL_NAME, device=device)
        
        print("Encoding books (Title + Author + Genre + Desc)...")
        embeddings = model.encode(content_to_encode, show_progress_bar=True, convert_to_tensor=True)
        emb_np = embeddings.cpu().numpy()
        
        print(f"Saving embeddings to {EMBEDDINGS_CACHE}...")
        np.save(EMBEDDINGS_CACHE, emb_np)
    
    print(f"Generating {NUM_USERS} semantic user journeys...")
    
    cpu_index = faiss.IndexFlatIP(emb_np.shape[1])
    faiss.normalize_L2(emb_np)
    cpu_index.add(emb_np)
    
    users = []
    
    for user_id in tqdm(range(NUM_USERS)):
        sequence = []
        
        num_interests = random.choice([1, 1, 2, 3])
        
        for _ in range(num_interests):
            anchor_idx = random.randint(0, len(titles) - 1)
            
            k_neighbors = 50
            q = emb_np[anchor_idx].reshape(1, -1)
            _, indices = cpu_index.search(q, k_neighbors)
            neighbors_indices = indices[0]
            
            num_to_read = random.randint(5, 15)
            
            read_indices = np.random.choice(neighbors_indices, size=min(len(neighbors_indices), num_to_read), replace=False)
            
            for idx in read_indices:
                sequence.append(titles[idx])
                
        if len(sequence) > MAX_SEQUENCE_LENGTH:
            sequence = sequence[:MAX_SEQUENCE_LENGTH]
            
        if len(sequence) >= MIN_SEQUENCE_LENGTH:
             users.append({
                'user_id': user_id,
                'book_sequence': sequence,
                'sequence_length': len(sequence),
                'persona': 'semantic_explorer',
                'metadata': {'generated': True} 
            })
            
    users_df = pd.DataFrame(users)
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    output_path = OUTPUT_DIR / "user_sequences.parquet"
    users_df.to_parquet(output_path, index=False)
    
    stats = {
        'num_users': len(users_df),
        'avg_sequence_length': float(users_df['sequence_length'].mean()),
        'generated_via': "semantic_clustering"
    }
    
    with open(OUTPUT_DIR / "user_metadata.json", 'w') as f:
        json.dump(stats, f, indent=2)
        
    print(f"\n Generated {len(users_df)} semantic users")
    print(f"   Output: {output_path}")

if __name__ == "__main__":
    main()