Spaces:

nice-bill
/

personalisation-engine

Running

File size: 3,651 Bytes
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import json
import torch
from sentence_transformers import SentenceTransformer
import random
import faiss

NUM_USERS = 10000
MIN_SEQUENCE_LENGTH = 5
MAX_SEQUENCE_LENGTH = 50
DATA_DIR = Path("data")
CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
OUTPUT_DIR = DATA_DIR / "synthetic"
MODEL_NAME = "all-MiniLM-L6-v2" 

def main():
    print("Loading catalog...")
    df = pd.read_csv(CATALOG_PATH)
    
    df['rich_content'] = (
        "Title: " + df['title'].fillna("") + 
        "; Author: " + df['authors'].fillna("Unknown") +
        "; Genres: " + df['genres'].fillna("") + 
        "; Description: " + df['description'].fillna("").astype(str).str.slice(0, 300)
    )
    
    titles = df['title'].tolist()
    content_to_encode = df['rich_content'].tolist()
    
    EMBEDDINGS_CACHE = DATA_DIR / "embeddings_cache.npy"
        
    if EMBEDDINGS_CACHE.exists():
        print(f"Loading cached embeddings from {EMBEDDINGS_CACHE}...")
        emb_np = np.load(EMBEDDINGS_CACHE)
        print("Embeddings loaded.")
    else:
        print(f"Loading Teacher Model ({MODEL_NAME})...")
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = SentenceTransformer(MODEL_NAME, device=device)
        
        print("Encoding books (Title + Author + Genre + Desc)...")
        embeddings = model.encode(content_to_encode, show_progress_bar=True, convert_to_tensor=True)
        emb_np = embeddings.cpu().numpy()
        
        print(f"Saving embeddings to {EMBEDDINGS_CACHE}...")
        np.save(EMBEDDINGS_CACHE, emb_np)
    
    print(f"Generating {NUM_USERS} semantic user journeys...")
    
    cpu_index = faiss.IndexFlatIP(emb_np.shape[1])
    faiss.normalize_L2(emb_np)
    cpu_index.add(emb_np)
    
    users = []
    
    for user_id in tqdm(range(NUM_USERS)):
        sequence = []
        
        num_interests = random.choice([1, 1, 2, 3])
        
        for _ in range(num_interests):
            anchor_idx = random.randint(0, len(titles) - 1)
            
            k_neighbors = 50
            q = emb_np[anchor_idx].reshape(1, -1)
            _, indices = cpu_index.search(q, k_neighbors)
            neighbors_indices = indices[0]
            
            num_to_read = random.randint(5, 15)
            
            read_indices = np.random.choice(neighbors_indices, size=min(len(neighbors_indices), num_to_read), replace=False)
            
            for idx in read_indices:
                sequence.append(titles[idx])
                
        if len(sequence) > MAX_SEQUENCE_LENGTH:
            sequence = sequence[:MAX_SEQUENCE_LENGTH]
            
        if len(sequence) >= MIN_SEQUENCE_LENGTH:
             users.append({
                'user_id': user_id,
                'book_sequence': sequence,
                'sequence_length': len(sequence),
                'persona': 'semantic_explorer',
                'metadata': {'generated': True} 
            })
            
    users_df = pd.DataFrame(users)
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    output_path = OUTPUT_DIR / "user_sequences.parquet"
    users_df.to_parquet(output_path, index=False)
    
    stats = {
        'num_users': len(users_df),
        'avg_sequence_length': float(users_df['sequence_length'].mean()),
        'generated_via': "semantic_clustering"
    }
    
    with open(OUTPUT_DIR / "user_metadata.json", 'w') as f:
        json.dump(stats, f, indent=2)
        
    print(f"\n Generated {len(users_df)} semantic users")
    print(f"   Output: {output_path}")

if __name__ == "__main__":
    main()