File size: 4,073 Bytes
8d8bf0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# src/utils/io_utils.py

import json
import pickle
import numpy as np
from pathlib import Path
from typing import List
import faiss
import os

def save_faiss_index(index, dataset, mode, item_ids):
    index_dir = Path("index") / dataset / mode
    index_dir.mkdir(parents=True, exist_ok=True)

    # Save FAISS index
    faiss.write_index(index, str(index_dir / "index.faiss"))

    # Save item_ids
    with open(index_dir / "item_ids.json", "w") as f:
        json.dump(item_ids, f)
        
def save_default_config(weights_dict, dataset, mode, config_name="default_cove.json"):
    """
    Save the default weights for CoVE FAISS combinations to defaults.json.
    """
    path = os.path.join("defaults", dataset, config_name)
    os.makedirs(os.path.dirname(path), exist_ok=True)

    if os.path.exists(path):
        with open(path, "r") as f:
            config = json.load(f)
    else:
        config = {}

    config[mode] = weights_dict

    with open(path, "w") as f:
        json.dump(config, f, indent=4)

    print(f"✓ Saved default weights to {path}")
    
def load_sequences(dataset: str) -> List[List[str]]:
    """
    Loads interaction sequences from a dataset.
    Looks for JSON file at: data/processed/{dataset}/sequences.json
    """
    path = Path(f"data/processed") / dataset / "sequences.json"
    if not path.exists():
        raise FileNotFoundError(f"[✗] sequences.json not found at {path}")
    with path.open("r") as f:
        return json.load(f)


def load_item_ids(dataset: str) -> List[str]:
    """
    Loads item ID list from a dataset.
    Expected path: data/processed/{dataset}/item_ids.json
    """
    path = Path(f"data/processed/{dataset}/item_ids.json")
    if not path.exists():
        raise FileNotFoundError(f"[✗] item_ids.json not found at {path}")
    with path.open("r") as f:
        return json.load(f)
    
import pandas as pd

def load_embeddings(dataset: str, suffix: str) -> np.ndarray:
    """
    Loads embeddings from .npy or .parquet formats.
    - Tries data/processed/{dataset}/{suffix}.npy
    - Then tries data/processed/{dataset}/{suffix}.parquet
    - Then tries legacy path: data/processed/{dataset}/embeddings_{suffix}.npy
    """
    import pandas as pd

    base_path = f"data/processed/{dataset}/"

    # Option 1: Newer convention - {suffix}.npy
    npy_path = os.path.join(base_path, f"{suffix}.npy")
    if os.path.exists(npy_path):
        return np.load(npy_path).astype(np.float32)

    # Option 2: Newer convention - {suffix}.parquet
    parquet_path = os.path.join(base_path, f"{suffix}.parquet")
    if os.path.exists(parquet_path):
        df = pd.read_parquet(parquet_path)

        # Unpack the 'vector' column (which is a list/array in each row)
        if "vector" in df.columns:
            embeds = np.stack(df["vector"].values)
        else:
            # Fallback to numeric filtering if 'vector' is missing
            df_numeric = df.select_dtypes(include=["number"])
            embeds = df_numeric.values
        print(f"[✓] Loaded {embeds.shape[0]} embeddings of dim {embeds.shape[1]} from {parquet_path}")
        return embeds.astype(np.float32)

    # Option 3: Legacy naming - embeddings_{suffix}.npy
    legacy_path = os.path.join(base_path, f"embeddings_{suffix}.npy")
    if os.path.exists(legacy_path):
        return np.load(legacy_path).astype(np.float32)

    raise FileNotFoundError(f"[✗] Embedding file not found: {npy_path}, {parquet_path}, or {legacy_path}")

def write_defaults_json(dataset: str, mode: str, embed_sources: List[str]):
    """
    Updates defaults/default_cove.json with mode and embedding sources.
    """
    default_path = Path("defaults/default_cove.json")
    default_path.parent.mkdir(parents=True, exist_ok=True)

    if default_path.exists():
        with open(default_path, "r") as f:
            defaults = json.load(f)
    else:
        defaults = {}

    defaults[dataset] = {
        "mode": mode,
        "embed_sources": embed_sources
    }

    with open(default_path, "w") as f:
        json.dump(defaults, f, indent=2)