# src/utils/io_utils.py import json import pickle import numpy as np from pathlib import Path from typing import List import faiss import os def save_faiss_index(index, dataset, mode, item_ids): index_dir = Path("index") / dataset / mode index_dir.mkdir(parents=True, exist_ok=True) # Save FAISS index faiss.write_index(index, str(index_dir / "index.faiss")) # Save item_ids with open(index_dir / "item_ids.json", "w") as f: json.dump(item_ids, f) def save_default_config(weights_dict, dataset, mode, config_name="default_cove.json"): """ Save the default weights for CoVE FAISS combinations to defaults.json. """ path = os.path.join("defaults", dataset, config_name) os.makedirs(os.path.dirname(path), exist_ok=True) if os.path.exists(path): with open(path, "r") as f: config = json.load(f) else: config = {} config[mode] = weights_dict with open(path, "w") as f: json.dump(config, f, indent=4) print(f"✓ Saved default weights to {path}") def load_sequences(dataset: str) -> List[List[str]]: """ Loads interaction sequences from a dataset. Looks for JSON file at: data/processed/{dataset}/sequences.json """ path = Path(f"data/processed") / dataset / "sequences.json" if not path.exists(): raise FileNotFoundError(f"[✗] sequences.json not found at {path}") with path.open("r") as f: return json.load(f) def load_item_ids(dataset: str) -> List[str]: """ Loads item ID list from a dataset. Expected path: data/processed/{dataset}/item_ids.json """ path = Path(f"data/processed/{dataset}/item_ids.json") if not path.exists(): raise FileNotFoundError(f"[✗] item_ids.json not found at {path}") with path.open("r") as f: return json.load(f) import pandas as pd def load_embeddings(dataset: str, suffix: str) -> np.ndarray: """ Loads embeddings from .npy or .parquet formats. - Tries data/processed/{dataset}/{suffix}.npy - Then tries data/processed/{dataset}/{suffix}.parquet - Then tries legacy path: data/processed/{dataset}/embeddings_{suffix}.npy """ import pandas as pd base_path = f"data/processed/{dataset}/" # Option 1: Newer convention - {suffix}.npy npy_path = os.path.join(base_path, f"{suffix}.npy") if os.path.exists(npy_path): return np.load(npy_path).astype(np.float32) # Option 2: Newer convention - {suffix}.parquet parquet_path = os.path.join(base_path, f"{suffix}.parquet") if os.path.exists(parquet_path): df = pd.read_parquet(parquet_path) # Unpack the 'vector' column (which is a list/array in each row) if "vector" in df.columns: embeds = np.stack(df["vector"].values) else: # Fallback to numeric filtering if 'vector' is missing df_numeric = df.select_dtypes(include=["number"]) embeds = df_numeric.values print(f"[✓] Loaded {embeds.shape[0]} embeddings of dim {embeds.shape[1]} from {parquet_path}") return embeds.astype(np.float32) # Option 3: Legacy naming - embeddings_{suffix}.npy legacy_path = os.path.join(base_path, f"embeddings_{suffix}.npy") if os.path.exists(legacy_path): return np.load(legacy_path).astype(np.float32) raise FileNotFoundError(f"[✗] Embedding file not found: {npy_path}, {parquet_path}, or {legacy_path}") def write_defaults_json(dataset: str, mode: str, embed_sources: List[str]): """ Updates defaults/default_cove.json with mode and embedding sources. """ default_path = Path("defaults/default_cove.json") default_path.parent.mkdir(parents=True, exist_ok=True) if default_path.exists(): with open(default_path, "r") as f: defaults = json.load(f) else: defaults = {} defaults[dataset] = { "mode": mode, "embed_sources": embed_sources } with open(default_path, "w") as f: json.dump(defaults, f, indent=2)