Spaces:
Sleeping
Sleeping
| # src/utils/io_utils.py | |
| import json | |
| import pickle | |
| import numpy as np | |
| from pathlib import Path | |
| from typing import List | |
| import faiss | |
| import os | |
| def save_faiss_index(index, dataset, mode, item_ids): | |
| index_dir = Path("index") / dataset / mode | |
| index_dir.mkdir(parents=True, exist_ok=True) | |
| # Save FAISS index | |
| faiss.write_index(index, str(index_dir / "index.faiss")) | |
| # Save item_ids | |
| with open(index_dir / "item_ids.json", "w") as f: | |
| json.dump(item_ids, f) | |
| def save_default_config(weights_dict, dataset, mode, config_name="default_cove.json"): | |
| """ | |
| Save the default weights for CoVE FAISS combinations to defaults.json. | |
| """ | |
| path = os.path.join("defaults", dataset, config_name) | |
| os.makedirs(os.path.dirname(path), exist_ok=True) | |
| if os.path.exists(path): | |
| with open(path, "r") as f: | |
| config = json.load(f) | |
| else: | |
| config = {} | |
| config[mode] = weights_dict | |
| with open(path, "w") as f: | |
| json.dump(config, f, indent=4) | |
| print(f"β Saved default weights to {path}") | |
| def load_sequences(dataset: str) -> List[List[str]]: | |
| """ | |
| Loads interaction sequences from a dataset. | |
| Looks for JSON file at: data/processed/{dataset}/sequences.json | |
| """ | |
| path = Path(f"data/processed") / dataset / "sequences.json" | |
| if not path.exists(): | |
| raise FileNotFoundError(f"[β] sequences.json not found at {path}") | |
| with path.open("r") as f: | |
| return json.load(f) | |
| def load_item_ids(dataset: str) -> List[str]: | |
| """ | |
| Loads item ID list from a dataset. | |
| Expected path: data/processed/{dataset}/item_ids.json | |
| """ | |
| path = Path(f"data/processed/{dataset}/item_ids.json") | |
| if not path.exists(): | |
| raise FileNotFoundError(f"[β] item_ids.json not found at {path}") | |
| with path.open("r") as f: | |
| return json.load(f) | |
| import pandas as pd | |
| def load_embeddings(dataset: str, suffix: str) -> np.ndarray: | |
| """ | |
| Loads embeddings from .npy or .parquet formats. | |
| - Tries data/processed/{dataset}/{suffix}.npy | |
| - Then tries data/processed/{dataset}/{suffix}.parquet | |
| - Then tries legacy path: data/processed/{dataset}/embeddings_{suffix}.npy | |
| """ | |
| import pandas as pd | |
| base_path = f"data/processed/{dataset}/" | |
| # Option 1: Newer convention - {suffix}.npy | |
| npy_path = os.path.join(base_path, f"{suffix}.npy") | |
| if os.path.exists(npy_path): | |
| return np.load(npy_path).astype(np.float32) | |
| # Option 2: Newer convention - {suffix}.parquet | |
| parquet_path = os.path.join(base_path, f"{suffix}.parquet") | |
| if os.path.exists(parquet_path): | |
| df = pd.read_parquet(parquet_path) | |
| # Unpack the 'vector' column (which is a list/array in each row) | |
| if "vector" in df.columns: | |
| embeds = np.stack(df["vector"].values) | |
| else: | |
| # Fallback to numeric filtering if 'vector' is missing | |
| df_numeric = df.select_dtypes(include=["number"]) | |
| embeds = df_numeric.values | |
| print(f"[β] Loaded {embeds.shape[0]} embeddings of dim {embeds.shape[1]} from {parquet_path}") | |
| return embeds.astype(np.float32) | |
| # Option 3: Legacy naming - embeddings_{suffix}.npy | |
| legacy_path = os.path.join(base_path, f"embeddings_{suffix}.npy") | |
| if os.path.exists(legacy_path): | |
| return np.load(legacy_path).astype(np.float32) | |
| raise FileNotFoundError(f"[β] Embedding file not found: {npy_path}, {parquet_path}, or {legacy_path}") | |
| def write_defaults_json(dataset: str, mode: str, embed_sources: List[str]): | |
| """ | |
| Updates defaults/default_cove.json with mode and embedding sources. | |
| """ | |
| default_path = Path("defaults/default_cove.json") | |
| default_path.parent.mkdir(parents=True, exist_ok=True) | |
| if default_path.exists(): | |
| with open(default_path, "r") as f: | |
| defaults = json.load(f) | |
| else: | |
| defaults = {} | |
| defaults[dataset] = { | |
| "mode": mode, | |
| "embed_sources": embed_sources | |
| } | |
| with open(default_path, "w") as f: | |
| json.dump(defaults, f, indent=2) |