cove-api / src /utils /io_utils.py
mickey1976's picture
Deploy: Minimal FastAPI backend for CoVE Space
549c270
# src/utils/io_utils.py
import json
import pickle
import numpy as np
from pathlib import Path
from typing import List
import faiss
import os
def save_faiss_index(index, dataset, mode, item_ids):
index_dir = Path("index") / dataset / mode
index_dir.mkdir(parents=True, exist_ok=True)
# Save FAISS index
faiss.write_index(index, str(index_dir / "index.faiss"))
# Save item_ids
with open(index_dir / "item_ids.json", "w") as f:
json.dump(item_ids, f)
def save_default_config(weights_dict, dataset, mode, config_name="default_cove.json"):
"""
Save the default weights for CoVE FAISS combinations to defaults.json.
"""
path = os.path.join("defaults", dataset, config_name)
os.makedirs(os.path.dirname(path), exist_ok=True)
if os.path.exists(path):
with open(path, "r") as f:
config = json.load(f)
else:
config = {}
config[mode] = weights_dict
with open(path, "w") as f:
json.dump(config, f, indent=4)
print(f"βœ“ Saved default weights to {path}")
def load_sequences(dataset: str) -> List[List[str]]:
"""
Loads interaction sequences from a dataset.
Looks for JSON file at: data/processed/{dataset}/sequences.json
"""
path = Path(f"data/processed") / dataset / "sequences.json"
if not path.exists():
raise FileNotFoundError(f"[βœ—] sequences.json not found at {path}")
with path.open("r") as f:
return json.load(f)
def load_item_ids(dataset: str) -> List[str]:
"""
Loads item ID list from a dataset.
Expected path: data/processed/{dataset}/item_ids.json
"""
path = Path(f"data/processed/{dataset}/item_ids.json")
if not path.exists():
raise FileNotFoundError(f"[βœ—] item_ids.json not found at {path}")
with path.open("r") as f:
return json.load(f)
import pandas as pd
def load_embeddings(dataset: str, suffix: str) -> np.ndarray:
"""
Loads embeddings from .npy or .parquet formats.
- Tries data/processed/{dataset}/{suffix}.npy
- Then tries data/processed/{dataset}/{suffix}.parquet
- Then tries legacy path: data/processed/{dataset}/embeddings_{suffix}.npy
"""
import pandas as pd
base_path = f"data/processed/{dataset}/"
# Option 1: Newer convention - {suffix}.npy
npy_path = os.path.join(base_path, f"{suffix}.npy")
if os.path.exists(npy_path):
return np.load(npy_path).astype(np.float32)
# Option 2: Newer convention - {suffix}.parquet
parquet_path = os.path.join(base_path, f"{suffix}.parquet")
if os.path.exists(parquet_path):
df = pd.read_parquet(parquet_path)
# Unpack the 'vector' column (which is a list/array in each row)
if "vector" in df.columns:
embeds = np.stack(df["vector"].values)
else:
# Fallback to numeric filtering if 'vector' is missing
df_numeric = df.select_dtypes(include=["number"])
embeds = df_numeric.values
print(f"[βœ“] Loaded {embeds.shape[0]} embeddings of dim {embeds.shape[1]} from {parquet_path}")
return embeds.astype(np.float32)
# Option 3: Legacy naming - embeddings_{suffix}.npy
legacy_path = os.path.join(base_path, f"embeddings_{suffix}.npy")
if os.path.exists(legacy_path):
return np.load(legacy_path).astype(np.float32)
raise FileNotFoundError(f"[βœ—] Embedding file not found: {npy_path}, {parquet_path}, or {legacy_path}")
def write_defaults_json(dataset: str, mode: str, embed_sources: List[str]):
"""
Updates defaults/default_cove.json with mode and embedding sources.
"""
default_path = Path("defaults/default_cove.json")
default_path.parent.mkdir(parents=True, exist_ok=True)
if default_path.exists():
with open(default_path, "r") as f:
defaults = json.load(f)
else:
defaults = {}
defaults[dataset] = {
"mode": mode,
"embed_sources": embed_sources
}
with open(default_path, "w") as f:
json.dump(defaults, f, indent=2)