Spaces:

mickey1976
/

cove-api

Sleeping

App Files Files Community

cove-api / src /utils /io_utils.py

mickey1976

Deploy: Minimal FastAPI backend for CoVE Space

549c270 4 months ago

raw

history blame contribute delete

4.07 kB

	# src/utils/io_utils.py

	import json
	import pickle
	import numpy as np
	from pathlib import Path
	from typing import List
	import faiss
	import os

	def save_faiss_index(index, dataset, mode, item_ids):
	index_dir = Path("index") / dataset / mode
	index_dir.mkdir(parents=True, exist_ok=True)

	# Save FAISS index
	faiss.write_index(index, str(index_dir / "index.faiss"))

	# Save item_ids
	with open(index_dir / "item_ids.json", "w") as f:
	json.dump(item_ids, f)

	def save_default_config(weights_dict, dataset, mode, config_name="default_cove.json"):
	"""
	Save the default weights for CoVE FAISS combinations to defaults.json.
	"""
	path = os.path.join("defaults", dataset, config_name)
	os.makedirs(os.path.dirname(path), exist_ok=True)

	if os.path.exists(path):
	with open(path, "r") as f:
	config = json.load(f)
	else:
	config = {}

	config[mode] = weights_dict

	with open(path, "w") as f:
	json.dump(config, f, indent=4)

	print(f"✓ Saved default weights to {path}")

	def load_sequences(dataset: str) -> List[List[str]]:
	"""
	Loads interaction sequences from a dataset.
	Looks for JSON file at: data/processed/{dataset}/sequences.json
	"""
	path = Path(f"data/processed") / dataset / "sequences.json"
	if not path.exists():
	raise FileNotFoundError(f"[✗] sequences.json not found at {path}")
	with path.open("r") as f:
	return json.load(f)


	def load_item_ids(dataset: str) -> List[str]:
	"""
	Loads item ID list from a dataset.
	Expected path: data/processed/{dataset}/item_ids.json
	"""
	path = Path(f"data/processed/{dataset}/item_ids.json")
	if not path.exists():
	raise FileNotFoundError(f"[✗] item_ids.json not found at {path}")
	with path.open("r") as f:
	return json.load(f)

	import pandas as pd

	def load_embeddings(dataset: str, suffix: str) -> np.ndarray:
	"""
	Loads embeddings from .npy or .parquet formats.
	- Tries data/processed/{dataset}/{suffix}.npy
	- Then tries data/processed/{dataset}/{suffix}.parquet
	- Then tries legacy path: data/processed/{dataset}/embeddings_{suffix}.npy
	"""
	import pandas as pd

	base_path = f"data/processed/{dataset}/"

	# Option 1: Newer convention - {suffix}.npy
	npy_path = os.path.join(base_path, f"{suffix}.npy")
	if os.path.exists(npy_path):
	return np.load(npy_path).astype(np.float32)

	# Option 2: Newer convention - {suffix}.parquet
	parquet_path = os.path.join(base_path, f"{suffix}.parquet")
	if os.path.exists(parquet_path):
	df = pd.read_parquet(parquet_path)

	# Unpack the 'vector' column (which is a list/array in each row)
	if "vector" in df.columns:
	embeds = np.stack(df["vector"].values)
	else:
	# Fallback to numeric filtering if 'vector' is missing
	df_numeric = df.select_dtypes(include=["number"])
	embeds = df_numeric.values
	print(f"[✓] Loaded {embeds.shape[0]} embeddings of dim {embeds.shape[1]} from {parquet_path}")
	return embeds.astype(np.float32)

	# Option 3: Legacy naming - embeddings_{suffix}.npy
	legacy_path = os.path.join(base_path, f"embeddings_{suffix}.npy")
	if os.path.exists(legacy_path):
	return np.load(legacy_path).astype(np.float32)

	raise FileNotFoundError(f"[✗] Embedding file not found: {npy_path}, {parquet_path}, or {legacy_path}")

	def write_defaults_json(dataset: str, mode: str, embed_sources: List[str]):
	"""
	Updates defaults/default_cove.json with mode and embedding sources.
	"""
	default_path = Path("defaults/default_cove.json")
	default_path.parent.mkdir(parents=True, exist_ok=True)

	if default_path.exists():
	with open(default_path, "r") as f:
	defaults = json.load(f)
	else:
	defaults = {}

	defaults[dataset] = {
	"mode": mode,
	"embed_sources": embed_sources
	}

	with open(default_path, "w") as f:
	json.dump(defaults, f, indent=2)