Spaces:
Sleeping
Sleeping
File size: 4,849 Bytes
6989c33 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
"""
Embeddings + FAISS index build/save/load.
"""
from typing import List
from pathlib import Path
import os
import platform
# On macOS, FAISS and PyTorch both ship libomp and loading both copies without
# telling LibOMP they're duplicates aborts the interpreter. Setting this flag
# before importing either library prevents the crash when building embeddings.
if platform.system() == "Darwin":
os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
import numpy as np
# Import FAISS before torch/sentence-transformers so libomp loads in a safe order on macOS.
import faiss
from sentence_transformers import SentenceTransformer
import pandas as pd
def embed_texts(texts: List[str], model_name: str):
"""
Return matrix of embeddings for texts.
# TODO hints:
# - Load SentenceTransformer by name; encode with normalize_embeddings=True if available.
# - Batch encode; return numpy array (n, d).
# Acceptance:
# - Returns embeddings and model reference (if needed).
"""
model = SentenceTransformer(model_name)
embeddings = model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
# Ensure numpy array and float32 for FAISS compatibility
embeddings = np.array(embeddings, dtype=np.float32)
return embeddings, model
def build_faiss_index(embeddings):
"""
Build a FAISS index and return it.
# TODO hints:
# - Use IndexFlatIP or L2; ensure vectors are normalized if using IP.
# Acceptance:
# - Returns a FAISS index ready for add/search.
"""
# Ensure embeddings are numpy array and float32
if not isinstance(embeddings, np.ndarray):
embeddings = np.array(embeddings, dtype=np.float32)
if embeddings.dtype != np.float32:
embeddings = embeddings.astype(np.float32)
# Make a copy before normalizing to avoid in-place modification issues
# (normalize_L2 modifies the array in-place)
embeddings = embeddings.copy()
# Ensure embeddings are normalized for IndexFlatIP (inner product = cosine similarity)
# Note: embeddings should already be normalized from embed_texts, but normalize_L2 is idempotent
faiss.normalize_L2(embeddings)
# Create IndexFlatIP (Inner Product) for normalized vectors
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embeddings)
return index
def save_index(index, meta_rows, out_dir: str):
"""
Persist FAISS index + metadata (CSV/Parquet) to data/index/.
Args:
index: FAISS index to save
meta_rows: List of dicts or DataFrame with metadata (chunk IDs, source info)
out_dir: Output directory path
# TODO hints:
# - Write index to .faiss and metadata to .parquet with chunk IDs and source info.
# Acceptance:
# - Files exist in data/index/.
"""
out_path = Path(out_dir)
out_path.mkdir(parents=True, exist_ok=True)
# Save FAISS index
index_path = out_path / 'index.faiss'
faiss.write_index(index, str(index_path))
# Convert meta_rows to DataFrame if it's a list
if isinstance(meta_rows, list):
meta_df = pd.DataFrame(meta_rows)
elif isinstance(meta_rows, pd.DataFrame):
meta_df = meta_rows
else:
raise ValueError("meta_rows must be a list of dicts or a pandas DataFrame")
# Save metadata
metadata_path = out_path / 'metadata.parquet'
meta_df.to_parquet(metadata_path, index=False)
print(f"β
Saved index to: {index_path}")
print(f"β
Saved metadata to: {metadata_path}")
print(f" Index size: {index.ntotal} vectors")
print(f" Metadata rows: {len(meta_df)}")
def load_index(in_dir: str):
"""
Load FAISS index + metadata.
Args:
in_dir: Input directory path containing index.faiss and metadata.parquet
# TODO hints:
# - Read index and matching metadata frame; sanity-check row counts.
# Acceptance:
# - Returns (index, metadata_df).
"""
in_path = Path(in_dir)
# Load FAISS index
index_path = in_path / 'index.faiss'
if not index_path.exists():
raise FileNotFoundError(f"Index file not found: {index_path}")
index = faiss.read_index(str(index_path))
# Load metadata
metadata_path = in_path / 'metadata.parquet'
if not metadata_path.exists():
raise FileNotFoundError(f"Metadata file not found: {metadata_path}")
meta_df = pd.read_parquet(metadata_path)
# Sanity check: row counts should match
if index.ntotal != len(meta_df):
raise ValueError(
f"Mismatch: index has {index.ntotal} vectors but metadata has {len(meta_df)} rows"
)
print(f"β
Loaded index: {index.ntotal} vectors, dimension {index.d}")
print(f"β
Loaded metadata: {len(meta_df)} rows")
return index, meta_df
|