File size: 4,849 Bytes
6989c33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""
Embeddings + FAISS index build/save/load.
"""
from typing import List
from pathlib import Path
import os
import platform

# On macOS, FAISS and PyTorch both ship libomp and loading both copies without
# telling LibOMP they're duplicates aborts the interpreter. Setting this flag
# before importing either library prevents the crash when building embeddings.
if platform.system() == "Darwin":
    os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")

import numpy as np
# Import FAISS before torch/sentence-transformers so libomp loads in a safe order on macOS.
import faiss
from sentence_transformers import SentenceTransformer
import pandas as pd


def embed_texts(texts: List[str], model_name: str):
    """
    Return matrix of embeddings for texts.

    # TODO hints:
    # - Load SentenceTransformer by name; encode with normalize_embeddings=True if available.
    # - Batch encode; return numpy array (n, d).

    # Acceptance:
    # - Returns embeddings and model reference (if needed).
    """
    model = SentenceTransformer(model_name)
    embeddings = model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
    # Ensure numpy array and float32 for FAISS compatibility
    embeddings = np.array(embeddings, dtype=np.float32)
    return embeddings, model


def build_faiss_index(embeddings):
    """
    Build a FAISS index and return it.

    # TODO hints:
    # - Use IndexFlatIP or L2; ensure vectors are normalized if using IP.

    # Acceptance:
    # - Returns a FAISS index ready for add/search.
    """
    # Ensure embeddings are numpy array and float32
    if not isinstance(embeddings, np.ndarray):
        embeddings = np.array(embeddings, dtype=np.float32)
    if embeddings.dtype != np.float32:
        embeddings = embeddings.astype(np.float32)
    
    # Make a copy before normalizing to avoid in-place modification issues
    # (normalize_L2 modifies the array in-place)
    embeddings = embeddings.copy()
    
    # Ensure embeddings are normalized for IndexFlatIP (inner product = cosine similarity)
    # Note: embeddings should already be normalized from embed_texts, but normalize_L2 is idempotent
    faiss.normalize_L2(embeddings)
    
    # Create IndexFlatIP (Inner Product) for normalized vectors
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)
    index.add(embeddings)
    
    return index


def save_index(index, meta_rows, out_dir: str):
    """
    Persist FAISS index + metadata (CSV/Parquet) to data/index/.

    Args:
        index: FAISS index to save
        meta_rows: List of dicts or DataFrame with metadata (chunk IDs, source info)
        out_dir: Output directory path

    # TODO hints:
    # - Write index to .faiss and metadata to .parquet with chunk IDs and source info.

    # Acceptance:
    # - Files exist in data/index/.
    """
    out_path = Path(out_dir)
    out_path.mkdir(parents=True, exist_ok=True)
    
    # Save FAISS index
    index_path = out_path / 'index.faiss'
    faiss.write_index(index, str(index_path))
    
    # Convert meta_rows to DataFrame if it's a list
    if isinstance(meta_rows, list):
        meta_df = pd.DataFrame(meta_rows)
    elif isinstance(meta_rows, pd.DataFrame):
        meta_df = meta_rows
    else:
        raise ValueError("meta_rows must be a list of dicts or a pandas DataFrame")
    
    # Save metadata
    metadata_path = out_path / 'metadata.parquet'
    meta_df.to_parquet(metadata_path, index=False)
    
    print(f"βœ… Saved index to: {index_path}")
    print(f"βœ… Saved metadata to: {metadata_path}")
    print(f"   Index size: {index.ntotal} vectors")
    print(f"   Metadata rows: {len(meta_df)}")


def load_index(in_dir: str):
    """
    Load FAISS index + metadata.

    Args:
        in_dir: Input directory path containing index.faiss and metadata.parquet

    # TODO hints:
    # - Read index and matching metadata frame; sanity-check row counts.

    # Acceptance:
    # - Returns (index, metadata_df).
    """
    in_path = Path(in_dir)
    
    # Load FAISS index
    index_path = in_path / 'index.faiss'
    if not index_path.exists():
        raise FileNotFoundError(f"Index file not found: {index_path}")
    index = faiss.read_index(str(index_path))
    
    # Load metadata
    metadata_path = in_path / 'metadata.parquet'
    if not metadata_path.exists():
        raise FileNotFoundError(f"Metadata file not found: {metadata_path}")
    meta_df = pd.read_parquet(metadata_path)
    
    # Sanity check: row counts should match
    if index.ntotal != len(meta_df):
        raise ValueError(
            f"Mismatch: index has {index.ntotal} vectors but metadata has {len(meta_df)} rows"
        )
    
    print(f"βœ… Loaded index: {index.ntotal} vectors, dimension {index.d}")
    print(f"βœ… Loaded metadata: {len(meta_df)} rows")
    
    return index, meta_df