Spaces:
Sleeping
Sleeping
File size: 5,300 Bytes
6518a94 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | """Inspect and query the FAISS index and metadata created by `index_faiss.py`.
Usage examples (PowerShell):
# List first 10 ids
C:/PF/Projects/Rs_mini_project/myenv/Scripts/python.exe backend/query_faiss_fixed.py --list 10
# Show vector info for id 5
C:/PF/Projects/Rs_mini_project/myenv/Scripts/python.exe backend/query_faiss_fixed.py --show-id 5
# Query by text (nearest neighbors)
C:/PF/Projects/Rs_mini_project/myenv/Scripts/python.exe backend/query_faiss_fixed.py --query-text "python data science" --k 5
# Query by existing id (find neighbors of vector with id 0)
C:/PF/Projects/Rs_mini_project/myenv/Scripts/python.exe backend/query_faiss_fixed.py --query-id 0 --k 5
The script expects the FAISS index at `backend/faiss_index.faiss` and metadata at
`backend/faiss_metadata.json` by default. It uses the same embedding model
(`all-MiniLM-L6-v2`) to encode text queries.
"""
from __future__ import annotations
import argparse
import json
from pathlib import Path
from typing import Optional
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
DEFAULT_INDEX = Path('faiss_index.faiss')
DEFAULT_META = Path('faiss_metadata.json')
MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
def load_index_and_meta(index_path: Path = DEFAULT_INDEX, meta_path: Path = DEFAULT_META):
if not index_path.exists():
raise FileNotFoundError(f'FAISS index not found: {index_path}')
if not meta_path.exists():
raise FileNotFoundError(f'Metadata file not found: {meta_path}')
index = faiss.read_index(str(index_path))
with open(meta_path, 'r', encoding='utf-8') as f:
metadata = json.load(f)
return index, metadata
def list_ids(index, metadata, n: int = 20):
# Try to read all ids from metadata (keys are strings)
ids = sorted(int(k) for k in metadata.keys())
for i in ids[:n]:
print('id={} -> Title: {}'.format(i, metadata.get(str(i), '')))
def show_vector(index, idx: int, top_dims: int = 8):
# FAISS doesn't provide direct access to stored vectors for all index types.
# For IndexIDMap over an IndexFlat, we can reconstruct by searching for exact id.
# We'll try to use index.reconstruct if available.
try:
vec = index.reconstruct(int(idx))
vec = np.array(vec).astype('float32')
norm = np.linalg.norm(vec)
print('id={} vector norm={:.4f} first {} dims: {}'.format(idx, norm, top_dims, vec[:top_dims].tolist()))
except Exception as e:
print('Could not reconstruct vector for id {}: {}'.format(idx, e))
def query_by_text(index, metadata, query: str, k: int = 5, model_name: str = MODEL_NAME):
model = SentenceTransformer(model_name, device='cpu')
q_emb = model.encode([query], convert_to_numpy=True).astype('float32')
faiss.normalize_L2(q_emb)
D, I = index.search(q_emb, k)
D = D[0]
I = I[0]
print('Query: "{}" -> top {} results:'.format(query, k))
for score, idx in zip(D, I):
if idx == -1:
continue
title = metadata.get(str(int(idx)), '')
print(' id={} score={:.4f} title={}'.format(int(idx), float(score), title))
def query_by_id(index, metadata, qid: int, k: int = 5):
# Try to reconstruct vector for qid then search
try:
vec = index.reconstruct(int(qid)).astype('float32')
except Exception as e:
print('Could not reconstruct vector for id {}: {}'.format(qid, e))
return
vec = np.expand_dims(vec, axis=0)
faiss.normalize_L2(vec)
D, I = index.search(vec, k)
D = D[0]
I = I[0]
print('Neighbors for id={} ->'.format(qid))
for score, idx in zip(D, I):
if idx == -1:
continue
title = metadata.get(str(int(idx)), '')
print(' id={} score={:.4f} title={}'.format(int(idx), float(score), title))
def _parse_args(argv=None):
p = argparse.ArgumentParser(description='Inspect/query FAISS index')
p.add_argument('--index', '-x', default=str(DEFAULT_INDEX))
p.add_argument('--meta', '-m', default=str(DEFAULT_META))
p.add_argument('--list', '-l', nargs='?', const=20, type=int, help='List first N ids (default 20)')
p.add_argument('--show-id', type=int, help='Show vector info for given id')
p.add_argument('--query-text', type=str, help='Query by text')
p.add_argument('--query-id', type=int, help='Query by existing id (find neighbors)')
p.add_argument('--k', type=int, default=5, help='Number of neighbors to return')
return p.parse_args(argv)
def main(argv=None):
args = _parse_args(argv)
index, metadata = load_index_and_meta(Path(args.index), Path(args.meta))
if args.list is not None:
list_ids(index, metadata, args.list)
return
if args.show_id is not None:
show_vector(index, args.show_id)
return
if args.query_text:
query_by_text(index, metadata, args.query_text, k=args.k)
return
if args.query_id is not None:
query_by_id(index, metadata, args.query_id, k=args.k)
return
print('No action specified. Use --help for usage examples.')
if __name__ == '__main__':
main()
|