File size: 5,300 Bytes
6518a94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""Inspect and query the FAISS index and metadata created by `index_faiss.py`.



Usage examples (PowerShell):

  # List first 10 ids

  C:/PF/Projects/Rs_mini_project/myenv/Scripts/python.exe backend/query_faiss_fixed.py --list 10



  # Show vector info for id 5

  C:/PF/Projects/Rs_mini_project/myenv/Scripts/python.exe backend/query_faiss_fixed.py --show-id 5



  # Query by text (nearest neighbors)

  C:/PF/Projects/Rs_mini_project/myenv/Scripts/python.exe backend/query_faiss_fixed.py --query-text "python data science" --k 5



  # Query by existing id (find neighbors of vector with id 0)

  C:/PF/Projects/Rs_mini_project/myenv/Scripts/python.exe backend/query_faiss_fixed.py --query-id 0 --k 5



The script expects the FAISS index at `backend/faiss_index.faiss` and metadata at

`backend/faiss_metadata.json` by default. It uses the same embedding model

(`all-MiniLM-L6-v2`) to encode text queries.

"""

from __future__ import annotations
import argparse
import json
from pathlib import Path
from typing import Optional

import numpy as np
from sentence_transformers import SentenceTransformer
import faiss


DEFAULT_INDEX = Path('faiss_index.faiss')
DEFAULT_META = Path('faiss_metadata.json')
MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'


def load_index_and_meta(index_path: Path = DEFAULT_INDEX, meta_path: Path = DEFAULT_META):
    if not index_path.exists():
        raise FileNotFoundError(f'FAISS index not found: {index_path}')
    if not meta_path.exists():
        raise FileNotFoundError(f'Metadata file not found: {meta_path}')

    index = faiss.read_index(str(index_path))
    with open(meta_path, 'r', encoding='utf-8') as f:
        metadata = json.load(f)
    return index, metadata


def list_ids(index, metadata, n: int = 20):
    # Try to read all ids from metadata (keys are strings)
    ids = sorted(int(k) for k in metadata.keys())
    for i in ids[:n]:
        print('id={} -> Title: {}'.format(i, metadata.get(str(i), '')))


def show_vector(index, idx: int, top_dims: int = 8):
    # FAISS doesn't provide direct access to stored vectors for all index types.
    # For IndexIDMap over an IndexFlat, we can reconstruct by searching for exact id.
    # We'll try to use index.reconstruct if available.
    try:
        vec = index.reconstruct(int(idx))
        vec = np.array(vec).astype('float32')
        norm = np.linalg.norm(vec)
        print('id={} vector norm={:.4f} first {} dims: {}'.format(idx, norm, top_dims, vec[:top_dims].tolist()))
    except Exception as e:
        print('Could not reconstruct vector for id {}: {}'.format(idx, e))


def query_by_text(index, metadata, query: str, k: int = 5, model_name: str = MODEL_NAME):
    model = SentenceTransformer(model_name, device='cpu')
    q_emb = model.encode([query], convert_to_numpy=True).astype('float32')
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, k)
    D = D[0]
    I = I[0]
    print('Query: "{}" -> top {} results:'.format(query, k))
    for score, idx in zip(D, I):
        if idx == -1:
            continue
        title = metadata.get(str(int(idx)), '')
        print('  id={} score={:.4f} title={}'.format(int(idx), float(score), title))


def query_by_id(index, metadata, qid: int, k: int = 5):
    # Try to reconstruct vector for qid then search
    try:
        vec = index.reconstruct(int(qid)).astype('float32')
    except Exception as e:
        print('Could not reconstruct vector for id {}: {}'.format(qid, e))
        return
    vec = np.expand_dims(vec, axis=0)
    faiss.normalize_L2(vec)
    D, I = index.search(vec, k)
    D = D[0]
    I = I[0]
    print('Neighbors for id={} ->'.format(qid))
    for score, idx in zip(D, I):
        if idx == -1:
            continue
        title = metadata.get(str(int(idx)), '')
        print('  id={} score={:.4f} title={}'.format(int(idx), float(score), title))


def _parse_args(argv=None):
    p = argparse.ArgumentParser(description='Inspect/query FAISS index')
    p.add_argument('--index', '-x', default=str(DEFAULT_INDEX))
    p.add_argument('--meta', '-m', default=str(DEFAULT_META))
    p.add_argument('--list', '-l', nargs='?', const=20, type=int, help='List first N ids (default 20)')
    p.add_argument('--show-id', type=int, help='Show vector info for given id')
    p.add_argument('--query-text', type=str, help='Query by text')
    p.add_argument('--query-id', type=int, help='Query by existing id (find neighbors)')
    p.add_argument('--k', type=int, default=5, help='Number of neighbors to return')
    return p.parse_args(argv)


def main(argv=None):
    args = _parse_args(argv)
    index, metadata = load_index_and_meta(Path(args.index), Path(args.meta))

    if args.list is not None:
        list_ids(index, metadata, args.list)
        return

    if args.show_id is not None:
        show_vector(index, args.show_id)
        return

    if args.query_text:
        query_by_text(index, metadata, args.query_text, k=args.k)
        return

    if args.query_id is not None:
        query_by_id(index, metadata, args.query_id, k=args.k)
        return

    print('No action specified. Use --help for usage examples.')


if __name__ == '__main__':
    main()