File size: 1,589 Bytes
8d25d65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import json
import faiss
import numpy as np
import re
from sentence_transformers import SentenceTransformer

EMAIL_PATTERN = re.compile(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}", re.IGNORECASE)

def load_index_and_meta(meta_path, chunks_path, index_path):
    meta_list = []
    with open(meta_path, "r", encoding="utf-8") as f:
        for line in f:
            meta_list.append(json.loads(line))

    mapping = {}
    with open(chunks_path, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            text = EMAIL_PATTERN.sub("[REDACTED_EMAIL]", obj["text"])
            mapping[(obj["page"], obj["chunk_id"])] = text

    index = faiss.read_index(index_path)
    embed_model = SentenceTransformer("all-MiniLM-L6-v2")

    return meta_list, mapping, index, embed_model


def retrieve_top_k(query, top_k, min_score, index, embed_model, meta_list, mapping):

    qvec = embed_model.encode([query], convert_to_numpy=True).astype("float32")
    faiss.normalize_L2(qvec)

    D, I = index.search(qvec, top_k)

    results = []
    for dist, idx in zip(D[0], I[0]):
        if dist < min_score:
            continue
        if idx < 0 or idx >= len(meta_list):
            continue

        m = meta_list[idx]
        page = m["page"]
        chunk = m["chunk_id"]
        text = mapping.get((page, chunk), "")
        text = EMAIL_PATTERN.sub("[REDACTED_EMAIL]", text)

        results.append({
            "score": float(dist),
            "page": page,
            "chunk_id": chunk,
            "text": text
        })

    return results