File size: 2,692 Bytes
f866820
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# RAG-document-assistant/scripts/search_documents.py
"""
Local similarity search (cosine) over embeddings.jsonl.

Purpose:
    Performs local similarity search using cosine similarity over pre-generated embeddings.
    Useful for testing and debugging search functionality without connecting to a vector database.

Inputs:
    embeddings_path (str): Path to embeddings.jsonl file
    query (str): Search query text
    k (int, optional): Number of results to return (default: 3)
    dim (int, optional): Embedding dimension (default: 64)

Outputs:
    Prints top-k results with id, filename, chunk_id, and similarity score

Usage:
    python scripts/search_documents.py /path/to/embeddings.jsonl "query text" [k] [dim]

Example:
    python scripts/search_documents.py ./data/chunks.jsonl "what is GDPR" 5 384
"""

import sys
import json
import math
from pathlib import Path
from typing import List
from src.ingestion.embeddings import get_embedding

def load_embeddings(path: str):
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(path)
    items = []
    with path.open("r", encoding="utf-8") as fh:
        for line in fh:
            obj = json.loads(line)
            items.append(obj)
    return items

def dot(a: List[float], b: List[float]) -> float:
    return sum(x*y for x,y in zip(a,b))

def norm(a: List[float]) -> float:
    return math.sqrt(sum(x*x for x in a))

def cosine_sim(a: List[float], b: List[float]) -> float:
    na = norm(a)
    nb = norm(b)
    if na == 0 or nb == 0:
        return 0.0
    return dot(a,b) / (na * nb)

def search(embeddings_path: str, query: str, k: int = 3, dim: int = 64):
    items = load_embeddings(embeddings_path)
    qvec = get_embedding(query, provider="local", dim=dim)
    scored = []
    for it in items:
        emb = it.get("embedding")
        if not emb:
            continue
        score = cosine_sim(qvec, emb)
        scored.append((score, it))
    scored.sort(key=lambda x: x[0], reverse=True)
    return scored[:k]

def print_results(results):
    print(f"{'SCORE':>8}  {'ID':60}  {'FILENAME':40}  {'CHUNK_ID':>7}")
    print("-"*130)
    for score, it in results:
        print(f"{score:8.4f}  {it['id'][:60]:60}  {it['filename'][:40]:40}  {it['chunk_id']:7d}")

if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("Usage: python3 scripts/search_documents.py /path/to/embeddings.jsonl \"query text\" [k] [dim]")
        raise SystemExit(1)
    emb_path = sys.argv[1]
    query = sys.argv[2]
    k = int(sys.argv[3]) if len(sys.argv) > 3 else 3
    dim = int(sys.argv[4]) if len(sys.argv) > 4 else 64

    results = search(emb_path, query, k=k, dim=dim)
    print_results(results)