Spaces:
Sleeping
Sleeping
File size: 2,692 Bytes
f866820 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
# RAG-document-assistant/scripts/search_documents.py
"""
Local similarity search (cosine) over embeddings.jsonl.
Purpose:
Performs local similarity search using cosine similarity over pre-generated embeddings.
Useful for testing and debugging search functionality without connecting to a vector database.
Inputs:
embeddings_path (str): Path to embeddings.jsonl file
query (str): Search query text
k (int, optional): Number of results to return (default: 3)
dim (int, optional): Embedding dimension (default: 64)
Outputs:
Prints top-k results with id, filename, chunk_id, and similarity score
Usage:
python scripts/search_documents.py /path/to/embeddings.jsonl "query text" [k] [dim]
Example:
python scripts/search_documents.py ./data/chunks.jsonl "what is GDPR" 5 384
"""
import sys
import json
import math
from pathlib import Path
from typing import List
from src.ingestion.embeddings import get_embedding
def load_embeddings(path: str):
path = Path(path)
if not path.exists():
raise FileNotFoundError(path)
items = []
with path.open("r", encoding="utf-8") as fh:
for line in fh:
obj = json.loads(line)
items.append(obj)
return items
def dot(a: List[float], b: List[float]) -> float:
return sum(x*y for x,y in zip(a,b))
def norm(a: List[float]) -> float:
return math.sqrt(sum(x*x for x in a))
def cosine_sim(a: List[float], b: List[float]) -> float:
na = norm(a)
nb = norm(b)
if na == 0 or nb == 0:
return 0.0
return dot(a,b) / (na * nb)
def search(embeddings_path: str, query: str, k: int = 3, dim: int = 64):
items = load_embeddings(embeddings_path)
qvec = get_embedding(query, provider="local", dim=dim)
scored = []
for it in items:
emb = it.get("embedding")
if not emb:
continue
score = cosine_sim(qvec, emb)
scored.append((score, it))
scored.sort(key=lambda x: x[0], reverse=True)
return scored[:k]
def print_results(results):
print(f"{'SCORE':>8} {'ID':60} {'FILENAME':40} {'CHUNK_ID':>7}")
print("-"*130)
for score, it in results:
print(f"{score:8.4f} {it['id'][:60]:60} {it['filename'][:40]:40} {it['chunk_id']:7d}")
if __name__ == "__main__":
if len(sys.argv) < 3:
print("Usage: python3 scripts/search_documents.py /path/to/embeddings.jsonl \"query text\" [k] [dim]")
raise SystemExit(1)
emb_path = sys.argv[1]
query = sys.argv[2]
k = int(sys.argv[3]) if len(sys.argv) > 3 else 3
dim = int(sys.argv[4]) if len(sys.argv) > 4 else 64
results = search(emb_path, query, k=k, dim=dim)
print_results(results) |