cryogenic22 commited on
Commit
ea88b9e
·
verified ·
1 Parent(s): d178ae1

Create utils/vector_store.py

Browse files
Files changed (1) hide show
  1. utils/vector_store.py +58 -0
utils/vector_store.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils/vector_store.py
2
+ import chromadb
3
+ from chromadb.config import Settings
4
+ from typing import List, Dict, Optional
5
+ import numpy as np
6
+
7
+ class VectorStore:
8
+ def __init__(self, persist_directory: str = "./data/chroma"):
9
+ self.client = chromadb.Client(Settings(
10
+ chroma_db_impl="duckdb+parquet",
11
+ persist_directory=persist_directory
12
+ ))
13
+ self.collection = self.client.get_or_create_collection("legal_docs")
14
+
15
+ def add_documents(self, chunks: List[Dict], metadata: Optional[Dict] = None):
16
+ """Add document chunks to vector store"""
17
+ for i, chunk in enumerate(chunks):
18
+ chunk_metadata = {
19
+ "chunk_id": i,
20
+ "length": chunk["metadata"]["length"],
21
+ "token_count": chunk["metadata"]["token_count"]
22
+ }
23
+ if metadata:
24
+ chunk_metadata.update(metadata)
25
+
26
+ self.collection.add(
27
+ embeddings=[chunk["embeddings"].tolist()],
28
+ documents=[chunk["text"]],
29
+ metadatas=[chunk_metadata],
30
+ ids=[f"{metadata.get('filename', 'doc')}_{i}"]
31
+ )
32
+
33
+ def search(self, query: str, n_results: int = 5) -> List[Dict]:
34
+ """Search for similar documents"""
35
+ results = self.collection.query(
36
+ query_texts=[query],
37
+ n_results=n_results,
38
+ include=["documents", "metadatas", "distances"]
39
+ )
40
+
41
+ return [
42
+ {
43
+ "text": doc,
44
+ "metadata": meta,
45
+ "distance": dist
46
+ }
47
+ for doc, meta, dist in zip(
48
+ results["documents"][0],
49
+ results["metadatas"][0],
50
+ results["distances"][0]
51
+ )
52
+ ]
53
+
54
+ def get_all_documents(self) -> List[Dict]:
55
+ """Get all stored documents"""
56
+ return self.collection.get(
57
+ include=["documents", "metadatas", "embeddings"]
58
+ )