Al1Abdullah commited on
Commit
5f9784f
·
verified ·
1 Parent(s): 602e279

Create vectorstore.py

Browse files
Files changed (1) hide show
  1. src/vectorstore.py +86 -0
src/vectorstore.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pinecone import Pinecone, ServerlessSpec, PodSpec
2
+ from langchain_pinecone import PineconeVectorStore
3
+ from langchain_chroma import Chroma
4
+ from langchain_community.embeddings import OllamaEmbeddings
5
+ from langchain.indexes import SQLRecordManager, index
6
+
7
+ from src.pdf_handler import extract_pdf, load_pdf_directory, split_pdf
8
+
9
+ import os
10
+ import shutil
11
+ from dotenv import load_dotenv
12
+
13
+ load_dotenv()
14
+
15
+
16
+ def setup_pinecone(index_name, embedding_model, embedding_dim, metric='cosine', use_serverless=True):
17
+ pc = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))
18
+ if use_serverless:
19
+ spec = ServerlessSpec(cloud='aws', region='us-east-1')
20
+ else:
21
+ spec = PodSpec()
22
+
23
+ if index_name in pc.list_indexes().names():
24
+ pc.delete_index(index_name)
25
+
26
+ pc.create_index(
27
+ index_name,
28
+ dimension=embedding_dim,
29
+ metric=metric,
30
+ spec=spec
31
+ )
32
+
33
+ db = PineconeVectorStore(index_name=index_name, embedding=embedding_model)
34
+ return db
35
+
36
+
37
+ def setup_chroma(index_name, embedding_model, persist_directory=None):
38
+ if not persist_directory:
39
+ persist_directory = './.cache/database'
40
+
41
+ os.makedirs(persist_directory, exist_ok=True)
42
+
43
+ db = Chroma(index_name, embedding_function=embedding_model, persist_directory=persist_directory)
44
+ return db
45
+
46
+
47
+ class VectorDB:
48
+ def __init__(self, db_name, index_name, cache_dir=None):
49
+ embedding = OllamaEmbeddings(model='nomic-embed-text:latest', num_gpu=1)
50
+
51
+ if not cache_dir:
52
+ cache_dir = './.cache/database'
53
+ self.cache_dir = cache_dir
54
+ os.makedirs(self.cache_dir, exist_ok=True)
55
+
56
+ if db_name == 'pinecone':
57
+ self.vectorstore = setup_pinecone(index_name, embedding, 768, 'cosine')
58
+ else:
59
+ self.vectorstore = setup_chroma(index_name, embedding, self.cache_dir)
60
+
61
+ namespace = f'{db_name}/{index_name}'
62
+ self.record_manager = SQLRecordManager(namespace,
63
+ db_url=f'sqlite:///{self.cache_dir}/record_manager_cache.sql')
64
+ self.record_manager.create_schema()
65
+
66
+ def index(self, uploaded_file):
67
+ directory = extract_pdf(uploaded_file)
68
+ docs = load_pdf_directory(directory)
69
+ chunks = split_pdf(docs)
70
+
71
+ index(
72
+ docs_source=chunks,
73
+ record_manager=self.record_manager,
74
+ vector_store=self.vectorstore,
75
+ cleanup='full',
76
+ source_id_key='source'
77
+ )
78
+
79
+ for file in os.listdir(directory):
80
+ os.remove(os.path.join(directory, file))
81
+
82
+ def as_retriever(self):
83
+ return self.vectorstore.as_retriever()
84
+
85
+ def __del__(self):
86
+ shutil.rmtree(self.cache_dir)