Deevyankar commited on
Commit
3c9300b
·
1 Parent(s): 5c2bf1e

Create ingest.py

Browse files
Files changed (1) hide show
  1. ingest.py +61 -0
ingest.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import chromadb
3
+
4
+ from llama_index.core import VectorStoreIndex, StorageContext, SimpleDirectoryReader
5
+ from llama_index.core.node_parser import SentenceSplitter
6
+ from llama_index.vector_stores.chroma import ChromaVectorStore
7
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
8
+
9
+ COLLECTION_NAME = "neuro_course"
10
+ TEXT_DIR = "processed/chapters"
11
+ PERSIST_DIR = "/data/chroma" if os.path.exists("/data") else "storage/chroma"
12
+
13
+
14
+ def load_docs():
15
+ if not os.path.exists(TEXT_DIR):
16
+ raise FileNotFoundError(f"{TEXT_DIR} does not exist")
17
+
18
+ docs = SimpleDirectoryReader(TEXT_DIR).load_data()
19
+ if not docs:
20
+ raise ValueError("No text documents found for ingestion")
21
+ return docs
22
+
23
+
24
+ def main():
25
+ os.makedirs(PERSIST_DIR, exist_ok=True)
26
+
27
+ docs = load_docs()
28
+
29
+ splitter = SentenceSplitter(
30
+ chunk_size=700,
31
+ chunk_overlap=100
32
+ )
33
+
34
+ embed_model = HuggingFaceEmbedding(
35
+ model_name="intfloat/multilingual-e5-base"
36
+ )
37
+
38
+ client = chromadb.PersistentClient(path=PERSIST_DIR)
39
+
40
+ try:
41
+ client.delete_collection(COLLECTION_NAME)
42
+ except Exception:
43
+ pass
44
+
45
+ collection = client.get_or_create_collection(COLLECTION_NAME)
46
+
47
+ vector_store = ChromaVectorStore(chroma_collection=collection)
48
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
49
+
50
+ VectorStoreIndex.from_documents(
51
+ docs,
52
+ storage_context=storage_context,
53
+ embed_model=embed_model,
54
+ transformations=[splitter]
55
+ )
56
+
57
+ print("Vector database created successfully.")
58
+
59
+
60
+ if __name__ == "__main__":
61
+ main()