Spaces:

Deevyankar
/

BrainChat

Sleeping

App Files Files Community

Deevyankar commited on Mar 15

Commit

3c9300b

1 Parent(s): 5c2bf1e

Create ingest.py

Browse files

Files changed (1) hide show

ingest.py +61 -0

ingest.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import os
+import chromadb
+from llama_index.core import VectorStoreIndex, StorageContext, SimpleDirectoryReader
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.vector_stores.chroma import ChromaVectorStore
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+COLLECTION_NAME = "neuro_course"
+TEXT_DIR = "processed/chapters"
+PERSIST_DIR = "/data/chroma" if os.path.exists("/data") else "storage/chroma"
+def load_docs():
+    if not os.path.exists(TEXT_DIR):
+        raise FileNotFoundError(f"{TEXT_DIR} does not exist")
+    docs = SimpleDirectoryReader(TEXT_DIR).load_data()
+    if not docs:
+        raise ValueError("No text documents found for ingestion")
+    return docs
+def main():
+    os.makedirs(PERSIST_DIR, exist_ok=True)
+    docs = load_docs()
+    splitter = SentenceSplitter(
+        chunk_size=700,
+        chunk_overlap=100
+    )
+    embed_model = HuggingFaceEmbedding(
+        model_name="intfloat/multilingual-e5-base"
+    )
+    client = chromadb.PersistentClient(path=PERSIST_DIR)
+    try:
+        client.delete_collection(COLLECTION_NAME)
+    except Exception:
+        pass
+    collection = client.get_or_create_collection(COLLECTION_NAME)
+    vector_store = ChromaVectorStore(chroma_collection=collection)
+    storage_context = StorageContext.from_defaults(vector_store=vector_store)
+    VectorStoreIndex.from_documents(
+        docs,
+        storage_context=storage_context,
+        embed_model=embed_model,
+        transformations=[splitter]
+    )
+    print("Vector database created successfully.")
+if __name__ == "__main__":
+    main()