Spaces:

DrishtiSharma
/

chat-w-docs-via-speech-or-text

Sleeping

DrishtiSharma commited on Dec 9, 2024

Commit

a72c887

verified ·

1 Parent(s): d2af2e8

Create vectorize_documents.py

Files changed (1) hide show

vectorize_documents.py ADDED Viewed

+from langchain_community.document_loaders import UnstructuredFileLoader
+from langchain_community.document_loaders import DirectoryLoader
+from langchain_text_splitters import CharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_chroma import Chroma
+# loaidng the embedding model
+embeddings = HuggingFaceEmbeddings()
+loader = DirectoryLoader(path="data",
+                         glob="./*.pdf",
+                         loader_cls=UnstructuredFileLoader)
+documents = loader.load()
+text_splitter = CharacterTextSplitter(chunk_size=2000,
+                                      chunk_overlap=500)
+text_chunks = text_splitter.split_documents(documents)
+vectordb = Chroma.from_documents(
+    documents=text_chunks,
+    embedding=embeddings,
+    persist_directory="vector_db_dir"
+)
+print("Documents Vectorized")