Spaces:
Sleeping
Sleeping
| from langchain.document_loaders import TextLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.embeddings import OpenAIEmbeddings | |
| from langchain.vectorstores import Chroma | |
| CHUNK_SIZE = 500 | |
| def get_documents(filename: str): | |
| loader = TextLoader(filename) | |
| docs = loader.load() | |
| return docs | |
| def get_text_chunk(docs): | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=0) | |
| texts = text_splitter.split_documents(docs) | |
| return texts | |
| def store(texts, dir_name): | |
| embeddings = OpenAIEmbeddings() | |
| db = Chroma.from_documents(texts, embeddings, persist_directory=dir_name) | |
| db.persist() | |
| def main(filename: str, dir_name: str): | |
| docs = get_documents(filename) | |
| texts = get_text_chunk(docs) | |
| store(texts, dir_name) | |
| if __name__ == "__main__": | |
| import sys | |
| args = sys.argv | |
| if len(args) != 3: | |
| print("No args, you need one args for text filename") | |
| else: | |
| filename = args[1] | |
| dir_name = args[2] | |
| main(filename, dir_name) | |