Spaces:
Sleeping
Sleeping
| from langchain.schema import Document | |
| import pickle | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_chroma import Chroma | |
| from langchain.retrievers import ParentDocumentRetriever | |
| from langchain.storage import InMemoryStore | |
| import os | |
| from typing import Iterable | |
| import json | |
| from tqdm import tqdm | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2") | |
| def parent_retriever(chroma_path, embeddings): | |
| parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, | |
| chunk_overlap=500) | |
| # create the child documents - The small chunks | |
| child_splitter = RecursiveCharacterTextSplitter(chunk_size=300, | |
| chunk_overlap=50) | |
| # The storage layer for the parent chunks | |
| store = InMemoryStore() | |
| vectorstore = Chroma(collection_name="full_documents", | |
| embedding_function=embeddings, | |
| persist_directory=chroma_path) | |
| retriever = ParentDocumentRetriever( | |
| vectorstore=vectorstore, | |
| docstore=store, | |
| child_splitter=child_splitter, | |
| parent_splitter=parent_splitter, | |
| search_kwargs={"k": 5}) | |
| return retriever | |
| def save_to_pickle(obj, filename): | |
| ''' | |
| save docstore as pickle file | |
| ''' | |
| with open(filename, "wb") as file: | |
| pickle.dump(obj, file, pickle.HIGHEST_PROTOCOL) | |
| retriever_repos = parent_retriever('ohw_proj_chorma_db',embeddings=embedding) | |
| def load_docs_from_jsonl(file_path)->Iterable[Document]: | |
| array = [] | |
| with open(file_path, 'r') as jsonl_file: | |
| for line in jsonl_file: | |
| data = json.loads(line) | |
| obj = Document(**data) | |
| array.append(obj) | |
| return array | |
| documents = load_docs_from_jsonl('project_readmes.json') | |
| for i in tqdm(range(0,len(documents))): | |
| retriever_repos.add_documents([documents[i]]) | |
| save_to_pickle(retriever_repos.docstore.store, 'ohw_proj_chorma_db.pcl') |