Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import PyMuPDFLoader | |
| from langchain_community.document_loaders import TextLoader | |
| from langchain_community.embeddings.sentence_transformer import ( | |
| SentenceTransformerEmbeddings, | |
| ) | |
| import os | |
| from langchain.storage import InMemoryStore | |
| from langchain_community.document_loaders import TextLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain.retrievers import ParentDocumentRetriever | |
| from langchain_community.vectorstores import Chroma | |
| from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter | |
| # Import CSV Files to the VectorDB | |
| # Reference : https://towardsdatascience.com/rag-how-to-talk-to-your-data-eaf5469b83b0 | |
| # df_mental_health = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "98_row_Mental_Health_FAQs") | |
| # df_counsellor_chats = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "Counsellor_Chats") | |
| # df_human_therapist = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "99_rows_Human_&_Therapist") | |
| # Get the directory path of the current script | |
| script_dir = os.path.dirname(os.path.abspath(__file__)) | |
| loader = PyMuPDFLoader(os.path.join(script_dir, 'Data','PDFs', 'DepressionGuide-web.pdf')) | |
| documents = loader.load() | |
| # create the open-source embedding function | |
| # Docs:- https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 | |
| embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") | |
| # https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever | |
| parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000) | |
| # This text splitter is used to create the child documents | |
| # It should create documents smaller than the parent | |
| child_splitter = RecursiveCharacterTextSplitter(chunk_size=400) | |
| # The vectorstore to use to index the child chunks | |
| vectorstore = Chroma( | |
| collection_name="split_parents", embedding_function=embedding_function) | |
| # The storage layer for the parent documents | |
| store = InMemoryStore() | |
| def instantiate_rag(): | |
| rag_retriever = ParentDocumentRetriever( | |
| vectorstore=vectorstore, | |
| docstore=store, | |
| child_splitter=child_splitter, | |
| parent_splitter=parent_splitter, | |
| ) | |
| rag_retriever.add_documents(documents) | |
| return rag_retriever |