| | import os |
| | import shutil |
| | from langchain_community.document_loaders import TextLoader, PyPDFLoader |
| | from langchain_text_splitters import RecursiveCharacterTextSplitter |
| | from langchain_openai import OpenAIEmbeddings |
| | from langchain_community.vectorstores import FAISS |
| | from langchain_core.documents import Document |
| | from getpass import getpass |
| |
|
| | |
| | os.environ["OPENAI_API_KEY"] = getpass("Provide OpenAI API Key:") |
| |
|
| | |
| | def create_combined_summary_vector_store(): |
| | |
| | directory_path = "./CAPS_Summaries" |
| |
|
| | |
| | md_files = [f for f in os.listdir(directory_path) if f.endswith('.md')] |
| |
|
| | |
| | documents = [] |
| | for file_name in md_files: |
| | file_path = os.path.join(directory_path, file_name) |
| | with open(file_path, 'r', encoding='utf-8') as file: |
| | content = file.read() |
| | |
| | documents.append(Document(page_content=content)) |
| | print(f"Successfully added {file_name} to the combined vector store.") |
| |
|
| | |
| | text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500) |
| | splits = text_splitter.split_documents(documents) |
| |
|
| | |
| | embeddings = OpenAIEmbeddings(model="text-embedding-3-large") |
| | vector_store = FAISS.from_documents(documents=splits, embedding=embeddings) |
| |
|
| | |
| | vector_store.save_local("./Combined_Summary_Vectorstore") |
| | print("Combined summary vector store creation complete and saved as 'Combined_Summary_Vectorstore'.") |
| |
|
| | |
| | def create_individual_summary_vector_stores(): |
| | |
| | directory_path = "./CAPS_Summaries" |
| | |
| | save_directory = "./Individual_Summary_Vectorstores" |
| |
|
| | |
| | os.makedirs(save_directory, exist_ok=True) |
| |
|
| | |
| | md_files = [f for f in os.listdir(directory_path) if f.endswith('.md')] |
| |
|
| | |
| | for file_name in md_files: |
| | file_path = os.path.join(directory_path, file_name) |
| | with open(file_path, 'r', encoding='utf-8') as file: |
| | content = file.read() |
| | |
| | document = Document(page_content=content) |
| | print(f"Successfully loaded {file_name}.") |
| |
|
| | |
| | text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500) |
| | splits = text_splitter.split_documents([document]) |
| |
|
| | |
| | embeddings = OpenAIEmbeddings(model="text-embedding-3-large") |
| | vector_store = FAISS.from_documents(documents=splits, embedding=embeddings) |
| |
|
| | |
| | vector_store_name = os.path.join(save_directory, f"{os.path.splitext(file_name)[0]}_vectorstore") |
| | vector_store.save_local(vector_store_name) |
| | print(f"Vector store for {file_name} created and saved as '{vector_store_name}'.") |
| | print(f"All Individual Summary Vectorstores created.") |
| |
|
| | |
| | def create_individual_vector_stores_for_all_documents(): |
| | |
| | summary_directory = "./CAPS_Summaries" |
| | caps_directory = "./CAPS" |
| | |
| | save_directory = "./Individual_All_Vectorstores" |
| |
|
| | |
| | os.makedirs(save_directory, exist_ok=True) |
| |
|
| | |
| | summary_files = [f for f in os.listdir(summary_directory) if f.endswith('.md')] |
| | |
| | caps_files = [f for f in os.listdir(caps_directory) if f.endswith('.pdf')] |
| |
|
| | |
| | for file_name in summary_files: |
| | |
| | source_vector_store_name = os.path.join("./Individual_Summary_Vectorstores", f"{os.path.splitext(file_name)[0]}_vectorstore") |
| | |
| | destination_vector_store_name = os.path.join(save_directory, f"{os.path.splitext(file_name)[0]}_vectorstore") |
| | |
| | shutil.copytree(source_vector_store_name, destination_vector_store_name, dirs_exist_ok=True) |
| | print(f"Copied vector store for {file_name} to '{destination_vector_store_name}'.") |
| |
|
| | |
| | for file_name in caps_files: |
| | file_path = os.path.join(caps_directory, file_name) |
| | loader = PyPDFLoader(file_path) |
| | documents = loader.load() |
| | print(f"Successfully loaded {file_name} from CAPS.") |
| |
|
| | |
| | text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500) |
| | splits = text_splitter.split_documents(documents) |
| |
|
| | |
| | embeddings = OpenAIEmbeddings(model="text-embedding-3-large") |
| | vector_store = FAISS.from_documents(documents=splits, embedding=embeddings) |
| |
|
| | |
| | vector_store_name = os.path.join(save_directory, f"{os.path.splitext(file_name)[0]}_vectorstore") |
| | vector_store.save_local(vector_store_name) |
| | print(f"Vector store for {file_name} created and saved as '{vector_store_name}'.") |
| | print(f"All Individual Vectorstores for complete and summary plans created.") |
| |
|
| | |
| | if __name__ == "__main__": |
| | create_combined_summary_vector_store() |
| | create_individual_summary_vector_stores() |
| | create_individual_vector_stores_for_all_documents() |
| |
|