File size: 1,515 Bytes
88a1870
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from loguru import logger

vector_store: InMemoryVectorStore = None


def load_web_content_to_vector_store(web_post_url):
    bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
    loader = WebBaseLoader(
        web_paths=(web_post_url,),
        bs_kwargs={"parse_only": bs4_strainer},
    )
    docs = loader.load()
    assert len(docs) == 1
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, chunk_overlap=200, add_start_index=True
    )
    all_splits = text_splitter.split_documents(docs)
    logger.info(f"Split blog post into {len(all_splits)} sub-documents.")
    document_ids = vector_store.add_documents(documents=all_splits)
    logger.info(f"Loaded {len(document_ids)} to vector store.")


def initialize_vector_store(web_post_url):
    # "https://lilianweng.github.io/posts/2023-06-23-agent/"
    logger.info("Loading Vector Store.....")
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    global vector_store
    vector_store = InMemoryVectorStore(embeddings)
    logger.info("Initialized InMemoryVectorStore")
    load_web_content_to_vector_store(web_post_url)
    logger.info("Loaded Web Content to the vector store...")


def get_vector_store():
    return vector_store