|
|
import bs4 |
|
|
from langchain_community.document_loaders import WebBaseLoader |
|
|
from langchain_core.vectorstores import InMemoryVectorStore |
|
|
from langchain_openai import OpenAIEmbeddings |
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
|
from loguru import logger |
|
|
|
|
|
vector_store: InMemoryVectorStore = None |
|
|
|
|
|
|
|
|
def load_web_content_to_vector_store(web_post_url): |
|
|
bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content")) |
|
|
loader = WebBaseLoader( |
|
|
web_paths=(web_post_url,), |
|
|
bs_kwargs={"parse_only": bs4_strainer}, |
|
|
) |
|
|
docs = loader.load() |
|
|
assert len(docs) == 1 |
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=1000, chunk_overlap=200, add_start_index=True |
|
|
) |
|
|
all_splits = text_splitter.split_documents(docs) |
|
|
logger.info(f"Split blog post into {len(all_splits)} sub-documents.") |
|
|
document_ids = vector_store.add_documents(documents=all_splits) |
|
|
logger.info(f"Loaded {len(document_ids)} to vector store.") |
|
|
|
|
|
|
|
|
def initialize_vector_store(web_post_url): |
|
|
|
|
|
logger.info("Loading Vector Store.....") |
|
|
embeddings = OpenAIEmbeddings(model="text-embedding-3-large") |
|
|
global vector_store |
|
|
vector_store = InMemoryVectorStore(embeddings) |
|
|
logger.info("Initialized InMemoryVectorStore") |
|
|
load_web_content_to_vector_store(web_post_url) |
|
|
logger.info("Loaded Web Content to the vector store...") |
|
|
|
|
|
|
|
|
def get_vector_store(): |
|
|
return vector_store |
|
|
|