from langchain_core.documents import Document import configs.config as config from processing.documents import load_documents, split_documents from processing.texts import clean_text class Service: def __init__(self, store): self.store = store def scrape_and_get_store_vector_retriever(self, urls: list[str]): """ Scrapes website content from fetched schemes and creates a VectorStore retriever. """ documents: list[Document] = [] for url in urls: try: website_documents = load_documents(url) for doc in website_documents: doc.page_content = clean_text(doc.page_content) doc.metadata["source"] = url documents.append(doc) except Exception as e: raise Exception(f"Error processing {url}: {e}") self.store.store_embeddings( split_documents(documents, chunk_size=config.CHUNK_SIZE, chunk_overlap=config.CHUNK_OVERLAP) )