Spaces:
Sleeping
Sleeping
File size: 1,039 Bytes
a856301 e1cda2e bcf9d83 e1cda2e bcf9d83 e1cda2e bcf9d83 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | from langchain_core.documents import Document
import configs.config as config
from processing.documents import load_documents, split_documents
from processing.texts import clean_text
class Service:
def __init__(self, store):
self.store = store
def scrape_and_get_store_vector_retriever(self, urls: list[str]):
"""
Scrapes website content from fetched schemes and creates a VectorStore retriever.
"""
documents: list[Document] = []
for url in urls:
try:
website_documents = load_documents(url)
for doc in website_documents:
doc.page_content = clean_text(doc.page_content)
doc.metadata["source"] = url
documents.append(doc)
except Exception as e:
raise Exception(f"Error processing {url}: {e}")
self.store.store_embeddings(
split_documents(documents, chunk_size=config.CHUNK_SIZE, chunk_overlap=config.CHUNK_OVERLAP)
)
|