File size: 1,039 Bytes
a856301
e1cda2e
bcf9d83
 
e1cda2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bcf9d83
 
 
 
e1cda2e
 
 
bcf9d83
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from langchain_core.documents import Document

import configs.config as config
from processing.documents import load_documents, split_documents
from processing.texts import clean_text


class Service:
    def __init__(self, store):
        self.store = store

    def scrape_and_get_store_vector_retriever(self, urls: list[str]):
        """
        Scrapes website content from fetched schemes and creates a VectorStore retriever.
        """
        documents: list[Document] = []

        for url in urls:
            try:
                website_documents = load_documents(url)
                for doc in website_documents:
                    doc.page_content = clean_text(doc.page_content)
                    doc.metadata["source"] = url
                    documents.append(doc)
            except Exception as e:
                raise Exception(f"Error processing {url}: {e}")

        self.store.store_embeddings(
            split_documents(documents, chunk_size=config.CHUNK_SIZE, chunk_overlap=config.CHUNK_OVERLAP)
        )