Spaces:
Running
Running
| from langchain_core.documents import Document | |
| import configs.config as config | |
| from processing.documents import load_documents, split_documents | |
| from processing.texts import clean_text | |
| class Service: | |
| def __init__(self, store): | |
| self.store = store | |
| def scrape_and_get_store_vector_retriever(self, urls: list[str]): | |
| """ | |
| Scrapes website content from fetched schemes and creates a VectorStore retriever. | |
| """ | |
| documents: list[Document] = [] | |
| for url in urls: | |
| try: | |
| website_documents = load_documents(url) | |
| for doc in website_documents: | |
| doc.page_content = clean_text(doc.page_content) | |
| doc.metadata["source"] = url | |
| documents.append(doc) | |
| except Exception as e: | |
| raise Exception(f"Error processing {url}: {e}") | |
| self.store.store_embeddings( | |
| split_documents(documents, chunk_size=config.CHUNK_SIZE, chunk_overlap=config.CHUNK_OVERLAP) | |
| ) | |