Spaces:
Sleeping
Sleeping
| import uuid | |
| import logging | |
| from typing import List | |
| from langchain_community.document_loaders import WebBaseLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| # add logger | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # get document | |
| def load_and_split_docs(urls: List[str]): | |
| MARKDOWN_SEPARATORS = [ | |
| "\n#{1,6} ", | |
| "```\n", | |
| "\n\\*\\*\\*+\n", | |
| "\n---+\n", | |
| "\n___+\n", | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| logger.info("Extracting web loader...") | |
| loader = WebBaseLoader(urls) | |
| docs = loader.load() | |
| text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( | |
| chunk_size=512, # The maximum number of characters in a chunk: we selected this value arbitrarily | |
| chunk_overlap=50, # The number of characters to overlap between chunks | |
| add_start_index=True, # If `True`, includes chunk's start index in metadata | |
| strip_whitespace=True, # If `True`, strips whitespace from the start and end of every document | |
| separators=MARKDOWN_SEPARATORS, | |
| ) | |
| logger.info("Split and documnets...") | |
| docs_split = text_splitter.split_documents(docs) | |
| for i, doc in enumerate(docs_split): | |
| doc.metadata['id'] = str(uuid.uuid4())[:4] | |
| doc.metadata['chunk-id'] = str(uuid.uuid4())[-4:] | |
| return docs_split |