import PyPDF2 from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.document_loaders import UnstructuredURLLoader def text_from_pdf(pdf_file) -> str: reader = PyPDF2.PdfReader(pdf_file) text = "" for page in reader.pages: text += page.extract_text() return text def chunk_text(text: str, chunk_size: int = 800, chunk_overlap: int = 150): splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) return splitter.split_text(text) def scrap_website(url: str) -> str: loader = UnstructuredURLLoader(urls=[url], ssl_verify=True) data = loader.load() return data[0].page_content