Spaces:
Running
Running
| import PyPDF2 | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.document_loaders import UnstructuredURLLoader | |
| def text_from_pdf(pdf_file) -> str: | |
| reader = PyPDF2.PdfReader(pdf_file) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() | |
| return text | |
| def chunk_text(text: str, chunk_size: int = 800, chunk_overlap: int = 150): | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, chunk_overlap=chunk_overlap | |
| ) | |
| return splitter.split_text(text) | |
| def scrap_website(url: str) -> str: | |
| loader = UnstructuredURLLoader(urls=[url], ssl_verify=True) | |
| data = loader.load() | |
| return data[0].page_content | |