Study-Mate / materials /text_utils.py
Hamdy005's picture
refactor: restructure project into modular src architecture and implement database store layer
962a395
raw
history blame contribute delete
713 Bytes
import PyPDF2
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredURLLoader
def text_from_pdf(pdf_file) -> str:
reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
def chunk_text(text: str, chunk_size: int = 800, chunk_overlap: int = 150):
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
return splitter.split_text(text)
def scrap_website(url: str) -> str:
loader = UnstructuredURLLoader(urls=[url], ssl_verify=True)
data = loader.load()
return data[0].page_content