import requests from bs4 import BeautifulSoup from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter import pdfplumber # html functions to load data from any web url def get_html_content(url): response = requests.get(url) return response.content # get the text from html content def get_text_from_html(html_content): soup = BeautifulSoup(html_content, 'html.parser') return soup.get_text() # recursive text splitter def split_docs(text, chunk_size=1000, chunk_overlap=20): text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) texts = text_splitter.split_text(text) return texts # get text chunks from web url func def scrape_text(url, max_chars=1000): html_content = get_html_content(str(url)) text = get_text_from_html(html_content) chunks = split_docs(text, chunk_size=max_chars) return chunks # pdf loader using langchain pypdf loader def pdf_loader(pdf): loader = PyPDFLoader(pdf) pages = loader.load_and_split() # load and split to get chunks return pages # split pdf docs def split_documents(text, chunk_size=1000, chunk_overlap=20): text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) texts = text_splitter.split_documents(text) return texts # split pdf texts def split_texts(text, chunk_size=1000, chunk_overlap=20): text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) texts = text_splitter.split_text(text) return texts # get text from pdf url def pdf_text(pdf, max_chars=1000): pages = pdf_loader(pdf) chunks = split_documents(pages,chunk_size=max_chars) return [list(p)[0][1] for p in chunks] # extract data from pdf file object def extract_data(feed, max_chars=1000): data = [] with pdfplumber.open(feed) as pdf: pages = pdf.pages for p in pages: data.append(p.extract_text()) string = ' '.join(data) chunks = split_texts(string,chunk_size=max_chars) return chunks