Spaces:
Sleeping
Sleeping
File size: 2,213 Bytes
ccd7971 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | import requests
from bs4 import BeautifulSoup
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pdfplumber
# html functions to load data from any web url
def get_html_content(url):
response = requests.get(url)
return response.content
# get the text from html content
def get_text_from_html(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
return soup.get_text()
# recursive text splitter
def split_docs(text, chunk_size=1000, chunk_overlap=20):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
texts = text_splitter.split_text(text)
return texts
# get text chunks from web url func
def scrape_text(url, max_chars=1000):
html_content = get_html_content(str(url))
text = get_text_from_html(html_content)
chunks = split_docs(text, chunk_size=max_chars)
return chunks
# pdf loader using langchain pypdf loader
def pdf_loader(pdf):
loader = PyPDFLoader(pdf)
pages = loader.load_and_split() # load and split to get chunks
return pages
# split pdf docs
def split_documents(text, chunk_size=1000, chunk_overlap=20):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
texts = text_splitter.split_documents(text)
return texts
# split pdf texts
def split_texts(text, chunk_size=1000, chunk_overlap=20):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
texts = text_splitter.split_text(text)
return texts
# get text from pdf url
def pdf_text(pdf, max_chars=1000):
pages = pdf_loader(pdf)
chunks = split_documents(pages,chunk_size=max_chars)
return [list(p)[0][1] for p in chunks]
# extract data from pdf file object
def extract_data(feed, max_chars=1000):
data = []
with pdfplumber.open(feed) as pdf:
pages = pdf.pages
for p in pages:
data.append(p.extract_text())
string = ' '.join(data)
chunks = split_texts(string,chunk_size=max_chars)
return chunks |