File size: 2,213 Bytes
ccd7971
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import requests
from bs4 import BeautifulSoup
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pdfplumber


# html functions to load data from any web url
def get_html_content(url):
    response = requests.get(url)
    return response.content 

# get the text from html content
def get_text_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    return soup.get_text()

# recursive text splitter
def split_docs(text, chunk_size=1000, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_text(text)
    return texts

# get text chunks from web url func
def scrape_text(url, max_chars=1000):
    html_content = get_html_content(str(url))
    text = get_text_from_html(html_content)
    chunks = split_docs(text, chunk_size=max_chars)
    return chunks

# pdf loader using langchain pypdf loader
def pdf_loader(pdf):
    loader = PyPDFLoader(pdf)
    pages = loader.load_and_split() # load and split to get chunks
    return pages  

# split pdf docs
def split_documents(text, chunk_size=1000, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(text)
    return texts

# split pdf texts
def split_texts(text, chunk_size=1000, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_text(text)
    return texts

# get text from pdf url
def pdf_text(pdf, max_chars=1000):
    pages = pdf_loader(pdf)
    chunks = split_documents(pages,chunk_size=max_chars)
    return [list(p)[0][1] for p in chunks]   

# extract data from pdf file object
def extract_data(feed, max_chars=1000):
    data = []
    with pdfplumber.open(feed) as pdf:
        pages = pdf.pages
        for p in pages:
            data.append(p.extract_text())
    string = ' '.join(data)
    chunks = split_texts(string,chunk_size=max_chars)
    return chunks