File size: 1,926 Bytes
1503fe5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fdafa82
 
1503fe5
fdafa82
1503fe5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
from dotenv import load_dotenv
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.llms import OpenAI


def get_text(pdf):
    pdfreader = PdfReader(pdf)
    text  = ''
    for page in pdfreader.pages:
        text += page.extract_text()
    
    return text

def get_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator='\n',
        chunk_size = 1000,
        chunk_overlap = 200,
        length_function = len 
    )
    chunks = text_splitter.split_text(text)
    return chunks


def main():

    response = ""
    load_dotenv()
    #GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
    #OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
    
    llm = OpenAI(openai_api_key="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")
    
    # configure streamlit
    st.set_page_config(page_title="Query Your PDF", page_icon=':books:')    
    st.title("Query Your PDF")
    
    query = st.text_input("Ask your query about the pdf", value=None)

    with st.sidebar:
        st.subheader("Upload your PDF here")
        pdf_file = st.file_uploader("Upload", type=['pdf'])

        if pdf_file:
            text = get_text(pdf_file)
            chunks = get_chunks(text)

            embedding = OpenAIEmbeddings()
            database = FAISS.from_texts(chunks, embedding)

            if query:
                docs = database.similarity_search(query) 

            chain = load_qa_chain(
                llm = llm,
                chain_type='stuff',
            )

            response = chain.run(input_documents=docs, question=query)
    
    st.write(response)
    
if __name__ == "__main__":
    main()