File size: 2,426 Bytes
315125b
 
 
 
f96ce6a
315125b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f96ce6a
315125b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
### Chat With PDF ###

import os
from dotenv import load_dotenv
import streamlit as st
import cassio
from langchain_community.vectorstores import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain_community.llms import OpenAI
from langchain_openai import ChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from PyPDF2 import PdfReader

load_dotenv()
ASTRADB_APP_TOKEN = os.getenv("ASTRA_DB_TOKEN")
ASTRADB_ID = os.getenv("ASTRA_DB_ID")

def read_file_and_chunk(pdf):
    reader = PdfReader(pdf)
    raw_text = ""
    for _, page in enumerate(reader.pages):
        content = page.extract_text()
        if content:
            raw_text += content
            
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=400,
        chunk_overlap=100,
        length_function=len
    )
    text_chunks = text_splitter.split_text(raw_text)
    return text_chunks

def initialize_database():
    cassio.init(
        token=ASTRADB_APP_TOKEN,
        database_id=ASTRADB_ID
    )
    astra_vector_store = Cassandra(
        embedding=embed,
        table_name="pdf_chat",
        session=None,
        keyspace=None
    )
    return astra_vector_store

def load_to_db(texts, vector_store):
    vector_store.add_texts(texts)
    vector_index = VectorStoreIndexWrapper(vectorstore=vector_store)
    return vector_index
    

# Initialize Streamlit app

st.set_page_config(page_title="Chat With PDF")
st.header("Ask Questions About Your Documents")

OPENAI_API_KEY = st.text_input("OpenAI API Key: ", type="password")
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embed = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

uploaded_file = st.file_uploader("Upload your PDF file")
if uploaded_file is not None:
    st.write("Reading and indexing your PDF, this may take a moment...")
    try:
        chunks = read_file_and_chunk(uploaded_file)
        astra_vector_store = initialize_database()
        astra_vector_index = load_to_db(chunks, astra_vector_store)
    except Exception as e:
        st.subheader(e)
    
    user_query = st.text_input("Query: ", key=input)
    submit = st.button("Ask")
    if submit:
        answer = astra_vector_index.query(user_query, llm=llm).strip()
        st.subheader("Answer:")
        st.write(answer)