### Chat With PDF ### import os from dotenv import load_dotenv import streamlit as st import cassio from langchain_community.vectorstores import Cassandra from langchain.indexes.vectorstore import VectorStoreIndexWrapper from langchain_community.llms import OpenAI from langchain_openai import ChatOpenAI from langchain.prompts.chat import ChatPromptTemplate from langchain_openai import OpenAIEmbeddings from langchain_text_splitters import CharacterTextSplitter from PyPDF2 import PdfReader load_dotenv() ASTRADB_APP_TOKEN = os.getenv("ASTRA_DB_TOKEN") ASTRADB_ID = os.getenv("ASTRA_DB_ID") def read_file_and_chunk(pdf): reader = PdfReader(pdf) raw_text = "" for _, page in enumerate(reader.pages): content = page.extract_text() if content: raw_text += content text_splitter = CharacterTextSplitter( separator="\n", chunk_size=400, chunk_overlap=100, length_function=len ) text_chunks = text_splitter.split_text(raw_text) return text_chunks def initialize_database(): cassio.init( token=ASTRADB_APP_TOKEN, database_id=ASTRADB_ID ) astra_vector_store = Cassandra( embedding=embed, table_name="pdf_chat", session=None, keyspace=None ) return astra_vector_store def load_to_db(texts, vector_store): vector_store.add_texts(texts) vector_index = VectorStoreIndexWrapper(vectorstore=vector_store) return vector_index # Initialize Streamlit app st.set_page_config(page_title="Chat With PDF") st.header("Ask Questions About Your Documents") OPENAI_API_KEY = st.text_input("OpenAI API Key: ", type="password") llm = OpenAI(openai_api_key=OPENAI_API_KEY) embed = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY) uploaded_file = st.file_uploader("Upload your PDF file") if uploaded_file is not None: st.write("Reading and indexing your PDF, this may take a moment...") try: chunks = read_file_and_chunk(uploaded_file) astra_vector_store = initialize_database() astra_vector_index = load_to_db(chunks, astra_vector_store) except Exception as e: st.subheader(e) user_query = st.text_input("Query: ", key=input) submit = st.button("Ask") if submit: answer = astra_vector_index.query(user_query, llm=llm).strip() st.subheader("Answer:") st.write(answer)