Rakshitjan's picture
Update app.py
7f5c7bd verified
# import streamlit as st
# from PyPDF2 import PdfReader
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# import os
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
# import google.generativeai as genai
# from langchain.vectorstores import FAISS
# from langchain_google_genai import ChatGoogleGenerativeAI
# from langchain.chains.question_answering import load_qa_chain
# from langchain.prompts import PromptTemplate
# from dotenv import load_dotenv
# load_dotenv()
# os.getenv("GOOGLE_API_KEY")
# genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
# def get_pdf_text(pdf_docs):
# text=""
# for pdf in pdf_docs:
# pdf_reader= PdfReader(pdf)
# for page in pdf_reader.pages:
# text+= page.extract_text()
# return text
# def get_text_chunks(text):
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
# chunks = text_splitter.split_text(text)
# return chunks
# def get_vector_store(text_chunks):
# embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
# vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
# vector_store.save_local("faiss_index")
# def get_conversational_chain():
# prompt_template = """
# Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
# provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n
# Context:\n {context}?\n
# Question: \n{question}\n
# Answer:
# """
# model = ChatGoogleGenerativeAI(model="gemini-pro",
# temperature=0.3)
# prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question"])
# chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
# return chain
# def user_input(user_question):
# embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
# new_db = FAISS.load_local("faiss_index", embeddings,allow_dangerous_deserialization=True)
# docs = new_db.similarity_search(user_question)
# chain = get_conversational_chain()
# response = chain(
# {"input_documents":docs, "question": user_question}
# , return_only_outputs=True)
# print(response)
# st.write("Reply: ", response["output_text"])
# def main():
# st.set_page_config("Chat PDF")
# st.header("Chat with PDF using Gemini💁")
# user_question = st.text_input("Ask a Question from the PDF Files")
# if user_question:
# user_input(user_question)
# with st.sidebar:
# st.title("Menu:")
# pdf_docs = st.file_uploader("Upload your PDF Files and Click on the Submit & Process Button", accept_multiple_files=True)
# if st.button("Submit & Process"):
# with st.spinner("Processing..."):
# raw_text = get_pdf_text(pdf_docs)
# text_chunks = get_text_chunks(raw_text)
# get_vector_store(text_chunks)
# st.success("Done")
# if __name__ == "__main__":
# main()
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
import os
# Load API key
load_dotenv()
genai_key = os.getenv("GOOGLE_API_KEY")
# Constants for cost calculation
EMBEDDING_COST_PER_1000_TOKENS = 0.0002 # USD
LM_COST_PER_1000_TOKENS = 0.0001 # USD
def get_pdf_text(pdf_docs):
"""Extract text from uploaded PDF documents."""
text = ""
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def get_text_chunks(text):
"""Split the extracted text into chunks for embedding."""
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
chunks = text_splitter.split_text(text)
return chunks
def calculate_cost(tokens, rate_per_1000):
"""Calculate cost based on tokens and rate."""
return (tokens / 1000) * rate_per_1000
def get_vector_store(text_chunks):
"""Generate embeddings and store in FAISS."""
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
vector_store.save_local("faiss_index")
# Calculate embedding cost
total_tokens = sum(len(chunk.split()) for chunk in text_chunks)
embedding_cost = calculate_cost(total_tokens, EMBEDDING_COST_PER_1000_TOKENS)
return embedding_cost
def get_conversational_chain():
"""Set up the conversational chain."""
prompt_template = """
Answer the question as detailed as possible from the provided context. If the answer is not in
the context, respond with "answer is not available in the context".\n\n
Context:\n {context}?\n
Question:\n {question}\n
Answer:
"""
model = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.3)
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
return chain
def process_user_question(user_question):
"""Process user question and calculate costs."""
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector_store = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
docs = vector_store.similarity_search(user_question)
# Token estimation for retrieval
retrieval_tokens = sum(len(doc.page_content.split()) for doc in docs)
chain = get_conversational_chain()
response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
# Token estimation for inference
input_tokens = sum(len(doc.page_content.split()) for doc in docs) + len(user_question.split())
output_tokens = len(response["output_text"].split())
# Cost calculation
retrieval_cost = calculate_cost(retrieval_tokens, EMBEDDING_COST_PER_1000_TOKENS)
inference_cost = calculate_cost(input_tokens + output_tokens, LM_COST_PER_1000_TOKENS)
total_cost = retrieval_cost + inference_cost
# Output the results
st.write("Response:", response["output_text"])
st.write(f"Embedding Cost: ${retrieval_cost:.4f}")
st.write(f"Language Model Cost: ${inference_cost:.4f}")
st.write(f"Total Query Cost: ${total_cost:.4f}")
def main():
"""Streamlit app entry point."""
st.set_page_config("Chat PDF Cost Calculator")
st.header("Chat with PDF using Gemini 💁 (Cost Included)")
user_question = st.text_input("Ask a Question from the PDF Files")
if user_question:
process_user_question(user_question)
with st.sidebar:
st.title("Menu:")
pdf_docs = st.file_uploader("Upload your PDF Files and Click Submit & Process", accept_multiple_files=True)
if st.button("Submit & Process"):
with st.spinner("Processing..."):
raw_text = get_pdf_text(pdf_docs)
text_chunks = get_text_chunks(raw_text)
embedding_cost = get_vector_store(text_chunks)
st.success(f"Processing Done! Embedding Cost: ${embedding_cost:.4f}")
if __name__ == "__main__":
main()