Spaces:

Emerging-Tech
/

docreader

Runtime error

App Files Files Community

docreader / app.py

CosmoAI

Update app.py

6ae72bf verified almost 2 years ago

raw

history blame

6.81 kB

	import streamlit as st
	import langchain
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.vectorstores import Chroma
	from langchain import OpenAI, VectorDBQA
	from langchain.chains import RetrievalQAWithSourcesChain
	import PyPDF2

	api_key = os.environ["OPENAI_API_KEY"]

	#This function will go through pdf and extract and return list of page texts.
	def read_and_textify(files):
	text_list = []
	sources_list = []
	for file in files:
	pdfReader = PyPDF2.PdfReader(file)
	#print("Page Number:", len(pdfReader.pages))
	for i in range(len(pdfReader.pages)):
	pageObj = pdfReader.pages[i]
	text = pageObj.extract_text()
	pageObj.clear()
	text_list.append(text)
	sources_list.append(file.name + "_page_"+str(i))
	return [text_list,sources_list]

	st.set_page_config(layout="centered", page_title="Multidoc_QnA")
	st.header("Multidoc_QnA")
	st.write("---")

	#file uploader
	uploaded_files = st.file_uploader("Upload documents",accept_multiple_files=True, type=["txt","pdf"])
	st.write("---")

	if uploaded_files is None:
	st.info(f"""Upload files to analyse""")
	elif uploaded_files:
	st.write(str(len(uploaded_files)) + " document(s) loaded..")

	textify_output = read_and_textify(uploaded_files)

	documents = textify_output[0]
	sources = textify_output[1]

	#extract embeddings
	embeddings = OpenAIEmbeddings(openai_api_key = api_key)
	#vstore with metadata. Here we will store page numbers.
	vStore = Chroma.from_texts(documents, embeddings, metadatas=[{"source": s} for s in sources])
	#deciding model
	model_name = "gpt-3.5-turbo"
	# model_name = "gpt-4"

	retriever = vStore.as_retriever()
	retriever.search_kwargs = {'k':2}

	#initiate model
	llm = OpenAI(model_name=model_name, openai_api_key = api_key, streaming=True)
	model = RetrievalQAWithSourcesChain.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

	st.header("Ask your data")
	user_q = st.text_area("Enter your questions here")

	if st.button("Get Response"):
	try:
	with st.spinner("Model is working on it..."):
	result = model({"question":user_q}, return_only_outputs=True)
	st.subheader('Your response:')
	st.write(result['answer'])
	st.subheader('Source pages:')
	st.write(result['sources'])
	except Exception as e:
	st.error(f"An error occurred: {e}")
	st.error('Oops, the GPT response resulted in an error :( Please try again with a different question.')




































	# import gradio as gr
	# import streamlit as st
	# from langchain.embeddings.openai import OpenAIEmbeddings
	# from langchain.text_splitter import CharacterTextSplitter
	# from langchain.vectorstores import Chroma
	# from langchain.chains import ConversationalRetrievalChain
	# from langchain.chat_models import ChatOpenAI
	# from langchain.document_loaders import PyPDFLoader
	# import os
	# import fitz
	# from PIL import Image


	# # Global variables
	# COUNT, N = 0, 0
	# chat_history = []
	# chain = None # Initialize chain as None

	# # Function to set the OpenAI API key

	# api_key = os.environ['OPENAI_API_KEY']

	# st.write(api_key)


	# # Function to enable the API key input box
	# def enable_api_box():
	# return enable_box

	# # Function to add text to the chat history
	# def add_text(history, text):
	# if not text:
	# raise gr.Error('Enter text')
	# history = history + [(text, '')]
	# return history

	# # Function to process the PDF file and create a conversation chain
	# def process_file(file):
	# global chain
	# if 'OPENAI_API_KEY' not in os.environ:
	# raise gr.Error('Upload your OpenAI API key')

	# # Replace with your actual PDF processing logic
	# loader = PyPDFLoader(file.name)
	# documents = loader.load()
	# embeddings = OpenAIEmbeddings()
	# pdfsearch = Chroma.from_documents(documents, embeddings)

	# chain = ConversationalRetrievalChain.from_llm(ChatOpenAI(temperature=0.3),
	# retriever=pdfsearch.as_retriever(search_kwargs={"k": 1}),
	# return_source_documents=True)
	# return chain

	# # Function to generate a response based on the chat history and query
	# def generate_response(history, query, pdf_upload):
	# global COUNT, N, chat_history, chain
	# if not pdf_upload:
	# raise gr.Error(message='Upload a PDF')

	# if COUNT == 0:
	# chain = process_file(pdf_upload)
	# COUNT += 1

	# # Replace with your LangChain logic to generate a response
	# result = chain({"question": query, 'chat_history': chat_history}, return_only_outputs=True)
	# chat_history += [(query, result["answer"])]
	# N = list(result['source_documents'][0])[1][1]['page'] # Adjust as needed

	# for char in result['answer']:
	# history[-1][-1] += char
	# return history, ''

	# # Function to render a specific page of a PDF file as an image
	# def render_file(file):
	# global N
	# doc = fitz.open(file.name)
	# page = doc[N]
	# pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
	# image = Image.frombytes('RGB', [pix.width, pix.height], pix.samples)
	# return image

	# # Function to render initial content from the PDF
	# def render_first(pdf_file):
	# # Replace with logic to process the PDF and generate an initial image
	# image = Image.new('RGB', (600, 400), color = 'white') # Placeholder
	# return image

	# # Streamlit & Gradio Interface

	# st.title("PDF-Powered Chatbot")

	# with st.container():
	# gr.Markdown("""
	# <style>
	# .image-container { height: 680px; }
	# </style>
	# """)

	# with gr.Blocks() as demo:
	# pdf_upload1 = gr.UploadButton("📁 Upload PDF 1", file_types=[".pdf"]) # Define pdf_upload1

	# # ... (rest of your interface creation)

	# txt = gr.Textbox(label="Enter your query", placeholder="Ask a question...")
	# submit_btn = gr.Button('Submit')

	# @submit_btn.click()
	# def on_submit():
	# add_text(chatbot, txt)
	# generate_response(chatbot, txt, pdf_upload1) # Use pdf_upload1 here
	# render_file(pdf_upload1) # Use pdf_upload1 here

	# if __name__ == "__main__":
	# gr.Interface(
	# fn=generate_response,
	# inputs=[
	# "file", # Define pdf_upload1
	# "text", # Define chatbot output
	# "text" # Define txt
	# ],
	# outputs=[
	# "image", # Define show_img
	# "text", # Define chatbot output
	# "text" # Define txt
	# ],
	# title="PDF-Powered Chatbot"
	# ).launch()