Spaces:

snrspeaks
/

ChatPDF-Chainlit

Sleeping

App Files Files Community

ChatPDF-Chainlit / app.py

snrspeaks

Update app.py

1127c0e over 2 years ago

raw

history blame contribute delete

5.09 kB

	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
	from langchain.vectorstores import Chroma
	from langchain.chains import RetrievalQAWithSourcesChain
	from langchain.memory import ConversationBufferWindowMemory
	from langchain.chains import ConversationalRetrievalChain
	from langchain.chat_models import ChatOpenAI
	from langchain.prompts.chat import (
	ChatPromptTemplate,
	SystemMessagePromptTemplate,
	HumanMessagePromptTemplate,
	)
	from langchain.document_loaders import PyPDFLoader
	import os
	import chainlit as cl
	from langchain.prompts import PromptTemplate

	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

	system_template = """Use the following pieces of context to answer the users question.
	If you don't know the answer, just say that you don't know, don't try to make up an answer.
	ALWAYS return a "SOURCES" part in your answer.
	The "SOURCES" part should be a reference to the source of the document from which you got your answer.
	Example of your response should be:
	```
	The answer is foo
	SOURCES: xyz
	```
	Begin!
	----------------
	{summaries}"""
	messages = [
	SystemMessagePromptTemplate.from_template(system_template),
	HumanMessagePromptTemplate.from_template("{question}"),
	]
	prompt = ChatPromptTemplate.from_messages(messages)
	chain_type_kwargs = {"prompt": prompt}

	@cl.on_chat_start
	async def start():
	await cl.Avatar(
	name="ChatPDF",
	url="https://avatars.githubusercontent.com/u/128686189?s=400&u=a1d1553023f8ea0921fba0debbe92a8c5f840dd9&v=4",
	# path = r'assets/ChatPDFAvatar.jpg'
	).send()


	@cl.langchain_factory(use_async=True)
	async def init():
	files = None

	# Wait for the user to upload a file
	while files == None:
	files = await cl.AskFileMessage(
	content="Hey, Welcome to ChatPDF!\n\nChatPDF is a smart, user-friendly tool that integrates state-of-the-art AI models with text extraction and embedding capabilities to create a unique, conversational interaction with your PDF documents.\n\nSimply upload your PDF, ask your questions, and ChatPDF will deliver the most relevant answers directly from your document.\n\nPlease upload a PDF file to begin!",max_size_mb=100, accept=["application/pdf"]
	).send()

	file = files[0]

	msg = cl.Message(content=f'''Processing "{file.name}"...''')
	await msg.send()

	#

	with open(os.path.join(file.name), "wb") as f:
	f.write(file.content)

	print(file.name)

	loader = PyPDFLoader(file.name)
	pages = loader.load_and_split()

	# add page split info
	# Initialize a dictionary to keep track of duplicate page numbers
	page_counts = {}

	for document in pages:
	page_number = document.metadata['page']

	# If this is the first occurrence of this page number, initialize its count to 1
	# Otherwise, increment the count for this page number
	page_counts[page_number] = page_counts.get(page_number, 0) + 1

	# Create the page split info string
	page_split_info = f"Page-{page_number+1}.{page_counts[page_number]}"

	# Add the page split info to the document's metadata
	document.metadata['page_split_info'] = page_split_info



	# Create a Chroma vector store
	embeddings = OpenAIEmbeddings()
	docsearch = await cl.make_async(Chroma.from_documents)(
	pages, embeddings
	)

	# define memory
	memory = ConversationBufferWindowMemory(
	k=5,
	memory_key='chat_history',
	return_messages=True,
	output_key='answer'
	)

	# Create a chain that uses the Chroma vector store
	chain = ConversationalRetrievalChain.from_llm(
	ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k", streaming=True),
	chain_type="stuff",
	retriever=docsearch.as_retriever(search_kwargs={'k':10}),
	memory=memory,
	return_source_documents=True,
	)

	# Save the metadata and texts in the user session
	# cl.user_session.set("metadatas", metadatas)
	cl.user_session.set("texts", pages)

	# Let the user know that the system is ready
	await msg.update(content=f''' "{file.name}" processed. You can now ask questions!''')


	return chain


	@cl.langchain_postprocess
	async def process_response(res):
	answer = res["answer"]
	source_documents = res['source_documents']
	content = [source_documents[i].page_content for i in range(len(source_documents))]
	name = [source_documents[i].metadata['page_split_info'] for i in range(len(source_documents))]
	source_elements = [
	cl.Text(content=content[i], name=name[i]) for i in range(len(source_documents))
	]

	if source_documents:
	answer += f"\n\nSources: {', '.join([source_documents[i].metadata['page_split_info'] for i in range(len(source_documents))])}"
	else:
	answer += "\n\nNo sources found"

	await cl.Message(content=answer, elements=source_elements).send()
	# await cl.Message(content=answer).send()