Spaces:
Sleeping
Sleeping
File size: 2,909 Bytes
ffd7039 69dccc0 8c0cdd0 69dccc0 8c0cdd0 13a2c93 e1fc3f5 8c0cdd0 69dccc0 8c0cdd0 69dccc0 8c0cdd0 69dccc0 8c0cdd0 45901b5 8c0cdd0 69dccc0 8c0cdd0 13a2c93 8c0cdd0 13a2c93 8c0cdd0 13a2c93 45901b5 8c0cdd0 13a2c93 69dccc0 8c0cdd0 13a2c93 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import gradio as gr
import fitz # PyMuPDF
import re
from pathlib import Path
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_openai import ChatOpenAI
from langchain_experimental.text_splitter import SemanticChunker
openai_api_key = "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"
def extract_text_from_pdf(pdf_file):
document = fitz.open(pdf_file)
text = ""
for page_num in range(len(document)):
page = document.load_page(page_num)
text += page.get_text()
document.close()
return text
def clean_text(text):
cleaned_text = re.sub(r'\s+', ' ', text)
cleaned_text = re.sub(r'(.)\1{2,}', r'\1', cleaned_text)
cleaned_text = re.sub(r'\b(\w+)\b(?:\s+\1\b)+', r'\1', cleaned_text)
return cleaned_text.strip()
def initialize_chatbot(cleaned_text, openai_api_key):
embeddings = OpenAIEmbeddings(api_key=openai_api_key)
text_splitter = SemanticChunker(embeddings)
docs = text_splitter.create_documents([cleaned_text])
vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)
llm = ChatOpenAI(api_key=openai_api_key, temperature=0.5, model="gpt-4o", verbose=True)
retriever = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=llm)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
qa = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)
return qa
def answer_query(pdf_file, question):
extracted_text = extract_text_from_pdf(pdf_file)
cleaned_text = clean_text(extracted_text)
qa = initialize_chatbot(cleaned_text, openai_api_key)
result = qa({"question": question})
return result['answer']
def process_pdf_and_question(pdf_file, question, chat_history):
if pdf_file is None:
chat_history.append(("System", "Please upload a PDF file."))
return chat_history
if not question.strip():
chat_history.append(("System", "Please enter a question."))
return chat_history
answer = answer_query(pdf_file, question)
chat_history.append((question, answer))
return chat_history
with gr.Blocks() as demo:
chat_history = gr.State([])
upload = gr.File(label="Upload PDF")
chatbot = gr.Chatbot(label="Chat History")
question = gr.Textbox(label="Ask a question", placeholder="Type your question and hit Enter")
question.submit(
fn=process_pdf_and_question,
inputs=[upload, question, chat_history],
outputs=[chatbot],
)
if __name__ == "__main__":
demo.launch()
|