Spaces:
Running
Running
| import gradio as gr | |
| import openai, os | |
| import tqdm | |
| import time | |
| from langchain.vectorstores import Chroma | |
| from PyPDF2 import PdfReader | |
| from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter | |
| from langchain.embeddings.openai import OpenAIEmbeddings | |
| from langchain import VectorDBQA | |
| # from langchain.llms import ChatOpenAI | |
| from langchain.chains import RetrievalQA | |
| from langchain.chat_models import AzureChatOpenAI | |
| # from langchain.chat_models import AzureChatOpenAI | |
| os.environ["OPENAI_API_TYPE"] = openai.api_type = "azure" | |
| os.environ["OPENAI_API_VERSION"] = openai.api_version = "2023-03-15-preview" | |
| os.environ["OPENAI_API_BASE"] = openai.api_base = "https://eastus-openai-sean.openai.azure.com/" | |
| # openai.api_key = os.environ["OPENAI_API_KEY"] | |
| def upload_pdf(file, pdf_text, embeddings, vectorstore, azure_embeddings, qa, progress = gr.Progress(track_tqdm=True)): | |
| reader = PdfReader(file) | |
| number_of_pages = len(reader.pages) | |
| pdf_text = "" | |
| for page_number in range(number_of_pages): | |
| page = reader.pages[page_number] | |
| pdf_text += page.extract_text() | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size = 1000, | |
| chunk_overlap = 200, | |
| length_function = len,) | |
| texts = text_splitter.split_text(pdf_text) | |
| for text in tqdm.tqdm(texts): | |
| try: | |
| response = openai.Embedding.create( | |
| input=text, | |
| engine="text-embedding-ada-002") | |
| emb = response['data'][0]['embedding'] | |
| embeddings.append(emb) | |
| except Exception as e: | |
| print(e) | |
| time.sleep(8) | |
| response = openai.Embedding.create( | |
| input=text, | |
| engine="text-embedding-ada-002") | |
| emb = response['data'][0]['embedding'] | |
| embeddings.append(emb) | |
| azure_embeddings = OpenAIEmbeddings( | |
| deployment="text-embedding-ada-002", | |
| model="ytext-embedding-ada-002", | |
| ) | |
| vectorstore = Chroma("collection", embedding_function=azure_embeddings) | |
| vectorstore._collection.add( | |
| ids= [f"doc_{i}" for i in range(len(texts))], | |
| documents=texts, | |
| embeddings=embeddings, | |
| metadatas=[{"source": "source"} for text in texts]) | |
| qa = RetrievalQA.from_chain_type(llm= AzureChatOpenAI(model_name='gpt-35-turbo', deployment_name="gpt-35-turbo"), chain_type="stuff", retriever=vectorstore.as_retriever()) | |
| # qa = RetrievalQA.from_chain_type(llm= AzureChatOpenAI(deployment_name="chat", model_name="gpt-35-turbo"), chain_type="stuff", vectorstore=vectorstore.as_retriever()) | |
| return pdf_text, pdf_text, embeddings, vectorstore, azure_embeddings, qa, gr.update(visible=True), gr.update(visible=True), gr.update(visible=False) | |
| def add_text(chatstate, query, qa): | |
| # chain.run(input_documents=docs, question=query) | |
| chatstate = chatstate + [(query, qa.run(query))] | |
| return chatstate, chatstate, qa | |
| with gr.Blocks(css="footer {visibility: hidden}", title='PDF - Q&A') as demo: | |
| qa = pdf_text = embeddings = vectorstore = azure_embeddings = gr.State([]) | |
| with gr.Row(visible=False) as chat_row: | |
| chatbot = gr.Chatbot() | |
| with gr.Row(visible=False) as submit_row: | |
| text = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False) | |
| chatstate = gr.State([]) | |
| text.submit(add_text, [chatstate, text, qa], [chatbot, chatstate, qa]) | |
| # set state | |
| with gr.Column() as upload_column: | |
| file = gr.File() | |
| upload_btn = gr.Button("Upload") | |
| output_text = gr.TextArea() | |
| upload_btn.click(upload_pdf, inputs=[file, pdf_text, embeddings, vectorstore, azure_embeddings, qa], outputs=[output_text, pdf_text, embeddings, vectorstore, azure_embeddings, qa, chat_row, submit_row, upload_column]) | |
| demo.launch(enable_queue=True) |