Spaces:
Build error
Build error
| # %% | |
| import os | |
| import json | |
| import urllib.parse | |
| from tempfile import _TemporaryFileWrapper | |
| import pandas as pd | |
| import requests | |
| import streamlit as st | |
| from streamlit_chat import message | |
| from streamlit_extras.add_vertical_space import add_vertical_space | |
| from streamlit_extras.colored_header import colored_header | |
| st.set_page_config( | |
| layout="wide", | |
| page_title="pdfGPT-chat. Ask your PDF!", | |
| page_icon=":robot_face:", | |
| ) | |
| def main(): | |
| def convert_df(df): | |
| return df.to_csv(index=False).encode("utf-8") | |
| def pdf_change(): | |
| st.session_state["pdf_change"] = True | |
| def check_api(api_key): | |
| return api_key.startswith("sk-") and len(api_key) == 51 | |
| def check_url(url): | |
| parsed_url = urllib.parse.urlparse(url) | |
| return all([parsed_url.scheme, parsed_url.netloc]) | |
| def result_to_dict(r, start): | |
| result = r.json()["result"] | |
| result = result.split("###")[start:] | |
| keys = ["prompt", "answer", "token_used", "gpt_model"] | |
| # Error in OpenAI server also gives status_code 200 | |
| if len(result) >= 0: | |
| result.extend([result, 0, gpt_model]) | |
| return dict(zip(keys, result)) | |
| def load_pdf(): | |
| if file is None and len(pdf_url) == 0: | |
| return st.error("Both URL and PDF is empty. Provide at least one.") | |
| elif len(pdf_url) > 0: | |
| if not check_url(pdf_url): | |
| return st.error("Please enter valid URL.") | |
| elif file is not None: | |
| return st.error( | |
| "Both URL and PDF is provided. Please provide only one (either URL or PDF)." | |
| ) | |
| # load pdf from url | |
| else: | |
| r = requests.post( | |
| f"{LCSERVE_HOST}/load_url", | |
| json={ | |
| "url": pdf_url, | |
| "rebuild_embedding": st.session_state["pdf_change"], | |
| "embedding_model": embedding_model, | |
| "gpt_model": gpt_model, | |
| "envs": { | |
| "OPENAI_API_KEY": OPENAI_API_KEY, | |
| } | |
| }, | |
| ) | |
| # load file | |
| else: | |
| _data = { | |
| "rebuild_embedding": st.session_state["pdf_change"], | |
| "embedding_model": embedding_model, | |
| "gpt_model": gpt_model, | |
| "envs": { | |
| "OPENAI_API_KEY": OPENAI_API_KEY, | |
| } | |
| } | |
| r = requests.post( | |
| f"{LCSERVE_HOST}/load_file", | |
| params={"input_data": json.dumps(_data)}, | |
| files={"file": file}, | |
| ) | |
| if r.status_code != 200: | |
| if "error" in r.json(): | |
| if "message" in r.json()["error"]: | |
| return st.error(r.json()["error"]["message"]) | |
| else: | |
| return str(r.json()) | |
| elif r.json()["result"].startswith("Corpus Loaded."): | |
| st.session_state["loaded"] = True | |
| st.session_state["pdf_change"] = False | |
| # extract result | |
| result = result_to_dict(r, 1) | |
| # concatenate reply | |
| reply_summary = "Hello there. I'm pdfGPT-chat.\nHere is a summary of your PDF:\n\n" | |
| reply_summary += result["answer"] | |
| reply_summary += "\n\nDo you have any question about your PDF?" | |
| if len(st.session_state["past"]) == 1: | |
| st.session_state["generated"][0] = reply_summary | |
| else: | |
| st.session_state["past"].append("Hi") | |
| st.session_state["generated"].append(reply_summary) | |
| # calculate cost | |
| calculate_cost(result["token_used"], result["gpt_model"]) | |
| return st.success("The PDF file has been loaded.") | |
| else: | |
| return st.info(r.json()["result"]) | |
| def generate_response( | |
| lcserve_host: str, | |
| url: str, | |
| file: _TemporaryFileWrapper, | |
| question: str, | |
| ) -> dict: | |
| if question.strip() == "": | |
| return "[ERROR]: Question field is empty" | |
| _data = { | |
| "question": question, | |
| "rebuild_embedding": st.session_state["pdf_change"], | |
| "embedding_model": embedding_model, | |
| "gpt_model": gpt_model, | |
| "envs": { | |
| "OPENAI_API_KEY": OPENAI_API_KEY, | |
| }, | |
| } | |
| if url.strip() != "": | |
| r = requests.post( | |
| f"{LCSERVE_HOST}/ask_url", | |
| json={"url": url, **_data}, | |
| ) | |
| else: | |
| r = requests.post( | |
| f"{LCSERVE_HOST}/ask_file", | |
| params={"input_data": json.dumps(_data)}, | |
| files={"file": file}, | |
| ) | |
| if r.status_code != 200: | |
| content = r.content.decode() # Convert bytes to string | |
| with open("langchainlog.txt", "w") as file: | |
| file.write(content) | |
| return f"[ERROR]: {r.text}" | |
| result_dict = result_to_dict(r, 0) | |
| return result_dict | |
| def calculate_cost(token_used, gpt_model): | |
| st.session_state["total_token"] += int(token_used) | |
| if "gpt-3" in gpt_model: | |
| current_cost = st.session_state["total_token"] * 0.002 / 1000 | |
| else: | |
| current_cost = st.session_state["total_token"] * 0.06 / 1000 | |
| st.session_state["total_cost"] += current_cost | |
| # %% | |
| # main page layout | |
| header = st.container() | |
| welcome_page = st.container() | |
| response_container = st.container() | |
| input_container = st.container() | |
| cost_container = st.container() | |
| load_pdf_popup = st.container() | |
| # sidebar layout | |
| input_details = st.sidebar.container() | |
| preferences = st.sidebar.container() | |
| chat_download = st.sidebar.container() | |
| # %% | |
| # instantiate session states | |
| if "api_key" not in st.session_state: | |
| st.session_state["api_key"] = False | |
| if "generated" not in st.session_state: | |
| st.session_state["generated"] = ["Hello there. I'm pdfGPT-chat. Do you have any question about your PDF?"] | |
| if "loaded" not in st.session_state: | |
| st.session_state["loaded"] = False | |
| if "past" not in st.session_state: | |
| st.session_state["past"] = ["Hi"] | |
| if "pdf_change" not in st.session_state: | |
| st.session_state["pdf_change"] = True | |
| if "total_cost" not in st.session_state: | |
| st.session_state["total_cost"] = 0 | |
| if "total_token" not in st.session_state: | |
| st.session_state["total_token"] = 0 | |
| # %% | |
| # constants | |
| E5_URL = "https://github.com/microsoft/unilm/tree/master/e5" | |
| EMBEDDING_CHOICES = { | |
| "multilingual-e5-base": "Multilingual-E5 (default)", | |
| "e5-small-v2": "English-E5-small (faster)", | |
| } | |
| GPT_CHOICES = { | |
| "gpt-3.5-turbo": "GPT-3.5-turbo (default)", | |
| "gpt-4": "GPT-4 (smarter, costlier)", | |
| } | |
| LCSERVE_HOST = "http://localhost:8080" | |
| OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") | |
| PDFGPT_URL = "https://github.com/bhaskatripathi/pdfGPT" | |
| SIGNATURE = """<style> | |
| .footer { | |
| position: static; | |
| left: 0; | |
| bottom: 0; | |
| width: 100%; | |
| background: rgba(0,0,0,0); | |
| text-align: center; | |
| } | |
| </style> | |
| <div class="footer"> | |
| <p style='display: block; | |
| text-align: center; | |
| font-size:14px; | |
| color:darkgray'>Developed with ❤ by asyafiqe</p> | |
| </div> | |
| """ | |
| with header: | |
| st.title(":page_facing_up: pdfGPT-chat") | |
| with st.expander( | |
| "A fork of [pdfGPT](%s) with several improvements. With pdfGPT-chat, you can chat with your PDF files using [**Microsoft E5 Multilingual Text Embeddings**](%s) and **OpenAI**." | |
| % (PDFGPT_URL, E5_URL) | |
| ): | |
| st.markdown( | |
| "Compared to other tools, pdfGPT-chat provides **hallucinations-free** response, thanks to its superior embeddings and tailored prompt.<br />The generated responses from pdfGPT-chat include **citations** in square brackets ([]), indicating the **page numbers** where the relevant information is found.<br />This feature not only enhances the credibility of the responses but also aids in swiftly locating the pertinent information within the PDF file.", | |
| unsafe_allow_html=True, | |
| ) | |
| colored_header( | |
| label="", | |
| description="", | |
| color_name="blue-40", | |
| ) | |
| with preferences: | |
| colored_header( | |
| label="", | |
| description="", | |
| color_name="blue-40", | |
| ) | |
| st.write("**Preferences**") | |
| embedding_model = st.selectbox( | |
| "Embedding", | |
| EMBEDDING_CHOICES.keys(), | |
| help="""[Multilingual-E5](%s) supports 100 languages. | |
| E5-small is much faster and suitable for PC without GPU.""" | |
| % E5_URL, | |
| on_change=pdf_change, | |
| format_func=lambda x: EMBEDDING_CHOICES[x], | |
| ) | |
| gpt_model = st.selectbox( | |
| "GPT Model", | |
| GPT_CHOICES.keys(), | |
| help="For GPT-4 you might have to join the waitlist: https://openai.com/waitlist/gpt-4-api", | |
| format_func=lambda x: GPT_CHOICES[x], | |
| ) | |
| # %% | |
| # sidebar | |
| with input_details: | |
| # sidebar | |
| pdf_url = st.text_input( | |
| ":globe_with_meridians: Enter PDF URL here", on_change=pdf_change | |
| ) | |
| st.markdown( | |
| "<h2 style='text-align: center; color: black;'>OR</h2>", | |
| unsafe_allow_html=True, | |
| ) | |
| file = st.file_uploader( | |
| ":page_facing_up: Upload your PDF/ Research Paper / Book here", | |
| type=["pdf"], | |
| on_change=pdf_change, | |
| ) | |
| if st.button("Load PDF"): | |
| st.session_state["loaded"] = True | |
| with st.spinner("Loading PDF"): | |
| with load_pdf_popup: | |
| load_pdf() | |
| # %% | |
| # main tab | |
| if st.session_state["loaded"]: | |
| with input_container: | |
| with st.form(key="input_form", clear_on_submit=True): | |
| user_input = st.text_area("Question:", key="input", height=100) | |
| submit_button = st.form_submit_button(label="Send") | |
| if user_input and submit_button: | |
| with st.spinner("Processing your question"): | |
| response = generate_response( | |
| LCSERVE_HOST, | |
| pdf_url, | |
| file, | |
| user_input, | |
| ) | |
| st.session_state.past.append(user_input) | |
| st.session_state.generated.append(response["answer"]) | |
| # calculate cost | |
| calculate_cost(response["token_used"], response["gpt_model"]) | |
| if not user_input and submit_button: | |
| st.error("Please write your question.") | |
| with response_container: | |
| if st.session_state["generated"]: | |
| for i in range(len(st.session_state["generated"])): | |
| message( | |
| st.session_state["past"][i], is_user=True, key=str(i) + "_user" | |
| ) | |
| message(st.session_state["generated"][i], key=str(i)) | |
| cost_container.caption( | |
| f"Estimated cost: $ {st.session_state['total_cost']:.4f}" | |
| ) | |
| else: | |
| with welcome_page: | |
| st.write("") | |
| st.subheader( | |
| """:arrow_left: To start please fill input details in the sidebar and click **Load PDF**""" | |
| ) | |
| # %% | |
| # placed in the end to include the last conversation | |
| with chat_download: | |
| chat_history = pd.DataFrame( | |
| { | |
| "Question": st.session_state["past"], | |
| "Answer": st.session_state["generated"], | |
| } | |
| ) | |
| csv = convert_df(chat_history) | |
| st.download_button( | |
| label="Download chat history", | |
| data=csv, | |
| file_name="chat history.csv", | |
| mime="text/csv", | |
| ) | |
| add_vertical_space(2) | |
| st.markdown(SIGNATURE, unsafe_allow_html=True) | |
| # %% | |
| # # javascript | |
| # | |
| # # scroll halfway through the page | |
| js = f""" | |
| <script> | |
| function scroll() {{ | |
| var textAreas = parent.document.querySelectorAll('section.main'); | |
| var halfwayScroll = 0.4 * textAreas[0].scrollHeight; // Calculate halfway scroll position | |
| for (let index = 0; index < textAreas.length; index++) {{ | |
| textAreas[index].scrollTop = halfwayScroll; // Set scroll position to halfway | |
| }} | |
| }} | |
| scroll(); // Call the scroll function | |
| </script> | |
| """ | |
| st.components.v1.html(js) | |
| # reduce main top padding | |
| st.markdown( | |
| "<style>div.block-container{padding-top:1.5em;}</style>", | |
| unsafe_allow_html=True, | |
| ) | |
| # reduce sidebar top padding | |
| st.markdown( | |
| "<style>.css-ysnqb2.e1g8pov64 {margin-top: -90px;}</style>", | |
| unsafe_allow_html=True, | |
| ) | |
| if __name__ == "__main__": | |
| main() | |