Spaces:

RAHMAN00700
/

Research_paper

Runtime error

App Files Files Community

RAHMAN00700 commited on Oct 12, 2024

Commit

3e1f2bb

unverified ·

1 Parent(s): 5bfe977

Update app.py

Browse files

Files changed (1) hide show

app.py +145 -41

app.py CHANGED Viewed

@@ -2,23 +2,33 @@ import streamlit as st
 from dotenv import load_dotenv
 from PyPDF2 import PdfReader
 from langchain.text_splitter import CharacterTextSplitter
-from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.chat_models import ChatOpenAI
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
 from langchain.llms import HuggingFaceHub
-from ibm_watson_machine_learning.foundation_models import Model
-from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
-from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
-import requests
 import os
 import tempfile
-# Load environment variables
-load_dotenv()
-# IBM Watson credentials
 project_id = os.getenv("PROJECT_ID", None)
 credentials = {
     "url": "https://us-south.ml.cloud.ibm.com",
@@ -35,21 +45,14 @@ def getBearer(apikey):
 credentials["token"] = getBearer(credentials["apikey"])
-# Use a supported model
-model_id = ModelTypes.LLAMA_3_70B_INSTRUCT
-# Initialize Watsonx foundation model
-parameters = {
-    GenParams.DECODING_METHOD: "greedy",
-    GenParams.MAX_NEW_TOKENS: 500,
-    GenParams.MIN_NEW_TOKENS: 0,
-    GenParams.STOP_SEQUENCES: ["\n"],
-    GenParams.REPETITION_PENALTY: 2
-}
 llama_model = Model(
-    model_id=model_id,
-    params=parameters,
     credentials=credentials,
     project_id=project_id
 )
@@ -70,50 +73,151 @@ def get_text_chunks(text):
         chunk_overlap=200,
         length_function=len
     )
-    return text_splitter.split_text(text)
 # Function to create a vector store
 def get_vectorstore(text_chunks):
-    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-    return FAISS.from_texts(text_chunks, embeddings)
 # Function to create a conversation chain
 def get_conversation_chain(vectorstore):
     llm = llama_model.to_langchain()
-    memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
-    return ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), memory=memory)
 # Main function
 def main():
     st.set_page_config(page_title="Chat with your Documents", page_icon=":books:")
-    st.header("Chat with Research Papers :books:")
     if "conversation" not in st.session_state:
         st.session_state.conversation = None
     if "chat_history" not in st.session_state:
         st.session_state.chat_history = None
     user_question = st.text_input("Ask questions to research paper or upload your documents:")
     if st.button("Search") and user_question:
-        with st.spinner("Processing your question..."):
-            prompt = {"question": user_question}
-            response = llama_model.generate(prompt)
-            st.write(response)
     with st.sidebar:
         st.subheader("Your documents")
         pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
-        if pdf_docs:
             if st.button("Process"):
-                with st.spinner("Processing your documents..."):
                     raw_text = get_pdf_text(pdf_docs)
                     text_chunks = get_text_chunks(raw_text)
                     vectorstore = get_vectorstore(text_chunks)
                     st.session_state.conversation = get_conversation_chain(vectorstore)
-                    st.write("Documents loaded")
-if __name__ == "__main__":
     main()

 from dotenv import load_dotenv
 from PyPDF2 import PdfReader
 from langchain.text_splitter import CharacterTextSplitter
+from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.chat_models import ChatOpenAI
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
+from htmlTemplates import css, bot_template, user_template
 from langchain.llms import HuggingFaceHub
+from langchain.embeddings import HuggingFaceEmbeddings
 import os
+import requests
 import tempfile
+import pandas as pd
+from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
+from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
+from langchain.vectorstores import FAISS
+from langchain.embeddings import TensorflowHubEmbeddings
+# Define parameters
+parameters = {
+    GenParams.DECODING_METHOD: "greedy",
+    GenParams.MAX_NEW_TOKENS: 500,
+    GenParams.MIN_NEW_TOKENS: 0,
+    GenParams.STOP_SEQUENCES: ["\n"],
+    GenParams.REPETITION_PENALTY: 2
+}
+load_dotenv()
 project_id = os.getenv("PROJECT_ID", None)
 credentials = {
     "url": "https://us-south.ml.cloud.ibm.com",
 credentials["token"] = getBearer(credentials["apikey"])
+# Select supported model type (fixing the issue)
+from ibm_watson_machine_learning.foundation_models import Model
+model_id = "meta-llama/llama-3-70b-instruct"  # Use valid model from the supported list
+# Initialize the Watsonx foundation model
 llama_model = Model(
+    model_id=model_id,
+    params=parameters,
     credentials=credentials,
     project_id=project_id
 )
         chunk_overlap=200,
         length_function=len
     )
+    chunks = text_splitter.split_text(text)
+    return chunks
 # Function to create a vector store
 def get_vectorstore(text_chunks):
+    url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
+    embeddings = TensorflowHubEmbeddings(model_url=url)
+    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
+    return vectorstore
 # Function to create a conversation chain
 def get_conversation_chain(vectorstore):
     llm = llama_model.to_langchain()
+    memory = ConversationBufferMemory(
+        memory_key='chat_history', return_messages=True)
+    conversation_chain = ConversationalRetrievalChain.from_llm(
+        llm=llm,
+        retriever=vectorstore.as_retriever(),
+        memory=memory
+    )
+    return conversation_chain
+def call_model_flan(question):
+    parameters = {
+        GenParams.DECODING_METHOD: "greedy",
+        GenParams.MAX_NEW_TOKENS: 50,
+        GenParams.MIN_NEW_TOKENS: 1,
+        GenParams.STOP_SEQUENCES: ["<|endoftext|>"],
+        GenParams.REPETITION_PENALTY: 1
+    }
+    # Initialize the Watsonx foundation model
+    llm_model = Model(
+        model_id="meta-llama/llama-3-405b-instruct",
+        params=parameters,
+        credentials=credentials,
+        project_id=project_id
+    )
+    prompt = f"Considering the following question, generate 3 keywords most significant to use when searching in the Arxiv API. Provide your response as a Python list: {question}."
+    result = llm_model.generate(prompt)['results'][0]['generated_text']
+    # Convert string to a list of individual words
+    word_list = result.split(', ')
+    return word_list
+def download_pdf(url, filename):
+    response = requests.get(url)
+    with open(filename, 'wb') as file:
+        file.write(response.content)
+def download_pdf_files(url_list):
+    temp_dir = tempfile.gettempdir()  # Get the temporary directory path
+    downloaded_files = []  # List to store downloaded file paths
+    for i, url in enumerate(url_list):
+        filename = os.path.join(temp_dir, f'file_{i+1}.pdf')  # Set the absolute path in the temporary directory
+        download_pdf(url, filename)
+        downloaded_files.append(filename)  # Append the file name to the list with the path
+        print(f'Downloaded: {filename}')
+    return downloaded_files
+def delete_files_in_temp():
+    temp_dir = tempfile.gettempdir()  # Get the temporary directory path
+    for file in os.listdir(temp_dir):
+        file_path = os.path.join(temp_dir, file)
+        try:
+            if os.path.isfile(file_path):
+                os.unlink(file_path)
+                print(f"Deleted: {file_path}")
+        except Exception as e:
+            print(f"Failed to delete {file_path}: {e}")
+def arxiv_search(topic):
+    import arxiv
+    titles = []
+    pdf_url = []
+    search = arxiv.Search(
+        query=topic,
+        max_results=5,
+        sort_by=arxiv.SortCriterion.Relevance
+    )
+    titles = [result.title for result in arxiv.Client().results(search)]
+    pdf_url = [result.pdf_url for result in arxiv.Client().results(search)]
+    url_list = pdf_url
+    downloaded_files = download_pdf_files(url_list)
+    return downloaded_files, titles
+# Function to handle user input and display responses
+def handle_user_input(user_question, titles=None):
+    prompt = {"question": user_question}
+    response = st.session_state.conversation(prompt)
+    st.session_state.chat_history = response['chat_history']
+    for i, message in enumerate(st.session_state.chat_history):
+        template = user_template if i % 2 == 0 else bot_template
+        st.write(template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
 # Main function
 def main():
     st.set_page_config(page_title="Chat with your Documents", page_icon=":books:")
+    st.write(css, unsafe_allow_html=True)
     if "conversation" not in st.session_state:
         st.session_state.conversation = None
     if "chat_history" not in st.session_state:
         st.session_state.chat_history = None
+    st.header("Chat with Research papers :books:")
     user_question = st.text_input("Ask questions to research paper or upload your documents:")
     if st.button("Search") and user_question:
+        with st.spinner("Analyzing query"):
+            original_list = call_model_flan(user_question)
+            unique_list = list(set(original_list))
+            topic = ' '.join(unique_list)  # full topic creation
+        with st.spinner("Searching in Database: " + topic):
+            downloaded_files, titles = arxiv_search(topic)
+        with st.spinner("Vectorizing results"):
+            # Get PDF text and split into chunks
+            raw_text = get_pdf_text(downloaded_files)
+            text_chunks = get_text_chunks(raw_text)
+            # Create vector store and conversation chain
+            vectorstore = get_vectorstore(text_chunks)
+            st.write("Documents loaded")
+            st.session_state.conversation = get_conversation_chain(vectorstore)
+            if titles is not None:
+                enumerated_strings = [f"{index + 1}. {value}" for index, value in enumerate(titles)]
+                combined_string = ', <br> '.join(enumerated_strings)
+                st.write(bot_template.replace("{{MSG}}", "Relevant papers found: " + combined_string), unsafe_allow_html=True)
     with st.sidebar:
         st.subheader("Your documents")
         pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
+        if not pdf_docs:
+            st.write('You can add your document')
+        else:
             if st.button("Process"):
+                with st.spinner("Processing"):
                     raw_text = get_pdf_text(pdf_docs)
                     text_chunks = get_text_chunks(raw_text)
                     vectorstore = get_vectorstore(text_chunks)
+                    st.write("Document loaded")
                     st.session_state.conversation = get_conversation_chain(vectorstore)
+    if user_question and st.session_state.conversation is not None:
+        handle_user_input(user_question)
+if __name__ == '__main__':
     main()