|
|
| import streamlit as st |
| import os |
| import json |
| import requests |
| from langchain_community.document_loaders import PyMuPDFLoader |
| from openai import OpenAI |
| import tiktoken |
| import pandas as pd |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from langchain_community.embeddings.openai import OpenAIEmbeddings |
| from langchain_community.vectorstores import Chroma |
| import tempfile |
|
|
| |
| |
| |
|
|
| |
| OPENAI_API_KEY = os.environ.get("API_KEY") |
| OPENAI_API_BASE = os.environ.get("API_BASE") |
|
|
| |
| client = OpenAI( |
| api_key=OPENAI_API_KEY, |
| base_url=OPENAI_API_BASE |
| ) |
|
|
| |
| qna_system_message = """ |
| You are an AI assistant designed to support research teams in efficiently reviewing scientific literature. Your task is to provide evidence-based, concise, and relevant summaries based on the context provided from research papers. |
| |
| User input will include the necessary context for you to answer their questions. This context will begin with the token: |
| |
| ###Context |
| The context contains excerpts from one or more research papers, along with associated metadata such as titles, authors, abstracts, keywords, and specific sections relevant to the query. |
| |
| When crafting your response |
| -Use only the provided context to answer the question. |
| -If the answer is found in the context, respond with concise and insight-focused summaries. |
| -Include the paper title and, where applicable, arXiv ID or section reference as the source. |
| -If the question is unrelated to the context or the context is empty, clearly respond with: "Sorry, this is out of my knowledge base." |
| |
| |
| Please adhere to the following response guidelines: |
| -Provide clear, direct answers using only the given context. |
| -Do not include any additional information outside of the context. |
| -Avoid rephrasing or generalizing unless explicitly relevant to the question. |
| -If no relevant answer exists in the context, respond with: "Sorry, this is out of my knowledge base." |
| -If the context is not provided, your response should also be: "Sorry, this is out of my knowledge base." |
| |
| |
| Here is an example of how to structure your response: |
| |
| Answer: |
| [Answer based on context] |
| |
| Source: |
| [Source details with page or section] |
| """ |
|
|
| |
| qna_user_message_template = """ |
| ###Context |
| Here are some excerpts from GEN AI Research Paper and their sources that are relevant to the Gen AI question mentioned below: |
| {context} |
| ###Question |
| {question} |
| """ |
|
|
| @st.cache_resource |
| def load_and_process_pdfs(uploaded_files): |
| all_documents = [] |
| for uploaded_file in uploaded_files: |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: |
| tmp_file.write(uploaded_file.getvalue()) |
| tmp_file_path = tmp_file.name |
| loader = PyMuPDFLoader(tmp_file_path) |
| documents = loader.load() |
| all_documents.extend(documents) |
| os.remove(tmp_file_path) |
| text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( |
| encoding_name='cl100k_base', |
| chunk_size=1000, |
| ) |
| document_chunks = text_splitter.split_documents(all_documents) |
|
|
| embedding_model = OpenAIEmbeddings( |
| openai_api_key=OPENAI_API_KEY, |
| openai_api_base=OPENAI_API_BASE |
| ) |
|
|
| |
| vectorstore = Chroma.from_documents( |
| document_chunks, |
| embedding_model |
| ) |
| return vectorstore.as_retriever(search_type='similarity', search_kwargs={'k': 5}) |
|
|
| def generate_rag_response(user_input, retriever, max_tokens=500, temperature=0, top_p=0.95): |
| |
| relevant_document_chunks = retriever.get_relevant_documents(query=user_input) |
| context_list = [d.page_content for d in relevant_document_chunks] |
|
|
| |
| context_for_query = ". ".join(context_list) |
|
|
| user_message = qna_user_message_template.replace('{context}', context_for_query) |
| user_message = user_message.replace('{question}', user_input) |
|
|
| |
| try: |
| response = client.chat.completions.create( |
| model="gpt-4o-mini", |
| messages=[ |
| {"role": "system", "content": qna_system_message}, |
| {"role": "user", "content": user_message} |
| ], |
| max_tokens=max_tokens, |
| temperature=temperature, |
| top_p=top_p |
| ) |
| response = response.choices[0].message.content.strip() |
| except Exception as e: |
| response = f'Sorry, I encountered the following error: \n {e}' |
|
|
| return response |
|
|
| |
| st.title("LLM-Powered Research Assistant") |
|
|
| uploaded_files = st.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True) |
|
|
| retriever = None |
| if uploaded_files: |
| st.info("Processing uploaded PDFs...") |
| retriever = load_and_process_pdfs(uploaded_files) |
| st.success("PDFs processed and ready for questioning!") |
|
|
|
|
| if retriever: |
| user_question = st.text_input("Ask a question about the uploaded documents:") |
| if user_question: |
| with st.spinner("Generating response..."): |
| rag_response = generate_rag_response(user_question, retriever) |
| st.write(rag_response) |
|
|