ishwor2048's picture
Upload 3 files
2af6090 verified
import streamlit as st
import os
import json
import requests
from langchain_community.document_loaders import PyMuPDFLoader
from openai import OpenAI
import tiktoken
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
import tempfile
OPENAI_API_KEY = os.environ.get("API_KEY")
OPENAI_API_BASE = os.environ.get("API_BASE")
# Initialize OpenAI client
client = OpenAI(
api_key=OPENAI_API_KEY,
base_url=OPENAI_API_BASE
)
# Define the system prompt for the model
qna_system_message = """
You are an AI assistant designed to support research teams in efficiently reviewing scientific literature. Your task is to provide evidence-based, concise, and relevant summaries based on the context provided from research papers.
User input will include the necessary context for you to answer their questions. This context will begin with the token:
###Context
The context contains excerpts from one or more research papers, along with associated metadata such as titles, authors, abstracts, keywords, and specific sections relevant to the query.
When crafting your response
-Use only the provided context to answer the question.
-If the answer is found in the context, respond with concise and insight-focused summaries.
-Include the paper title and, where applicable, arXiv ID or section reference as the source.
-If the question is unrelated to the context or the context is empty, clearly respond with: "Sorry, this is out of my knowledge base."
Please adhere to the following response guidelines:
-Provide clear, direct answers using only the given context.
-Do not include any additional information outside of the context.
-Avoid rephrasing or generalizing unless explicitly relevant to the question.
-If no relevant answer exists in the context, respond with: "Sorry, this is out of my knowledge base."
-If the context is not provided, your response should also be: "Sorry, this is out of my knowledge base."
Here is an example of how to structure your response:
Answer:
[Answer based on context]
Source:
[Source details with page or section]
"""
# Define the user message template
qna_user_message_template = """
###Context
Here are some excerpts from GEN AI Research Paper and their sources that are relevant to the Gen AI question mentioned below:
{context}
###Question
{question}
"""
@st.cache_resource
def load_and_process_pdfs(uploaded_files):
all_documents = []
for uploaded_file in uploaded_files:
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
tmp_file.write(uploaded_file.getvalue())
tmp_file_path = tmp_file.name
loader = PyMuPDFLoader(tmp_file_path)
documents = loader.load()
all_documents.extend(documents)
os.remove(tmp_file_path) # Clean up the temporary file
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
encoding_name='cl100k_base',
chunk_size=1000,
)
document_chunks = text_splitter.split_documents(all_documents)
embedding_model = OpenAIEmbeddings(
openai_api_key=OPENAI_API_KEY,
openai_api_base=OPENAI_API_BASE
)
# Create an in-memory vector store (or use a persistent one if needed)
vectorstore = Chroma.from_documents(
document_chunks,
embedding_model
)
return vectorstore.as_retriever(search_type='similarity', search_kwargs={'k': 5})
def generate_rag_response(user_input, retriever, max_tokens=500, temperature=0, top_p=0.95):
# Retrieve relevant document chunks
relevant_document_chunks = retriever.get_relevant_documents(query=user_input)
context_list = [d.page_content for d in relevant_document_chunks]
# Combine document chunks into a single context
context_for_query = ". ".join(context_list)
user_message = qna_user_message_template.replace('{context}', context_for_query)
user_message = user_message.replace('{question}', user_input)
# Generate the response
try:
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": qna_system_message},
{"role": "user", "content": user_message}
],
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p
)
response = response.choices[0].message.content.strip()
except Exception as e:
response = f'Sorry, I encountered the following error: \n {e}'
return response
# Streamlit App
st.title("LLM-Powered Research Assistant")
uploaded_files = st.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True)
retriever = None
if uploaded_files:
st.info("Processing uploaded PDFs...")
retriever = load_and_process_pdfs(uploaded_files)
st.success("PDFs processed and ready for questioning!")
if retriever:
user_question = st.text_input("Ask a question about the uploaded documents:")
if user_question:
with st.spinner("Generating response..."):
rag_response = generate_rag_response(user_question, retriever)
st.write(rag_response)