ishwor2048 commited on
Commit
2af6090
·
verified ·
1 Parent(s): dbc1081

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +23 -0
  2. app.py +142 -0
  3. requirements.txt +11 -0
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ FROM python:3.9-slim
3
+
4
+ WORKDIR /app
5
+
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ curl \
9
+ software-properties-common \
10
+ git \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ COPY requirements.txt ./
14
+ COPY app.py ./
15
+ COPY src/ ./src/
16
+
17
+ RUN pip3 install -r requirements.txt
18
+
19
+ EXPOSE 8501
20
+
21
+ HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
22
+
23
+ ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0","--server.enableXsrfProtection=false"]
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import streamlit as st
3
+ import os
4
+ import json
5
+ import requests
6
+ from langchain_community.document_loaders import PyMuPDFLoader
7
+ from openai import OpenAI
8
+ import tiktoken
9
+ import pandas as pd
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from langchain_community.embeddings.openai import OpenAIEmbeddings
12
+ from langchain_community.vectorstores import Chroma
13
+ import tempfile
14
+
15
+
16
+ OPENAI_API_KEY = os.environ.get("API_KEY")
17
+ OPENAI_API_BASE = os.environ.get("API_BASE")
18
+
19
+ # Initialize OpenAI client
20
+ client = OpenAI(
21
+ api_key=OPENAI_API_KEY,
22
+ base_url=OPENAI_API_BASE
23
+ )
24
+
25
+ # Define the system prompt for the model
26
+ qna_system_message = """
27
+ You are an AI assistant designed to support research teams in efficiently reviewing scientific literature. Your task is to provide evidence-based, concise, and relevant summaries based on the context provided from research papers.
28
+
29
+ User input will include the necessary context for you to answer their questions. This context will begin with the token:
30
+
31
+ ###Context
32
+ The context contains excerpts from one or more research papers, along with associated metadata such as titles, authors, abstracts, keywords, and specific sections relevant to the query.
33
+
34
+ When crafting your response
35
+ -Use only the provided context to answer the question.
36
+ -If the answer is found in the context, respond with concise and insight-focused summaries.
37
+ -Include the paper title and, where applicable, arXiv ID or section reference as the source.
38
+ -If the question is unrelated to the context or the context is empty, clearly respond with: "Sorry, this is out of my knowledge base."
39
+
40
+
41
+ Please adhere to the following response guidelines:
42
+ -Provide clear, direct answers using only the given context.
43
+ -Do not include any additional information outside of the context.
44
+ -Avoid rephrasing or generalizing unless explicitly relevant to the question.
45
+ -If no relevant answer exists in the context, respond with: "Sorry, this is out of my knowledge base."
46
+ -If the context is not provided, your response should also be: "Sorry, this is out of my knowledge base."
47
+
48
+
49
+ Here is an example of how to structure your response:
50
+
51
+ Answer:
52
+ [Answer based on context]
53
+
54
+ Source:
55
+ [Source details with page or section]
56
+ """
57
+
58
+ # Define the user message template
59
+ qna_user_message_template = """
60
+ ###Context
61
+ Here are some excerpts from GEN AI Research Paper and their sources that are relevant to the Gen AI question mentioned below:
62
+ {context}
63
+ ###Question
64
+ {question}
65
+ """
66
+
67
+ @st.cache_resource
68
+ def load_and_process_pdfs(uploaded_files):
69
+ all_documents = []
70
+ for uploaded_file in uploaded_files:
71
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
72
+ tmp_file.write(uploaded_file.getvalue())
73
+ tmp_file_path = tmp_file.name
74
+ loader = PyMuPDFLoader(tmp_file_path)
75
+ documents = loader.load()
76
+ all_documents.extend(documents)
77
+ os.remove(tmp_file_path) # Clean up the temporary file
78
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
79
+ encoding_name='cl100k_base',
80
+ chunk_size=1000,
81
+ )
82
+ document_chunks = text_splitter.split_documents(all_documents)
83
+
84
+ embedding_model = OpenAIEmbeddings(
85
+ openai_api_key=OPENAI_API_KEY,
86
+ openai_api_base=OPENAI_API_BASE
87
+ )
88
+
89
+ # Create an in-memory vector store (or use a persistent one if needed)
90
+ vectorstore = Chroma.from_documents(
91
+ document_chunks,
92
+ embedding_model
93
+ )
94
+ return vectorstore.as_retriever(search_type='similarity', search_kwargs={'k': 5})
95
+
96
+ def generate_rag_response(user_input, retriever, max_tokens=500, temperature=0, top_p=0.95):
97
+ # Retrieve relevant document chunks
98
+ relevant_document_chunks = retriever.get_relevant_documents(query=user_input)
99
+ context_list = [d.page_content for d in relevant_document_chunks]
100
+
101
+ # Combine document chunks into a single context
102
+ context_for_query = ". ".join(context_list)
103
+
104
+ user_message = qna_user_message_template.replace('{context}', context_for_query)
105
+ user_message = user_message.replace('{question}', user_input)
106
+
107
+ # Generate the response
108
+ try:
109
+ response = client.chat.completions.create(
110
+ model="gpt-4o-mini",
111
+ messages=[
112
+ {"role": "system", "content": qna_system_message},
113
+ {"role": "user", "content": user_message}
114
+ ],
115
+ max_tokens=max_tokens,
116
+ temperature=temperature,
117
+ top_p=top_p
118
+ )
119
+ response = response.choices[0].message.content.strip()
120
+ except Exception as e:
121
+ response = f'Sorry, I encountered the following error: \n {e}'
122
+
123
+ return response
124
+
125
+ # Streamlit App
126
+ st.title("LLM-Powered Research Assistant")
127
+
128
+ uploaded_files = st.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True)
129
+
130
+ retriever = None
131
+ if uploaded_files:
132
+ st.info("Processing uploaded PDFs...")
133
+ retriever = load_and_process_pdfs(uploaded_files)
134
+ st.success("PDFs processed and ready for questioning!")
135
+
136
+
137
+ if retriever:
138
+ user_question = st.text_input("Ask a question about the uploaded documents:")
139
+ if user_question:
140
+ with st.spinner("Generating response..."):
141
+ rag_response = generate_rag_response(user_question, retriever)
142
+ st.write(rag_response)
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ langchain_community==0.3.27
3
+ langchain==0.3.27
4
+ chromadb==1.0.15
5
+ pymupdf==1.26.3
6
+ tiktoken==0.9.0
7
+ datasets==4.0.0
8
+ evaluate==0.4.5
9
+ streamlit==1.35.0
10
+ openai==1.99.1
11
+ langchain_openai==0.3.28