umangagarwal1008 commited on
Commit
ee67828
·
verified ·
1 Parent(s): 9f2ea6d

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +22 -0
  2. README.md +19 -0
  3. app.py +137 -0
  4. requirements.txt +10 -0
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y \
6
+ build-essential \
7
+ curl \
8
+ software-properties-common \
9
+ git \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ COPY requirements.txt ./
13
+ COPY app.py ./
14
+ COPY src/ ./src/
15
+
16
+ RUN pip3 install -r requirements.txt
17
+
18
+ EXPOSE 8501
19
+
20
+ HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
21
+
22
+ ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0","--server.enableXsrfProtection=false"]
README.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Research Paper Rag
3
+ emoji: 🚀
4
+ colorFrom: red
5
+ colorTo: red
6
+ sdk: streamlit
7
+ app_port: 8501
8
+ tags:
9
+ - streamlit
10
+ pinned: false
11
+ short_description: Streamlit template space
12
+ ---
13
+
14
+ # Welcome to Streamlit!
15
+
16
+ Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
17
+
18
+ If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
19
+ forums](https://discuss.streamlit.io).
app.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import json
4
+ import requests
5
+ from langchain_community.document_loaders import PyMuPDFLoader
6
+ from openai import OpenAI
7
+ import tiktoken
8
+ import pandas as pd
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain_community.embeddings.openai import OpenAIEmbeddings
11
+ from langchain_community.vectorstores import Chroma
12
+ import tempfile
13
+
14
+
15
+ OPENAI_API_KEY = os.environ.get("API_KEY")
16
+ OPENAI_API_BASE = os.environ.get("API_BASE")
17
+
18
+ # Initialize OpenAI client
19
+ client = OpenAI(
20
+ api_key=OPENAI_API_KEY,
21
+ base_url=OPENAI_API_BASE
22
+ )
23
+
24
+ # Define the system prompt for the model
25
+ qna_system_message = """
26
+ You are an AI assistant designed to support professional doctors at St. Bernard's Medical Center. Your task is to provide evidence-based, concise, and relevant medical information to doctors' clinical questions based on the context provided.
27
+
28
+ User input will include the necessary context for you to answer their questions. This context will begin with the token: ###Context. The context contains references to specific portions of trusted medical literature and research articles relevant to the query, along with their source details.
29
+
30
+ When crafting your response:
31
+ 1. Use only the provided context to answer the question.
32
+ 2. If the answer is found in the context, respond with concise and actionable medical insights.
33
+ 3. Include the source reference with the page number, journal name, or publication, as provided in the context.
34
+ 4. If the question is unrelated to the context or the context is empty, clearly respond with: "Sorry, this is out of my knowledge base."
35
+
36
+ Please adhere to the following response guidelines:
37
+ - Provide clear, direct answers using only the given context.
38
+ - Do not include any additional information outside of the context.
39
+ - Avoid rephrasing or summarizing the context unless explicitly relevant to the question.
40
+ - If no relevant answer exists in the context, respond with: "Sorry, this is out of my knowledge base."
41
+ - If the context is not provided, your response should also be: "Sorry, this is out of my knowledge base."
42
+
43
+ Here is an example of how to structure your response:
44
+
45
+ Answer:
46
+ [Answer based on context]
47
+
48
+ Source:
49
+ [Source details with page or section]
50
+ """
51
+
52
+ # Define the user message template
53
+ qna_user_message_template = """
54
+ ###Context
55
+ Here are some excerpts from GEN AI Research Paper and their sources that are relevant to the Gen AI question mentioned below:
56
+ {context}
57
+
58
+ ###Question
59
+ {question}
60
+ """
61
+
62
+ @st.cache_resource
63
+ def load_and_process_pdfs(uploaded_files):
64
+ all_documents = []
65
+ for uploaded_file in uploaded_files:
66
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
67
+ tmp_file.write(uploaded_file.getvalue())
68
+ tmp_file_path = tmp_file.name
69
+ loader = PyMuPDFLoader(tmp_file_path)
70
+ documents = loader.load()
71
+ all_documents.extend(documents)
72
+ os.remove(tmp_file_path) # Clean up the temporary file
73
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
74
+ encoding_name='cl100k_base',
75
+ chunk_size=1000,
76
+ )
77
+ document_chunks = text_splitter.split_documents(all_documents)
78
+
79
+ embedding_model = OpenAIEmbeddings(
80
+ openai_api_key=OPENAI_API_KEY,
81
+ openai_api_base=OPENAI_API_BASE
82
+ )
83
+
84
+ # Create an in-memory vector store (or use a persistent one if needed)
85
+ vectorstore = Chroma.from_documents(
86
+ document_chunks,
87
+ embedding_model
88
+ )
89
+ return vectorstore.as_retriever(search_type='similarity', search_kwargs={'k': 5})
90
+
91
+ def generate_rag_response(user_input, retriever, max_tokens=500, temperature=0, top_p=0.95):
92
+ # Retrieve relevant document chunks
93
+ relevant_document_chunks = retriever.get_relevant_documents(query=user_input)
94
+ context_list = [d.page_content for d in relevant_document_chunks]
95
+
96
+ # Combine document chunks into a single context
97
+ context_for_query = ". ".join(context_list)
98
+
99
+ user_message = qna_user_message_template.replace('{context}', context_for_query)
100
+ user_message = user_message.replace('{question}', user_input)
101
+
102
+ # Generate the response
103
+ try:
104
+ response = client.chat.completions.create(
105
+ model="gpt-4o-mini",
106
+ messages=[
107
+ {"role": "system", "content": qna_system_message},
108
+ {"role": "user", "content": user_message}
109
+ ],
110
+ max_tokens=max_tokens,
111
+ temperature=temperature,
112
+ top_p=top_p
113
+ )
114
+ response = response.choices[0].message.content.strip()
115
+ except Exception as e:
116
+ response = f'Sorry, I encountered the following error: \n {e}'
117
+
118
+ return response
119
+
120
+ # Streamlit App
121
+ st.title("LLM-Powered Research Assistant")
122
+
123
+ uploaded_files = st.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True)
124
+
125
+ retriever = None
126
+ if uploaded_files:
127
+ st.info("Processing uploaded PDFs...")
128
+ retriever = load_and_process_pdfs(uploaded_files)
129
+ st.success("PDFs processed and ready for questioning!")
130
+
131
+
132
+ if retriever:
133
+ user_question = st.text_input("Ask a question about the uploaded documents:")
134
+ if user_question:
135
+ with st.spinner("Generating response..."):
136
+ rag_response = generate_rag_response(user_question, retriever)
137
+ st.write(rag_response)
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain_community==0.3.27
2
+ langchain==0.3.27
3
+ chromadb==1.0.15
4
+ pymupdf==1.26.3
5
+ tiktoken==0.9.0
6
+ datasets==4.0.0
7
+ evaluate==0.4.5
8
+ streamlit==1.35.0
9
+ openai==1.99.1
10
+ langchain_openai==0.3.28