sdmadhav commited on
Commit
fddd403
Β·
verified Β·
1 Parent(s): b3b1005

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +231 -227
app.py CHANGED
@@ -1,228 +1,232 @@
1
- import streamlit as st
2
- import os
3
- import tempfile
4
- from dotenv import load_dotenv
5
-
6
- # Load environment variables
7
- load_dotenv()
8
-
9
- # Import required libraries
10
- try:
11
- from langchain_community.document_loaders import PyPDFLoader
12
- from langchain_text_splitters import RecursiveCharacterTextSplitter
13
- from langchain_core.documents import Document
14
- from langchain_community.retrievers import BM25Retriever
15
- from smolagents import Tool, CodeAgent, InferenceClientModel
16
- except ImportError as e:
17
- st.error(f"Missing dependency: {e}. Please install all requirements.")
18
- st.stop()
19
-
20
- # Custom Retriever Tool
21
- class RetrieverTool(Tool):
22
- name = "retriever"
23
- description = "Uses semantic search to retrieve the parts of the research paper that could be most relevant to answer your query."
24
- inputs = {
25
- "query": {
26
- "type": "string",
27
- "description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
28
- }
29
- }
30
- output_type = "string"
31
-
32
- def __init__(self, docs, **kwargs):
33
- super().__init__(**kwargs)
34
- self.retriever = BM25Retriever.from_documents(docs, k=10)
35
-
36
- def forward(self, query: str) -> str:
37
- """Execute the retrieval based on the provided query."""
38
- assert isinstance(query, str), "Your search query must be a string"
39
-
40
- docs = self.retriever.invoke(query)
41
-
42
- return "\nRetrieved documents:\n" + "".join([
43
- f"\n\n===== Document {str(i)} (Page {doc.metadata.get('page', 'N/A')}) =====\n" + doc.page_content
44
- for i, doc in enumerate(docs)
45
- ])
46
-
47
- # Function to load and process PDF
48
- @st.cache_resource
49
- def load_and_process_pdf(pdf_path):
50
- """Load PDF and split into chunks for retrieval."""
51
- try:
52
- # Load PDF
53
- loader = PyPDFLoader(pdf_path)
54
- pages = loader.load()
55
-
56
- # Split into chunks
57
- text_splitter = RecursiveCharacterTextSplitter(
58
- chunk_size=500,
59
- chunk_overlap=50,
60
- add_start_index=True,
61
- strip_whitespace=True,
62
- separators=["\n\n", "\n", ".", " ", ""],
63
- )
64
- docs_processed = text_splitter.split_documents(pages)
65
-
66
- return docs_processed, len(pages)
67
- except Exception as e:
68
- st.error(f"Error processing PDF: {e}")
69
- return None, 0
70
-
71
- # Function to create agent
72
- @st.cache_resource
73
- def create_agent(_docs):
74
- """Create the RAG agent with retriever tool."""
75
- retriever_tool = RetrieverTool(_docs)
76
-
77
- agent = CodeAgent(
78
- tools=[retriever_tool],
79
- model=InferenceClientModel(),
80
- max_steps=4,
81
- verbosity_level=0,
82
- )
83
-
84
- return agent
85
-
86
- # Streamlit UI
87
- def main():
88
- st.set_page_config(
89
- page_title="PaperChat",
90
- page_icon="πŸ“„",
91
- layout="wide"
92
- )
93
-
94
- # Header
95
- st.title("πŸ“„ PaperChat - Research Paper Q&A Assistant")
96
- st.markdown("""
97
- Upload any research paper (PDF) and ask questions about it.
98
- Powered by Agentic RAG with retrieval capabilities.
99
- """)
100
-
101
- # Sidebar
102
- with st.sidebar:
103
- st.header("πŸ“€ Upload Paper")
104
- uploaded_file = st.file_uploader(
105
- "Choose a PDF file",
106
- type="pdf",
107
- help="Upload a research paper in PDF format"
108
- )
109
-
110
- st.markdown("---")
111
- st.subheader("πŸ“š Example Questions")
112
- st.markdown("""
113
- - What is the main contribution of this paper?
114
- - What methodology was used?
115
- - What are the key results?
116
- - What datasets were used?
117
- - What are the limitations mentioned?
118
- """)
119
-
120
- st.markdown("---")
121
- st.subheader("ℹ️ How it works")
122
- st.markdown("""
123
- 1. Upload your paper
124
- 2. The system chunks and indexes it
125
- 3. Ask questions naturally
126
- 4. Get answers with source citations
127
- """)
128
-
129
- # Main content area
130
- if uploaded_file is not None:
131
- # Save uploaded file to temporary location
132
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
133
- tmp_file.write(uploaded_file.read())
134
- tmp_path = tmp_file.name
135
-
136
- # Process PDF
137
- with st.spinner("πŸ”„ Processing your paper... This may take a moment."):
138
- docs, num_pages = load_and_process_pdf(tmp_path)
139
-
140
- if docs:
141
- st.success(f"βœ… Paper loaded successfully! ({num_pages} pages, {len(docs)} chunks)")
142
-
143
- # Create agent
144
- with st.spinner("πŸ€– Initializing AI agent..."):
145
- agent = create_agent(docs)
146
-
147
- st.success("βœ… Agent ready! You can now ask questions.")
148
-
149
- # Chat interface
150
- st.markdown("---")
151
- st.subheader("πŸ’¬ Ask Questions")
152
-
153
- # Initialize chat history
154
- if "messages" not in st.session_state:
155
- st.session_state.messages = []
156
-
157
- # Display chat history
158
- for message in st.session_state.messages:
159
- with st.chat_message(message["role"]):
160
- st.markdown(message["content"])
161
-
162
- # Chat input
163
- if question := st.chat_input("Ask a question about the paper..."):
164
- # Add user message to chat history
165
- st.session_state.messages.append({"role": "user", "content": question})
166
-
167
- # Display user message
168
- with st.chat_message("user"):
169
- st.markdown(question)
170
-
171
- # Generate response
172
- with st.chat_message("assistant"):
173
- with st.spinner("πŸ€” Thinking..."):
174
- try:
175
- answer = agent.run(question)
176
- st.markdown(answer)
177
-
178
- # Add assistant response to chat history
179
- st.session_state.messages.append({"role": "assistant", "content": answer})
180
- except Exception as e:
181
- error_msg = f"Error generating answer: {str(e)}"
182
- st.error(error_msg)
183
- st.session_state.messages.append({"role": "assistant", "content": error_msg})
184
-
185
- # Clear chat button
186
- if st.button("πŸ—‘οΈ Clear Chat History"):
187
- st.session_state.messages = []
188
- st.rerun()
189
-
190
- # Cleanup temp file
191
- try:
192
- os.unlink(tmp_path)
193
- except:
194
- pass
195
-
196
- else:
197
- # Welcome message when no file is uploaded
198
- st.info("πŸ‘ˆ Please upload a research paper PDF from the sidebar to get started.")
199
-
200
- st.markdown("### 🎯 What can you do with PaperChat?")
201
- col1, col2, col3 = st.columns(3)
202
-
203
- with col1:
204
- st.markdown("""
205
- #### πŸ“– Understand Papers
206
- - Get summaries of complex papers
207
- - Understand methodology
208
- - Learn about key findings
209
- """)
210
-
211
- with col2:
212
- st.markdown("""
213
- #### πŸ” Extract Information
214
- - Find specific details
215
- - Locate datasets used
216
- - Identify citations
217
- """)
218
-
219
- with col3:
220
- st.markdown("""
221
- #### πŸ’‘ Learn Faster
222
- - Ask follow-up questions
223
- - Clarify concepts
224
- - Compare approaches
225
- """)
226
-
227
- if __name__ == "__main__":
 
 
 
 
228
  main()
 
1
+ import streamlit as st
2
+ import os
3
+ import tempfile
4
+ from dotenv import load_dotenv
5
+
6
+ # Load environment variables
7
+ load_dotenv()
8
+
9
+ # Import required libraries
10
+ try:
11
+ from langchain_community.document_loaders import PyPDFLoader
12
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
13
+ from langchain_core.documents import Document
14
+ from langchain_community.retrievers import BM25Retriever
15
+ from smolagents import Tool, CodeAgent, InferenceClientModel
16
+ except ImportError as e:
17
+ st.error(f"Missing dependency: {e}. Please install all requirements.")
18
+ st.stop()
19
+
20
+ # Custom Retriever Tool
21
+ class RetrieverTool(Tool):
22
+ name = "retriever"
23
+ description = "Uses semantic search to retrieve the parts of the research paper that could be most relevant to answer your query."
24
+ inputs = {
25
+ "query": {
26
+ "type": "string",
27
+ "description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
28
+ }
29
+ }
30
+ output_type = "string"
31
+
32
+ def __init__(self, docs, **kwargs):
33
+ super().__init__(**kwargs)
34
+ self.retriever = BM25Retriever.from_documents(docs, k=10)
35
+
36
+ def forward(self, query: str) -> str:
37
+ """Execute the retrieval based on the provided query."""
38
+ assert isinstance(query, str), "Your search query must be a string"
39
+
40
+ docs = self.retriever.invoke(query)
41
+
42
+ return "\nRetrieved documents:\n" + "".join([
43
+ f"\n\n===== Document {str(i)} (Page {doc.metadata.get('page', 'N/A')}) =====\n" + doc.page_content
44
+ for i, doc in enumerate(docs)
45
+ ])
46
+
47
+ # Function to load and process PDF
48
+ @st.cache_resource
49
+ def load_and_process_pdf(pdf_path):
50
+ """Load PDF and split into chunks for retrieval."""
51
+ try:
52
+ # Load PDF
53
+ loader = PyPDFLoader(pdf_path)
54
+ pages = loader.load()
55
+
56
+ # Split into chunks
57
+ text_splitter = RecursiveCharacterTextSplitter(
58
+ chunk_size=500,
59
+ chunk_overlap=50,
60
+ add_start_index=True,
61
+ strip_whitespace=True,
62
+ separators=["\n\n", "\n", ".", " ", ""],
63
+ )
64
+ docs_processed = text_splitter.split_documents(pages)
65
+
66
+ return docs_processed, len(pages)
67
+ except Exception as e:
68
+ st.error(f"Error processing PDF: {e}")
69
+ return None, 0
70
+
71
+ # Function to create agent
72
+ @st.cache_resource
73
+ def create_agent(_docs):
74
+ """Create the RAG agent with retriever tool."""
75
+ retriever_tool = RetrieverTool(_docs)
76
+
77
+ # Use FREE Hugging Face model (Qwen 2.5 72B via serverless inference)
78
+ agent = CodeAgent(
79
+ tools=[retriever_tool],
80
+ model=InferenceClientModel(
81
+ model_id="Qwen/Qwen2.5-72B-Instruct",
82
+ token=os.getenv("HF_TOKEN")
83
+ ),
84
+ max_steps=4,
85
+ verbosity_level=0,
86
+ )
87
+
88
+ return agent
89
+
90
+ # Streamlit UI
91
+ def main():
92
+ st.set_page_config(
93
+ page_title="PaperChat",
94
+ page_icon="πŸ“„",
95
+ layout="wide"
96
+ )
97
+
98
+ # Header
99
+ st.title("πŸ“„ PaperChat - Research Paper Q&A Assistant")
100
+ st.markdown("""
101
+ Upload any research paper (PDF) and ask questions about it.
102
+ Powered by Agentic RAG with retrieval capabilities.
103
+ """)
104
+
105
+ # Sidebar
106
+ with st.sidebar:
107
+ st.header("πŸ“€ Upload Paper")
108
+ uploaded_file = st.file_uploader(
109
+ "Choose a PDF file",
110
+ type="pdf",
111
+ help="Upload a research paper in PDF format"
112
+ )
113
+
114
+ st.markdown("---")
115
+ st.subheader("πŸ“š Example Questions")
116
+ st.markdown("""
117
+ - What is the main contribution of this paper?
118
+ - What methodology was used?
119
+ - What are the key results?
120
+ - What datasets were used?
121
+ - What are the limitations mentioned?
122
+ """)
123
+
124
+ st.markdown("---")
125
+ st.subheader("ℹ️ How it works")
126
+ st.markdown("""
127
+ 1. Upload your paper
128
+ 2. The system chunks and indexes it
129
+ 3. Ask questions naturally
130
+ 4. Get answers with source citations
131
+ """)
132
+
133
+ # Main content area
134
+ if uploaded_file is not None:
135
+ # Save uploaded file to temporary location
136
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
137
+ tmp_file.write(uploaded_file.read())
138
+ tmp_path = tmp_file.name
139
+
140
+ # Process PDF
141
+ with st.spinner("πŸ”„ Processing your paper... This may take a moment."):
142
+ docs, num_pages = load_and_process_pdf(tmp_path)
143
+
144
+ if docs:
145
+ st.success(f"βœ… Paper loaded successfully! ({num_pages} pages, {len(docs)} chunks)")
146
+
147
+ # Create agent
148
+ with st.spinner("πŸ€– Initializing AI agent..."):
149
+ agent = create_agent(docs)
150
+
151
+ st.success("βœ… Agent ready! You can now ask questions.")
152
+
153
+ # Chat interface
154
+ st.markdown("---")
155
+ st.subheader("πŸ’¬ Ask Questions")
156
+
157
+ # Initialize chat history
158
+ if "messages" not in st.session_state:
159
+ st.session_state.messages = []
160
+
161
+ # Display chat history
162
+ for message in st.session_state.messages:
163
+ with st.chat_message(message["role"]):
164
+ st.markdown(message["content"])
165
+
166
+ # Chat input
167
+ if question := st.chat_input("Ask a question about the paper..."):
168
+ # Add user message to chat history
169
+ st.session_state.messages.append({"role": "user", "content": question})
170
+
171
+ # Display user message
172
+ with st.chat_message("user"):
173
+ st.markdown(question)
174
+
175
+ # Generate response
176
+ with st.chat_message("assistant"):
177
+ with st.spinner("πŸ€” Thinking..."):
178
+ try:
179
+ answer = agent.run(question)
180
+ st.markdown(answer)
181
+
182
+ # Add assistant response to chat history
183
+ st.session_state.messages.append({"role": "assistant", "content": answer})
184
+ except Exception as e:
185
+ error_msg = f"Error generating answer: {str(e)}"
186
+ st.error(error_msg)
187
+ st.session_state.messages.append({"role": "assistant", "content": error_msg})
188
+
189
+ # Clear chat button
190
+ if st.button("πŸ—‘οΈ Clear Chat History"):
191
+ st.session_state.messages = []
192
+ st.rerun()
193
+
194
+ # Cleanup temp file
195
+ try:
196
+ os.unlink(tmp_path)
197
+ except:
198
+ pass
199
+
200
+ else:
201
+ # Welcome message when no file is uploaded
202
+ st.info("πŸ‘ˆ Please upload a research paper PDF from the sidebar to get started.")
203
+
204
+ st.markdown("### 🎯 What can you do with PaperChat?")
205
+ col1, col2, col3 = st.columns(3)
206
+
207
+ with col1:
208
+ st.markdown("""
209
+ #### πŸ“– Understand Papers
210
+ - Get summaries of complex papers
211
+ - Understand methodology
212
+ - Learn about key findings
213
+ """)
214
+
215
+ with col2:
216
+ st.markdown("""
217
+ #### πŸ” Extract Information
218
+ - Find specific details
219
+ - Locate datasets used
220
+ - Identify citations
221
+ """)
222
+
223
+ with col3:
224
+ st.markdown("""
225
+ #### πŸ’‘ Learn Faster
226
+ - Ask follow-up questions
227
+ - Clarify concepts
228
+ - Compare approaches
229
+ """)
230
+
231
+ if __name__ == "__main__":
232
  main()