Nishitha03 commited on
Commit
00ddd85
Β·
verified Β·
1 Parent(s): 1c0d3c3

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +1539 -38
src/streamlit_app.py CHANGED
@@ -1,40 +1,1541 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
1
+ # import os
2
+ # import requests
3
+ # import arxiv
4
+ # from pathlib import Path
5
+ # from typing import List, Dict, Optional
6
+ # import streamlit as st
7
+ # from llama_index.core import (
8
+ # VectorStoreIndex,
9
+ # SimpleDirectoryReader,
10
+ # Settings,
11
+ # Document,
12
+ # StorageContext,
13
+ # load_index_from_storage
14
+ # )
15
+ # from llama_index.core.node_parser import SentenceSplitter
16
+ # from llama_index.llms.groq import Groq
17
+ # from llama_index.embeddings.huggingface import HuggingFaceEmbedding
18
+ # from llama_index.core.query_engine import RetrieverQueryEngine
19
+ # from llama_index.core.retrievers import VectorIndexRetriever
20
+ # from llama_index.core.response_synthesizers import get_response_synthesizer
21
+ # from llama_index.core.memory import ChatMemoryBuffer
22
+ # from llama_index.core.chat_engine import CondensePlusContextChatEngine
23
+ # import logging
24
+ # import hashlib
25
+ # import json
26
+ # import time
27
+ # from datetime import datetime
28
+
29
+ # # Configure logging
30
+ # logging.basicConfig(level=logging.INFO)
31
+ # logger = logging.getLogger(__name__)
32
+
33
+ # class AcademicPaperQA:
34
+ # def __init__(self, model_name="llama3-70b-8192", groq_api_key=None):
35
+ # """Initialize the Academic Paper Q&A system with Groq API"""
36
+ # self.data_dir = Path("./papers")
37
+ # self.storage_dir = Path("./storage")
38
+ # self.model_name = model_name
39
+ # self.groq_api_key = groq_api_key
40
+
41
+ # # Create directories
42
+ # self.data_dir.mkdir(exist_ok=True)
43
+ # self.storage_dir.mkdir(exist_ok=True)
44
+
45
+ # # Initialize models
46
+ # self._setup_models()
47
+
48
+ # # Initialize index and chat engine
49
+ # self.index = None
50
+ # self.query_engine = None
51
+ # self.chat_engine = None
52
+ # self.current_papers_hash = None
53
+ # self.is_ready = False
54
+
55
+ # # Chat history
56
+ # self.chat_history = []
57
+
58
+ # def _setup_models(self):
59
+ # """Setup LLM and embedding models with Groq API"""
60
+ # try:
61
+ # if not self.groq_api_key:
62
+ # raise ValueError("Groq API key is required. Please set GROQ_API_KEY environment variable or pass it directly.")
63
+
64
+ # # Initialize LLM via Groq API with settings optimized for detailed responses
65
+ # self.llm = Groq(
66
+ # model=self.model_name,
67
+ # api_key=self.groq_api_key,
68
+ # temperature=0.3, # Slightly higher for more creative/detailed responses
69
+ # max_tokens=4096, # Maximum tokens for response
70
+ # top_p=0.9, # Nucleus sampling for better quality
71
+ # system_prompt="""You are an expert academic research assistant. When answering questions about research papers, provide comprehensive, detailed responses that include:
72
+
73
+ # 1. Direct answers to the question asked
74
+ # 2. Relevant background context and explanations
75
+ # 3. Specific details from the papers including methodologies, findings, and implications
76
+ # 4. Analysis and interpretation of the information
77
+ # 5. Connections between different concepts when relevant
78
+ # 6. Limitations or caveats when appropriate
79
+
80
+ # Always aim for thorough, well-structured responses that demonstrate deep understanding of the academic content. Use clear paragraphs and explain technical concepts when necessary."""
81
+ # )
82
+
83
+ # # Initialize lightweight embedding model for CPU usage
84
+ # self.embed_model = HuggingFaceEmbedding(
85
+ # model_name="sentence-transformers/all-MiniLM-L6-v2",
86
+ # device="cpu" # Explicitly use CPU
87
+ # )
88
+
89
+ # # Configure global settings
90
+ # Settings.llm = self.llm
91
+ # Settings.embed_model = self.embed_model
92
+ # Settings.chunk_size = 512 # Smaller chunks for better CPU performance
93
+ # Settings.chunk_overlap = 50 # Reduced overlap for CPU efficiency
94
+
95
+ # logger.info(f"Models initialized successfully with {self.model_name} via Groq API")
96
+
97
+ # except Exception as e:
98
+ # logger.error(f"Error setting up models: {e}")
99
+ # raise
100
+
101
+ # def _get_papers_hash(self) -> str:
102
+ # """Generate hash of current papers in directory"""
103
+ # pdf_files = list(self.data_dir.glob("*.pdf"))
104
+ # if not pdf_files:
105
+ # return ""
106
+
107
+ # # Create hash based on filenames and file sizes
108
+ # file_info = []
109
+ # for pdf_file in sorted(pdf_files):
110
+ # file_info.append(f"{pdf_file.name}:{pdf_file.stat().st_size}")
111
+
112
+ # papers_string = "|".join(file_info)
113
+ # return hashlib.md5(papers_string.encode()).hexdigest()
114
+
115
+ # def _save_papers_metadata(self, papers_hash: str):
116
+ # """Save metadata about current papers"""
117
+ # metadata_file = self.storage_dir / "papers_metadata.json"
118
+ # metadata = {
119
+ # "papers_hash": papers_hash,
120
+ # "model_name": self.model_name
121
+ # }
122
+ # with open(metadata_file, "w") as f:
123
+ # json.dump(metadata, f)
124
+
125
+ # def _load_papers_metadata(self) -> Dict:
126
+ # """Load metadata about papers"""
127
+ # metadata_file = self.storage_dir / "papers_metadata.json"
128
+ # if metadata_file.exists():
129
+ # with open(metadata_file, "r") as f:
130
+ # return json.load(f)
131
+ # return {}
132
+
133
+ # def download_arxiv_paper(self, arxiv_id: str) -> Optional[str]:
134
+ # """Download paper from arXiv"""
135
+ # try:
136
+ # search = arxiv.Search(id_list=[arxiv_id])
137
+ # paper = next(search.results())
138
+
139
+ # filename = f"{arxiv_id.replace('/', '_')}.pdf"
140
+ # filepath = self.data_dir / filename
141
+
142
+ # paper.download_pdf(dirpath=str(self.data_dir), filename=filename)
143
+
144
+ # logger.info(f"Downloaded paper: {paper.title}")
145
+ # return str(filepath)
146
+
147
+ # except Exception as e:
148
+ # logger.error(f"Error downloading paper {arxiv_id}: {e}")
149
+ # return None
150
+
151
+ # def load_documents(self, file_paths: List[str] = None) -> List[Document]:
152
+ # """Load documents from PDF files"""
153
+ # try:
154
+ # if file_paths is None:
155
+ # reader = SimpleDirectoryReader(
156
+ # input_dir=str(self.data_dir),
157
+ # required_exts=[".pdf"]
158
+ # )
159
+ # else:
160
+ # reader = SimpleDirectoryReader(input_files=file_paths)
161
+
162
+ # documents = reader.load_data()
163
+ # logger.info(f"Loaded {len(documents)} documents")
164
+
165
+ # return documents
166
+
167
+ # except Exception as e:
168
+ # logger.error(f"Error loading documents: {e}")
169
+ # return []
170
+
171
+ # def create_index(self, documents: List[Document], save_index: bool = True):
172
+ # """Create vector index from documents with CPU-optimized settings"""
173
+ # try:
174
+ # if not documents:
175
+ # raise ValueError("No documents provided for indexing")
176
+
177
+ # logger.info(f"Creating index from {len(documents)} documents")
178
+
179
+ # # CPU-optimized sentence splitter
180
+ # sentence_splitter = SentenceSplitter(
181
+ # chunk_size=512, # Smaller chunks for CPU
182
+ # chunk_overlap=50
183
+ # )
184
+
185
+ # self.index = VectorStoreIndex.from_documents(
186
+ # documents,
187
+ # transformations=[sentence_splitter],
188
+ # show_progress=True
189
+ # )
190
+
191
+ # if save_index:
192
+ # self.index.storage_context.persist(persist_dir=str(self.storage_dir))
193
+ # current_hash = self._get_papers_hash()
194
+ # self._save_papers_metadata(current_hash)
195
+ # self.current_papers_hash = current_hash
196
+ # logger.info("Index saved to storage")
197
+
198
+ # self._create_query_engine()
199
+ # self._create_chat_engine()
200
+ # self.is_ready = True
201
+ # logger.info("Vector index created successfully")
202
+
203
+ # except Exception as e:
204
+ # logger.error(f"Error creating index: {e}")
205
+ # self.is_ready = False
206
+ # raise
207
+
208
+ # def should_rebuild_index(self) -> bool:
209
+ # """Check if index should be rebuilt based on papers"""
210
+ # current_hash = self._get_papers_hash()
211
+
212
+ # if not current_hash:
213
+ # return False
214
+
215
+ # metadata = self._load_papers_metadata()
216
+
217
+ # if not metadata:
218
+ # logger.info("No metadata found, rebuilding index")
219
+ # return True
220
+
221
+ # if metadata.get("papers_hash") != current_hash:
222
+ # logger.info("Papers hash changed, rebuilding index")
223
+ # return True
224
+
225
+ # if metadata.get("model_name") != self.model_name:
226
+ # logger.info("Model changed, rebuilding index")
227
+ # return True
228
+
229
+ # return False
230
+
231
+ # def load_index(self) -> bool:
232
+ # """Load existing index from storage if it matches current papers"""
233
+ # try:
234
+ # if self.should_rebuild_index():
235
+ # logger.info("Index needs to be rebuilt due to changes")
236
+ # return False
237
+
238
+ # index_files = list(self.storage_dir.glob("*"))
239
+ # if not index_files:
240
+ # logger.info("No index files found")
241
+ # return False
242
+
243
+ # storage_context = StorageContext.from_defaults(
244
+ # persist_dir=str(self.storage_dir)
245
+ # )
246
+ # self.index = load_index_from_storage(storage_context)
247
+ # self._create_query_engine()
248
+ # self._create_chat_engine()
249
+ # self.current_papers_hash = self._get_papers_hash()
250
+ # self.is_ready = True
251
+
252
+ # logger.info("Index loaded from storage successfully")
253
+ # return True
254
+
255
+ # except Exception as e:
256
+ # logger.error(f"Error loading index: {e}")
257
+ # self.is_ready = False
258
+ # return False
259
+
260
+ # def _create_query_engine(self):
261
+ # """Create query engine with settings for detailed responses"""
262
+ # try:
263
+ # if not self.index:
264
+ # raise ValueError("No index available for query engine")
265
+
266
+ # retriever = VectorIndexRetriever(
267
+ # index=self.index,
268
+ # similarity_top_k=3 # Reduced for CPU efficiency
269
+ # )
270
+
271
+ # response_synthesizer = get_response_synthesizer(
272
+ # response_mode="compact", # Better for detailed responses
273
+ # streaming=False,
274
+ # text_qa_template="""Context information is below.
275
+ # ---------------------
276
+ # {context_str}
277
+ # ---------------------
278
+ # Given the context information and not prior knowledge, please provide a comprehensive and detailed answer to the question. Include specific details from the research papers, explain methodologies when relevant, discuss findings thoroughly, and provide analysis and implications. Structure your response clearly with proper explanations of technical concepts.
279
+
280
+ # Question: {query_str}
281
+ # Answer: """
282
+ # )
283
+
284
+ # self.query_engine = RetrieverQueryEngine(
285
+ # retriever=retriever,
286
+ # response_synthesizer=response_synthesizer
287
+ # )
288
+
289
+ # logger.info("Query engine created successfully")
290
+
291
+ # except Exception as e:
292
+ # logger.error(f"Error creating query engine: {e}")
293
+ # raise
294
+
295
+ # def _create_chat_engine(self):
296
+ # """Create chat engine for conversational interactions"""
297
+ # try:
298
+ # if not self.index:
299
+ # raise ValueError("No index available for chat engine")
300
+
301
+ # # Create memory buffer for chat history
302
+ # memory = ChatMemoryBuffer.from_defaults(token_limit=2000) # Reduced for efficiency
303
+
304
+ # # Create chat engine
305
+ # self.chat_engine = CondensePlusContextChatEngine.from_defaults(
306
+ # retriever=VectorIndexRetriever(
307
+ # index=self.index,
308
+ # similarity_top_k=3
309
+ # ),
310
+ # memory=memory,
311
+ # llm=self.llm,
312
+ # context_prompt=(
313
+ # "You are an expert academic research assistant having a conversation about research papers. "
314
+ # "Use the following context from the papers to answer questions thoroughly and in detail. "
315
+ # "Provide comprehensive explanations, include specific findings, methodologies, and implications. "
316
+ # "Build upon previous parts of the conversation when relevant.\n"
317
+ # "Context:\n"
318
+ # "{context_str}\n"
319
+ # "Instructions: Answer the user's question in detail using the provided context."
320
+ # ),
321
+ # verbose=True
322
+ # )
323
+
324
+ # logger.info("Chat engine created successfully")
325
+
326
+ # except Exception as e:
327
+ # logger.error(f"Error creating chat engine: {e}")
328
+ # raise
329
+
330
+ # def get_loaded_papers_info(self) -> List[str]:
331
+ # """Get list of currently loaded papers"""
332
+ # pdf_files = list(self.data_dir.glob("*.pdf"))
333
+ # return [pdf_file.name for pdf_file in pdf_files]
334
+
335
+ # def clear_papers(self):
336
+ # """Clear all papers and reset index"""
337
+ # try:
338
+ # # Remove all PDF files
339
+ # for pdf_file in self.data_dir.glob("*.pdf"):
340
+ # pdf_file.unlink()
341
+
342
+ # # Clear storage
343
+ # if self.storage_dir.exists():
344
+ # import shutil
345
+ # shutil.rmtree(self.storage_dir)
346
+ # self.storage_dir.mkdir(exist_ok=True)
347
+
348
+ # # Reset everything
349
+ # self.index = None
350
+ # self.query_engine = None
351
+ # self.chat_engine = None
352
+ # self.current_papers_hash = None
353
+ # self.is_ready = False
354
+ # self.chat_history = []
355
+
356
+ # logger.info("Papers and index cleared")
357
+ # return True
358
+
359
+ # except Exception as e:
360
+ # logger.error(f"Error clearing papers: {e}")
361
+ # return False
362
+
363
+ # def clear_chat_history(self):
364
+ # """Clear chat history and reset memory"""
365
+ # try:
366
+ # self.chat_history = []
367
+ # if self.chat_engine and hasattr(self.chat_engine, 'memory'):
368
+ # self.chat_engine.memory.reset()
369
+ # logger.info("Chat history cleared")
370
+ # except Exception as e:
371
+ # logger.error(f"Error clearing chat history: {e}")
372
+
373
+ # def process_all_papers(self) -> Dict[str, str]:
374
+ # """Process all papers in the directory and create/load index"""
375
+ # try:
376
+ # current_papers = self.get_loaded_papers_info()
377
+ # if not current_papers:
378
+ # return {"error": "No papers found in directory"}
379
+
380
+ # logger.info(f"Processing {len(current_papers)} papers: {current_papers}")
381
+
382
+ # if self.load_index():
383
+ # return {"success": f"Loaded existing index for {len(current_papers)} papers"}
384
+
385
+ # logger.info("Creating new index from documents...")
386
+ # documents = self.load_documents()
387
+
388
+ # if not documents:
389
+ # return {"error": "Failed to load documents from PDF files"}
390
+
391
+ # self.create_index(documents)
392
+
393
+ # if self.is_ready:
394
+ # return {"success": f"Successfully created index for {len(current_papers)} papers"}
395
+ # else:
396
+ # return {"error": "Failed to create index"}
397
+
398
+ # except Exception as e:
399
+ # logger.error(f"Error processing papers: {e}")
400
+ # return {"error": f"Error processing papers: {str(e)}"}
401
+
402
+ # def ask_question(self, question: str, use_chat_engine: bool = True) -> Dict[str, any]:
403
+ # """Ask a question using either chat engine (conversational) or query engine (standalone)"""
404
+ # if not self.is_ready:
405
+ # return {"error": "System not ready. Please process papers first."}
406
+
407
+ # try:
408
+ # logger.info(f"Asking question: {question}")
409
+
410
+ # if use_chat_engine and self.chat_engine:
411
+ # # Use chat engine for conversational context
412
+ # response = self.chat_engine.chat(question)
413
+ # answer = str(response)
414
+
415
+ # # Add to chat history
416
+ # self.chat_history.append({
417
+ # "timestamp": datetime.now().strftime("%H:%M:%S"),
418
+ # "question": question,
419
+ # "answer": answer,
420
+ # "type": "chat"
421
+ # })
422
+
423
+ # # Get sources if available
424
+ # sources = []
425
+ # if hasattr(response, 'source_nodes') and response.source_nodes:
426
+ # for i, node in enumerate(response.source_nodes):
427
+ # sources.append({
428
+ # 'text': node.text[:400] + "..." if len(node.text) > 400 else node.text,
429
+ # 'score': node.score if hasattr(node, 'score') else 'N/A'
430
+ # })
431
+
432
+ # else:
433
+ # # Use query engine for standalone questions
434
+ # response = self.query_engine.query(question)
435
+ # answer = str(response)
436
+
437
+ # # Add to chat history
438
+ # self.chat_history.append({
439
+ # "timestamp": datetime.now().strftime("%H:%M:%S"),
440
+ # "question": question,
441
+ # "answer": answer,
442
+ # "type": "query"
443
+ # })
444
+
445
+ # sources = []
446
+ # if hasattr(response, 'source_nodes') and response.source_nodes:
447
+ # for i, node in enumerate(response.source_nodes):
448
+ # sources.append({
449
+ # 'text': node.text[:400] + "..." if len(node.text) > 400 else node.text,
450
+ # 'score': node.score if hasattr(node, 'score') else 'N/A'
451
+ # })
452
+
453
+ # logger.info(f"Generated answer length: {len(answer)} characters")
454
+
455
+ # return {
456
+ # "answer": answer,
457
+ # "sources": sources,
458
+ # "timestamp": datetime.now().strftime("%H:%M:%S")
459
+ # }
460
+
461
+ # except Exception as e:
462
+ # logger.error(f"Error answering question: {e}")
463
+ # return {"error": f"Error processing question: {str(e)}"}
464
+
465
+ # def create_streamlit_app():
466
+ # """Create Streamlit web interface with chat functionality"""
467
+ # st.set_page_config(
468
+ # page_title="Academic Paper Q&A Bot (Groq Powered)",
469
+ # page_icon="πŸ”¬",
470
+ # layout="wide"
471
+ # )
472
+
473
+ # st.title("πŸ”¬ Academic Paper Q&A Bot (Groq Powered)")
474
+
475
+ # # Custom CSS for chat interface
476
+ # st.markdown("""
477
+ # <style>
478
+ # .chat-message {
479
+ # padding: 1rem;
480
+ # border-radius: 0.5rem;
481
+ # margin-bottom: 1rem;
482
+ # display: flex;
483
+ # flex-direction: column;
484
+ # }
485
+ # .user-message {
486
+ # background-color: #e3f2fd;
487
+ # margin-left: 20%;
488
+ # }
489
+ # .bot-message {
490
+ # background-color: #f5f5f5;
491
+ # margin-right: 20%;
492
+ # }
493
+ # .message-content {
494
+ # margin: 0.5rem 0;
495
+ # }
496
+ # .message-timestamp {
497
+ # font-size: 0.8rem;
498
+ # color: #666;
499
+ # align-self: flex-end;
500
+ # }
501
+ # .stChatInputContainer {
502
+ # position: fixed;
503
+ # bottom: 0;
504
+ # background: white;
505
+ # padding: 1rem;
506
+ # border-top: 1px solid #e0e0e0;
507
+ # }
508
+ # </style>
509
+ # """, unsafe_allow_html=True)
510
+
511
+ # # API Key configuration in sidebar
512
+ # st.sidebar.header("πŸ”‘ API Configuration")
513
+ # groq_api_key = st.sidebar.text_input(
514
+ # "Groq API Key:",
515
+ # type="password",
516
+ # help="Get your free API key from https://console.groq.com/keys"
517
+ # )
518
+
519
+ # if not groq_api_key:
520
+ # groq_api_key = os.getenv("GROQ_API_KEY")
521
+
522
+ # if not groq_api_key:
523
+ # st.sidebar.error("Please enter your Groq API key or set GROQ_API_KEY environment variable")
524
+ # st.info("πŸ”‘ **To get started:**\n1. Go to https://console.groq.com/keys\n2. Create a free account\n3. Generate an API key\n4. Enter it in the sidebar")
525
+ # st.stop()
526
+
527
+ # # Model selection in sidebar
528
+ # st.sidebar.header("βš™οΈ Configuration")
529
+ # model_options = {
530
+ # "Llama3 70B (Most Capable)": "llama3-70b-8192",
531
+ # "Llama3 8B (Fast)": "llama3-8b-8192",
532
+ # "Mixtral 8x7B (Balanced)": "mixtral-8x7b-32768",
533
+ # "Gemma 7B (Efficient)": "gemma-7b-it"
534
+ # }
535
+
536
+ # selected_model = st.sidebar.selectbox(
537
+ # "Choose Groq Model:",
538
+ # list(model_options.keys()),
539
+ # index=0
540
+ # )
541
+
542
+ # model_name = model_options[selected_model]
543
+
544
+ # # Initialize session state
545
+ # if ('qa_system' not in st.session_state or
546
+ # st.session_state.get('current_model') != model_name or
547
+ # st.session_state.get('current_api_key') != groq_api_key):
548
+
549
+ # with st.spinner(f"Initializing system with {selected_model}..."):
550
+ # try:
551
+ # st.session_state.qa_system = AcademicPaperQA(
552
+ # model_name=model_name,
553
+ # groq_api_key=groq_api_key
554
+ # )
555
+ # st.session_state.current_model = model_name
556
+ # st.session_state.current_api_key = groq_api_key
557
+ # st.session_state.papers_loaded = False
558
+ # st.success(f"System initialized with {selected_model} via Groq API!")
559
+ # except Exception as e:
560
+ # st.error(f"Error initializing system: {e}")
561
+ # st.info("Please check your Groq API key and try again.")
562
+ # st.stop()
563
+
564
+ # if 'papers_loaded' not in st.session_state:
565
+ # st.session_state.papers_loaded = False
566
+
567
+ # # Display current model info
568
+ # st.sidebar.info(f"**Current model:** {selected_model}")
569
+ # st.sidebar.success("βœ… Using Groq API (Cloud)")
570
+
571
+ # # Show system status
572
+ # if hasattr(st.session_state.qa_system, 'is_ready'):
573
+ # if st.session_state.qa_system.is_ready:
574
+ # st.sidebar.success("βœ… System Ready")
575
+ # else:
576
+ # st.sidebar.warning("⚠️ Process papers first")
577
+
578
+ # # Show currently loaded papers
579
+ # current_papers = st.session_state.qa_system.get_loaded_papers_info()
580
+ # if current_papers:
581
+ # st.sidebar.subheader("πŸ“š Loaded Papers:")
582
+ # for paper in current_papers:
583
+ # st.sidebar.text(f"πŸ“„ {paper}")
584
+
585
+ # if st.sidebar.button("πŸ—‘οΈ Clear All Papers"):
586
+ # with st.spinner("Clearing papers..."):
587
+ # if st.session_state.qa_system.clear_papers():
588
+ # st.session_state.papers_loaded = False
589
+ # st.sidebar.success("Papers cleared!")
590
+ # st.rerun()
591
+
592
+ # # Chat controls in sidebar
593
+ # st.sidebar.subheader("πŸ’¬ Chat Controls")
594
+ # if st.sidebar.button("🧹 Clear Chat History"):
595
+ # st.session_state.qa_system.clear_chat_history()
596
+ # st.sidebar.success("Chat cleared!")
597
+ # st.rerun()
598
+
599
+ # # Response mode toggle
600
+ # use_chat_mode = st.sidebar.toggle("πŸ’¬ Conversational Mode", value=True,
601
+ # help="Enable for follow-up questions and context retention")
602
+
603
+ # # Main interface
604
+ # if not st.session_state.qa_system.is_ready:
605
+ # # Show paper loading interface when system not ready
606
+ # st.header("πŸ“₯ Load Academic Papers")
607
+
608
+ # col1, col2 = st.columns([1, 1])
609
+
610
+ # with col1:
611
+ # st.subheader("From arXiv")
612
+ # arxiv_id = st.text_input("Enter arXiv ID (e.g., 2301.00001)")
613
+ # if st.button("Download from arXiv"):
614
+ # if arxiv_id:
615
+ # with st.spinner("Downloading paper..."):
616
+ # filepath = st.session_state.qa_system.download_arxiv_paper(arxiv_id)
617
+ # if filepath:
618
+ # st.success(f"Downloaded paper")
619
+ # st.session_state.papers_loaded = False
620
+ # else:
621
+ # st.error("Failed to download paper")
622
+
623
+ # with col2:
624
+ # st.subheader("Upload PDF Files")
625
+ # uploaded_files = st.file_uploader(
626
+ # "Choose PDF files",
627
+ # type="pdf",
628
+ # accept_multiple_files=True
629
+ # )
630
+
631
+ # if uploaded_files:
632
+ # saved_files = []
633
+ # for uploaded_file in uploaded_files:
634
+ # file_path = st.session_state.qa_system.data_dir / uploaded_file.name
635
+ # with open(file_path, "wb") as f:
636
+ # f.write(uploaded_file.getbuffer())
637
+ # saved_files.append(str(file_path))
638
+
639
+ # st.success(f"Uploaded {len(saved_files)} files")
640
+ # st.session_state.papers_loaded = False
641
+
642
+ # # Process papers
643
+ # st.subheader("πŸ”„ Process Papers")
644
+ # current_papers = st.session_state.qa_system.get_loaded_papers_info()
645
+
646
+ # if not current_papers:
647
+ # st.info("No papers found. Please upload or download papers first.")
648
+ # else:
649
+ # st.info(f"Found {len(current_papers)} paper(s): {', '.join(current_papers)}")
650
+
651
+ # if st.button("πŸš€ Process Papers", type="primary"):
652
+ # with st.spinner("Processing papers (creating embeddings on CPU)..."):
653
+ # result = st.session_state.qa_system.process_all_papers()
654
+
655
+ # if "error" in result:
656
+ # st.error(result["error"])
657
+ # st.session_state.papers_loaded = False
658
+ # else:
659
+ # st.success(result["success"])
660
+ # st.session_state.papers_loaded = True
661
+ # st.rerun()
662
+
663
+ # else:
664
+ # # Main chat interface when system is ready
665
+ # st.header("πŸ’¬ Chat with Your Papers")
666
+
667
+ # # Show loaded papers info
668
+ # loaded_papers = st.session_state.qa_system.get_loaded_papers_info()
669
+ # st.info(f"πŸ“š Chatting with {len(loaded_papers)} paper(s): {', '.join(loaded_papers)}")
670
+
671
+ # # Chat history display
672
+ # chat_container = st.container()
673
+
674
+ # with chat_container:
675
+ # # Display chat history
676
+ # for i, message in enumerate(st.session_state.qa_system.chat_history[-10:]): # Show last 10 messages
677
+ # # User message
678
+ # st.markdown(f"""
679
+ # <div class="chat-message user-message">
680
+ # <div class="message-content"><strong style="color: black;">You:</strong> {message['question']}</div>
681
+ # <div class="message-timestamp">{message['timestamp']}</div>
682
+ # </div>
683
+ # """, unsafe_allow_html=True)
684
+
685
+ # # Bot response
686
+ # st.markdown(f"""
687
+ # <div class="chat-message bot-message">
688
+ # <div class="message-content"><strong style="color: black;">Assistant:</strong></div>
689
+ # </div>
690
+ # """, unsafe_allow_html=True)
691
+
692
+ # st.write(message['answer'])
693
+ # st.markdown("---")
694
+
695
+ # # Quick question buttons
696
+ # st.subheader("πŸš€ Quick Questions")
697
+ # col1, col2, col3 = st.columns(3)
698
+
699
+ # quick_question = None
700
+ # with col1:
701
+ # if st.button("🎯 Main Research Question"):
702
+ # quick_question = "What is the main research question or objective addressed in this paper? Please provide a detailed explanation."
703
+ # if st.button("πŸ”¬ Methodology"):
704
+ # quick_question = "What methodology or research approach was used in this study? Please explain in detail including any experimental design, data collection methods, and analytical techniques."
705
+
706
+ # with col2:
707
+ # if st.button("πŸ“Š Key Findings"):
708
+ # quick_question = "What are the key findings and results of this research? Please provide a comprehensive summary of the main discoveries and their significance."
709
+ # if st.button("🎯 Conclusions"):
710
+ # quick_question = "What are the main conclusions and implications of this research? How do the authors interpret their findings?"
711
+
712
+ # with col3:
713
+ # if st.button("⚠️ Limitations"):
714
+ # quick_question = "What are the limitations of this study? What do the authors identify as potential weaknesses or areas for future research?"
715
+ # if st.button("πŸ“‹ Summary"):
716
+ # quick_question = "Please provide a comprehensive summary of this paper including the research question, methodology, key findings, and conclusions."
717
+
718
+ # # Chat input
719
+ # st.subheader("πŸ’­ Ask Your Question")
720
+ # user_question = st.text_area("Type your question here...", height=100, placeholder="Ask anything about your papers...")
721
+
722
+ # # Use quick question if selected, otherwise use user input
723
+ # question_to_ask = quick_question if quick_question else user_question
724
+
725
+ # col1, col2 = st.columns([3, 1])
726
+ # with col1:
727
+ # if st.button("Send Message", type="primary", disabled=not question_to_ask):
728
+ # if question_to_ask:
729
+ # with st.spinner("Thinking... (Processing via Groq API)"):
730
+ # result = st.session_state.qa_system.ask_question(
731
+ # question_to_ask,
732
+ # use_chat_engine=use_chat_mode
733
+ # )
734
+
735
+ # if "error" in result:
736
+ # st.error(result["error"])
737
+ # else:
738
+ # st.rerun() # Reload to show new message
739
+
740
+ # with col2:
741
+ # response_mode = "πŸ’¬ Chat Mode" if use_chat_mode else "❓ Q&A Mode"
742
+ # st.info(response_mode)
743
+
744
+ # # Sources section (show for last question if available)
745
+ # if (st.session_state.qa_system.chat_history and
746
+ # st.session_state.qa_system.chat_history[-1].get('sources')):
747
+
748
+ # with st.expander("πŸ“š View Sources", expanded=False):
749
+ # sources = st.session_state.qa_system.chat_history[-1]['sources']
750
+ # for i, source in enumerate(sources, 1):
751
+ # st.markdown(f"**Source {i}** (Relevance: {source['score']})")
752
+ # st.text(source['text'])
753
+ # st.markdown("---")
754
+
755
+ # if __name__ == "__main__":
756
+ # create_streamlit_app()
757
+
758
+
759
+
760
+
761
+ import os
762
+ import requests
763
+ import arxiv
764
+ from pathlib import Path
765
+ from typing import List, Dict, Optional
766
  import streamlit as st
767
+ from llama_index.core import (
768
+ VectorStoreIndex,
769
+ SimpleDirectoryReader,
770
+ Settings,
771
+ Document,
772
+ StorageContext,
773
+ load_index_from_storage
774
+ )
775
+ from llama_index.core.node_parser import SentenceSplitter
776
+ from llama_index.llms.groq import Groq
777
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
778
+ from llama_index.core.query_engine import RetrieverQueryEngine
779
+ from llama_index.core.retrievers import VectorIndexRetriever
780
+ from llama_index.core.response_synthesizers import get_response_synthesizer
781
+ from llama_index.core.memory import ChatMemoryBuffer
782
+ from llama_index.core.chat_engine import CondensePlusContextChatEngine
783
+ import logging
784
+ import hashlib
785
+ import json
786
+ import time
787
+ from datetime import datetime
788
+
789
+ # Configure logging
790
+ logging.basicConfig(level=logging.INFO)
791
+ logger = logging.getLogger(__name__)
792
+
793
+ class AcademicPaperQA:
794
+ def __init__(self, model_name="llama3-70b-8192", groq_api_key=None):
795
+ """Initialize the Academic Paper Q&A system with Groq API"""
796
+ self.data_dir = Path("./papers")
797
+ self.storage_dir = Path("./storage")
798
+ self.model_name = model_name
799
+ self.groq_api_key = groq_api_key
800
+
801
+ # Create directories
802
+ self.data_dir.mkdir(exist_ok=True)
803
+ self.storage_dir.mkdir(exist_ok=True)
804
+
805
+ # Initialize models
806
+ self._setup_models()
807
+
808
+ # Initialize index and chat engine
809
+ self.index = None
810
+ self.query_engine = None
811
+ self.chat_engine = None
812
+ self.current_papers_hash = None
813
+ self.is_ready = False
814
+
815
+ # Chat history
816
+ self.chat_history = []
817
+
818
+ def _setup_models(self):
819
+ """Setup LLM and embedding models with Groq API"""
820
+ try:
821
+ if not self.groq_api_key:
822
+ raise ValueError("Groq API key is required. Please set GROQ_API_KEY environment variable or pass it directly.")
823
+
824
+ # Initialize LLM via Groq API with conservative token settings
825
+ self.llm = Groq(
826
+ model=self.model_name,
827
+ api_key=self.groq_api_key,
828
+ temperature=0.3,
829
+ max_tokens=2048, # Reduced max tokens to prevent context overflow
830
+ top_p=0.9,
831
+ system_prompt="""You are an expert academic research assistant. Provide comprehensive, detailed responses about research papers including:
832
+
833
+ 1. Direct answers to questions
834
+ 2. Relevant background context
835
+ 3. Specific details from papers including methodologies and findings
836
+ 4. Analysis and interpretation
837
+ 5. Connections between concepts when relevant
838
+
839
+ Keep responses thorough but concise to stay within token limits."""
840
+ )
841
+
842
+ # Initialize lightweight embedding model for CPU usage
843
+ # Using a more stable embedding model
844
+ try:
845
+ self.embed_model = HuggingFaceEmbedding(
846
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
847
+ device="cpu",
848
+ max_length=512 # Explicit max length to prevent issues
849
+ )
850
+ except Exception as e:
851
+ logger.warning(f"Failed to load HuggingFace embedding, trying alternative: {e}")
852
+ # Fallback to a different embedding model
853
+ self.embed_model = HuggingFaceEmbedding(
854
+ model_name="BAAI/bge-small-en-v1.5",
855
+ device="cpu",
856
+ max_length=512
857
+ )
858
+
859
+ # Configure global settings with conservative values
860
+ Settings.llm = self.llm
861
+ Settings.embed_model = self.embed_model
862
+ Settings.chunk_size = 256 # Smaller chunks to prevent context overflow
863
+ Settings.chunk_overlap = 25 # Reduced overlap
864
+
865
+ logger.info(f"Models initialized successfully with {self.model_name} via Groq API")
866
+
867
+ except Exception as e:
868
+ logger.error(f"Error setting up models: {e}")
869
+ raise
870
+
871
+ def _get_papers_hash(self) -> str:
872
+ """Generate hash of current papers in directory"""
873
+ pdf_files = list(self.data_dir.glob("*.pdf"))
874
+ if not pdf_files:
875
+ return ""
876
+
877
+ # Create hash based on filenames and file sizes
878
+ file_info = []
879
+ for pdf_file in sorted(pdf_files):
880
+ file_info.append(f"{pdf_file.name}:{pdf_file.stat().st_size}")
881
+
882
+ papers_string = "|".join(file_info)
883
+ return hashlib.md5(papers_string.encode()).hexdigest()
884
+
885
+ def _save_papers_metadata(self, papers_hash: str):
886
+ """Save metadata about current papers"""
887
+ metadata_file = self.storage_dir / "papers_metadata.json"
888
+ metadata = {
889
+ "papers_hash": papers_hash,
890
+ "model_name": self.model_name
891
+ }
892
+ with open(metadata_file, "w") as f:
893
+ json.dump(metadata, f)
894
+
895
+ def _load_papers_metadata(self) -> Dict:
896
+ """Load metadata about papers"""
897
+ metadata_file = self.storage_dir / "papers_metadata.json"
898
+ if metadata_file.exists():
899
+ with open(metadata_file, "r") as f:
900
+ return json.load(f)
901
+ return {}
902
+
903
+ def download_arxiv_paper(self, arxiv_id: str) -> Optional[str]:
904
+ """Download paper from arXiv"""
905
+ try:
906
+ search = arxiv.Search(id_list=[arxiv_id])
907
+ paper = next(search.results())
908
+
909
+ filename = f"{arxiv_id.replace('/', '_')}.pdf"
910
+ filepath = self.data_dir / filename
911
+
912
+ paper.download_pdf(dirpath=str(self.data_dir), filename=filename)
913
+
914
+ logger.info(f"Downloaded paper: {paper.title}")
915
+ return str(filepath)
916
+
917
+ except Exception as e:
918
+ logger.error(f"Error downloading paper {arxiv_id}: {e}")
919
+ return None
920
+
921
+ def load_documents(self, file_paths: List[str] = None) -> List[Document]:
922
+ """Load documents from PDF files with error handling"""
923
+ try:
924
+ if file_paths is None:
925
+ reader = SimpleDirectoryReader(
926
+ input_dir=str(self.data_dir),
927
+ required_exts=[".pdf"],
928
+ recursive=False # Explicit setting
929
+ )
930
+ else:
931
+ reader = SimpleDirectoryReader(input_files=file_paths)
932
+
933
+ documents = reader.load_data()
934
+ logger.info(f"Loaded {len(documents)} documents")
935
+
936
+ # Clean and validate documents
937
+ cleaned_documents = []
938
+ for doc in documents:
939
+ if doc.text and len(doc.text.strip()) > 50: # Filter out very short documents
940
+ # Truncate very long documents to prevent memory issues
941
+ if len(doc.text) > 50000:
942
+ doc.text = doc.text[:50000] + "... [Document truncated]"
943
+ cleaned_documents.append(doc)
944
+
945
+ logger.info(f"After cleaning: {len(cleaned_documents)} valid documents")
946
+ return cleaned_documents
947
+
948
+ except Exception as e:
949
+ logger.error(f"Error loading documents: {e}")
950
+ return []
951
+
952
+ def create_index(self, documents: List[Document], save_index: bool = True):
953
+ """Create vector index from documents with CPU-optimized settings"""
954
+ try:
955
+ if not documents:
956
+ raise ValueError("No documents provided for indexing")
957
+
958
+ logger.info(f"Creating index from {len(documents)} documents")
959
+
960
+ # CPU-optimized sentence splitter with smaller chunks
961
+ sentence_splitter = SentenceSplitter(
962
+ chunk_size=256, # Smaller chunks to prevent context overflow
963
+ chunk_overlap=25,
964
+ separator=" " # Explicit separator
965
+ )
966
+
967
+ # Process documents in smaller batches to prevent memory issues
968
+ batch_size = 5
969
+ all_nodes = []
970
+
971
+ for i in range(0, len(documents), batch_size):
972
+ batch = documents[i:i + batch_size]
973
+ logger.info(f"Processing batch {i//batch_size + 1}/{(len(documents) + batch_size - 1)//batch_size}")
974
+
975
+ nodes = sentence_splitter.get_nodes_from_documents(batch)
976
+ all_nodes.extend(nodes)
977
+
978
+ # Create index from nodes
979
+ self.index = VectorStoreIndex(
980
+ nodes=all_nodes,
981
+ show_progress=True
982
+ )
983
+
984
+ if save_index:
985
+ self.index.storage_context.persist(persist_dir=str(self.storage_dir))
986
+ current_hash = self._get_papers_hash()
987
+ self._save_papers_metadata(current_hash)
988
+ self.current_papers_hash = current_hash
989
+ logger.info("Index saved to storage")
990
+
991
+ self._create_query_engine()
992
+ self._create_chat_engine()
993
+ self.is_ready = True
994
+ logger.info("Vector index created successfully")
995
+
996
+ except Exception as e:
997
+ logger.error(f"Error creating index: {e}")
998
+ self.is_ready = False
999
+ raise
1000
+
1001
+ def should_rebuild_index(self) -> bool:
1002
+ """Check if index should be rebuilt based on papers"""
1003
+ current_hash = self._get_papers_hash()
1004
+
1005
+ if not current_hash:
1006
+ return False
1007
+
1008
+ metadata = self._load_papers_metadata()
1009
+
1010
+ if not metadata:
1011
+ logger.info("No metadata found, rebuilding index")
1012
+ return True
1013
+
1014
+ if metadata.get("papers_hash") != current_hash:
1015
+ logger.info("Papers hash changed, rebuilding index")
1016
+ return True
1017
+
1018
+ if metadata.get("model_name") != self.model_name:
1019
+ logger.info("Model changed, rebuilding index")
1020
+ return True
1021
+
1022
+ return False
1023
+
1024
+ def load_index(self) -> bool:
1025
+ """Load existing index from storage if it matches current papers"""
1026
+ try:
1027
+ if self.should_rebuild_index():
1028
+ logger.info("Index needs to be rebuilt due to changes")
1029
+ return False
1030
+
1031
+ index_files = list(self.storage_dir.glob("*"))
1032
+ if not index_files:
1033
+ logger.info("No index files found")
1034
+ return False
1035
+
1036
+ storage_context = StorageContext.from_defaults(
1037
+ persist_dir=str(self.storage_dir)
1038
+ )
1039
+ self.index = load_index_from_storage(storage_context)
1040
+ self._create_query_engine()
1041
+ self._create_chat_engine()
1042
+ self.current_papers_hash = self._get_papers_hash()
1043
+ self.is_ready = True
1044
+
1045
+ logger.info("Index loaded from storage successfully")
1046
+ return True
1047
+
1048
+ except Exception as e:
1049
+ logger.error(f"Error loading index: {e}")
1050
+ self.is_ready = False
1051
+ return False
1052
+
1053
+ def _create_query_engine(self):
1054
+ """Create query engine with settings for detailed responses"""
1055
+ try:
1056
+ if not self.index:
1057
+ raise ValueError("No index available for query engine")
1058
+
1059
+ retriever = VectorIndexRetriever(
1060
+ index=self.index,
1061
+ similarity_top_k=2 # Reduced to prevent context overflow
1062
+ )
1063
+
1064
+ response_synthesizer = get_response_synthesizer(
1065
+ response_mode="compact", # More efficient for context management
1066
+ streaming=False,
1067
+ text_qa_template="""Context information is below.
1068
+ ---------------------
1069
+ {context_str}
1070
+ ---------------------
1071
+ Based on the context information, provide a comprehensive answer to the question. Include specific details from the research papers and explain key concepts clearly.
1072
+
1073
+ Question: {query_str}
1074
+ Answer: """
1075
+ )
1076
+
1077
+ self.query_engine = RetrieverQueryEngine(
1078
+ retriever=retriever,
1079
+ response_synthesizer=response_synthesizer
1080
+ )
1081
+
1082
+ logger.info("Query engine created successfully")
1083
+
1084
+ except Exception as e:
1085
+ logger.error(f"Error creating query engine: {e}")
1086
+ raise
1087
+
1088
+ def _create_chat_engine(self):
1089
+ """Create chat engine for conversational interactions with conservative settings"""
1090
+ try:
1091
+ if not self.index:
1092
+ raise ValueError("No index available for chat engine")
1093
+
1094
+ # Create memory buffer with smaller token limit to prevent overflow
1095
+ memory = ChatMemoryBuffer.from_defaults(token_limit=1000)
1096
+
1097
+ # Create chat engine with conservative settings
1098
+ self.chat_engine = CondensePlusContextChatEngine.from_defaults(
1099
+ retriever=VectorIndexRetriever(
1100
+ index=self.index,
1101
+ similarity_top_k=2 # Reduced to manage context size
1102
+ ),
1103
+ memory=memory,
1104
+ llm=self.llm,
1105
+ context_prompt=(
1106
+ "You are an expert academic research assistant. "
1107
+ "Use the following context to answer questions thoroughly but concisely. "
1108
+ "Context:\n{context_str}\n"
1109
+ "Answer the user's question based on the provided context."
1110
+ ),
1111
+ verbose=True,
1112
+ # Additional context management
1113
+ context_window=4096, # Conservative context window
1114
+ max_tokens=1500 # Conservative max tokens for response
1115
+ )
1116
+
1117
+ logger.info("Chat engine created successfully")
1118
+
1119
+ except Exception as e:
1120
+ logger.error(f"Error creating chat engine: {e}")
1121
+ raise
1122
+
1123
+ def get_loaded_papers_info(self) -> List[str]:
1124
+ """Get list of currently loaded papers"""
1125
+ pdf_files = list(self.data_dir.glob("*.pdf"))
1126
+ return [pdf_file.name for pdf_file in pdf_files]
1127
+
1128
+ def clear_papers(self):
1129
+ """Clear all papers and reset index"""
1130
+ try:
1131
+ # Remove all PDF files
1132
+ for pdf_file in self.data_dir.glob("*.pdf"):
1133
+ pdf_file.unlink()
1134
+
1135
+ # Clear storage
1136
+ if self.storage_dir.exists():
1137
+ import shutil
1138
+ shutil.rmtree(self.storage_dir)
1139
+ self.storage_dir.mkdir(exist_ok=True)
1140
+
1141
+ # Reset everything
1142
+ self.index = None
1143
+ self.query_engine = None
1144
+ self.chat_engine = None
1145
+ self.current_papers_hash = None
1146
+ self.is_ready = False
1147
+ self.chat_history = []
1148
+
1149
+ logger.info("Papers and index cleared")
1150
+ return True
1151
+
1152
+ except Exception as e:
1153
+ logger.error(f"Error clearing papers: {e}")
1154
+ return False
1155
+
1156
+ def clear_chat_history(self):
1157
+ """Clear chat history and reset memory"""
1158
+ try:
1159
+ self.chat_history = []
1160
+ if self.chat_engine and hasattr(self.chat_engine, 'memory'):
1161
+ self.chat_engine.memory.reset()
1162
+ logger.info("Chat history cleared")
1163
+ except Exception as e:
1164
+ logger.error(f"Error clearing chat history: {e}")
1165
+
1166
+ def process_all_papers(self) -> Dict[str, str]:
1167
+ """Process all papers in the directory and create/load index"""
1168
+ try:
1169
+ current_papers = self.get_loaded_papers_info()
1170
+ if not current_papers:
1171
+ return {"error": "No papers found in directory"}
1172
+
1173
+ logger.info(f"Processing {len(current_papers)} papers: {current_papers}")
1174
+
1175
+ if self.load_index():
1176
+ return {"success": f"Loaded existing index for {len(current_papers)} papers"}
1177
+
1178
+ logger.info("Creating new index from documents...")
1179
+ documents = self.load_documents()
1180
+
1181
+ if not documents:
1182
+ return {"error": "Failed to load documents from PDF files"}
1183
+
1184
+ self.create_index(documents)
1185
+
1186
+ if self.is_ready:
1187
+ return {"success": f"Successfully created index for {len(current_papers)} papers"}
1188
+ else:
1189
+ return {"error": "Failed to create index"}
1190
+
1191
+ except Exception as e:
1192
+ logger.error(f"Error processing papers: {e}")
1193
+ return {"error": f"Error processing papers: {str(e)}"}
1194
+
1195
+ def ask_question(self, question: str, use_chat_engine: bool = True) -> Dict[str, any]:
1196
+ """Ask a question using either chat engine (conversational) or query engine (standalone)"""
1197
+ if not self.is_ready:
1198
+ return {"error": "System not ready. Please process papers first."}
1199
+
1200
+ try:
1201
+ logger.info(f"Asking question: {question}")
1202
+
1203
+ # Truncate very long questions to prevent context overflow
1204
+ if len(question) > 500:
1205
+ question = question[:500] + "..."
1206
+ logger.warning("Question truncated to prevent context overflow")
1207
+
1208
+ if use_chat_engine and self.chat_engine:
1209
+ # Use chat engine for conversational context
1210
+ try:
1211
+ response = self.chat_engine.chat(question)
1212
+ answer = str(response)
1213
+ except Exception as chat_error:
1214
+ logger.warning(f"Chat engine failed, falling back to query engine: {chat_error}")
1215
+ # Fallback to query engine if chat engine fails
1216
+ response = self.query_engine.query(question)
1217
+ answer = str(response)
1218
+ use_chat_engine = False # Update flag for history tracking
1219
+
1220
+ else:
1221
+ # Use query engine for standalone questions
1222
+ response = self.query_engine.query(question)
1223
+ answer = str(response)
1224
+
1225
+ # Add to chat history
1226
+ self.chat_history.append({
1227
+ "timestamp": datetime.now().strftime("%H:%M:%S"),
1228
+ "question": question,
1229
+ "answer": answer,
1230
+ "type": "chat" if use_chat_engine else "query"
1231
+ })
1232
+
1233
+ # Get sources if available
1234
+ sources = []
1235
+ if hasattr(response, 'source_nodes') and response.source_nodes:
1236
+ for i, node in enumerate(response.source_nodes):
1237
+ sources.append({
1238
+ 'text': node.text[:300] + "..." if len(node.text) > 300 else node.text,
1239
+ 'score': node.score if hasattr(node, 'score') else 'N/A'
1240
+ })
1241
+
1242
+ logger.info(f"Generated answer length: {len(answer)} characters")
1243
+
1244
+ return {
1245
+ "answer": answer,
1246
+ "sources": sources,
1247
+ "timestamp": datetime.now().strftime("%H:%M:%S")
1248
+ }
1249
+
1250
+ except Exception as e:
1251
+ logger.error(f"Error answering question: {e}")
1252
+ return {"error": f"Error processing question: {str(e)}"}
1253
+
1254
+ def create_streamlit_app():
1255
+ """Create Streamlit web interface with chat functionality"""
1256
+ st.set_page_config(
1257
+ page_title="Academic Paper Q&A Bot (Groq Powered)",
1258
+ page_icon="πŸ”¬",
1259
+ layout="wide"
1260
+ )
1261
+
1262
+ st.title("πŸ”¬ Academic Paper Q&A Bot (Groq Powered)")
1263
+
1264
+ # Custom CSS for chat interface
1265
+ st.markdown("""
1266
+ <style>
1267
+ .chat-message {
1268
+ padding: 1rem;
1269
+ border-radius: 0.5rem;
1270
+ margin-bottom: 1rem;
1271
+ display: flex;
1272
+ flex-direction: column;
1273
+ }
1274
+ .user-message {
1275
+ background-color: #e3f2fd;
1276
+ margin-left: 20%;
1277
+ }
1278
+ .bot-message {
1279
+ background-color: #f5f5f5;
1280
+ margin-right: 20%;
1281
+ }
1282
+ .message-content {
1283
+ margin: 0.5rem 0;
1284
+ }
1285
+ .message-timestamp {
1286
+ font-size: 0.8rem;
1287
+ color: #666;
1288
+ align-self: flex-end;
1289
+ }
1290
+ .stChatInputContainer {
1291
+ position: fixed;
1292
+ bottom: 0;
1293
+ background: white;
1294
+ padding: 1rem;
1295
+ border-top: 1px solid #e0e0e0;
1296
+ }
1297
+ </style>
1298
+ """, unsafe_allow_html=True)
1299
+
1300
+ # API Key configuration in sidebar
1301
+ st.sidebar.header("πŸ”‘ API Configuration")
1302
+ groq_api_key = st.sidebar.text_input(
1303
+ "Groq API Key:",
1304
+ type="password",
1305
+ help="Get your free API key from https://console.groq.com/keys"
1306
+ )
1307
+
1308
+ if not groq_api_key:
1309
+ groq_api_key = os.getenv("GROQ_API_KEY")
1310
+
1311
+ if not groq_api_key:
1312
+ st.sidebar.error("Please enter your Groq API key or set GROQ_API_KEY environment variable")
1313
+ st.info("πŸ”‘ **To get started:**\n1. Go to https://console.groq.com/keys\n2. Create a free account\n3. Generate an API key\n4. Enter it in the sidebar")
1314
+ st.stop()
1315
+
1316
+ # Model selection in sidebar
1317
+ st.sidebar.header("βš™οΈ Configuration")
1318
+ model_options = {
1319
+ "Llama3 8B (Fast & Stable)": "llama3-8b-8192",
1320
+ "Llama3 70B (Most Capable)": "llama3-70b-8192",
1321
+ "Mixtral 8x7B (Balanced)": "mixtral-8x7b-32768",
1322
+ "Gemma 7B (Efficient)": "gemma-7b-it"
1323
+ }
1324
+
1325
+ selected_model = st.sidebar.selectbox(
1326
+ "Choose Groq Model:",
1327
+ list(model_options.keys()),
1328
+ index=0 # Default to the more stable 8B model
1329
+ )
1330
+
1331
+ model_name = model_options[selected_model]
1332
+
1333
+ # Initialize session state
1334
+ if ('qa_system' not in st.session_state or
1335
+ st.session_state.get('current_model') != model_name or
1336
+ st.session_state.get('current_api_key') != groq_api_key):
1337
+
1338
+ with st.spinner(f"Initializing system with {selected_model}..."):
1339
+ try:
1340
+ st.session_state.qa_system = AcademicPaperQA(
1341
+ model_name=model_name,
1342
+ groq_api_key=groq_api_key
1343
+ )
1344
+ st.session_state.current_model = model_name
1345
+ st.session_state.current_api_key = groq_api_key
1346
+ st.session_state.papers_loaded = False
1347
+ st.success(f"System initialized with {selected_model} via Groq API!")
1348
+ except Exception as e:
1349
+ st.error(f"Error initializing system: {e}")
1350
+ st.info("Please check your Groq API key and try again.")
1351
+ st.stop()
1352
+
1353
+ if 'papers_loaded' not in st.session_state:
1354
+ st.session_state.papers_loaded = False
1355
+
1356
+ # Display current model info
1357
+ st.sidebar.info(f"**Current model:** {selected_model}")
1358
+ st.sidebar.success("βœ… Using Groq API (Cloud)")
1359
+ st.sidebar.info("πŸ’¬ Conversational Mode: ON")
1360
+
1361
+ # Show system status
1362
+ if hasattr(st.session_state.qa_system, 'is_ready'):
1363
+ if st.session_state.qa_system.is_ready:
1364
+ st.sidebar.success("βœ… System Ready")
1365
+ else:
1366
+ st.sidebar.warning("⚠️ Process papers first")
1367
+
1368
+ # Show currently loaded papers
1369
+ current_papers = st.session_state.qa_system.get_loaded_papers_info()
1370
+ if current_papers:
1371
+ st.sidebar.subheader("πŸ“š Loaded Papers:")
1372
+ for paper in current_papers:
1373
+ st.sidebar.text(f"πŸ“„ {paper}")
1374
+
1375
+ if st.sidebar.button("πŸ—‘οΈ Clear All Papers"):
1376
+ with st.spinner("Clearing papers..."):
1377
+ if st.session_state.qa_system.clear_papers():
1378
+ st.session_state.papers_loaded = False
1379
+ st.sidebar.success("Papers cleared!")
1380
+ st.rerun()
1381
+
1382
+ # Chat controls in sidebar
1383
+ st.sidebar.subheader("πŸ’¬ Chat Controls")
1384
+ if st.sidebar.button("🧹 Clear Chat History"):
1385
+ st.session_state.qa_system.clear_chat_history()
1386
+ st.sidebar.success("Chat cleared!")
1387
+ st.rerun()
1388
+
1389
+ # Main interface
1390
+ if not st.session_state.qa_system.is_ready:
1391
+ # Show paper loading interface when system not ready
1392
+ st.header("πŸ“₯ Load Academic Papers")
1393
+
1394
+ col1, col2 = st.columns([1, 1])
1395
+
1396
+ with col1:
1397
+ st.subheader("From arXiv")
1398
+ arxiv_id = st.text_input("Enter arXiv ID (e.g., 2301.00001)")
1399
+ if st.button("Download from arXiv"):
1400
+ if arxiv_id:
1401
+ with st.spinner("Downloading paper..."):
1402
+ filepath = st.session_state.qa_system.download_arxiv_paper(arxiv_id)
1403
+ if filepath:
1404
+ st.success(f"Downloaded paper")
1405
+ st.session_state.papers_loaded = False
1406
+ else:
1407
+ st.error("Failed to download paper")
1408
+
1409
+ with col2:
1410
+ st.subheader("Upload PDF Files")
1411
+ uploaded_files = st.file_uploader(
1412
+ "Choose PDF files",
1413
+ type="pdf",
1414
+ accept_multiple_files=True
1415
+ )
1416
+
1417
+ if uploaded_files:
1418
+ saved_files = []
1419
+ for uploaded_file in uploaded_files:
1420
+ file_path = st.session_state.qa_system.data_dir / uploaded_file.name
1421
+ with open(file_path, "wb") as f:
1422
+ f.write(uploaded_file.getbuffer())
1423
+ saved_files.append(str(file_path))
1424
+
1425
+ st.success(f"Uploaded {len(saved_files)} files")
1426
+ st.session_state.papers_loaded = False
1427
+
1428
+ # Process papers
1429
+ st.subheader("πŸ”„ Process Papers")
1430
+ current_papers = st.session_state.qa_system.get_loaded_papers_info()
1431
+
1432
+ if not current_papers:
1433
+ st.info("No papers found. Please upload or download papers first.")
1434
+ else:
1435
+ st.info(f"Found {len(current_papers)} paper(s): {', '.join(current_papers)}")
1436
+
1437
+ if st.button("πŸš€ Process Papers", type="primary"):
1438
+ with st.spinner("Processing papers (creating embeddings on CPU)..."):
1439
+ result = st.session_state.qa_system.process_all_papers()
1440
+
1441
+ if "error" in result:
1442
+ st.error(result["error"])
1443
+ st.session_state.papers_loaded = False
1444
+ else:
1445
+ st.success(result["success"])
1446
+ st.session_state.papers_loaded = True
1447
+ st.rerun()
1448
+
1449
+ else:
1450
+ # Main chat interface when system is ready
1451
+ st.header("πŸ’¬ Chat with Your Papers")
1452
+
1453
+ # Show loaded papers info
1454
+ loaded_papers = st.session_state.qa_system.get_loaded_papers_info()
1455
+ st.info(f"πŸ“š Chatting with {len(loaded_papers)} paper(s): {', '.join(loaded_papers)}")
1456
+
1457
+ # Chat history display
1458
+ chat_container = st.container()
1459
+
1460
+ with chat_container:
1461
+ # Display chat history
1462
+ for i, message in enumerate(st.session_state.qa_system.chat_history[-10:]): # Show last 10 messages
1463
+
1464
+
1465
+ st.markdown(f"""
1466
+ <div class="chat-message user-message">
1467
+ <div class="message-content" style="color: black;">
1468
+ <strong>You:</strong> {message['question']}
1469
+ </div>
1470
+ <div class="message-timestamp">{message['timestamp']}</div>
1471
+ </div>
1472
+ """, unsafe_allow_html=True)
1473
+
1474
+
1475
+
1476
+ # Bot response
1477
+ st.markdown(f"""
1478
+ <div class="chat-message bot-message">
1479
+ <div class="message-content"><strong style="color: black;">Assistant:</strong></div>
1480
+ </div>
1481
+ """, unsafe_allow_html=True)
1482
+
1483
+ st.write(message['answer'])
1484
+ st.markdown("---")
1485
+
1486
+ # Quick question buttons
1487
+ st.subheader("πŸš€ Quick Questions")
1488
+ col1, col2, col3 = st.columns(3)
1489
+
1490
+ quick_question = None
1491
+ with col1:
1492
+ if st.button("🎯 Main Research Question"):
1493
+ quick_question = "What is the main research question addressed in this paper?"
1494
+ if st.button("πŸ”¬ Methodology"):
1495
+ quick_question = "What methodology was used in this study?"
1496
+
1497
+ with col2:
1498
+ if st.button("πŸ“Š Key Findings"):
1499
+ quick_question = "What are the key findings of this research?"
1500
+ if st.button("🎯 Conclusions"):
1501
+ quick_question = "What are the main conclusions of this research?"
1502
+
1503
+ with col3:
1504
+ if st.button("⚠️ Limitations"):
1505
+ quick_question = "What are the limitations of this study?"
1506
+ if st.button("πŸ“‹ Summary"):
1507
+ quick_question = "Please provide a summary of this paper."
1508
+
1509
+ # Chat input
1510
+ st.subheader("πŸ’­ Ask Your Question")
1511
+ user_question = st.text_area("Type your question here...", height=100, placeholder="Ask anything about your papers...")
1512
+
1513
+ # Use quick question if selected, otherwise use user input
1514
+ question_to_ask = quick_question if quick_question else user_question
1515
+
1516
+ if st.button("Send Message", type="primary", disabled=not question_to_ask):
1517
+ if question_to_ask:
1518
+ with st.spinner("Thinking... (Processing via Groq API)"):
1519
+ result = st.session_state.qa_system.ask_question(
1520
+ question_to_ask,
1521
+ use_chat_engine=True # Always use conversational mode
1522
+ )
1523
+
1524
+ if "error" in result:
1525
+ st.error(result["error"])
1526
+ else:
1527
+ st.rerun() # Reload to show new message
1528
+
1529
+ # Sources section (show for last question if available)
1530
+ if (st.session_state.qa_system.chat_history and
1531
+ st.session_state.qa_system.chat_history[-1].get('sources')):
1532
+
1533
+ with st.expander("πŸ“š View Sources", expanded=False):
1534
+ sources = st.session_state.qa_system.chat_history[-1]['sources']
1535
+ for i, source in enumerate(sources, 1):
1536
+ st.markdown(f"**Source {i}** (Relevance: {source['score']})")
1537
+ st.text(source['text'])
1538
+ st.markdown("---")
1539
 
1540
+ if __name__ == "__main__":
1541
+ create_streamlit_app()