sidbhasin commited on
Commit
3f21fcc
·
verified ·
1 Parent(s): 633407a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -132
app.py CHANGED
@@ -12,48 +12,33 @@ st.set_page_config(
12
  layout="wide"
13
  )
14
 
15
- # Custom CSS for better styling
16
  st.markdown("""
17
  <style>
18
  .chat-container {
19
- display: flex;
20
- flex-direction: column;
21
- gap: 20px;
22
  padding: 20px;
23
- height: calc(100vh - 200px);
24
- overflow-y: auto;
25
  }
26
- .message-container {
27
- display: flex;
28
- flex-direction: column;
29
- gap: 10px;
30
  padding: 15px;
31
  border-radius: 10px;
32
- max-width: 90%;
33
- }
34
- .user-message {
35
- background-color: #2b313e;
36
- color: white;
37
- align-self: flex-end;
38
  }
39
  .assistant-message {
40
  background-color: #f0f2f6;
41
- color: black;
42
- align-self: flex-start;
 
43
  }
44
  .source-info {
45
  font-size: 0.8em;
46
  color: #666;
 
 
47
  border-top: 1px solid #ddd;
48
- margin-top: 10px;
49
- padding-top: 10px;
50
- }
51
- .context-box {
52
- background-color: #f8f9fa;
53
- border-left: 3px solid #1f77b4;
54
- padding: 10px;
55
- margin-top: 10px;
56
- font-size: 0.9em;
57
  }
58
  .chat-input {
59
  position: fixed;
@@ -64,19 +49,29 @@ st.markdown("""
64
  background: white;
65
  border-top: 1px solid #ddd;
66
  }
 
 
 
67
  </style>
68
  """, unsafe_allow_html=True)
69
 
 
 
 
 
 
 
70
  @st.cache_resource
71
- def load_qa_model():
72
  return pipeline(
73
  "question-answering",
74
  model="deepset/roberta-base-squad2",
75
  tokenizer="deepset/roberta-base-squad2"
76
  )
77
 
78
- def process_pdf(pdf_file):
79
  text_data = []
 
80
  with pdfplumber.open(pdf_file) as pdf:
81
  for page_num, page in enumerate(pdf.pages, 1):
82
  text = page.extract_text()
@@ -92,50 +87,47 @@ def process_pdf(pdf_file):
92
  })
93
  return text_data
94
 
95
- def find_best_answer(question, text_data, qa_model):
96
  best_answer = None
97
  max_score = 0
98
- relevant_context = []
99
-
100
- for chunk in text_data:
101
- try:
102
- result = qa_model(
103
- question=question,
104
- context=chunk['text'],
105
- max_answer_len=100
106
- )
107
-
108
- if result['score'] > max_score:
109
- max_score = result['score']
110
- best_answer = {
111
- 'answer': result['answer'],
112
  'confidence': result['score'],
113
- 'page': chunk['page'],
114
- 'paragraph': chunk['paragraph'],
115
- 'context': chunk['context']
116
  }
117
-
118
- # Collect relevant contexts
119
- if result['score'] > 0.1: # Threshold for relevance
120
- relevant_context.append(chunk['context'])
121
-
122
- except Exception as e:
123
- continue
124
-
125
- return best_answer, relevant_context[:3] # Return top 3 relevant contexts
 
 
 
 
126
 
127
  def main():
128
- st.title("📚 Advanced PDF Question Answering")
129
 
130
- # Initialize session state
131
- if 'messages' not in st.session_state:
132
- st.session_state.messages = []
133
- if 'pdf_data' not in st.session_state:
134
- st.session_state.pdf_data = None
135
-
136
- # Load QA model
137
  try:
138
- qa_model = load_qa_model()
139
  except Exception as e:
140
  st.error(f"Error loading model: {str(e)}")
141
  return
@@ -143,91 +135,78 @@ def main():
143
  # File upload
144
  pdf_file = st.file_uploader("Upload PDF Document", type=['pdf'])
145
 
146
- if pdf_file and not st.session_state.pdf_data:
147
  with st.spinner("Processing PDF..."):
148
  try:
149
- st.session_state.pdf_data = process_pdf(pdf_file)
150
- st.success("PDF processed successfully! You can now ask questions.")
151
  except Exception as e:
152
  st.error(f"Error processing PDF: {str(e)}")
153
  return
154
 
155
- # Chat interface
156
- st.markdown('<div class="chat-container">', unsafe_allow_html=True)
157
-
158
- # Display chat history
159
- for message in st.session_state.messages:
160
- if message["role"] == "user":
161
- st.markdown(f"""
162
- <div class="message-container user-message">
163
- {message["content"]}
164
- </div>
165
- """, unsafe_allow_html=True)
166
- else:
167
- st.markdown(f"""
168
- <div class="message-container assistant-message">
169
- <div>{message["content"]}</div>
170
- <div class="source-info">
171
- Source: Page {message["metadata"]["page"]},
172
- Paragraph {message["metadata"]["paragraph"]}
173
- (Confidence: {message["metadata"]["confidence"]:.1%})
174
  </div>
175
- <div class="context-box">
176
- {message["metadata"]["context"]}
177
- </div>
178
- </div>
179
- """, unsafe_allow_html=True)
180
-
181
- st.markdown('</div>', unsafe_allow_html=True)
182
 
183
- # Question input
184
- if st.session_state.pdf_data:
185
- question = st.text_input("Ask a question about the document:", key="question_input")
186
-
187
- if question:
188
- # Add user question to chat history
189
- st.session_state.messages.append({"role": "user", "content": question})
190
 
191
- # Generate answer
192
- with st.spinner("Finding answer..."):
193
- answer, relevant_contexts = find_best_answer(
194
- question,
195
- st.session_state.pdf_data,
196
- qa_model
197
- )
198
 
199
- if answer:
200
- # Add assistant response to chat history
201
- st.session_state.messages.append({
202
- "role": "assistant",
203
- "content": answer["answer"],
204
- "metadata": {
205
- "page": answer["page"],
206
- "paragraph": answer["paragraph"],
207
- "confidence": answer["confidence"],
208
- "context": answer["context"]
209
- }
210
- })
211
 
212
- # Force refresh
213
- st.rerun()
214
- else:
215
- st.error("Sorry, I couldn't find a relevant answer in the document.")
216
-
 
 
 
 
 
 
 
 
 
 
217
  else:
218
  st.markdown("""
219
  ### Instructions:
220
  1. Upload a PDF document using the file uploader above
221
  2. Wait for the document to be processed
222
- 3. Start asking questions about the content
223
- 4. Get detailed answers with source information and context
224
 
225
  ### Features:
226
- - Natural conversation interface
227
- - Source tracking with page numbers
228
- - Confidence scores
229
- - Relevant context display
230
- - Multiple question support
231
  """)
232
 
233
  if __name__ == "__main__":
 
12
  layout="wide"
13
  )
14
 
15
+ # Custom CSS for better chat interface
16
  st.markdown("""
17
  <style>
18
  .chat-container {
19
+ border-radius: 10px;
20
+ margin-bottom: 20px;
 
21
  padding: 20px;
 
 
22
  }
23
+ .user-message {
24
+ background-color: #e6f3ff;
 
 
25
  padding: 15px;
26
  border-radius: 10px;
27
+ margin: 10px 0;
28
+ text-align: right;
 
 
 
 
29
  }
30
  .assistant-message {
31
  background-color: #f0f2f6;
32
+ padding: 15px;
33
+ border-radius: 10px;
34
+ margin: 10px 0;
35
  }
36
  .source-info {
37
  font-size: 0.8em;
38
  color: #666;
39
+ margin-top: 5px;
40
+ padding-top: 5px;
41
  border-top: 1px solid #ddd;
 
 
 
 
 
 
 
 
 
42
  }
43
  .chat-input {
44
  position: fixed;
 
49
  background: white;
50
  border-top: 1px solid #ddd;
51
  }
52
+ .main {
53
+ margin-bottom: 100px; /* Space for fixed chat input */
54
+ }
55
  </style>
56
  """, unsafe_allow_html=True)
57
 
58
+ # Initialize session state
59
+ if 'messages' not in st.session_state:
60
+ st.session_state.messages = []
61
+ if 'text_data' not in st.session_state:
62
+ st.session_state.text_data = None
63
+
64
  @st.cache_resource
65
+ def load_model():
66
  return pipeline(
67
  "question-answering",
68
  model="deepset/roberta-base-squad2",
69
  tokenizer="deepset/roberta-base-squad2"
70
  )
71
 
72
+ def extract_text_with_metadata(pdf_file):
73
  text_data = []
74
+
75
  with pdfplumber.open(pdf_file) as pdf:
76
  for page_num, page in enumerate(pdf.pages, 1):
77
  text = page.extract_text()
 
87
  })
88
  return text_data
89
 
90
+ def find_answer(question, text_data, qa_model):
91
  best_answer = None
92
  max_score = 0
93
+
94
+ # Combine all text for context
95
+ full_text = ' '.join([item['text'] for item in text_data])
96
+
97
+ try:
98
+ # Get answer from model
99
+ result = qa_model(question=question, context=full_text)
100
+
101
+ # Find the source paragraph
102
+ answer_text = result['answer']
103
+ for item in text_data:
104
+ if answer_text in item['text']:
105
+ return {
106
+ 'answer': answer_text,
107
  'confidence': result['score'],
108
+ 'page': item['page'],
109
+ 'paragraph': item['paragraph'],
110
+ 'context': item['text']
111
  }
112
+
113
+ # If exact paragraph not found, return with first paragraph
114
+ return {
115
+ 'answer': answer_text,
116
+ 'confidence': result['score'],
117
+ 'page': 1,
118
+ 'paragraph': 1,
119
+ 'context': text_data[0]['text']
120
+ }
121
+
122
+ except Exception as e:
123
+ st.error(f"Error finding answer: {str(e)}")
124
+ return None
125
 
126
  def main():
127
+ st.title("📚 PDF Chat Assistant")
128
 
 
 
 
 
 
 
 
129
  try:
130
+ qa_model = load_model()
131
  except Exception as e:
132
  st.error(f"Error loading model: {str(e)}")
133
  return
 
135
  # File upload
136
  pdf_file = st.file_uploader("Upload PDF Document", type=['pdf'])
137
 
138
+ if pdf_file and not st.session_state.text_data:
139
  with st.spinner("Processing PDF..."):
140
  try:
141
+ st.session_state.text_data = extract_text_with_metadata(pdf_file)
142
+ st.success("PDF processed successfully! You can now ask questions below.")
143
  except Exception as e:
144
  st.error(f"Error processing PDF: {str(e)}")
145
  return
146
 
147
+ # Display chat interface if PDF is processed
148
+ if st.session_state.text_data:
149
+ # Chat history
150
+ st.markdown('<div class="chat-container">', unsafe_allow_html=True)
151
+ for message in st.session_state.messages:
152
+ if message["role"] == "user":
153
+ st.markdown(f'<div class="user-message">{message["content"]}</div>',
154
+ unsafe_allow_html=True)
155
+ else:
156
+ st.markdown(f"""
157
+ <div class="assistant-message">
158
+ <div>{message["content"]}</div>
159
+ <div class="source-info">
160
+ Source: Page {message["metadata"]["page"]},
161
+ Paragraph {message["metadata"]["paragraph"]}
162
+ (Confidence: {message["metadata"]["confidence"]:.1%})
163
+ </div>
 
 
164
  </div>
165
+ """, unsafe_allow_html=True)
166
+ st.markdown('</div>', unsafe_allow_html=True)
 
 
 
 
 
167
 
168
+ # Chat input
169
+ with st.container():
170
+ st.markdown('<div class="chat-input">', unsafe_allow_html=True)
171
+ question = st.text_input("Ask a question about the document:", key="question_input")
172
+ st.markdown('</div>', unsafe_allow_html=True)
 
 
173
 
174
+ if question:
175
+ # Add user question to chat history
176
+ st.session_state.messages.append({"role": "user", "content": question})
 
 
 
 
177
 
178
+ # Get answer
179
+ with st.spinner("Finding answer..."):
180
+ answer = find_answer(question, st.session_state.text_data, qa_model)
 
 
 
 
 
 
 
 
 
181
 
182
+ if answer:
183
+ # Add assistant response to chat history
184
+ st.session_state.messages.append({
185
+ "role": "assistant",
186
+ "content": answer["answer"],
187
+ "metadata": {
188
+ "page": answer["page"],
189
+ "paragraph": answer["paragraph"],
190
+ "confidence": answer["confidence"],
191
+ "context": answer["context"]
192
+ }
193
+ })
194
+
195
+ # Rerun to update chat display
196
+ st.rerun()
197
  else:
198
  st.markdown("""
199
  ### Instructions:
200
  1. Upload a PDF document using the file uploader above
201
  2. Wait for the document to be processed
202
+ 3. Use the chat interface to ask questions
203
+ 4. Get answers with source information
204
 
205
  ### Features:
206
+ - Chat-like interface
207
+ - Source tracking
208
+ - Context preservation
209
+ - Multiple questions support
 
210
  """)
211
 
212
  if __name__ == "__main__":