raymondEDS commited on
Commit
aa1280b
Β·
1 Parent(s): 104b6b9

Using pdf workaround

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +161 -46
src/streamlit_app.py CHANGED
@@ -4,6 +4,8 @@ import io
4
  import base64
5
  from datetime import datetime
6
  import json
 
 
7
 
8
  # Page configuration
9
  st.set_page_config(
@@ -19,7 +21,7 @@ if 'uploaded_documents' not in st.session_state:
19
  if 'current_user' not in st.session_state:
20
  st.session_state.current_user = "User"
21
 
22
- def save_document_info(filename, file_content, file_type):
23
  """Save document information to session state"""
24
  if 'documents' not in st.session_state.uploaded_documents:
25
  st.session_state.uploaded_documents['documents'] = []
@@ -29,15 +31,29 @@ def save_document_info(filename, file_content, file_type):
29
  'upload_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
30
  'file_type': file_type,
31
  'size': len(file_content),
32
- 'content': file_content.decode('latin-1') if isinstance(file_content, bytes) else str(file_content)
 
33
  }
34
 
35
  st.session_state.uploaded_documents['documents'].append(document_info)
36
 
37
- def extract_pdf_text(pdf_file):
38
- """Extract text from PDF file"""
39
  try:
40
- pdf_reader = PyPDF2.PdfReader(pdf_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  text = ""
42
  for page in pdf_reader.pages:
43
  text += page.extract_text() + "\n"
@@ -46,6 +62,14 @@ def extract_pdf_text(pdf_file):
46
  st.error(f"Error reading PDF: {str(e)}")
47
  return ""
48
 
 
 
 
 
 
 
 
 
49
  def main():
50
  # Sidebar for navigation
51
  with st.sidebar:
@@ -124,7 +148,7 @@ def show_upload_documents():
124
  st.markdown("---")
125
 
126
  # Add information about file upload
127
- st.info("πŸ’‘ **Note:** File upload is optimized for Hugging Face Spaces. If you experience issues, try refreshing the page.")
128
 
129
  uploaded_file = st.file_uploader(
130
  "Choose a PDF file",
@@ -146,38 +170,53 @@ def show_upload_documents():
146
  for key, value in file_details.items():
147
  st.write(f"- {key}: {value}")
148
 
149
- # Extract and display PDF content
150
- pdf_text = extract_pdf_text(uploaded_file)
151
-
152
- if pdf_text.strip():
153
- st.subheader("πŸ“„ Document Preview")
154
- with st.expander("View extracted text"):
155
- st.text_area("PDF Content", pdf_text, height=300)
156
- else:
157
- st.warning("⚠️ Could not extract text from this PDF. The file may be image-based or encrypted.")
158
-
159
- # Upload button
160
- if st.button("Upload Document", type="primary"):
161
- try:
162
- # Reset file pointer to beginning
163
- uploaded_file.seek(0)
164
-
165
- # Save document info
166
- save_document_info(
167
- uploaded_file.name,
168
- uploaded_file.read(),
169
- "PDF"
170
- )
171
-
172
- st.success(f"βœ… Document '{uploaded_file.name}' uploaded successfully!")
173
- st.balloons()
174
-
175
- # Clear the file uploader
176
- st.rerun()
177
-
178
- except Exception as e:
179
- st.error(f"❌ Error uploading document: {str(e)}")
180
- st.info("πŸ’‘ Try uploading a smaller file or refresh the page.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
  except Exception as e:
183
  st.error(f"❌ Error processing file: {str(e)}")
@@ -192,6 +231,11 @@ def show_upload_documents():
192
  - Avoid password-protected PDFs
193
  - If upload fails, try refreshing the page
194
 
 
 
 
 
 
195
  **Supported formats:** PDF only
196
  """)
197
 
@@ -242,9 +286,26 @@ def show_my_documents():
242
  st.write(f"**Uploaded:** {doc['upload_time']}")
243
  st.write(f"**Size:** {doc['size']} bytes")
244
 
245
- # Display content if available
246
- if 'content' in doc and doc['content']:
247
- st.text_area("Document Content", doc['content'], height=400, key=f"content_{i}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
  st.markdown("---")
250
 
@@ -295,9 +356,26 @@ def show_document_library():
295
  st.write(f"**Uploaded:** {doc['upload_time']}")
296
  st.write(f"**Size:** {doc['size']} bytes")
297
 
298
- # Display content if available
299
- if 'content' in doc and doc['content']:
300
- st.text_area("Document Content", doc['content'], height=400, key=f"lib_content_{i}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
 
302
  st.markdown("---")
303
 
@@ -309,7 +387,7 @@ def show_settings():
309
  st.subheader("πŸ”§ System Information")
310
  st.write("**Version:** Dev LMS v1.0")
311
  st.write("**Features:**")
312
- st.write("- PDF document upload")
313
  st.write("- Document search and preview")
314
  st.write("- Document library")
315
  st.write("- Session-based storage")
@@ -340,11 +418,48 @@ def show_settings():
340
  # Clear data option
341
  if st.button("πŸ—‘οΈ Clear All Data"):
342
  if st.session_state.uploaded_documents.get('documents'):
 
 
 
 
 
 
343
  st.session_state.uploaded_documents['documents'] = []
344
- st.success("All documents have been cleared!")
345
  st.rerun()
346
  else:
347
  st.info("No documents to clear.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
 
349
  if __name__ == "__main__":
350
  main()
 
4
  import base64
5
  from datetime import datetime
6
  import json
7
+ import tempfile
8
+ import os
9
 
10
  # Page configuration
11
  st.set_page_config(
 
21
  if 'current_user' not in st.session_state:
22
  st.session_state.current_user = "User"
23
 
24
+ def save_document_info(filename, file_content, file_type, temp_path=None):
25
  """Save document information to session state"""
26
  if 'documents' not in st.session_state.uploaded_documents:
27
  st.session_state.uploaded_documents['documents'] = []
 
31
  'upload_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
32
  'file_type': file_type,
33
  'size': len(file_content),
34
+ 'content': file_content.decode('latin-1') if isinstance(file_content, bytes) else str(file_content),
35
+ 'temp_path': temp_path # Store temp path for later use
36
  }
37
 
38
  st.session_state.uploaded_documents['documents'].append(document_info)
39
 
40
+ def extract_pdf_text_from_temp(temp_path):
41
+ """Extract text from PDF file using temporary file path"""
42
  try:
43
+ with open(temp_path, "rb") as pdf_file:
44
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
45
+ text = ""
46
+ for page in pdf_reader.pages:
47
+ text += page.extract_text() + "\n"
48
+ return text
49
+ except Exception as e:
50
+ st.error(f"Error reading PDF: {str(e)}")
51
+ return ""
52
+
53
+ def extract_pdf_text_from_memory(uploaded_file):
54
+ """Extract text from PDF file in memory"""
55
+ try:
56
+ pdf_reader = PyPDF2.PdfReader(uploaded_file)
57
  text = ""
58
  for page in pdf_reader.pages:
59
  text += page.extract_text() + "\n"
 
62
  st.error(f"Error reading PDF: {str(e)}")
63
  return ""
64
 
65
+ def cleanup_temp_file(temp_path):
66
+ """Clean up temporary file"""
67
+ try:
68
+ if temp_path and os.path.exists(temp_path):
69
+ os.remove(temp_path)
70
+ except Exception as e:
71
+ st.warning(f"Could not clean up temporary file: {str(e)}")
72
+
73
  def main():
74
  # Sidebar for navigation
75
  with st.sidebar:
 
148
  st.markdown("---")
149
 
150
  # Add information about file upload
151
+ st.info("πŸ’‘ **Note:** File upload uses temporary storage for better compatibility with Hugging Face Spaces.")
152
 
153
  uploaded_file = st.file_uploader(
154
  "Choose a PDF file",
 
170
  for key, value in file_details.items():
171
  st.write(f"- {key}: {value}")
172
 
173
+ # Create temporary file for better PDF processing
174
+ temp_path = None
175
+ try:
176
+ with tempfile.NamedTemporaryFile(mode="wb", suffix=".pdf", delete=False) as temp:
177
+ bytes_data = uploaded_file.getvalue()
178
+ temp.write(bytes_data)
179
+ temp_path = temp.name
180
+
181
+ st.success(f"πŸ“ File temporarily stored at: {temp_path}")
182
+
183
+ # Extract and display PDF content using temporary file
184
+ pdf_text = extract_pdf_text_from_temp(temp_path)
185
+
186
+ if pdf_text.strip():
187
+ st.subheader("πŸ“„ Document Preview")
188
+ with st.expander("View extracted text"):
189
+ st.text_area("PDF Content", pdf_text, height=300)
190
+ else:
191
+ st.warning("⚠️ Could not extract text from this PDF. The file may be image-based or encrypted.")
192
+
193
+ # Upload button
194
+ if st.button("Upload Document", type="primary"):
195
+ try:
196
+ # Save document info with temporary file path
197
+ save_document_info(
198
+ uploaded_file.name,
199
+ bytes_data,
200
+ "PDF",
201
+ temp_path
202
+ )
203
+
204
+ st.success(f"βœ… Document '{uploaded_file.name}' uploaded successfully!")
205
+ st.balloons()
206
+
207
+ # Clear the file uploader
208
+ st.rerun()
209
+
210
+ except Exception as e:
211
+ st.error(f"❌ Error uploading document: {str(e)}")
212
+ st.info("πŸ’‘ Try uploading a smaller file or refresh the page.")
213
+ # Clean up temp file on error
214
+ cleanup_temp_file(temp_path)
215
+
216
+ except Exception as e:
217
+ st.error(f"❌ Error creating temporary file: {str(e)}")
218
+ st.info("πŸ’‘ Please try uploading a different PDF file.")
219
+ cleanup_temp_file(temp_path)
220
 
221
  except Exception as e:
222
  st.error(f"❌ Error processing file: {str(e)}")
 
231
  - Avoid password-protected PDFs
232
  - If upload fails, try refreshing the page
233
 
234
+ **Technical details:**
235
+ - Files are temporarily stored on the server
236
+ - Text extraction uses temporary file processing
237
+ - Automatic cleanup of temporary files
238
+
239
  **Supported formats:** PDF only
240
  """)
241
 
 
286
  st.write(f"**Uploaded:** {doc['upload_time']}")
287
  st.write(f"**Size:** {doc['size']} bytes")
288
 
289
+ # Check if we have a temporary file path for better content extraction
290
+ if doc.get('temp_path') and os.path.exists(doc['temp_path']):
291
+ try:
292
+ # Extract fresh content from temporary file
293
+ fresh_content = extract_pdf_text_from_temp(doc['temp_path'])
294
+ if fresh_content.strip():
295
+ st.text_area("Document Content (Fresh Extract)", fresh_content, height=400, key=f"fresh_content_{i}")
296
+ else:
297
+ # Fall back to stored content
298
+ if 'content' in doc and doc['content']:
299
+ st.text_area("Document Content (Stored)", doc['content'], height=400, key=f"content_{i}")
300
+ except Exception as e:
301
+ st.warning(f"Could not read from temporary file: {str(e)}")
302
+ # Fall back to stored content
303
+ if 'content' in doc and doc['content']:
304
+ st.text_area("Document Content (Stored)", doc['content'], height=400, key=f"content_{i}")
305
+ else:
306
+ # Display stored content
307
+ if 'content' in doc and doc['content']:
308
+ st.text_area("Document Content", doc['content'], height=400, key=f"content_{i}")
309
 
310
  st.markdown("---")
311
 
 
356
  st.write(f"**Uploaded:** {doc['upload_time']}")
357
  st.write(f"**Size:** {doc['size']} bytes")
358
 
359
+ # Check if we have a temporary file path for better content extraction
360
+ if doc.get('temp_path') and os.path.exists(doc['temp_path']):
361
+ try:
362
+ # Extract fresh content from temporary file
363
+ fresh_content = extract_pdf_text_from_temp(doc['temp_path'])
364
+ if fresh_content.strip():
365
+ st.text_area("Document Content (Fresh Extract)", fresh_content, height=400, key=f"lib_fresh_content_{i}")
366
+ else:
367
+ # Fall back to stored content
368
+ if 'content' in doc and doc['content']:
369
+ st.text_area("Document Content (Stored)", doc['content'], height=400, key=f"lib_content_{i}")
370
+ except Exception as e:
371
+ st.warning(f"Could not read from temporary file: {str(e)}")
372
+ # Fall back to stored content
373
+ if 'content' in doc and doc['content']:
374
+ st.text_area("Document Content (Stored)", doc['content'], height=400, key=f"lib_content_{i}")
375
+ else:
376
+ # Display stored content
377
+ if 'content' in doc and doc['content']:
378
+ st.text_area("Document Content", doc['content'], height=400, key=f"lib_content_{i}")
379
 
380
  st.markdown("---")
381
 
 
387
  st.subheader("πŸ”§ System Information")
388
  st.write("**Version:** Dev LMS v1.0")
389
  st.write("**Features:**")
390
+ st.write("- PDF document upload with temporary storage")
391
  st.write("- Document search and preview")
392
  st.write("- Document library")
393
  st.write("- Session-based storage")
 
418
  # Clear data option
419
  if st.button("πŸ—‘οΈ Clear All Data"):
420
  if st.session_state.uploaded_documents.get('documents'):
421
+ # Clean up temporary files before clearing data
422
+ documents = st.session_state.uploaded_documents['documents']
423
+ for doc in documents:
424
+ if doc.get('temp_path'):
425
+ cleanup_temp_file(doc['temp_path'])
426
+
427
  st.session_state.uploaded_documents['documents'] = []
428
+ st.success("All documents and temporary files have been cleared!")
429
  st.rerun()
430
  else:
431
  st.info("No documents to clear.")
432
+
433
+ st.markdown("---")
434
+
435
+ # Cleanup temporary files option
436
+ if st.button("🧹 Cleanup Temporary Files"):
437
+ documents = st.session_state.uploaded_documents.get('documents', [])
438
+ cleaned_count = 0
439
+
440
+ for doc in documents:
441
+ if doc.get('temp_path') and not os.path.exists(doc['temp_path']):
442
+ # Remove temp_path reference if file doesn't exist
443
+ doc.pop('temp_path', None)
444
+ cleaned_count += 1
445
+
446
+ if cleaned_count > 0:
447
+ st.success(f"Cleaned up {cleaned_count} missing temporary file references!")
448
+ else:
449
+ st.info("No cleanup needed - all temporary files are properly managed.")
450
+
451
+ st.markdown("---")
452
+
453
+ # System status
454
+ st.subheader("πŸ“Š System Status")
455
+ documents = st.session_state.uploaded_documents.get('documents', [])
456
+ temp_files_count = sum(1 for doc in documents if doc.get('temp_path') and os.path.exists(doc['temp_path']))
457
+
458
+ col1, col2 = st.columns(2)
459
+ with col1:
460
+ st.metric("Total Documents", len(documents))
461
+ with col2:
462
+ st.metric("Active Temp Files", temp_files_count)
463
 
464
  if __name__ == "__main__":
465
  main()