Alphin Jain commited on
Commit
1a40b86
·
1 Parent(s): 28f6084

back to normal

Browse files
Files changed (2) hide show
  1. app.py +118 -346
  2. main.py +151 -118
app.py CHANGED
@@ -2,36 +2,15 @@
2
  import streamlit as st
3
  import os
4
  import sys
5
- import time
6
  import logging
7
- import concurrent.futures # Use concurrent.futures explicitly
8
- from concurrent.futures import ThreadPoolExecutor, Future # Import Future
9
- import shutil # Import shutil for clearing directory
10
- import queue # Import the queue module
11
 
12
  # Add the project root to the sys.path to allow importing modules like config, document_processing, etc.
13
  # This assumes app.py is in the project root directory.
14
- project_root = os.path.abspath(os.path.join(os.path.dirname(__file__)))
15
- if project_root not in sys.path:
16
- sys.path.append(project_root)
17
-
18
- # Initialize ThreadPoolExecutor - reuse it across reruns
19
- # Max workers can be adjusted based on expected load and I/O vs CPU bound tasks
20
- if 'executor' not in st.session_state:
21
- st.session_state.executor = ThreadPoolExecutor(max_workers=os.cpu_count()*2) # Adjust max_workers as needed
22
-
23
- # Initialize a thread-safe queue for communication from background threads
24
- # This queue will store updates about file processing status
25
- if 'update_queue' not in st.session_state:
26
- st.session_state.update_queue = queue.Queue()
27
-
28
 
29
  try:
30
- # Import the necessary functions and constants from your main script
31
- # Ensure your main.py has the clear_upload_directory function and TEMP_UPLOAD_DIR constant
32
- # and that summarize_extracted_documents accepts the update_queue argument.
33
- from main import process_uploaded_files, setup_retrieval_system, summarize_extracted_documents, clear_upload_directory, TEMP_UPLOAD_DIR
34
-
35
  # Configure Streamlit's logging to match your application's settings
36
  logging.basicConfig(level='INFO', format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
37
  logger = logging.getLogger(__name__)
@@ -51,7 +30,7 @@ except ImportError as e:
51
  # --- Streamlit App Configuration ---
52
  st.set_page_config(
53
  page_title="Aya Insight Document Summarizer",
54
- page_icon="塘",
55
  layout="wide"
56
  )
57
 
@@ -59,30 +38,21 @@ st.set_page_config(
59
  # Initialize session state variables if they don't exist
60
  if 'api_key_entered' not in st.session_state:
61
  st.session_state.api_key_entered = False
62
- # Initialize summary_results as a dictionary to store results per file
63
- # Key: filename, Value: {'status': str, 'summary': str | None, 'error': str | None, 'success': bool, ...}
64
- if 'summary_results' not in st.session_state or st.session_state.summary_results is None:
65
- st.session_state.summary_results = {}
66
  if 'selected_filename' not in st.session_state:
67
  st.session_state.selected_filename = None
68
- # Add state variables to track the background task for each file
69
- # Key: filename, Value: concurrent.futures.Future object
70
- if 'processing_futures' not in st.session_state:
71
- st.session_state.processing_futures = {}
72
- # Overall summarizing flag
73
- if 'summarizing' not in st.session_state:
74
- st.session_state.summarizing = False
75
- if 'overall_error' not in st.session_state: # For errors in initial steps (extraction, setup)
76
- st.session_state.overall_error = None
77
 
78
 
79
  # --- API Key Input Section ---
80
  if not st.session_state.api_key_entered:
81
- st.title("白 Enter Your Cohere API Key to Unlock")
82
  api_key = st.text_input("Cohere API Key", type="password", help="Enter your Cohere API key to use the summarization service.")
83
 
84
  if st.button("Unlock"):
85
  if api_key:
 
 
86
  os.environ["COHERE_API_KEY"] = api_key # Set the environment variable
87
  st.session_state.api_key_entered = True
88
  st.success("API Key accepted. You can now upload documents.")
@@ -92,7 +62,7 @@ if not st.session_state.api_key_entered:
92
 
93
  # --- Main Application Content (Unlocked) ---
94
  if st.session_state.api_key_entered and modules_loaded:
95
- st.title("塘 Aya Insight Document Summarizer")
96
  st.markdown("""
97
  Upload one or more PDF or image files to get a structured summary for each document.
98
  """)
@@ -106,338 +76,140 @@ if st.session_state.api_key_entered and modules_loaded:
106
  )
107
 
108
  # --- Summarize Button and Logic ---
109
- # Disable the button if summarization is already in progress
110
  if uploaded_files: # Only show button if files are uploaded
111
  st.info(f"You have uploaded {len(uploaded_files)} file(s).")
112
 
113
- # Use a unique key for the button based on the summarizing state
114
- button_key = "summarize_button_processing" if st.session_state.summarizing else "summarize_button_ready"
115
-
116
- if st.button("Generate Summaries", key=button_key, disabled=st.session_state.summarizing):
117
  st.session_state.selected_filename = None # Reset selected file on new summary generation
118
- st.session_state.summary_results = {} # Clear previous results before starting new process
119
- st.session_state.processing_futures = {} # Clear previous futures
120
- st.session_state.summarizing = True # Set summarizing flag
121
- st.session_state.overall_error = None # Clear previous overall errors
122
- st.session_state.update_queue = queue.Queue() # Create a new queue for this batch
123
-
124
  if not uploaded_files:
125
  st.warning("Please upload at least one file before generating summaries.")
126
- st.session_state.summarizing = False # Reset flag if no files
127
- st.rerun() # Rerun to clear spinner if no files
128
  else:
129
  st.subheader("Processing Documents...")
130
- # Clear the temporary upload directory from previous runs
131
- clear_upload_directory()
132
- logger.info("Cleared previous upload directory content.")
133
-
134
- # --- Initial Processing (Extraction and Retrieval Setup) ---
135
- # This part still runs in the main Streamlit thread
136
- logger.info(f"Calling process_uploaded_files with {len(uploaded_files)} files.")
137
- try:
138
- # Save files and perform initial extraction
139
- extraction_results = process_uploaded_files(uploaded_files)
140
- logger.info(f"Finished document extraction. {len(extraction_results)} results obtained.")
141
-
142
- # Initialize status for all uploaded files (even those that might fail extraction)
143
- # This ensures all uploaded files get a tile.
144
- uploaded_filenames = [f.name for f in uploaded_files]
145
- for filename in uploaded_filenames:
146
- # Find the corresponding extraction result to get potential early errors
147
- initial_res = next((res for res in extraction_results if res.get('filename') == filename), {})
148
- status = 'waiting'
149
- error = initial_res.get('error')
150
- if error:
151
- status = 'extraction_error'
152
- st.warning(f"Extraction failed for {filename}: {error}")
153
-
154
- st.session_state.summary_results[filename] = {
155
- 'status': status,
156
- 'summary': None,
157
- 'error': error,
158
- 'success': False
159
- }
160
-
161
- # Filter for files that had successful extraction (have text)
162
- summarizable_extraction_results = [res for res in extraction_results if res.get('text')]
163
-
164
- if not summarizable_extraction_results:
165
- st.error("No text could be extracted from the uploaded files. Please check the file formats.")
166
- logger.error("No text extracted from any uploaded file.")
167
- st.session_state.summarizing = False # Reset flag
168
- st.rerun() # Rerun to clear spinner and show error
169
- # st.stop() # Don't stop, allow user to try again
170
-
171
-
172
- # Setup retrieval system (Vector Store and Embedding)
173
- logger.info("Calling setup_retrieval_system.")
174
- # setup_retrieval_system needs results with text
175
- extraction_results_with_chunks, retriever = setup_retrieval_system(summarizable_extraction_results)
176
- logger.info("Retriever system setup complete.")
177
-
178
- # Check if chunking was successful for summarizable files
179
- final_summarizable_results = []
180
- for res in extraction_results_with_chunks:
181
- filename = res.get('filename', 'unknown')
182
- if res.get('chunk_size', 0) > 0:
183
- final_summarizable_results.append(res)
184
- # Update status for files that are ready for summarization
185
- if filename in st.session_state.summary_results:
186
- st.session_state.summary_results[filename]['status'] = 'queued' # Ready for background task
187
- else:
188
- # Update status for files that failed chunking
189
- if filename in st.session_state.summary_results and st.session_state.summary_results[filename]['status'] == 'waiting':
190
- st.session_state.summary_results[filename].update({
191
- 'status': 'chunking_error',
192
- 'error': 'Could not create text chunks',
193
- 'success': False
194
- })
195
- st.warning(f"Chunking failed for {filename}.")
196
-
197
-
198
- if not final_summarizable_results:
199
- st.error("No usable content chunks were created from the extracted text. Summarization cannot proceed.")
200
- logger.error("No usable content chunks created.")
201
- st.session_state.summarizing = False # Reset flag
202
- st.rerun()
203
- # st.stop()
204
-
205
-
206
- # --- Submit Individual Summarization Tasks to Background ---
207
- logger.info(f"Submitting {len(final_summarizable_results)} summarization tasks to ThreadPoolExecutor.")
208
-
209
- # Submit the synchronous summarization task for each document to the background thread
210
- # Pass the queue object to the background function
211
- for doc_result in final_summarizable_results:
212
- filename = doc_result.get('filename', 'unknown')
213
- # Pass necessary data and the queue object to the background task
214
- # The summarize_extracted_documents in main.py is expected to handle a list of one document
215
- future = st.session_state.executor.submit(
216
- summarize_extracted_documents, # The synchronous function to run
217
- [doc_result], # Pass a list with a single document result
218
- retriever,
219
- st.session_state.update_queue # Pass the queue
220
- )
221
- st.session_state.processing_futures[filename] = future
222
- # Update initial status in main thread's session state immediately
223
- if filename in st.session_state.summary_results:
224
- st.session_state.summary_results[filename]['status'] = 'processing' # Initial status while task is running
225
-
226
- logger.info(f"Individual summarization tasks submitted to background for {len(st.session_state.processing_futures)} files.")
227
-
228
- # Rerun immediately to show the spinner and initial state with 'processing' tiles
229
- st.rerun()
230
-
231
- except FileNotFoundError as fnf_error:
232
- st.error(f"Configuration Error: {fnf_error}. Please check your environment settings.")
233
- logger.error(f"Configuration Error during Streamlit process: {fnf_error}", exc_info=True)
234
- st.session_state.summarizing = False # Reset flag on error
235
- st.session_state.overall_error = str(fnf_error)
236
- st.session_state.processing_futures = {} # Clear futures on error
237
- st.rerun() # Rerun to show error and reset state
238
- except Exception as e:
239
- st.error(f"An unexpected error occurred during initial processing: {e}")
240
- logger.error(f"An unexpected error occurred during initial Streamlit process: {e}", exc_info=True)
241
- st.session_state.summarizing = False # Reset flag on error
242
- st.session_state.overall_error = str(e)
243
- st.session_state.processing_futures = {} # Clear futures on error
244
- st.rerun() # Rerun to show error and reset state
245
-
246
-
247
- # --- Process Queue and Check Background Task Status ---
248
- # This block runs on every Streamlit rerun
249
- # Process any updates from the background queue
250
- updated_from_queue = False
251
- try:
252
- while not st.session_state.update_queue.empty():
253
- update = st.session_state.update_queue.get_nowait()
254
- filename = update.get('filename')
255
- status = update.get('status')
256
- result_data = update.get('result_data')
257
-
258
- if filename and filename in st.session_state.summary_results:
259
- logger.debug(f"Main thread processing queue update for {filename}: status={status}")
260
- st.session_state.summary_results[filename]['status'] = status
261
- if result_data:
262
- # Update other fields if result data is provided (e.g., summary, error, success)
263
- st.session_state.summary_results[filename].update(result_data)
264
- updated_from_queue = True
265
- st.session_state.update_queue.task_done() # Mark the task as done in the queue
266
-
267
- except queue.Empty:
268
- # This is expected when the queue is empty
269
- pass
270
- except Exception as e:
271
- logger.error(f"Error processing queue: {e}", exc_info=True)
272
- st.error(f"An error occurred while processing background updates: {e}")
273
-
274
-
275
- # Check completed futures to know when tasks are done and potentially trigger rerun
276
- completed_futures_filenames = [filename for filename, future in st.session_state.processing_futures.items() if future.done()]
277
-
278
- if completed_futures_filenames:
279
- logger.debug(f"Found {len(completed_futures_filenames)} completed futures.")
280
- rerun_needed = False
281
- for filename in completed_futures_filenames:
282
- # We don't need to get the result here as the queue callback
283
- # already put the final status/result into the queue and updated session_state.
284
- # We just need to know the future is done to remove it.
285
- try:
286
- # Optional: Check for exceptions in the future if the queue callback
287
- # didn't handle all error reporting scenarios.
288
- # future.result() # This would re-raise exceptions from the background task
289
- pass
290
- except Exception as e:
291
- logger.error(f"Exception in completed future for {filename} (already reported via queue): {e}", exc_info=True)
292
- # The queue processing should have updated the status to 'error' already.
293
-
294
- # Remove the completed future
295
- if filename in st.session_state.processing_futures:
296
- del st.session_state.processing_futures[filename]
297
- rerun_needed = True # Indicate that UI might need update if queue updates didn't trigger it
298
-
299
- # Trigger a rerun if any futures completed or if queue was processed
300
- if rerun_needed or updated_from_queue:
301
- logger.info("Completed futures processed or queue updated, triggering rerun.")
302
- st.rerun()
303
-
304
- # Check if all futures are done and no more are being added
305
- if not st.session_state.processing_futures and st.session_state.summarizing:
306
- logger.info("All background tasks completed.")
307
- st.session_state.summarizing = False # Reset overall summarizing flag
308
- # Trigger a final rerun to show completion status if not already triggered
309
- if not updated_from_queue: # Avoid double rerun if queue processing already did it
310
- st.rerun()
311
 
312
 
313
  # --- Display Document Tiles and Summaries ---
314
- # This section reads from the incrementally updated st.session_state.summary_results.
315
- # It will update whenever Streamlit reruns.
316
- if st.session_state.summary_results is not None: # Check if the dictionary exists
317
- st.subheader("Processing Status and Summaries:")
318
-
319
- if not st.session_state.summary_results and not st.session_state.summarizing and st.session_state.overall_error is None:
320
- st.info("Upload files and click 'Generate Summaries' to begin.")
321
- elif st.session_state.summarizing or st.session_state.summary_results:
322
- # Display an overall status message
323
- total_files = len(st.session_state.summary_results)
324
- # Count files that have reached a final state
325
- completed_count = sum(1 for res in st.session_state.summary_results.values() if res.get('status') in ['completed', 'error', 'skipped', 'extraction_error', 'no_text', 'chunking_error'])
326
- # Count files that are currently in an intermediate processing state
327
- processing_count = sum(1 for res in st.session_state.summary_results.values() if res.get('status') in ['waiting', 'queued', 'processing', 'summarizing'])
328
-
329
- if st.session_state.summarizing:
330
- st.info(f"Processing... {processing_count} in progress, {completed_count} completed or skipped out of {total_files} files.")
331
- elif st.session_state.overall_error:
332
- st.error(f"An error occurred during initial processing: {st.session_state.overall_error}")
333
- st.warning(f"{completed_count} files processed with status out of {total_files} attempted.")
334
- else:
335
- successful_summaries = sum(1 for res in st.session_state.summary_results.values() if res.get('status') == 'completed')
336
- st.success(f"Finished processing. {successful_summaries} summaries successfully generated out of {total_files} files.")
337
- if completed_count < total_files:
338
- st.warning(f"Some files ({total_files - completed_count}) did not complete processing.")
339
-
340
-
341
- # Display files as a grid of clickable tiles if there are results
342
- if st.session_state.summary_results:
343
  files_per_row = 3
344
- # Use the keys (filenames) for consistent ordering
345
- filenames = list(st.session_state.summary_results.keys())
346
- rows = len(filenames) // files_per_row + (len(filenames) % files_per_row > 0)
 
347
 
348
  for i in range(rows):
349
  cols = st.columns(files_per_row)
350
  for j in range(files_per_row):
351
  file_index = i * files_per_row + j
352
- if file_index < len(filenames):
353
- filename = filenames[file_index]
354
- result = st.session_state.summary_results.get(filename, {}) # Get result safely
355
- status = result.get('status', 'unknown')
356
  is_selected = st.session_state.selected_filename == filename
357
 
358
- # Determine tile appearance based on status
359
- if status == 'completed':
360
- button_label = f"塘 {filename} (Completed)"
361
- # Use a custom style or expander for completed tiles if needed,
362
- # or rely on the selection below to show the summary.
363
- elif status in ['error', 'skipped', 'extraction_error', 'no_text', 'chunking_error']:
364
- button_label = f"☒ {filename} ({status.replace('_', ' ').title()})" # e.g., "Extraction Error"
365
- elif status in ['waiting', 'queued', 'processing', 'summarizing']:
366
- button_label = f"時計 {filename} ({status.replace('_', ' ').title()})..."
367
- else:
368
- button_label = f"ファイル {filename} (Status: {status})"
369
-
370
- # Use a standard button within the column
371
- # Add a unique key based on the filename and whether it's selected
372
- button_key = f"tile_button_{filename}_{'selected' if is_selected else 'normal'}"
373
- # Use beta_container or just cols[j]
374
  with cols[j]:
375
- # Make the button disabled if it's in a processing state
376
- button_disabled = status in ['waiting', 'queued', 'processing', 'summarizing']
377
- if st.button(button_label, key=button_key, use_container_width=True, disabled=button_disabled):
378
- # Only allow selecting tiles that have finished processing (completed, error, skipped, etc.)
379
- if not button_disabled:
380
- st.session_state.selected_filename = filename
381
- logger.info(f"Selected file: {filename}")
382
- else:
383
- # This block is technically unreachable because the button is disabled,
384
- # but good practice to handle.
385
- st.info(f"Cannot select '{filename}' while it is still processing or waiting.")
386
- st.session_state.selected_filename = None # Deselect if a processing tile was somehow clicked
387
-
388
-
389
- # Display summary/details of the selected file below the tiles
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
  if st.session_state.selected_filename:
391
  st.markdown("---") # Separator
392
- st.subheader(f"Details for: {st.session_state.selected_filename}")
393
-
394
- selected_result = st.session_state.summary_results.get(st.session_state.selected_filename)
395
-
396
- if selected_result:
397
- status = selected_result.get('status', 'unknown')
398
- st.markdown(f"**Status:** {status.replace('_', ' ').title()}")
399
-
400
- if status == 'completed':
401
- summary = selected_result.get('summary')
402
- if summary:
403
- st.markdown("---")
404
- st.markdown("### Summary:")
405
- st.markdown(summary) # Render markdown summary
406
- else:
407
- st.info(f"Summary is empty for {st.session_state.selected_filename}.")
408
- elif selected_result.get('error'):
409
- st.error(f"Error details: {selected_result.get('error')}")
410
- elif status == 'skipped':
411
- st.info("This file was skipped due to initial processing issues.")
412
- elif status == 'no_text':
413
- st.info("No text could be extracted from this file.")
414
- elif status == 'chunking_error':
415
- st.info("Could not create usable text chunks from this file.")
416
  else:
417
- st.info(f"Details are not available for '{st.session_state.selected_filename}' yet as it has status: {status.replace('_', ' ').title()}")
 
 
418
 
 
 
 
 
 
 
 
419
 
420
- # --- Message if modules failed to load ---
421
- if not modules_loaded:
422
- st.error("Application modules failed to load. Please check your environment and project setup.")
423
 
424
  # --- Message if API Key is not entered and modules loaded ---
425
- if not st.session_state.api_key_entered and modules_loaded and not st.session_state.summarizing:
426
  st.info("Enter your Cohere API Key above to unlock the application functionality.")
427
 
428
- # --- Keep UI updated by rerunning periodically if processing ---
429
- # This is a simple mechanism to trigger reruns to check background task status and queue.
430
- # Adjust sleep time as needed, shorter times make it more responsive but use more CPU.
431
- # Rerun if there are active processing futures or if the queue is not empty.
432
- if st.session_state.processing_futures or not st.session_state.update_queue.empty():
433
- logger.debug(f"Active processing futures ({len(st.session_state.processing_futures)}) or queue not empty, sleeping briefly to trigger rerun.")
434
- time.sleep(0.5) # Sleep for a short duration
435
- st.rerun() # Rerun the app to check for completed futures and process queue
436
- elif st.session_state.summarizing:
437
- # If summarizing is true but no futures are active and queue is empty,
438
- # it means initial steps might be running or processing just finished.
439
- # Rerun once more to update final status.
440
- logger.debug("Summarizing is True but no active futures and queue is empty, triggering final rerun.")
441
- st.session_state.summarizing = False # Ensure flag is reset
442
- st.rerun()
443
-
 
2
  import streamlit as st
3
  import os
4
  import sys
 
5
  import logging
 
 
 
 
6
 
7
  # Add the project root to the sys.path to allow importing modules like config, document_processing, etc.
8
  # This assumes app.py is in the project root directory.
9
+ # Adjust the path if your app.py is in a subdirectory.
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  try:
12
+ # Import the necessary functions from your main script print("YES1")
13
+ from main import process_uploaded_files, setup_retrieval_system, summarize_extracted_documents
 
 
 
14
  # Configure Streamlit's logging to match your application's settings
15
  logging.basicConfig(level='INFO', format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
16
  logger = logging.getLogger(__name__)
 
30
  # --- Streamlit App Configuration ---
31
  st.set_page_config(
32
  page_title="Aya Insight Document Summarizer",
33
+ page_icon="📄",
34
  layout="wide"
35
  )
36
 
 
38
  # Initialize session state variables if they don't exist
39
  if 'api_key_entered' not in st.session_state:
40
  st.session_state.api_key_entered = False
41
+ if 'summary_results' not in st.session_state:
42
+ st.session_state.summary_results = None
 
 
43
  if 'selected_filename' not in st.session_state:
44
  st.session_state.selected_filename = None
 
 
 
 
 
 
 
 
 
45
 
46
 
47
  # --- API Key Input Section ---
48
  if not st.session_state.api_key_entered:
49
+ st.title("🔒 Enter Your Cohere API Key to Unlock")
50
  api_key = st.text_input("Cohere API Key", type="password", help="Enter your Cohere API key to use the summarization service.")
51
 
52
  if st.button("Unlock"):
53
  if api_key:
54
+ # Basic validation: Just check if it's not empty.
55
+ # For a real application, you might want to validate by making a small API call.
56
  os.environ["COHERE_API_KEY"] = api_key # Set the environment variable
57
  st.session_state.api_key_entered = True
58
  st.success("API Key accepted. You can now upload documents.")
 
62
 
63
  # --- Main Application Content (Unlocked) ---
64
  if st.session_state.api_key_entered and modules_loaded:
65
+ st.title("📄 Aya Insight Document Summarizer")
66
  st.markdown("""
67
  Upload one or more PDF or image files to get a structured summary for each document.
68
  """)
 
76
  )
77
 
78
  # --- Summarize Button and Logic ---
 
79
  if uploaded_files: # Only show button if files are uploaded
80
  st.info(f"You have uploaded {len(uploaded_files)} file(s).")
81
 
82
+ if st.button("Generate Summaries", key="summarize_button"):
 
 
 
83
  st.session_state.selected_filename = None # Reset selected file on new summary generation
 
 
 
 
 
 
84
  if not uploaded_files:
85
  st.warning("Please upload at least one file before generating summaries.")
 
 
86
  else:
87
  st.subheader("Processing Documents...")
88
+ all_summary_results = [] # To store results for display
89
+
90
+ # Use a spinner to indicate processing
91
+ with st.spinner("Processing documents and generating summaries... This may take a few minutes depending on file size and number."):
92
+ try:
93
+ # Step 1: Process uploaded files (Extraction)
94
+ logger.info(f"Calling process_uploaded_files with {len(uploaded_files)} files.")
95
+ extraction_results = process_uploaded_files(uploaded_files)
96
+ logger.info(f"Finished document extraction. {len(extraction_results)} results obtained.")
97
+
98
+ # Check if any files were successfully extracted
99
+ if not any(res.get('text') for res in extraction_results):
100
+ st.error("No text could be extracted from the uploaded files. Please check the file formats.")
101
+ logger.error("No text extracted from any uploaded file.")
102
+ st.session_state.summary_results = [] # Store empty results
103
+ # st.stop() # Don't stop, allow user to try again
104
+
105
+ # Step 2: Setup retrieval system (Vector Store and Embedding)
106
+ logger.info("Calling setup_retrieval_system.")
107
+ extraction_results_with_chunks, retriever = setup_retrieval_system(extraction_results)
108
+ logger.info("Retriever system setup complete.")
109
+
110
+ # Step 3: Summarize the extracted documents
111
+ logger.info("Calling summarize_extracted_documents.")
112
+ summary_results = summarize_extracted_documents(extraction_results_with_chunks, retriever)
113
+ logger.info(f"Finished summarization. {len(summary_results)} summary results obtained.")
114
+
115
+ st.session_state.summary_results = summary_results # Store results in session state
116
+
117
+ except FileNotFoundError as fnf_error:
118
+ st.error(f"Configuration Error: {fnf_error}. Please check your environment settings.")
119
+ logger.error(f"Configuration Error during Streamlit process: {fnf_error}", exc_info=True)
120
+ st.session_state.summary_results = [] # Store empty results on error
121
+ except Exception as e:
122
+ st.error(f"An unexpected error occurred during processing: {e}")
123
+ logger.error(f"An unexpected error occurred during Streamlit process: {e}", exc_info=True)
124
+ st.session_state.summary_results = [] # Store empty results on error
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
 
127
  # --- Display Document Tiles and Summaries ---
128
+ if st.session_state.summary_results is not None:
129
+ st.subheader("Summaries:")
130
+
131
+ if not st.session_state.summary_results:
132
+ st.info("No summaries were generated. Upload files and click 'Generate Summaries'.")
133
+ else:
134
+ # Display files as a grid of clickable tiles
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  files_per_row = 3
136
+ rows = len(st.session_state.summary_results) // files_per_row + (len(st.session_state.summary_results) % files_per_row > 0)
137
+
138
+ # Create a list of filenames for easy access
139
+ filenames = [res.get('filename', f'File {i+1}') for i, res in enumerate(st.session_state.summary_results)]
140
 
141
  for i in range(rows):
142
  cols = st.columns(files_per_row)
143
  for j in range(files_per_row):
144
  file_index = i * files_per_row + j
145
+ if file_index < len(st.session_state.summary_results):
146
+ result = st.session_state.summary_results[file_index]
147
+ filename = result.get('filename', f'File {file_index+1}')
 
148
  is_selected = st.session_state.selected_filename == filename
149
 
150
+ # Create a tile using a button or markdown link
151
+ # Using a button inside a column for simplicity
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  with cols[j]:
153
+ # Add a border or highlight if selected
154
+ tile_style = "border: 2px solid lightgrey; padding: 10px; margin: 5px; text-align: center; cursor: pointer;"
155
+ if is_selected:
156
+ tile_style = "border: 2px solid steelblue; padding: 10px; margin: 5px; text-align: center; cursor: pointer; background-color: #e6f3ff;" # Highlight color
157
+
158
+ # Use markdown with HTML to create the clickable tile appearance
159
+ # When clicked, set the selected filename in session state
160
+ st.markdown(
161
+ f"""
162
+ <div style="{tile_style}" onclick="document.getElementById('hidden_button_{file_index}').click()">
163
+ 📄<br>
164
+ <strong>{filename}</strong>
165
+ </div>
166
+ <button id="hidden_button_{file_index}" style="display: none;" onclick="document.getElementById('hidden_button_{file_index}').click()"></button>
167
+ """,
168
+ unsafe_allow_html=True
169
+ )
170
+ # Streamlit buttons don't work directly with markdown clicks like this easily.
171
+ # A simpler approach is to use a standard button and handle the click.
172
+ # Let's use a standard button instead of complex markdown/JS.
173
+
174
+ # Alternative using a standard button:
175
+ if st.button(f"📄 {filename}", key=f"tile_button_{file_index}"):
176
+ st.session_state.selected_filename = filename
177
+ logger.info(f"Selected file: {filename}")
178
+ st.rerun() # Rerun to display the summary
179
+
180
+
181
+ # Display summary of the selected file
182
  if st.session_state.selected_filename:
183
  st.markdown("---") # Separator
184
+ st.subheader(f"Summary for: {st.session_state.selected_filename}")
185
+
186
+ # Find the summary for the selected file
187
+ selected_summary = None
188
+ selected_result = None
189
+ for result in st.session_state.summary_results:
190
+ if result.get('filename') == st.session_state.selected_filename:
191
+ selected_summary = result.get('summary')
192
+ selected_result = result
193
+ break
194
+
195
+ if selected_summary:
196
+ if selected_result.get('success'):
197
+ st.markdown(selected_summary) # Render markdown summary
 
 
 
 
 
 
 
 
 
 
198
  else:
199
+ st.error(f"Could not load summary for {st.session_state.selected_filename}: {selected_result.get('error', 'Unknown error')}")
200
+ else:
201
+ st.info(f"Summary not available for {st.session_state.selected_filename}.")
202
 
203
+ # Display overall processing status
204
+ successful_count = sum(res.get('success', False) for res in st.session_state.summary_results)
205
+ total_files = len(st.session_state.summary_results)
206
+ st.markdown(f"---") # Final separator
207
+ st.success(f"Processed {total_files} files. Successfully summarized {successful_count}.")
208
+ if successful_count < total_files:
209
+ st.warning("Some files could not be processed or summarized. See error messages above.")
210
 
 
 
 
211
 
212
  # --- Message if API Key is not entered and modules loaded ---
213
+ if not st.session_state.api_key_entered:
214
  st.info("Enter your Cohere API Key above to unlock the application functionality.")
215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main.py CHANGED
@@ -1,64 +1,26 @@
1
- import time
2
- import logging
3
  import os
4
  import tempfile
5
- import shutil # Import shutil for directory cleaning
6
- from typing import List, Dict, Any, Tuple, Optional
7
- from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
8
- import queue # Import the queue module
9
-
10
- # Import necessary modules from your project
11
- # Assuming config, document_processing, retrieval, summarization are accessible via sys.path
12
- # You might need to adjust these imports based on your actual project structure
13
- try:
14
- from app.config.settings import DOCS_FOLDER # Keep for local runs if needed
15
- except ImportError:
16
- # Define a default if config.config is not available
17
- DOCS_FOLDER = "docs" # Default folder name
18
-
19
 
 
 
20
  from app.document_processing.extractors import DocumentProcessorAdapter
21
  from app.retrieval.vector_store import Retriever
 
22
  from app.summarization.summarizer import DocumentSummarizer
23
 
24
- # Configure module-specific logger
25
  logger = logging.getLogger(__name__)
26
 
27
- # Add a simple print statement to confirm module loading
28
- logger.info("main.py is being loaded.")
29
-
30
- # Define a persistent temporary directory relative to the project root
31
- # This assumes app.py and main.py are in the project root
32
- PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__)))
33
- TEMP_UPLOAD_DIR = os.path.join(PROJECT_ROOT, 'temp_uploads')
34
- os.makedirs(TEMP_UPLOAD_DIR, exist_ok=True) # Ensure the directory exists
35
-
36
- def clear_upload_directory():
37
- """Clears all files from the persistent temporary upload directory."""
38
- if os.path.exists(TEMP_UPLOAD_DIR):
39
- logger.info(f"Clearing temporary upload directory: {TEMP_UPLOAD_DIR}")
40
- # Iterate through all items in the directory
41
- for item in os.listdir(TEMP_UPLOAD_DIR):
42
- item_path = os.path.join(TEMP_UPLOAD_DIR, item)
43
- try:
44
- # Check if it's a file or a symbolic link and remove it
45
- if os.path.isfile(item_path) or os.path.islink(item_path):
46
- os.unlink(item_path)
47
- logger.debug(f"Deleted file/link: {item_path}")
48
- # Check if it's a directory and remove it and its contents
49
- elif os.path.isdir(item_path):
50
- shutil.rmtree(item_path)
51
- logger.debug(f"Deleted directory: {item_path}")
52
- except Exception as e:
53
- logger.error(f"Error deleting {item_path}: {e}", exc_info=True)
54
- else:
55
- logger.info(f"Temporary upload directory does not exist, no need to clear: {TEMP_UPLOAD_DIR}")
56
 
57
 
58
  def process_uploaded_files(uploaded_files) -> List[Dict[str, Any]]:
59
  """
60
  Processes a list of files uploaded via Streamlit.
61
- Saves them into a persistent temporary folder and uses the DocumentProcessorAdapter
62
  to process that folder.
63
 
64
  Args:
@@ -74,32 +36,34 @@ def process_uploaded_files(uploaded_files) -> List[Dict[str, Any]]:
74
  start_time = time.time()
75
  logger.info(f"Starting processing for {len(uploaded_files)} uploaded files.")
76
 
77
- # Save all uploaded files into the persistent temporary directory
78
- logger.info(f"Saving files to persistent temporary directory: {TEMP_UPLOAD_DIR}")
79
- saved_files_paths = []
80
- for uploaded_file in uploaded_files:
81
- # Create a safe path within the temporary directory
82
- # Use uploaded_file.name directly, ensuring it's joined with the base temp dir
83
- file_path = os.path.join(TEMP_UPLOAD_DIR, uploaded_file.name)
84
- # Write the file content to the temporary path
85
- try:
86
- with open(file_path, "wb") as f:
87
- f.write(uploaded_file.getvalue())
88
- logger.debug(f"Saved uploaded file '{uploaded_file.name}' to '{file_path}'")
89
- saved_files_paths.append(file_path)
90
- except Exception as e:
91
- logger.error(f"Error saving uploaded file '{uploaded_file.name}' to temporary directory: {e}", exc_info=True)
92
- # Log a warning in Streamlit if a file couldn't be saved
93
- st.warning(f"Could not save uploaded file '{uploaded_file.name}' temporarily. It will be skipped.")
 
94
 
95
- # Use the DocumentProcessorAdapter to process the entire temporary folder
96
- processor = DocumentProcessorAdapter()
97
- # Call process_folder with the persistent temporary directory path
98
- # The process_folder method returns the list of extraction results
99
- extraction_results = processor.process_folder(TEMP_UPLOAD_DIR)
 
100
 
101
  end_time = time.time()
102
- logger.info(f"Finished processing uploaded files (saving and initial extraction) in {end_time - start_time:.2f} seconds.")
103
  # The extraction_results list now contains dictionaries with 'filename', 'text', 'error', etc.
104
  return extraction_results
105
 
@@ -135,18 +99,14 @@ def setup_retrieval_system(extraction_results: List[Dict[str, Any]]) -> Tuple[Li
135
  raise
136
 
137
 
138
- def summarize_extracted_documents(extraction_results: List[Dict[str, Any]], retriever: Retriever, update_queue: queue.Queue) -> List[Dict[str, Any]]:
139
  """
140
- Summarizes documents based on extraction results and a configured retriever,
141
- reporting progress via a queue.
142
 
143
  Args:
144
  extraction_results: List of dictionaries from document extraction (should include chunk_size
145
  populated by setup_retrieval_system).
146
- EXPECTED to be a list containing *one* document result dictionary
147
- when called from the Streamlit app for individual file processing.
148
  retriever: An initialized Retriever instance.
149
- update_queue: A thread-safe queue.Queue object to put progress updates into.
150
 
151
  Returns:
152
  A list of dictionaries, each containing the summary result for a file.
@@ -156,94 +116,167 @@ def summarize_extracted_documents(extraction_results: List[Dict[str, Any]], retr
156
  - 'summary': The generated summary string (if successful), or None.
157
  - 'error': An error message string (if not successful), or None.
158
  - 'processing_time': Time taken for summarization of this file.
159
- - 'status': Current processing status (e.g., 'completed', 'error').
160
  """
161
- # This function is designed to handle a list of results, but the Streamlit app
162
- # calls it with a list containing a single document result for parallel processing.
163
- # The internal logic iterates through the provided list.
164
-
165
  start_time = time.time()
166
- logger.info(f"Starting summarization for {len(extraction_results)} document(s) in background.")
167
 
168
  # Initialize the summarizer with the retriever
169
  summarizer = DocumentSummarizer(retriever)
170
 
171
- results = [] # List to store results for each document processed by this call
172
 
173
- # Filter out results that failed extraction or have no text/chunks from the input list
174
  # Summarization requires extracted text and successful chunking (chunk_size > 0)
175
- summarizable_input_results = [
176
  res for res in extraction_results
177
  if res.get('text') and res.get('chunk_size', 0) > 0 and res.get('error') is None
178
  ]
179
- skipped_input_results = [
180
  res for res in extraction_results
181
- if res not in summarizable_input_results
182
  ]
183
 
184
- if skipped_input_results:
185
- logger.warning(f"Skipping summarization for {len(skipped_input_results)} input file(s) due to extraction errors or no text/chunks.")
186
- for res in skipped_input_results:
187
  # Add entries for skipped files to the results list
188
- skipped_result = {
189
  'filename': res.get('filename', 'unknown'),
190
  'success': False,
191
  'summary': None,
192
  'error': res.get('error', 'Extraction failed or no text/chunks'),
193
  'processing_time': 0, # No summarization time for skipped files
194
- 'status': 'skipped' # Add status for skipped files
195
- }
196
- results.append(skipped_result)
197
- # Put status update into the queue
198
- update_queue.put({'filename': skipped_result['filename'], 'status': skipped_result['status'], 'result_data': skipped_result})
199
 
200
 
201
- # Process only the summarizable documents from the input list
202
- for result in summarizable_input_results:
203
  file_start_time = time.time()
204
  filename = result.get('filename', 'unknown')
 
205
  language = result.get('language', 'en')
206
- chunk_size = result.get('chunk_size', 0) # Should be > 0 for summarizable_input_results
207
 
208
- logger.info(f"Background: Summarizing document: {filename}")
209
- # Put status update into the queue
210
- update_queue.put({'filename': filename, 'status': 'summarizing'})
211
 
212
  try:
213
- # Call the summarizer for this single document
 
214
  summary = summarizer.summerize_document(filename, language, chunk_size)
215
 
216
  file_end_time = time.time()
217
- logger.info(f"Background: Finished summarizing {filename} in {file_end_time - file_start_time:.2f} seconds.")
218
- summary_result = {
219
  'filename': filename,
220
  'success': True,
221
  'summary': summary, # Return the summary string
222
  'error': None,
223
  'processing_time': file_end_time - file_start_time,
224
- 'status': 'completed' # Add completed status
225
  }
226
- results.append(summary_result) # Add to the results list for this call
227
- # Put status update into the queue
228
- update_queue.put({'filename': filename, 'status': summary_result['status'], 'result_data': summary_result})
229
-
230
  except Exception as e:
231
  file_end_time = time.time()
232
  error_msg = str(e)
233
- logger.error(f"Background: Error summarizing document {filename}: {e}", exc_info=True)
234
- error_result = {
235
  'filename': filename,
236
  'success': False,
237
  'summary': None,
238
  'error': error_msg,
239
  'processing_time': file_end_time - file_start_time,
240
- 'status': 'error' # Add error status
241
  }
242
- results.append(error_result) # Add to the results list for this call
243
- # Put status update into the queue
244
- update_queue.put({'filename': filename, 'status': error_result['status'], 'result_data': error_result})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
  end_time = time.time()
247
- logger.info(f"Background: Finished processing {len(extraction_results)} document(s) in {end_time - start_time:.2f} seconds.")
248
- # Return the list of results processed by this specific call
249
  return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging # Import logging
 
2
  import os
3
  import tempfile
4
+ import time
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from typing import List, Dict, Any, Tuple
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ from app.config.settings import DOCS_FOLDER
9
+ # Import classes from the renamed modules
10
  from app.document_processing.extractors import DocumentProcessorAdapter
11
  from app.retrieval.vector_store import Retriever
12
+ from app.summarization.output import SummaryOutputManager
13
  from app.summarization.summarizer import DocumentSummarizer
14
 
15
+ # Configure logging for the main script
16
  logger = logging.getLogger(__name__)
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
 
20
  def process_uploaded_files(uploaded_files) -> List[Dict[str, Any]]:
21
  """
22
  Processes a list of files uploaded via Streamlit.
23
+ Saves them temporarily into a folder and uses the DocumentProcessorAdapter
24
  to process that folder.
25
 
26
  Args:
 
36
  start_time = time.time()
37
  logger.info(f"Starting processing for {len(uploaded_files)} uploaded files.")
38
 
39
+ # Create a temporary directory to save uploaded files
40
+ # This directory will be automatically cleaned up when the 'with' block exits
41
+ with tempfile.TemporaryDirectory() as tmpdir:
42
+ logger.info(f"Using temporary directory: {tmpdir}")
43
+ # Save all uploaded files into the temporary directory
44
+ for uploaded_file in uploaded_files:
45
+ # Create a safe path within the temporary directory
46
+ # Use uploaded_file.name directly, tempfile handles uniqueness if needed
47
+ file_path = os.path.join(tmpdir, uploaded_file.name)
48
+ # Write the file content to the temporary path
49
+ try:
50
+ with open(file_path, "wb") as f:
51
+ f.write(uploaded_file.getvalue())
52
+ logger.debug(f"Saved uploaded file '{uploaded_file.name}' to '{file_path}'")
53
+ except Exception as e:
54
+ logger.error(f"Error saving uploaded file '{uploaded_file.name}' to temporary directory: {e}", exc_info=True)
55
+ # Log a warning in Streamlit if a file couldn't be saved
56
+ st.warning(f"Could not save uploaded file '{uploaded_file.name}' temporarily. It will be skipped.")
57
 
58
+
59
+ # Use the DocumentProcessorAdapter to process the entire temporary folder
60
+ processor = DocumentProcessorAdapter() # Corrected typo here
61
+ # Call process_folder with the temporary directory path
62
+ extraction_results = processor.process_folder(tmpdir)
63
+ # The process_folder method returns the list of extraction results
64
 
65
  end_time = time.time()
66
+ logger.info(f"Finished processing uploaded files in {end_time - start_time:.2f} seconds.")
67
  # The extraction_results list now contains dictionaries with 'filename', 'text', 'error', etc.
68
  return extraction_results
69
 
 
99
  raise
100
 
101
 
102
+ def summarize_extracted_documents(extraction_results: List[Dict[str, Any]], retriever: Retriever) -> List[Dict[str, Any]]:
103
  """
104
+ Summarizes documents based on extraction results and a configured retriever.
 
105
 
106
  Args:
107
  extraction_results: List of dictionaries from document extraction (should include chunk_size
108
  populated by setup_retrieval_system).
 
 
109
  retriever: An initialized Retriever instance.
 
110
 
111
  Returns:
112
  A list of dictionaries, each containing the summary result for a file.
 
116
  - 'summary': The generated summary string (if successful), or None.
117
  - 'error': An error message string (if not successful), or None.
118
  - 'processing_time': Time taken for summarization of this file.
 
119
  """
 
 
 
 
120
  start_time = time.time()
121
+ logger.info(f"Starting summarization for {len(extraction_results)} documents.")
122
 
123
  # Initialize the summarizer with the retriever
124
  summarizer = DocumentSummarizer(retriever)
125
 
126
+ results = [] # List to store results for each document
127
 
128
+ # Filter out results that failed extraction or have no text/chunks
129
  # Summarization requires extracted text and successful chunking (chunk_size > 0)
130
+ summarizable_results = [
131
  res for res in extraction_results
132
  if res.get('text') and res.get('chunk_size', 0) > 0 and res.get('error') is None
133
  ]
134
+ skipped_results = [
135
  res for res in extraction_results
136
+ if res not in summarizable_results
137
  ]
138
 
139
+ if skipped_results:
140
+ logger.warning(f"Skipping summarization for {len(skipped_results)} files due to extraction errors or no text/chunks.")
141
+ for res in skipped_results:
142
  # Add entries for skipped files to the results list
143
+ results.append({
144
  'filename': res.get('filename', 'unknown'),
145
  'success': False,
146
  'summary': None,
147
  'error': res.get('error', 'Extraction failed or no text/chunks'),
148
  'processing_time': 0, # No summarization time for skipped files
149
+ })
 
 
 
 
150
 
151
 
152
+ def process_single_summary(result: Dict[str, Any]) -> Dict[str, Any]:
153
+ """Helper function to summarize a single document result."""
154
  file_start_time = time.time()
155
  filename = result.get('filename', 'unknown')
156
+ # Use detected language, default to English if detection failed
157
  language = result.get('language', 'en')
158
+ chunk_size = result.get('chunk_size', 0) # Should be > 0 for summarizable_results
159
 
160
+ logger.info(f"Summarizing document: {filename}")
 
 
161
 
162
  try:
163
+ # Call the summarizer for a single document
164
+ # The summerize_document method handles parallel processing of components internally
165
  summary = summarizer.summerize_document(filename, language, chunk_size)
166
 
167
  file_end_time = time.time()
168
+ logger.info(f"Finished summarizing {filename} in {file_end_time - file_start_time:.2f} seconds.")
169
+ return {
170
  'filename': filename,
171
  'success': True,
172
  'summary': summary, # Return the summary string
173
  'error': None,
174
  'processing_time': file_end_time - file_start_time,
 
175
  }
 
 
 
 
176
  except Exception as e:
177
  file_end_time = time.time()
178
  error_msg = str(e)
179
+ logger.error(f"Error summarizing document {filename}: {e}", exc_info=True)
180
+ return {
181
  'filename': filename,
182
  'success': False,
183
  'summary': None,
184
  'error': error_msg,
185
  'processing_time': file_end_time - file_start_time,
 
186
  }
187
+
188
+ with ThreadPoolExecutor(max_workers=None) as executor: # Adjust max_workers as needed
189
+ # Submit summarizable document results to the executor
190
+ futures = {executor.submit(process_single_summary, res): res['filename'] for res in summarizable_results}
191
+
192
+ # Process results as they complete
193
+ for future in as_completed(futures):
194
+ filename = futures[future]
195
+ try:
196
+ summary_result = future.result()
197
+ results.append(summary_result)
198
+ logger.debug(f"Summary result received for {filename}")
199
+ except Exception as exc:
200
+ # This catches exceptions *within* the future's result retrieval
201
+ logger.error(f"Exception retrieving summary result for {filename}: {exc}", exc_info=True)
202
+ results.append({
203
+ 'filename': filename,
204
+ 'success': False,
205
+ 'summary': None,
206
+ 'error': f"Failed to retrieve result: {exc}",
207
+ 'processing_time': 0, # Can't determine processing time if result retrieval failed
208
+ })
209
 
210
  end_time = time.time()
211
+ logger.info(f"Finished batch summarization in {end_time - start_time:.2f} seconds.")
 
212
  return results
213
+
214
+
215
+ # if __name__ == "__main__":
216
+ # start_time = time.time()
217
+ # logger.info("Starting document summarization process (command line).")
218
+ #
219
+ # try:
220
+ # # Step 1: Process documents from the predefined folder
221
+ # logger.info(f"Processing documents from: {DOCS_FOLDER}")
222
+ # # DocumentProcessorAdapter().process_folder returns a list of extraction result dicts
223
+ # extraction_results = DocumentProcessorAdapter().process_folder(DOCS_FOLDER)
224
+ # logger.info(f"Document Processing Time taken: {time.time()-start_time:.2f} seconds")
225
+ #
226
+ # # Step 2: Setup retrieval system
227
+ # setup_start_time = time.time()
228
+ # # setup_retrieval_system takes extraction results and returns updated results (with chunk_size) and the retriever
229
+ # extraction_results_with_chunks, retriever = setup_retrieval_system(extraction_results)
230
+ # logger.info(f"Retriever Setup Time taken: {time.time() - setup_start_time:.2f} seconds")
231
+ #
232
+ # # Step 3: Summarize the documents
233
+ # summarization_start_time = time.time()
234
+ # # For command line, we might still want to save files locally
235
+ # output_manager = SummaryOutputManager() # Uses default output_dir from settings
236
+ # # summarize_extracted_documents performs the summarization and returns results
237
+ # summary_results = summarize_extracted_documents(extraction_results_with_chunks, retriever)
238
+ #
239
+ # # Step 4: Save summaries to files (for command-line only)
240
+ # logger.info("Saving summaries to files.")
241
+ # saved_count = 0
242
+ # for res in summary_results:
243
+ # if res['success'] and res['summary']:
244
+ # # Use the output_manager to save the summary string
245
+ # output_manager.save_summary(res['filename'], res['summary'], formats=['markdown'])
246
+ # saved_count += 1
247
+ # logger.info(f"Saved {saved_count} summaries.")
248
+ #
249
+ #
250
+ # logger.info(f"Summarization Time taken: {time.time() - summarization_start_time:.2f} seconds")
251
+ #
252
+ #
253
+ # # Output results summary to console
254
+ # logger.info("\n" + "=" * 50)
255
+ # logger.info("Summarization Process Complete.")
256
+ # logger.info("=" * 50)
257
+ # successful_count = sum(res.get('success', False) for res in summary_results)
258
+ # total_processed = len(summary_results) # Includes skipped files if they were added to results list earlier
259
+ # total_time = time.time() - start_time
260
+ #
261
+ # logger.info(f"Total files attempted: {len(extraction_results)}") # Total files found/attempted extraction
262
+ # logger.info(f"Files successfully extracted and summarizable: {len(extraction_results_with_chunks)}") # Files with text and chunks
263
+ # logger.info(f"Files summarized: {successful_count}/{total_processed}")
264
+ # logger.info(f"Total process time: {total_time:.2f} seconds")
265
+ # logger.info("=" * 50)
266
+ #
267
+ # # Print individual results status
268
+ # logger.info("\nIndividual File Results:")
269
+ # for result in summary_results:
270
+ # name = result.get('filename', 'unknown')
271
+ # status = "SUCCESS" if result['success'] else "FAILED"
272
+ # time_taken = result.get('processing_time', 0)
273
+ # error_msg = result.get('error', '')
274
+ # logger.info(f"- {name}: {status} ({time_taken:.2f}s) {f'Error: {error_msg}' if error_msg else ''}")
275
+ #
276
+ #
277
+ # except FileNotFoundError as fnf_error:
278
+ # logger.error(f"Configuration Error: {fnf_error}")
279
+ # print(f"Error: {fnf_error}")
280
+ # except Exception as main_error:
281
+ # logger.error(f"An unexpected error occurred during the main process: {main_error}", exc_info=True)
282
+ # print(f"An unexpected error occurred: {main_error}")