chrissoria commited on
Commit
09f46a9
·
1 Parent(s): 1bc0f81

Fix PDF progress: show document-based progress since catllm processes pages internally

Browse files
Files changed (1) hide show
  1. app.py +16 -29
app.py CHANGED
@@ -981,16 +981,13 @@ with col_input:
981
  model_source = get_model_source(model)
982
  items_list = input_data if isinstance(input_data, list) else [input_data]
983
 
984
- # For PDFs, count total pages; for others, count items
 
 
 
985
  if input_type_selected == "pdf":
986
- # Build list of (pdf_path, page_count) and calculate total pages
987
  pdf_page_counts = [(pdf_path, count_pdf_pages(pdf_path)) for pdf_path in items_list]
988
  total_pages = sum(pc for _, pc in pdf_page_counts)
989
- progress_unit = "page"
990
- total_units = total_pages
991
- else:
992
- total_units = len(items_list)
993
- progress_unit = "item"
994
 
995
  # Progress UI
996
  progress_bar = st.progress(0)
@@ -998,33 +995,27 @@ with col_input:
998
 
999
  all_results = []
1000
  start_time = time.time()
1001
- processed_units = 0
1002
 
1003
  for i, item in enumerate(items_list):
1004
- # For PDFs, get page count for this document
1005
- if input_type_selected == "pdf":
1006
- item_pages = pdf_page_counts[i][1]
1007
- else:
1008
- item_pages = 1
1009
-
1010
  # Update progress before processing
1011
- progress = processed_units / total_units if total_units > 0 else 0
1012
  progress_bar.progress(min(progress, 1.0))
1013
 
1014
  # Calculate ETA
1015
  elapsed = time.time() - start_time
1016
- if processed_units > 0:
1017
- avg_time_per_unit = elapsed / processed_units
1018
- remaining_units = total_units - processed_units
1019
- eta_seconds = avg_time_per_unit * remaining_units
1020
  eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
1021
  else:
1022
  eta_str = ""
1023
 
1024
  if input_type_selected == "pdf":
1025
- status_text.text(f"Processing page {processed_units + 1} of {total_units} ({progress*100:.0f}%){eta_str}")
 
1026
  else:
1027
- status_text.text(f"Processing {progress_unit} {processed_units + 1} of {total_units} ({progress*100:.0f}%){eta_str}")
1028
 
1029
  try:
1030
  classify_kwargs = {
@@ -1042,25 +1033,21 @@ with col_input:
1042
  item_result = catllm.classify(**classify_kwargs)
1043
  all_results.append(item_result)
1044
 
1045
- # Update processed count (pages for PDFs, 1 for others)
1046
- processed_units += item_pages
1047
-
1048
  # Update progress after processing
1049
- progress = processed_units / total_units if total_units > 0 else 1.0
1050
  progress_bar.progress(min(progress, 1.0))
1051
 
1052
  except Exception as e:
1053
- st.warning(f"Error on {progress_unit} {i+1}: {str(e)}")
1054
- processed_units += item_pages # Still count as processed
1055
  continue
1056
 
1057
  # Complete progress
1058
  progress_bar.progress(1.0)
1059
  processing_time = time.time() - start_time
1060
  if input_type_selected == "pdf":
1061
- status_text.text(f"Completed {total_pages} pages in {processing_time:.1f}s")
1062
  else:
1063
- status_text.text(f"Completed {total_units} items in {processing_time:.1f}s")
1064
 
1065
  if all_results:
1066
  # Combine results
 
981
  model_source = get_model_source(model)
982
  items_list = input_data if isinstance(input_data, list) else [input_data]
983
 
984
+ # Progress tracking
985
+ total_items = len(items_list)
986
+
987
+ # For PDFs, also get page counts for display
988
  if input_type_selected == "pdf":
 
989
  pdf_page_counts = [(pdf_path, count_pdf_pages(pdf_path)) for pdf_path in items_list]
990
  total_pages = sum(pc for _, pc in pdf_page_counts)
 
 
 
 
 
991
 
992
  # Progress UI
993
  progress_bar = st.progress(0)
 
995
 
996
  all_results = []
997
  start_time = time.time()
 
998
 
999
  for i, item in enumerate(items_list):
 
 
 
 
 
 
1000
  # Update progress before processing
1001
+ progress = i / total_items if total_items > 0 else 0
1002
  progress_bar.progress(min(progress, 1.0))
1003
 
1004
  # Calculate ETA
1005
  elapsed = time.time() - start_time
1006
+ if i > 0:
1007
+ avg_time_per_item = elapsed / i
1008
+ remaining_items = total_items - i
1009
+ eta_seconds = avg_time_per_item * remaining_items
1010
  eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
1011
  else:
1012
  eta_str = ""
1013
 
1014
  if input_type_selected == "pdf":
1015
+ doc_pages = pdf_page_counts[i][1]
1016
+ status_text.text(f"Processing document {i+1} of {total_items} ({doc_pages} pages) ({progress*100:.0f}%){eta_str}")
1017
  else:
1018
+ status_text.text(f"Processing item {i+1} of {total_items} ({progress*100:.0f}%){eta_str}")
1019
 
1020
  try:
1021
  classify_kwargs = {
 
1033
  item_result = catllm.classify(**classify_kwargs)
1034
  all_results.append(item_result)
1035
 
 
 
 
1036
  # Update progress after processing
1037
+ progress = (i + 1) / total_items if total_items > 0 else 1.0
1038
  progress_bar.progress(min(progress, 1.0))
1039
 
1040
  except Exception as e:
1041
+ st.warning(f"Error on item {i+1}: {str(e)}")
 
1042
  continue
1043
 
1044
  # Complete progress
1045
  progress_bar.progress(1.0)
1046
  processing_time = time.time() - start_time
1047
  if input_type_selected == "pdf":
1048
+ status_text.text(f"Completed {total_items} document(s) ({total_pages} pages) in {processing_time:.1f}s")
1049
  else:
1050
+ status_text.text(f"Completed {total_items} items in {processing_time:.1f}s")
1051
 
1052
  if all_results:
1053
  # Combine results