Spaces:
Running
Running
Commit
·
09f46a9
1
Parent(s):
1bc0f81
Fix PDF progress: show document-based progress since catllm processes pages internally
Browse files
app.py
CHANGED
|
@@ -981,16 +981,13 @@ with col_input:
|
|
| 981 |
model_source = get_model_source(model)
|
| 982 |
items_list = input_data if isinstance(input_data, list) else [input_data]
|
| 983 |
|
| 984 |
-
#
|
|
|
|
|
|
|
|
|
|
| 985 |
if input_type_selected == "pdf":
|
| 986 |
-
# Build list of (pdf_path, page_count) and calculate total pages
|
| 987 |
pdf_page_counts = [(pdf_path, count_pdf_pages(pdf_path)) for pdf_path in items_list]
|
| 988 |
total_pages = sum(pc for _, pc in pdf_page_counts)
|
| 989 |
-
progress_unit = "page"
|
| 990 |
-
total_units = total_pages
|
| 991 |
-
else:
|
| 992 |
-
total_units = len(items_list)
|
| 993 |
-
progress_unit = "item"
|
| 994 |
|
| 995 |
# Progress UI
|
| 996 |
progress_bar = st.progress(0)
|
|
@@ -998,33 +995,27 @@ with col_input:
|
|
| 998 |
|
| 999 |
all_results = []
|
| 1000 |
start_time = time.time()
|
| 1001 |
-
processed_units = 0
|
| 1002 |
|
| 1003 |
for i, item in enumerate(items_list):
|
| 1004 |
-
# For PDFs, get page count for this document
|
| 1005 |
-
if input_type_selected == "pdf":
|
| 1006 |
-
item_pages = pdf_page_counts[i][1]
|
| 1007 |
-
else:
|
| 1008 |
-
item_pages = 1
|
| 1009 |
-
|
| 1010 |
# Update progress before processing
|
| 1011 |
-
progress =
|
| 1012 |
progress_bar.progress(min(progress, 1.0))
|
| 1013 |
|
| 1014 |
# Calculate ETA
|
| 1015 |
elapsed = time.time() - start_time
|
| 1016 |
-
if
|
| 1017 |
-
|
| 1018 |
-
|
| 1019 |
-
eta_seconds =
|
| 1020 |
eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
|
| 1021 |
else:
|
| 1022 |
eta_str = ""
|
| 1023 |
|
| 1024 |
if input_type_selected == "pdf":
|
| 1025 |
-
|
|
|
|
| 1026 |
else:
|
| 1027 |
-
status_text.text(f"Processing
|
| 1028 |
|
| 1029 |
try:
|
| 1030 |
classify_kwargs = {
|
|
@@ -1042,25 +1033,21 @@ with col_input:
|
|
| 1042 |
item_result = catllm.classify(**classify_kwargs)
|
| 1043 |
all_results.append(item_result)
|
| 1044 |
|
| 1045 |
-
# Update processed count (pages for PDFs, 1 for others)
|
| 1046 |
-
processed_units += item_pages
|
| 1047 |
-
|
| 1048 |
# Update progress after processing
|
| 1049 |
-
progress =
|
| 1050 |
progress_bar.progress(min(progress, 1.0))
|
| 1051 |
|
| 1052 |
except Exception as e:
|
| 1053 |
-
st.warning(f"Error on
|
| 1054 |
-
processed_units += item_pages # Still count as processed
|
| 1055 |
continue
|
| 1056 |
|
| 1057 |
# Complete progress
|
| 1058 |
progress_bar.progress(1.0)
|
| 1059 |
processing_time = time.time() - start_time
|
| 1060 |
if input_type_selected == "pdf":
|
| 1061 |
-
status_text.text(f"Completed {total_pages} pages in {processing_time:.1f}s")
|
| 1062 |
else:
|
| 1063 |
-
status_text.text(f"Completed {
|
| 1064 |
|
| 1065 |
if all_results:
|
| 1066 |
# Combine results
|
|
|
|
| 981 |
model_source = get_model_source(model)
|
| 982 |
items_list = input_data if isinstance(input_data, list) else [input_data]
|
| 983 |
|
| 984 |
+
# Progress tracking
|
| 985 |
+
total_items = len(items_list)
|
| 986 |
+
|
| 987 |
+
# For PDFs, also get page counts for display
|
| 988 |
if input_type_selected == "pdf":
|
|
|
|
| 989 |
pdf_page_counts = [(pdf_path, count_pdf_pages(pdf_path)) for pdf_path in items_list]
|
| 990 |
total_pages = sum(pc for _, pc in pdf_page_counts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 991 |
|
| 992 |
# Progress UI
|
| 993 |
progress_bar = st.progress(0)
|
|
|
|
| 995 |
|
| 996 |
all_results = []
|
| 997 |
start_time = time.time()
|
|
|
|
| 998 |
|
| 999 |
for i, item in enumerate(items_list):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1000 |
# Update progress before processing
|
| 1001 |
+
progress = i / total_items if total_items > 0 else 0
|
| 1002 |
progress_bar.progress(min(progress, 1.0))
|
| 1003 |
|
| 1004 |
# Calculate ETA
|
| 1005 |
elapsed = time.time() - start_time
|
| 1006 |
+
if i > 0:
|
| 1007 |
+
avg_time_per_item = elapsed / i
|
| 1008 |
+
remaining_items = total_items - i
|
| 1009 |
+
eta_seconds = avg_time_per_item * remaining_items
|
| 1010 |
eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
|
| 1011 |
else:
|
| 1012 |
eta_str = ""
|
| 1013 |
|
| 1014 |
if input_type_selected == "pdf":
|
| 1015 |
+
doc_pages = pdf_page_counts[i][1]
|
| 1016 |
+
status_text.text(f"Processing document {i+1} of {total_items} ({doc_pages} pages) ({progress*100:.0f}%){eta_str}")
|
| 1017 |
else:
|
| 1018 |
+
status_text.text(f"Processing item {i+1} of {total_items} ({progress*100:.0f}%){eta_str}")
|
| 1019 |
|
| 1020 |
try:
|
| 1021 |
classify_kwargs = {
|
|
|
|
| 1033 |
item_result = catllm.classify(**classify_kwargs)
|
| 1034 |
all_results.append(item_result)
|
| 1035 |
|
|
|
|
|
|
|
|
|
|
| 1036 |
# Update progress after processing
|
| 1037 |
+
progress = (i + 1) / total_items if total_items > 0 else 1.0
|
| 1038 |
progress_bar.progress(min(progress, 1.0))
|
| 1039 |
|
| 1040 |
except Exception as e:
|
| 1041 |
+
st.warning(f"Error on item {i+1}: {str(e)}")
|
|
|
|
| 1042 |
continue
|
| 1043 |
|
| 1044 |
# Complete progress
|
| 1045 |
progress_bar.progress(1.0)
|
| 1046 |
processing_time = time.time() - start_time
|
| 1047 |
if input_type_selected == "pdf":
|
| 1048 |
+
status_text.text(f"Completed {total_items} document(s) ({total_pages} pages) in {processing_time:.1f}s")
|
| 1049 |
else:
|
| 1050 |
+
status_text.text(f"Completed {total_items} items in {processing_time:.1f}s")
|
| 1051 |
|
| 1052 |
if all_results:
|
| 1053 |
# Combine results
|