chrissoria commited on
Commit
34012b0
·
1 Parent(s): b42cd15

Show page progress for PDFs, rename button to Categorize Data

Browse files
Files changed (1) hide show
  1. app.py +58 -14
app.py CHANGED
@@ -24,6 +24,17 @@ MAX_CATEGORIES = 10
24
  INITIAL_CATEGORIES = 3
25
  MAX_FILE_SIZE_MB = 100
26
 
 
 
 
 
 
 
 
 
 
 
 
27
  # Free models (uses Space secrets - no user API key needed)
28
  FREE_MODEL_CHOICES = [
29
  "Qwen/Qwen3-VL-235B-A22B-Instruct:novita",
@@ -961,7 +972,7 @@ with col_input:
961
  model = st.selectbox("Model", options=PAID_MODEL_CHOICES, key="classify_model_paid")
962
  api_key = st.text_input("API Key", type="password", key="classify_api_key")
963
 
964
- if st.button("Classify Data", type="primary", use_container_width=True):
965
  if input_data is None:
966
  st.error("Please upload data first")
967
  elif not categories_entered:
@@ -982,7 +993,18 @@ with col_input:
982
  st.error(f"{provider} API key not configured")
983
  else:
984
  model_source = get_model_source(model)
985
- total_items = len(input_data) if isinstance(input_data, list) else 1
 
 
 
 
 
 
 
 
 
 
 
986
 
987
  # Progress UI
988
  progress_bar = st.progress(0)
@@ -990,23 +1012,33 @@ with col_input:
990
 
991
  all_results = []
992
  start_time = time.time()
 
993
 
994
- for i, item in enumerate(input_data if isinstance(input_data, list) else [input_data]):
995
- # Update progress
996
- progress = (i / total_items)
997
- progress_bar.progress(progress)
 
 
 
 
 
 
998
 
999
  # Calculate ETA
1000
  elapsed = time.time() - start_time
1001
- if i > 0:
1002
- avg_time_per_item = elapsed / i
1003
- remaining_items = total_items - i
1004
- eta_seconds = avg_time_per_item * remaining_items
1005
  eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
1006
  else:
1007
  eta_str = ""
1008
 
1009
- status_text.text(f"Processing item {i+1} of {total_items} ({progress*100:.0f}%){eta_str}")
 
 
 
1010
 
1011
  try:
1012
  classify_kwargs = {
@@ -1023,14 +1055,26 @@ with col_input:
1023
 
1024
  item_result = catllm.classify(**classify_kwargs)
1025
  all_results.append(item_result)
 
 
 
 
 
 
 
 
1026
  except Exception as e:
1027
- st.warning(f"Error on item {i+1}: {str(e)}")
 
1028
  continue
1029
 
1030
  # Complete progress
1031
  progress_bar.progress(1.0)
1032
  processing_time = time.time() - start_time
1033
- status_text.text(f"Completed {total_items} items in {processing_time:.1f}s")
 
 
 
1034
 
1035
  if all_results:
1036
  # Combine results
@@ -1125,7 +1169,7 @@ with col_output:
1125
  with st.expander("See the Code"):
1126
  st.code(results['code'], language='python')
1127
  else:
1128
- st.info("Upload data, select categories, and click 'Classify Data' to see results here.")
1129
 
1130
  # Bottom buttons
1131
  col_reset, col_code = st.columns(2)
 
24
  INITIAL_CATEGORIES = 3
25
  MAX_FILE_SIZE_MB = 100
26
 
27
+ def count_pdf_pages(pdf_path):
28
+ """Count the number of pages in a PDF file."""
29
+ try:
30
+ import fitz # PyMuPDF
31
+ doc = fitz.open(pdf_path)
32
+ page_count = len(doc)
33
+ doc.close()
34
+ return page_count
35
+ except Exception:
36
+ return 1 # Default to 1 if can't read
37
+
38
  # Free models (uses Space secrets - no user API key needed)
39
  FREE_MODEL_CHOICES = [
40
  "Qwen/Qwen3-VL-235B-A22B-Instruct:novita",
 
972
  model = st.selectbox("Model", options=PAID_MODEL_CHOICES, key="classify_model_paid")
973
  api_key = st.text_input("API Key", type="password", key="classify_api_key")
974
 
975
+ if st.button("Categorize Data", type="primary", use_container_width=True):
976
  if input_data is None:
977
  st.error("Please upload data first")
978
  elif not categories_entered:
 
993
  st.error(f"{provider} API key not configured")
994
  else:
995
  model_source = get_model_source(model)
996
+ items_list = input_data if isinstance(input_data, list) else [input_data]
997
+
998
+ # For PDFs, count total pages; for others, count items
999
+ if input_type_selected == "pdf":
1000
+ # Build list of (pdf_path, page_count) and calculate total pages
1001
+ pdf_page_counts = [(pdf_path, count_pdf_pages(pdf_path)) for pdf_path in items_list]
1002
+ total_pages = sum(pc for _, pc in pdf_page_counts)
1003
+ progress_unit = "page"
1004
+ total_units = total_pages
1005
+ else:
1006
+ total_units = len(items_list)
1007
+ progress_unit = "item"
1008
 
1009
  # Progress UI
1010
  progress_bar = st.progress(0)
 
1012
 
1013
  all_results = []
1014
  start_time = time.time()
1015
+ processed_units = 0
1016
 
1017
+ for i, item in enumerate(items_list):
1018
+ # For PDFs, get page count for this document
1019
+ if input_type_selected == "pdf":
1020
+ item_pages = pdf_page_counts[i][1]
1021
+ else:
1022
+ item_pages = 1
1023
+
1024
+ # Update progress before processing
1025
+ progress = processed_units / total_units if total_units > 0 else 0
1026
+ progress_bar.progress(min(progress, 1.0))
1027
 
1028
  # Calculate ETA
1029
  elapsed = time.time() - start_time
1030
+ if processed_units > 0:
1031
+ avg_time_per_unit = elapsed / processed_units
1032
+ remaining_units = total_units - processed_units
1033
+ eta_seconds = avg_time_per_unit * remaining_units
1034
  eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
1035
  else:
1036
  eta_str = ""
1037
 
1038
+ if input_type_selected == "pdf":
1039
+ status_text.text(f"Processing page {processed_units + 1} of {total_units} ({progress*100:.0f}%){eta_str}")
1040
+ else:
1041
+ status_text.text(f"Processing {progress_unit} {processed_units + 1} of {total_units} ({progress*100:.0f}%){eta_str}")
1042
 
1043
  try:
1044
  classify_kwargs = {
 
1055
 
1056
  item_result = catllm.classify(**classify_kwargs)
1057
  all_results.append(item_result)
1058
+
1059
+ # Update processed count (pages for PDFs, 1 for others)
1060
+ processed_units += item_pages
1061
+
1062
+ # Update progress after processing
1063
+ progress = processed_units / total_units if total_units > 0 else 1.0
1064
+ progress_bar.progress(min(progress, 1.0))
1065
+
1066
  except Exception as e:
1067
+ st.warning(f"Error on {progress_unit} {i+1}: {str(e)}")
1068
+ processed_units += item_pages # Still count as processed
1069
  continue
1070
 
1071
  # Complete progress
1072
  progress_bar.progress(1.0)
1073
  processing_time = time.time() - start_time
1074
+ if input_type_selected == "pdf":
1075
+ status_text.text(f"Completed {total_pages} pages in {processing_time:.1f}s")
1076
+ else:
1077
+ status_text.text(f"Completed {total_units} items in {processing_time:.1f}s")
1078
 
1079
  if all_results:
1080
  # Combine results
 
1169
  with st.expander("See the Code"):
1170
  st.code(results['code'], language='python')
1171
  else:
1172
+ st.info("Upload data, select categories, and click 'Categorize Data' to see results here.")
1173
 
1174
  # Bottom buttons
1175
  col_reset, col_code = st.columns(2)