chrissoria commited on
Commit
af04675
·
1 Parent(s): a9678de

Process PDF pages one at a time for real page-by-page progress

Browse files
Files changed (1) hide show
  1. app.py +139 -63
app.py CHANGED
@@ -52,6 +52,46 @@ def extract_text_from_pdfs(pdf_paths):
52
  print(f"Error extracting text from {pdf_path}: {e}")
53
  return all_texts
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  # Free models - display name -> actual API model name
56
  FREE_MODELS_MAP = {
57
  "Qwen3 235B": "Qwen/Qwen3-VL-235B-A22B-Instruct:novita",
@@ -984,14 +1024,6 @@ with col_input:
984
  model_source = get_model_source(model)
985
  items_list = input_data if isinstance(input_data, list) else [input_data]
986
 
987
- # Progress tracking
988
- total_items = len(items_list)
989
-
990
- # For PDFs, also get page counts for display
991
- if input_type_selected == "pdf":
992
- pdf_page_counts = [(pdf_path, count_pdf_pages(pdf_path)) for pdf_path in items_list]
993
- total_pages = sum(pc for _, pc in pdf_page_counts)
994
-
995
  # Progress UI
996
  progress_bar = st.progress(0)
997
  status_text = st.empty()
@@ -999,56 +1031,114 @@ with col_input:
999
  all_results = []
1000
  start_time = time.time()
1001
 
1002
- for i, item in enumerate(items_list):
1003
- # Update progress before processing
1004
- progress = i / total_items if total_items > 0 else 0
1005
- progress_bar.progress(min(progress, 1.0))
1006
-
1007
- # Calculate ETA
1008
- elapsed = time.time() - start_time
1009
- if i > 0:
1010
- avg_time_per_item = elapsed / i
1011
- remaining_items = total_items - i
1012
- eta_seconds = avg_time_per_item * remaining_items
1013
- eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
1014
- else:
1015
- eta_str = ""
1016
 
1017
- if input_type_selected == "pdf":
1018
- doc_pages = pdf_page_counts[i][1]
1019
- status_text.text(f"Processing document {i+1} of {total_items} ({doc_pages} pages) ({progress*100:.0f}%){eta_str}")
1020
- else:
1021
- status_text.text(f"Processing item {i+1} of {total_items} ({progress*100:.0f}%){eta_str}")
1022
 
1023
- try:
1024
- classify_kwargs = {
1025
- 'input_data': [item],
1026
- 'categories': categories_entered,
1027
- 'api_key': actual_api_key,
1028
- 'input_type': input_type_selected,
1029
- 'description': description,
1030
- 'user_model': model,
1031
- 'model_source': model_source
1032
- }
1033
- if mode:
1034
- classify_kwargs['mode'] = mode
1035
-
1036
- item_result = catllm.classify(**classify_kwargs)
1037
- all_results.append(item_result)
1038
-
1039
- # Update progress after processing
1040
- progress = (i + 1) / total_items if total_items > 0 else 1.0
1041
  progress_bar.progress(min(progress, 1.0))
1042
 
1043
- except Exception as e:
1044
- st.warning(f"Error on item {i+1}: {str(e)}")
1045
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1046
 
1047
  # Complete progress
1048
  progress_bar.progress(1.0)
1049
  processing_time = time.time() - start_time
1050
  if input_type_selected == "pdf":
1051
- status_text.text(f"Completed {total_items} document(s) ({total_pages} pages) in {processing_time:.1f}s")
1052
  else:
1053
  status_text.text(f"Completed {total_items} items in {processing_time:.1f}s")
1054
 
@@ -1056,20 +1146,6 @@ with col_input:
1056
  # Combine results
1057
  result_df = pd.concat(all_results, ignore_index=True)
1058
 
1059
- # For PDFs, replace temp file paths with original filenames
1060
- if input_type_selected == "pdf" and 'pdf_input' in result_df.columns:
1061
- pdf_name_map = st.session_state.get('pdf_name_map', {})
1062
- def replace_temp_path(val):
1063
- if pd.isna(val):
1064
- return val
1065
- val_str = str(val)
1066
- for temp_path, orig_name in pdf_name_map.items():
1067
- if temp_path in val_str:
1068
- # Replace temp path with original name, keep page suffix
1069
- return val_str.replace(temp_path, orig_name)
1070
- return val_str
1071
- result_df['pdf_input'] = result_df['pdf_input'].apply(replace_temp_path)
1072
-
1073
  # Save CSV
1074
  with tempfile.NamedTemporaryFile(mode='w', suffix='_classified.csv', delete=False) as f:
1075
  result_df.to_csv(f.name, index=False)
 
52
  print(f"Error extracting text from {pdf_path}: {e}")
53
  return all_texts
54
 
55
+
56
+ def extract_pdf_pages(pdf_paths, pdf_name_map, mode="image"):
57
+ """
58
+ Extract individual pages from PDFs.
59
+ Returns list of (page_data, page_label) tuples.
60
+ For image mode: page_data is path to temp image file
61
+ For text mode: page_data is extracted text
62
+ """
63
+ import fitz # PyMuPDF
64
+ pages = []
65
+
66
+ for pdf_path in pdf_paths:
67
+ orig_name = pdf_name_map.get(pdf_path, os.path.basename(pdf_path).replace('.pdf', ''))
68
+ try:
69
+ doc = fitz.open(pdf_path)
70
+ for page_num, page in enumerate(doc, 1):
71
+ page_label = f"{orig_name}_p{page_num}"
72
+
73
+ if mode == "text":
74
+ # Extract text
75
+ text = page.get_text().strip()
76
+ if text:
77
+ pages.append((text, page_label, "text"))
78
+ else:
79
+ # Render as image (for image or both mode)
80
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom for better quality
81
+ img_path = tempfile.NamedTemporaryFile(delete=False, suffix='.png').name
82
+ pix.save(img_path)
83
+
84
+ if mode == "both":
85
+ text = page.get_text().strip()
86
+ pages.append((img_path, page_label, "image", text))
87
+ else:
88
+ pages.append((img_path, page_label, "image"))
89
+ doc.close()
90
+ except Exception as e:
91
+ print(f"Error extracting pages from {pdf_path}: {e}")
92
+
93
+ return pages
94
+
95
  # Free models - display name -> actual API model name
96
  FREE_MODELS_MAP = {
97
  "Qwen3 235B": "Qwen/Qwen3-VL-235B-A22B-Instruct:novita",
 
1024
  model_source = get_model_source(model)
1025
  items_list = input_data if isinstance(input_data, list) else [input_data]
1026
 
 
 
 
 
 
 
 
 
1027
  # Progress UI
1028
  progress_bar = st.progress(0)
1029
  status_text = st.empty()
 
1031
  all_results = []
1032
  start_time = time.time()
1033
 
1034
+ # For PDFs, extract pages and process one at a time
1035
+ if input_type_selected == "pdf":
1036
+ pdf_name_map = st.session_state.get('pdf_name_map', {})
1037
+ status_text.text("Extracting PDF pages...")
 
 
 
 
 
 
 
 
 
 
1038
 
1039
+ pages = extract_pdf_pages(items_list, pdf_name_map, mode)
1040
+ total_pages = len(pages)
 
 
 
1041
 
1042
+ for i, page_data in enumerate(pages):
1043
+ # Update progress
1044
+ progress = i / total_pages if total_pages > 0 else 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1045
  progress_bar.progress(min(progress, 1.0))
1046
 
1047
+ # Calculate ETA
1048
+ elapsed = time.time() - start_time
1049
+ if i > 0:
1050
+ avg_time = elapsed / i
1051
+ eta_seconds = avg_time * (total_pages - i)
1052
+ eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
1053
+ else:
1054
+ eta_str = ""
1055
+
1056
+ page_label = page_data[1]
1057
+ status_text.text(f"Processing page {i+1} of {total_pages} ({page_label}) ({progress*100:.0f}%){eta_str}")
1058
+
1059
+ try:
1060
+ if page_data[2] == "text":
1061
+ # Text mode - classify as text
1062
+ result = catllm.classify(
1063
+ input_data=[page_data[0]],
1064
+ categories=categories_entered,
1065
+ api_key=actual_api_key,
1066
+ input_type="text",
1067
+ description=description,
1068
+ user_model=model,
1069
+ model_source=model_source
1070
+ )
1071
+ else:
1072
+ # Image mode - classify as image
1073
+ result = catllm.classify(
1074
+ input_data=[page_data[0]],
1075
+ categories=categories_entered,
1076
+ api_key=actual_api_key,
1077
+ input_type="image",
1078
+ description=description,
1079
+ user_model=model,
1080
+ model_source=model_source
1081
+ )
1082
+
1083
+ # Replace the input column with the page label
1084
+ if 'image_input' in result.columns:
1085
+ result['pdf_input'] = page_label
1086
+ result = result.drop(columns=['image_input'])
1087
+ elif 'text_input' in result.columns:
1088
+ result['pdf_input'] = page_label
1089
+ result = result.drop(columns=['text_input'])
1090
+ else:
1091
+ result['pdf_input'] = page_label
1092
+
1093
+ all_results.append(result)
1094
+ except Exception as e:
1095
+ st.warning(f"Error on {page_label}: {str(e)}")
1096
+ continue
1097
+
1098
+ total_items = total_pages
1099
+ else:
1100
+ # Non-PDF processing (text, images)
1101
+ total_items = len(items_list)
1102
+
1103
+ for i, item in enumerate(items_list):
1104
+ progress = i / total_items if total_items > 0 else 0
1105
+ progress_bar.progress(min(progress, 1.0))
1106
+
1107
+ elapsed = time.time() - start_time
1108
+ if i > 0:
1109
+ avg_time = elapsed / i
1110
+ eta_seconds = avg_time * (total_items - i)
1111
+ eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
1112
+ else:
1113
+ eta_str = ""
1114
+
1115
+ status_text.text(f"Processing item {i+1} of {total_items} ({progress*100:.0f}%){eta_str}")
1116
+
1117
+ try:
1118
+ item_result = catllm.classify(
1119
+ input_data=[item],
1120
+ categories=categories_entered,
1121
+ api_key=actual_api_key,
1122
+ input_type=input_type_selected,
1123
+ description=description,
1124
+ user_model=model,
1125
+ model_source=model_source
1126
+ )
1127
+ all_results.append(item_result)
1128
+
1129
+ # Update progress after processing
1130
+ progress = (i + 1) / total_items if total_items > 0 else 1.0
1131
+ progress_bar.progress(min(progress, 1.0))
1132
+
1133
+ except Exception as e:
1134
+ st.warning(f"Error on item {i+1}: {str(e)}")
1135
+ continue
1136
 
1137
  # Complete progress
1138
  progress_bar.progress(1.0)
1139
  processing_time = time.time() - start_time
1140
  if input_type_selected == "pdf":
1141
+ status_text.text(f"Completed {total_items} pages in {processing_time:.1f}s")
1142
  else:
1143
  status_text.text(f"Completed {total_items} items in {processing_time:.1f}s")
1144
 
 
1146
  # Combine results
1147
  result_df = pd.concat(all_results, ignore_index=True)
1148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1149
  # Save CSV
1150
  with tempfile.NamedTemporaryFile(mode='w', suffix='_classified.csv', delete=False) as f:
1151
  result_df.to_csv(f.name, index=False)