Spaces:
Running
Running
Commit
·
af04675
1
Parent(s):
a9678de
Process PDF pages one at a time for real page-by-page progress
Browse files
app.py
CHANGED
|
@@ -52,6 +52,46 @@ def extract_text_from_pdfs(pdf_paths):
|
|
| 52 |
print(f"Error extracting text from {pdf_path}: {e}")
|
| 53 |
return all_texts
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
# Free models - display name -> actual API model name
|
| 56 |
FREE_MODELS_MAP = {
|
| 57 |
"Qwen3 235B": "Qwen/Qwen3-VL-235B-A22B-Instruct:novita",
|
|
@@ -984,14 +1024,6 @@ with col_input:
|
|
| 984 |
model_source = get_model_source(model)
|
| 985 |
items_list = input_data if isinstance(input_data, list) else [input_data]
|
| 986 |
|
| 987 |
-
# Progress tracking
|
| 988 |
-
total_items = len(items_list)
|
| 989 |
-
|
| 990 |
-
# For PDFs, also get page counts for display
|
| 991 |
-
if input_type_selected == "pdf":
|
| 992 |
-
pdf_page_counts = [(pdf_path, count_pdf_pages(pdf_path)) for pdf_path in items_list]
|
| 993 |
-
total_pages = sum(pc for _, pc in pdf_page_counts)
|
| 994 |
-
|
| 995 |
# Progress UI
|
| 996 |
progress_bar = st.progress(0)
|
| 997 |
status_text = st.empty()
|
|
@@ -999,56 +1031,114 @@ with col_input:
|
|
| 999 |
all_results = []
|
| 1000 |
start_time = time.time()
|
| 1001 |
|
| 1002 |
-
|
| 1003 |
-
|
| 1004 |
-
|
| 1005 |
-
|
| 1006 |
-
|
| 1007 |
-
# Calculate ETA
|
| 1008 |
-
elapsed = time.time() - start_time
|
| 1009 |
-
if i > 0:
|
| 1010 |
-
avg_time_per_item = elapsed / i
|
| 1011 |
-
remaining_items = total_items - i
|
| 1012 |
-
eta_seconds = avg_time_per_item * remaining_items
|
| 1013 |
-
eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
|
| 1014 |
-
else:
|
| 1015 |
-
eta_str = ""
|
| 1016 |
|
| 1017 |
-
|
| 1018 |
-
|
| 1019 |
-
status_text.text(f"Processing document {i+1} of {total_items} ({doc_pages} pages) ({progress*100:.0f}%){eta_str}")
|
| 1020 |
-
else:
|
| 1021 |
-
status_text.text(f"Processing item {i+1} of {total_items} ({progress*100:.0f}%){eta_str}")
|
| 1022 |
|
| 1023 |
-
|
| 1024 |
-
|
| 1025 |
-
|
| 1026 |
-
'categories': categories_entered,
|
| 1027 |
-
'api_key': actual_api_key,
|
| 1028 |
-
'input_type': input_type_selected,
|
| 1029 |
-
'description': description,
|
| 1030 |
-
'user_model': model,
|
| 1031 |
-
'model_source': model_source
|
| 1032 |
-
}
|
| 1033 |
-
if mode:
|
| 1034 |
-
classify_kwargs['mode'] = mode
|
| 1035 |
-
|
| 1036 |
-
item_result = catllm.classify(**classify_kwargs)
|
| 1037 |
-
all_results.append(item_result)
|
| 1038 |
-
|
| 1039 |
-
# Update progress after processing
|
| 1040 |
-
progress = (i + 1) / total_items if total_items > 0 else 1.0
|
| 1041 |
progress_bar.progress(min(progress, 1.0))
|
| 1042 |
|
| 1043 |
-
|
| 1044 |
-
|
| 1045 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1046 |
|
| 1047 |
# Complete progress
|
| 1048 |
progress_bar.progress(1.0)
|
| 1049 |
processing_time = time.time() - start_time
|
| 1050 |
if input_type_selected == "pdf":
|
| 1051 |
-
status_text.text(f"Completed {total_items}
|
| 1052 |
else:
|
| 1053 |
status_text.text(f"Completed {total_items} items in {processing_time:.1f}s")
|
| 1054 |
|
|
@@ -1056,20 +1146,6 @@ with col_input:
|
|
| 1056 |
# Combine results
|
| 1057 |
result_df = pd.concat(all_results, ignore_index=True)
|
| 1058 |
|
| 1059 |
-
# For PDFs, replace temp file paths with original filenames
|
| 1060 |
-
if input_type_selected == "pdf" and 'pdf_input' in result_df.columns:
|
| 1061 |
-
pdf_name_map = st.session_state.get('pdf_name_map', {})
|
| 1062 |
-
def replace_temp_path(val):
|
| 1063 |
-
if pd.isna(val):
|
| 1064 |
-
return val
|
| 1065 |
-
val_str = str(val)
|
| 1066 |
-
for temp_path, orig_name in pdf_name_map.items():
|
| 1067 |
-
if temp_path in val_str:
|
| 1068 |
-
# Replace temp path with original name, keep page suffix
|
| 1069 |
-
return val_str.replace(temp_path, orig_name)
|
| 1070 |
-
return val_str
|
| 1071 |
-
result_df['pdf_input'] = result_df['pdf_input'].apply(replace_temp_path)
|
| 1072 |
-
|
| 1073 |
# Save CSV
|
| 1074 |
with tempfile.NamedTemporaryFile(mode='w', suffix='_classified.csv', delete=False) as f:
|
| 1075 |
result_df.to_csv(f.name, index=False)
|
|
|
|
| 52 |
print(f"Error extracting text from {pdf_path}: {e}")
|
| 53 |
return all_texts
|
| 54 |
|
| 55 |
+
|
| 56 |
+
def extract_pdf_pages(pdf_paths, pdf_name_map, mode="image"):
|
| 57 |
+
"""
|
| 58 |
+
Extract individual pages from PDFs.
|
| 59 |
+
Returns list of (page_data, page_label) tuples.
|
| 60 |
+
For image mode: page_data is path to temp image file
|
| 61 |
+
For text mode: page_data is extracted text
|
| 62 |
+
"""
|
| 63 |
+
import fitz # PyMuPDF
|
| 64 |
+
pages = []
|
| 65 |
+
|
| 66 |
+
for pdf_path in pdf_paths:
|
| 67 |
+
orig_name = pdf_name_map.get(pdf_path, os.path.basename(pdf_path).replace('.pdf', ''))
|
| 68 |
+
try:
|
| 69 |
+
doc = fitz.open(pdf_path)
|
| 70 |
+
for page_num, page in enumerate(doc, 1):
|
| 71 |
+
page_label = f"{orig_name}_p{page_num}"
|
| 72 |
+
|
| 73 |
+
if mode == "text":
|
| 74 |
+
# Extract text
|
| 75 |
+
text = page.get_text().strip()
|
| 76 |
+
if text:
|
| 77 |
+
pages.append((text, page_label, "text"))
|
| 78 |
+
else:
|
| 79 |
+
# Render as image (for image or both mode)
|
| 80 |
+
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom for better quality
|
| 81 |
+
img_path = tempfile.NamedTemporaryFile(delete=False, suffix='.png').name
|
| 82 |
+
pix.save(img_path)
|
| 83 |
+
|
| 84 |
+
if mode == "both":
|
| 85 |
+
text = page.get_text().strip()
|
| 86 |
+
pages.append((img_path, page_label, "image", text))
|
| 87 |
+
else:
|
| 88 |
+
pages.append((img_path, page_label, "image"))
|
| 89 |
+
doc.close()
|
| 90 |
+
except Exception as e:
|
| 91 |
+
print(f"Error extracting pages from {pdf_path}: {e}")
|
| 92 |
+
|
| 93 |
+
return pages
|
| 94 |
+
|
| 95 |
# Free models - display name -> actual API model name
|
| 96 |
FREE_MODELS_MAP = {
|
| 97 |
"Qwen3 235B": "Qwen/Qwen3-VL-235B-A22B-Instruct:novita",
|
|
|
|
| 1024 |
model_source = get_model_source(model)
|
| 1025 |
items_list = input_data if isinstance(input_data, list) else [input_data]
|
| 1026 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1027 |
# Progress UI
|
| 1028 |
progress_bar = st.progress(0)
|
| 1029 |
status_text = st.empty()
|
|
|
|
| 1031 |
all_results = []
|
| 1032 |
start_time = time.time()
|
| 1033 |
|
| 1034 |
+
# For PDFs, extract pages and process one at a time
|
| 1035 |
+
if input_type_selected == "pdf":
|
| 1036 |
+
pdf_name_map = st.session_state.get('pdf_name_map', {})
|
| 1037 |
+
status_text.text("Extracting PDF pages...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1038 |
|
| 1039 |
+
pages = extract_pdf_pages(items_list, pdf_name_map, mode)
|
| 1040 |
+
total_pages = len(pages)
|
|
|
|
|
|
|
|
|
|
| 1041 |
|
| 1042 |
+
for i, page_data in enumerate(pages):
|
| 1043 |
+
# Update progress
|
| 1044 |
+
progress = i / total_pages if total_pages > 0 else 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1045 |
progress_bar.progress(min(progress, 1.0))
|
| 1046 |
|
| 1047 |
+
# Calculate ETA
|
| 1048 |
+
elapsed = time.time() - start_time
|
| 1049 |
+
if i > 0:
|
| 1050 |
+
avg_time = elapsed / i
|
| 1051 |
+
eta_seconds = avg_time * (total_pages - i)
|
| 1052 |
+
eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
|
| 1053 |
+
else:
|
| 1054 |
+
eta_str = ""
|
| 1055 |
+
|
| 1056 |
+
page_label = page_data[1]
|
| 1057 |
+
status_text.text(f"Processing page {i+1} of {total_pages} ({page_label}) ({progress*100:.0f}%){eta_str}")
|
| 1058 |
+
|
| 1059 |
+
try:
|
| 1060 |
+
if page_data[2] == "text":
|
| 1061 |
+
# Text mode - classify as text
|
| 1062 |
+
result = catllm.classify(
|
| 1063 |
+
input_data=[page_data[0]],
|
| 1064 |
+
categories=categories_entered,
|
| 1065 |
+
api_key=actual_api_key,
|
| 1066 |
+
input_type="text",
|
| 1067 |
+
description=description,
|
| 1068 |
+
user_model=model,
|
| 1069 |
+
model_source=model_source
|
| 1070 |
+
)
|
| 1071 |
+
else:
|
| 1072 |
+
# Image mode - classify as image
|
| 1073 |
+
result = catllm.classify(
|
| 1074 |
+
input_data=[page_data[0]],
|
| 1075 |
+
categories=categories_entered,
|
| 1076 |
+
api_key=actual_api_key,
|
| 1077 |
+
input_type="image",
|
| 1078 |
+
description=description,
|
| 1079 |
+
user_model=model,
|
| 1080 |
+
model_source=model_source
|
| 1081 |
+
)
|
| 1082 |
+
|
| 1083 |
+
# Replace the input column with the page label
|
| 1084 |
+
if 'image_input' in result.columns:
|
| 1085 |
+
result['pdf_input'] = page_label
|
| 1086 |
+
result = result.drop(columns=['image_input'])
|
| 1087 |
+
elif 'text_input' in result.columns:
|
| 1088 |
+
result['pdf_input'] = page_label
|
| 1089 |
+
result = result.drop(columns=['text_input'])
|
| 1090 |
+
else:
|
| 1091 |
+
result['pdf_input'] = page_label
|
| 1092 |
+
|
| 1093 |
+
all_results.append(result)
|
| 1094 |
+
except Exception as e:
|
| 1095 |
+
st.warning(f"Error on {page_label}: {str(e)}")
|
| 1096 |
+
continue
|
| 1097 |
+
|
| 1098 |
+
total_items = total_pages
|
| 1099 |
+
else:
|
| 1100 |
+
# Non-PDF processing (text, images)
|
| 1101 |
+
total_items = len(items_list)
|
| 1102 |
+
|
| 1103 |
+
for i, item in enumerate(items_list):
|
| 1104 |
+
progress = i / total_items if total_items > 0 else 0
|
| 1105 |
+
progress_bar.progress(min(progress, 1.0))
|
| 1106 |
+
|
| 1107 |
+
elapsed = time.time() - start_time
|
| 1108 |
+
if i > 0:
|
| 1109 |
+
avg_time = elapsed / i
|
| 1110 |
+
eta_seconds = avg_time * (total_items - i)
|
| 1111 |
+
eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
|
| 1112 |
+
else:
|
| 1113 |
+
eta_str = ""
|
| 1114 |
+
|
| 1115 |
+
status_text.text(f"Processing item {i+1} of {total_items} ({progress*100:.0f}%){eta_str}")
|
| 1116 |
+
|
| 1117 |
+
try:
|
| 1118 |
+
item_result = catllm.classify(
|
| 1119 |
+
input_data=[item],
|
| 1120 |
+
categories=categories_entered,
|
| 1121 |
+
api_key=actual_api_key,
|
| 1122 |
+
input_type=input_type_selected,
|
| 1123 |
+
description=description,
|
| 1124 |
+
user_model=model,
|
| 1125 |
+
model_source=model_source
|
| 1126 |
+
)
|
| 1127 |
+
all_results.append(item_result)
|
| 1128 |
+
|
| 1129 |
+
# Update progress after processing
|
| 1130 |
+
progress = (i + 1) / total_items if total_items > 0 else 1.0
|
| 1131 |
+
progress_bar.progress(min(progress, 1.0))
|
| 1132 |
+
|
| 1133 |
+
except Exception as e:
|
| 1134 |
+
st.warning(f"Error on item {i+1}: {str(e)}")
|
| 1135 |
+
continue
|
| 1136 |
|
| 1137 |
# Complete progress
|
| 1138 |
progress_bar.progress(1.0)
|
| 1139 |
processing_time = time.time() - start_time
|
| 1140 |
if input_type_selected == "pdf":
|
| 1141 |
+
status_text.text(f"Completed {total_items} pages in {processing_time:.1f}s")
|
| 1142 |
else:
|
| 1143 |
status_text.text(f"Completed {total_items} items in {processing_time:.1f}s")
|
| 1144 |
|
|
|
|
| 1146 |
# Combine results
|
| 1147 |
result_df = pd.concat(all_results, ignore_index=True)
|
| 1148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1149 |
# Save CSV
|
| 1150 |
with tempfile.NamedTemporaryFile(mode='w', suffix='_classified.csv', delete=False) as f:
|
| 1151 |
result_df.to_csv(f.name, index=False)
|