Spaces:
Running
Running
Commit
·
34012b0
1
Parent(s):
b42cd15
Show page progress for PDFs, rename button to Categorize Data
Browse files
app.py
CHANGED
|
@@ -24,6 +24,17 @@ MAX_CATEGORIES = 10
|
|
| 24 |
INITIAL_CATEGORIES = 3
|
| 25 |
MAX_FILE_SIZE_MB = 100
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
# Free models (uses Space secrets - no user API key needed)
|
| 28 |
FREE_MODEL_CHOICES = [
|
| 29 |
"Qwen/Qwen3-VL-235B-A22B-Instruct:novita",
|
|
@@ -961,7 +972,7 @@ with col_input:
|
|
| 961 |
model = st.selectbox("Model", options=PAID_MODEL_CHOICES, key="classify_model_paid")
|
| 962 |
api_key = st.text_input("API Key", type="password", key="classify_api_key")
|
| 963 |
|
| 964 |
-
if st.button("
|
| 965 |
if input_data is None:
|
| 966 |
st.error("Please upload data first")
|
| 967 |
elif not categories_entered:
|
|
@@ -982,7 +993,18 @@ with col_input:
|
|
| 982 |
st.error(f"{provider} API key not configured")
|
| 983 |
else:
|
| 984 |
model_source = get_model_source(model)
|
| 985 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 986 |
|
| 987 |
# Progress UI
|
| 988 |
progress_bar = st.progress(0)
|
|
@@ -990,23 +1012,33 @@ with col_input:
|
|
| 990 |
|
| 991 |
all_results = []
|
| 992 |
start_time = time.time()
|
|
|
|
| 993 |
|
| 994 |
-
for i, item in enumerate(
|
| 995 |
-
#
|
| 996 |
-
|
| 997 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 998 |
|
| 999 |
# Calculate ETA
|
| 1000 |
elapsed = time.time() - start_time
|
| 1001 |
-
if
|
| 1002 |
-
|
| 1003 |
-
|
| 1004 |
-
eta_seconds =
|
| 1005 |
eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
|
| 1006 |
else:
|
| 1007 |
eta_str = ""
|
| 1008 |
|
| 1009 |
-
|
|
|
|
|
|
|
|
|
|
| 1010 |
|
| 1011 |
try:
|
| 1012 |
classify_kwargs = {
|
|
@@ -1023,14 +1055,26 @@ with col_input:
|
|
| 1023 |
|
| 1024 |
item_result = catllm.classify(**classify_kwargs)
|
| 1025 |
all_results.append(item_result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1026 |
except Exception as e:
|
| 1027 |
-
st.warning(f"Error on
|
|
|
|
| 1028 |
continue
|
| 1029 |
|
| 1030 |
# Complete progress
|
| 1031 |
progress_bar.progress(1.0)
|
| 1032 |
processing_time = time.time() - start_time
|
| 1033 |
-
|
|
|
|
|
|
|
|
|
|
| 1034 |
|
| 1035 |
if all_results:
|
| 1036 |
# Combine results
|
|
@@ -1125,7 +1169,7 @@ with col_output:
|
|
| 1125 |
with st.expander("See the Code"):
|
| 1126 |
st.code(results['code'], language='python')
|
| 1127 |
else:
|
| 1128 |
-
st.info("Upload data, select categories, and click '
|
| 1129 |
|
| 1130 |
# Bottom buttons
|
| 1131 |
col_reset, col_code = st.columns(2)
|
|
|
|
| 24 |
INITIAL_CATEGORIES = 3
|
| 25 |
MAX_FILE_SIZE_MB = 100
|
| 26 |
|
| 27 |
+
def count_pdf_pages(pdf_path):
|
| 28 |
+
"""Count the number of pages in a PDF file."""
|
| 29 |
+
try:
|
| 30 |
+
import fitz # PyMuPDF
|
| 31 |
+
doc = fitz.open(pdf_path)
|
| 32 |
+
page_count = len(doc)
|
| 33 |
+
doc.close()
|
| 34 |
+
return page_count
|
| 35 |
+
except Exception:
|
| 36 |
+
return 1 # Default to 1 if can't read
|
| 37 |
+
|
| 38 |
# Free models (uses Space secrets - no user API key needed)
|
| 39 |
FREE_MODEL_CHOICES = [
|
| 40 |
"Qwen/Qwen3-VL-235B-A22B-Instruct:novita",
|
|
|
|
| 972 |
model = st.selectbox("Model", options=PAID_MODEL_CHOICES, key="classify_model_paid")
|
| 973 |
api_key = st.text_input("API Key", type="password", key="classify_api_key")
|
| 974 |
|
| 975 |
+
if st.button("Categorize Data", type="primary", use_container_width=True):
|
| 976 |
if input_data is None:
|
| 977 |
st.error("Please upload data first")
|
| 978 |
elif not categories_entered:
|
|
|
|
| 993 |
st.error(f"{provider} API key not configured")
|
| 994 |
else:
|
| 995 |
model_source = get_model_source(model)
|
| 996 |
+
items_list = input_data if isinstance(input_data, list) else [input_data]
|
| 997 |
+
|
| 998 |
+
# For PDFs, count total pages; for others, count items
|
| 999 |
+
if input_type_selected == "pdf":
|
| 1000 |
+
# Build list of (pdf_path, page_count) and calculate total pages
|
| 1001 |
+
pdf_page_counts = [(pdf_path, count_pdf_pages(pdf_path)) for pdf_path in items_list]
|
| 1002 |
+
total_pages = sum(pc for _, pc in pdf_page_counts)
|
| 1003 |
+
progress_unit = "page"
|
| 1004 |
+
total_units = total_pages
|
| 1005 |
+
else:
|
| 1006 |
+
total_units = len(items_list)
|
| 1007 |
+
progress_unit = "item"
|
| 1008 |
|
| 1009 |
# Progress UI
|
| 1010 |
progress_bar = st.progress(0)
|
|
|
|
| 1012 |
|
| 1013 |
all_results = []
|
| 1014 |
start_time = time.time()
|
| 1015 |
+
processed_units = 0
|
| 1016 |
|
| 1017 |
+
for i, item in enumerate(items_list):
|
| 1018 |
+
# For PDFs, get page count for this document
|
| 1019 |
+
if input_type_selected == "pdf":
|
| 1020 |
+
item_pages = pdf_page_counts[i][1]
|
| 1021 |
+
else:
|
| 1022 |
+
item_pages = 1
|
| 1023 |
+
|
| 1024 |
+
# Update progress before processing
|
| 1025 |
+
progress = processed_units / total_units if total_units > 0 else 0
|
| 1026 |
+
progress_bar.progress(min(progress, 1.0))
|
| 1027 |
|
| 1028 |
# Calculate ETA
|
| 1029 |
elapsed = time.time() - start_time
|
| 1030 |
+
if processed_units > 0:
|
| 1031 |
+
avg_time_per_unit = elapsed / processed_units
|
| 1032 |
+
remaining_units = total_units - processed_units
|
| 1033 |
+
eta_seconds = avg_time_per_unit * remaining_units
|
| 1034 |
eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
|
| 1035 |
else:
|
| 1036 |
eta_str = ""
|
| 1037 |
|
| 1038 |
+
if input_type_selected == "pdf":
|
| 1039 |
+
status_text.text(f"Processing page {processed_units + 1} of {total_units} ({progress*100:.0f}%){eta_str}")
|
| 1040 |
+
else:
|
| 1041 |
+
status_text.text(f"Processing {progress_unit} {processed_units + 1} of {total_units} ({progress*100:.0f}%){eta_str}")
|
| 1042 |
|
| 1043 |
try:
|
| 1044 |
classify_kwargs = {
|
|
|
|
| 1055 |
|
| 1056 |
item_result = catllm.classify(**classify_kwargs)
|
| 1057 |
all_results.append(item_result)
|
| 1058 |
+
|
| 1059 |
+
# Update processed count (pages for PDFs, 1 for others)
|
| 1060 |
+
processed_units += item_pages
|
| 1061 |
+
|
| 1062 |
+
# Update progress after processing
|
| 1063 |
+
progress = processed_units / total_units if total_units > 0 else 1.0
|
| 1064 |
+
progress_bar.progress(min(progress, 1.0))
|
| 1065 |
+
|
| 1066 |
except Exception as e:
|
| 1067 |
+
st.warning(f"Error on {progress_unit} {i+1}: {str(e)}")
|
| 1068 |
+
processed_units += item_pages # Still count as processed
|
| 1069 |
continue
|
| 1070 |
|
| 1071 |
# Complete progress
|
| 1072 |
progress_bar.progress(1.0)
|
| 1073 |
processing_time = time.time() - start_time
|
| 1074 |
+
if input_type_selected == "pdf":
|
| 1075 |
+
status_text.text(f"Completed {total_pages} pages in {processing_time:.1f}s")
|
| 1076 |
+
else:
|
| 1077 |
+
status_text.text(f"Completed {total_units} items in {processing_time:.1f}s")
|
| 1078 |
|
| 1079 |
if all_results:
|
| 1080 |
# Combine results
|
|
|
|
| 1169 |
with st.expander("See the Code"):
|
| 1170 |
st.code(results['code'], language='python')
|
| 1171 |
else:
|
| 1172 |
+
st.info("Upload data, select categories, and click 'Categorize Data' to see results here.")
|
| 1173 |
|
| 1174 |
# Bottom buttons
|
| 1175 |
col_reset, col_code = st.columns(2)
|