Spaces:
Running
on
Zero
Running
on
Zero
baseline08_beta0.3.2_02Oct25: HF Spaces GPU support, update requirements.txt ; - fix progress bar ; - minor tweaks/fixes
Browse files- requirements.txt +13 -2
- ui/gradio_ui.py +44 -18
requirements.txt
CHANGED
|
@@ -1,8 +1,19 @@
|
|
| 1 |
gradio>=5.44.0 # gradio[mcp]>=5.44.0
|
| 2 |
#mcp>=1.15.0 # MCP Python SDK (Model Coontext Protocol)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
marker-pdf[full]>=1.10.0 # pip install marker (GitHub: https://github.com/datalab-to/marker)
|
| 4 |
weasyprint>=59.0 # optional fallback if pandoc is not available
|
| 5 |
#pandoc==2.3 # for Markdown β PDF conversion
|
| 6 |
-
python-magic==0.4.27 # fileβtype detection
|
| 7 |
#pdfdfium2 # Python binding to PDFium for PDF rendering, inspection, manipution and creation
|
| 8 |
-
|
|
|
|
|
|
| 1 |
gradio>=5.44.0 # gradio[mcp]>=5.44.0
|
| 2 |
#mcp>=1.15.0 # MCP Python SDK (Model Coontext Protocol)
|
| 3 |
+
|
| 4 |
+
#gradio[mcp]>=5.44.0 # Gradio as MCP Server
|
| 5 |
+
#pydantic>=2.11.7 # dependants: Marker-pdf and Gradio[mcp]
|
| 6 |
+
|
| 7 |
+
## HF Spaces recommendation: https://huggingface.co/docs/hub/spaces-gpus#frameworks
|
| 8 |
+
space--extra-index-url https://download.pytorch.org/whl/cu113
|
| 9 |
+
#torch
|
| 10 |
+
torch>=2.7.1 # ZeroGPU support
|
| 11 |
+
spaces>=0.42.1 # HF Spaces (default on HF Spaces
|
| 12 |
+
#huggingface_hub>=0.34.0 # HuggingFace integration
|
| 13 |
+
|
| 14 |
marker-pdf[full]>=1.10.0 # pip install marker (GitHub: https://github.com/datalab-to/marker)
|
| 15 |
weasyprint>=59.0 # optional fallback if pandoc is not available
|
| 16 |
#pandoc==2.3 # for Markdown β PDF conversion
|
|
|
|
| 17 |
#pdfdfium2 # Python binding to PDFium for PDF rendering, inspection, manipution and creation
|
| 18 |
+
|
| 19 |
+
#python-magic==0.4.27 # fileβtype detection: thin layer over the libmagic C library
|
ui/gradio_ui.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
| 2 |
from ast import Interactive
|
| 3 |
import gradio as gr
|
| 4 |
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
|
|
| 5 |
import asyncio ##future
|
| 6 |
import time
|
| 7 |
|
|
@@ -9,6 +10,7 @@ from pathlib import Path, WindowsPath
|
|
| 9 |
from typing import Optional, Union #, Dict, List, Any, Tuple
|
| 10 |
|
| 11 |
from huggingface_hub import get_token
|
|
|
|
| 12 |
|
| 13 |
#import file_handler
|
| 14 |
from file_handler import file_utils
|
|
@@ -57,6 +59,7 @@ except Exception as exc:
|
|
| 57 |
# pool executor to convert files called by Gradio
|
| 58 |
##SMY: TODO: future: refactor to gradio_process.py and
|
| 59 |
## pull options to cli-options{"output_format":, "output_dir_string":, "use_llm":, "page_range":, "force_ocr":, "debug":, "strip_existing_ocr":, "disable_ocr_math""}
|
|
|
|
| 60 |
def convert_batch(
|
| 61 |
pdf_files, #: list[str],
|
| 62 |
pdf_files_count: int,
|
|
@@ -246,7 +249,7 @@ def convert_batch(
|
|
| 246 |
time.sleep(0.25)
|
| 247 |
yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 248 |
|
| 249 |
-
# Use progress.tqdm to integrate with the executor map
|
| 250 |
#results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
|
| 251 |
for result_interim in progress.tqdm(
|
| 252 |
iterable=pool.map(pdf2md_converter.convert_files, pdf_files), #, max_retries), total=len(pdf_files)
|
|
@@ -257,9 +260,25 @@ def convert_batch(
|
|
| 257 |
#yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 258 |
#progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
|
| 259 |
#progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
|
| 260 |
-
#time.sleep(0.25)
|
| 261 |
-
|
| 262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
progress((11,16), desc=f"ProcessPoolExecutor: Got Results from files conversion")
|
| 264 |
time.sleep(0.25)
|
| 265 |
except Exception as exc:
|
|
@@ -526,6 +545,8 @@ def build_interface() -> gr.Blocks:
|
|
| 526 |
# Concatenate the new files with the existing ones in the state
|
| 527 |
updated_files = current_state + new_file_paths
|
| 528 |
updated_filenames = [Path(f).name for f in updated_files]
|
|
|
|
|
|
|
| 529 |
|
| 530 |
# Return the updated state and a message to the user
|
| 531 |
#file_info = "\n".join(updated_files)
|
|
@@ -533,7 +554,7 @@ def build_interface() -> gr.Blocks:
|
|
| 533 |
#message = f"Accumulated {len(updated_files)} file(s) total.\n\nAll file paths:\n{file_info}"
|
| 534 |
message = f"Accumulated {len(updated_files)} file(s) total: \n{filename_info}"
|
| 535 |
|
| 536 |
-
return updated_files, message, gr.update(interactive=True), gr.update(interactive=True)
|
| 537 |
|
| 538 |
# with gr.Blocks(title=TITLE) as demo
|
| 539 |
with gr.Blocks(title=TITLE, css=custom_css) as demo:
|
|
@@ -685,9 +706,6 @@ def build_interface() -> gr.Blocks:
|
|
| 685 |
|
| 686 |
logout_status_md = gr.Markdown(visible=True) #visible=False)
|
| 687 |
|
| 688 |
-
# The gr.State component to hold the accumulated list of files
|
| 689 |
-
uploaded_file_list = gr.State([]) ##NB: initial value of `gr.State` must be able to be deepcopied
|
| 690 |
-
|
| 691 |
# --- PDF & HTML β Markdown tab ---
|
| 692 |
with gr.Tab(" π PDF & HTML β Markdown"):
|
| 693 |
gr.Markdown(f"#### {DESCRIPTION_PDF_HTML}")
|
|
@@ -742,7 +760,7 @@ def build_interface() -> gr.Blocks:
|
|
| 742 |
with gr.Tab(" π PDF β Markdown (Flag for DEPRECATION)", interactive=False, visible=True): #False
|
| 743 |
gr.Markdown(f"#### {DESCRIPTION_PDF}")
|
| 744 |
|
| 745 |
-
|
| 746 |
label="Upload PDF files",
|
| 747 |
file_count="directory", ## handle directory and files upload #"multiple",
|
| 748 |
type="filepath",
|
|
@@ -821,10 +839,15 @@ def build_interface() -> gr.Blocks:
|
|
| 821 |
label="Conversion Logs",
|
| 822 |
lines=5,
|
| 823 |
#max_lines=25,
|
| 824 |
-
|
|
|
|
| 825 |
)
|
| 826 |
|
| 827 |
# Initialise gr.State
|
|
|
|
|
|
|
|
|
|
|
|
|
| 828 |
state_max_workers = gr.State(4) #max_workers_sl,
|
| 829 |
state_max_retries = gr.State(2) #max_retries_sl,
|
| 830 |
state_tz_hours = gr.State(value=None)
|
|
@@ -925,7 +948,7 @@ def build_interface() -> gr.Blocks:
|
|
| 925 |
#yield [], msg, '', ''
|
| 926 |
#return [], f"Files list cleared.", [], []
|
| 927 |
yield [], msg, None, None
|
| 928 |
-
return [], f"Files list cleared.", None, None
|
| 929 |
|
| 930 |
#hf_login_logout_btn.click(fn=custom_do_logout, inputs=None, outputs=hf_login_logout_btn)
|
| 931 |
##unused
|
|
@@ -939,14 +962,14 @@ def build_interface() -> gr.Blocks:
|
|
| 939 |
file_btn.upload(
|
| 940 |
fn=accumulate_files,
|
| 941 |
inputs=[file_btn, uploaded_file_list],
|
| 942 |
-
outputs=[uploaded_file_list, output_textbox, process_button, clear_button]
|
| 943 |
)
|
| 944 |
|
| 945 |
# Event handler for the directory upload button
|
| 946 |
dir_btn.upload(
|
| 947 |
fn=accumulate_files,
|
| 948 |
inputs=[dir_btn, uploaded_file_list],
|
| 949 |
-
outputs=[uploaded_file_list, output_textbox, process_button, clear_button]
|
| 950 |
)
|
| 951 |
|
| 952 |
# Event handler for the "Clear" button
|
|
@@ -966,11 +989,12 @@ def build_interface() -> gr.Blocks:
|
|
| 966 |
## and then use the return value of the function to update the component.
|
| 967 |
## Discarding for now. #//TODO: investigate further.
|
| 968 |
## SMY: Solved: using gr.State
|
|
|
|
| 969 |
inputs_arg = [
|
| 970 |
#pdf_files,
|
| 971 |
##pdf_files_wrap(pdf_files), # wrap pdf_files in a list (if not already)
|
| 972 |
uploaded_file_list,
|
| 973 |
-
files_count, #pdf_files_count,
|
| 974 |
provider_dd,
|
| 975 |
model_tb,
|
| 976 |
hf_provider_dd,
|
|
@@ -1017,10 +1041,12 @@ def build_interface() -> gr.Blocks:
|
|
| 1017 |
|
| 1018 |
##gr.File .upload() event, fire only after a file has been uploaded
|
| 1019 |
# Event handler for the pdf file upload button
|
| 1020 |
-
|
|
|
|
|
|
|
| 1021 |
fn=accumulate_files,
|
| 1022 |
-
inputs=[
|
| 1023 |
-
outputs=[uploaded_file_list, log_output]
|
| 1024 |
)
|
| 1025 |
#inputs_arg[0] = files_upload
|
| 1026 |
btn_pdf_convert.click(
|
|
@@ -1061,7 +1087,7 @@ def build_interface() -> gr.Blocks:
|
|
| 1061 |
|
| 1062 |
btn_pdf_count.click(
|
| 1063 |
fn=get_file_count,
|
| 1064 |
-
inputs=[
|
| 1065 |
outputs=[files_count, log_output]
|
| 1066 |
)
|
| 1067 |
btn_html_count.click(
|
|
|
|
| 2 |
from ast import Interactive
|
| 3 |
import gradio as gr
|
| 4 |
from concurrent.futures import ProcessPoolExecutor, as_completed
|
| 5 |
+
import tqdm
|
| 6 |
import asyncio ##future
|
| 7 |
import time
|
| 8 |
|
|
|
|
| 10 |
from typing import Optional, Union #, Dict, List, Any, Tuple
|
| 11 |
|
| 12 |
from huggingface_hub import get_token
|
| 13 |
+
import spaces ##HuggingFace spaces to accelerate GPU support on HF Spaces
|
| 14 |
|
| 15 |
#import file_handler
|
| 16 |
from file_handler import file_utils
|
|
|
|
| 59 |
# pool executor to convert files called by Gradio
|
| 60 |
##SMY: TODO: future: refactor to gradio_process.py and
|
| 61 |
## pull options to cli-options{"output_format":, "output_dir_string":, "use_llm":, "page_range":, "force_ocr":, "debug":, "strip_existing_ocr":, "disable_ocr_math""}
|
| 62 |
+
@spaces.GPU
|
| 63 |
def convert_batch(
|
| 64 |
pdf_files, #: list[str],
|
| 65 |
pdf_files_count: int,
|
|
|
|
| 249 |
time.sleep(0.25)
|
| 250 |
yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 251 |
|
| 252 |
+
'''# Use progress.tqdm to integrate with the executor map
|
| 253 |
#results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
|
| 254 |
for result_interim in progress.tqdm(
|
| 255 |
iterable=pool.map(pdf2md_converter.convert_files, pdf_files), #, max_retries), total=len(pdf_files)
|
|
|
|
| 260 |
#yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 261 |
#progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
|
| 262 |
#progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
|
| 263 |
+
#time.sleep(0.25)'''
|
| 264 |
+
def get_results_pool_map(pdf_files, pdf_files_count, progress2=gr.Progress()):
|
| 265 |
+
#Use progress.tqdm to integrate with the executor map
|
| 266 |
+
#results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
|
| 267 |
+
for result_interim in progress2.tqdm(
|
| 268 |
+
iterable=pool.map(pdf2md_converter.convert_files, pdf_files), #, max_retries), total=len(pdf_files)
|
| 269 |
+
desc=f"ProcessPoolExecutor: Pooling file conversion ... pool.map",
|
| 270 |
+
total=pdf_files_count):
|
| 271 |
+
results.append(result_interim)
|
| 272 |
+
|
| 273 |
+
# Update the Gradio UI to improve user-friendly eXperience
|
| 274 |
+
#yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 275 |
+
progress2((0,len(pdf_files)), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
|
| 276 |
+
#progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
|
| 277 |
+
time.sleep(0.75) #.sleep(0.25)
|
| 278 |
+
|
| 279 |
+
return results
|
| 280 |
+
results = get_results_pool_map(pdf_files, pdf_files_count)
|
| 281 |
+
yield gr.update(interactive=True), f"ProcessPoolExecutor: Got Results from files conversion: [{str(results)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 282 |
progress((11,16), desc=f"ProcessPoolExecutor: Got Results from files conversion")
|
| 283 |
time.sleep(0.25)
|
| 284 |
except Exception as exc:
|
|
|
|
| 545 |
# Concatenate the new files with the existing ones in the state
|
| 546 |
updated_files = current_state + new_file_paths
|
| 547 |
updated_filenames = [Path(f).name for f in updated_files]
|
| 548 |
+
|
| 549 |
+
updated_files_count = len(updated_files)
|
| 550 |
|
| 551 |
# Return the updated state and a message to the user
|
| 552 |
#file_info = "\n".join(updated_files)
|
|
|
|
| 554 |
#message = f"Accumulated {len(updated_files)} file(s) total.\n\nAll file paths:\n{file_info}"
|
| 555 |
message = f"Accumulated {len(updated_files)} file(s) total: \n{filename_info}"
|
| 556 |
|
| 557 |
+
return updated_files, updated_files_count, message, gr.update(interactive=True), gr.update(interactive=True)
|
| 558 |
|
| 559 |
# with gr.Blocks(title=TITLE) as demo
|
| 560 |
with gr.Blocks(title=TITLE, css=custom_css) as demo:
|
|
|
|
| 706 |
|
| 707 |
logout_status_md = gr.Markdown(visible=True) #visible=False)
|
| 708 |
|
|
|
|
|
|
|
|
|
|
| 709 |
# --- PDF & HTML β Markdown tab ---
|
| 710 |
with gr.Tab(" π PDF & HTML β Markdown"):
|
| 711 |
gr.Markdown(f"#### {DESCRIPTION_PDF_HTML}")
|
|
|
|
| 760 |
with gr.Tab(" π PDF β Markdown (Flag for DEPRECATION)", interactive=False, visible=True): #False
|
| 761 |
gr.Markdown(f"#### {DESCRIPTION_PDF}")
|
| 762 |
|
| 763 |
+
files_upload_pdf_fl = gr.File(
|
| 764 |
label="Upload PDF files",
|
| 765 |
file_count="directory", ## handle directory and files upload #"multiple",
|
| 766 |
type="filepath",
|
|
|
|
| 839 |
label="Conversion Logs",
|
| 840 |
lines=5,
|
| 841 |
#max_lines=25,
|
| 842 |
+
interactive=True, #False
|
| 843 |
+
show_label=False,
|
| 844 |
)
|
| 845 |
|
| 846 |
# Initialise gr.State
|
| 847 |
+
# The gr.State component to hold the accumulated list of files
|
| 848 |
+
uploaded_file_list = gr.State([]) ##NB: initial value of `gr.State` must be able to be deepcopied
|
| 849 |
+
uploaded_files_count = gr.State(0) ## initial files count
|
| 850 |
+
|
| 851 |
state_max_workers = gr.State(4) #max_workers_sl,
|
| 852 |
state_max_retries = gr.State(2) #max_retries_sl,
|
| 853 |
state_tz_hours = gr.State(value=None)
|
|
|
|
| 948 |
#yield [], msg, '', ''
|
| 949 |
#return [], f"Files list cleared.", [], []
|
| 950 |
yield [], msg, None, None
|
| 951 |
+
return [], 0, f"Files list cleared.", None, None
|
| 952 |
|
| 953 |
#hf_login_logout_btn.click(fn=custom_do_logout, inputs=None, outputs=hf_login_logout_btn)
|
| 954 |
##unused
|
|
|
|
| 962 |
file_btn.upload(
|
| 963 |
fn=accumulate_files,
|
| 964 |
inputs=[file_btn, uploaded_file_list],
|
| 965 |
+
outputs=[uploaded_file_list, uploaded_files_count, output_textbox, process_button, clear_button]
|
| 966 |
)
|
| 967 |
|
| 968 |
# Event handler for the directory upload button
|
| 969 |
dir_btn.upload(
|
| 970 |
fn=accumulate_files,
|
| 971 |
inputs=[dir_btn, uploaded_file_list],
|
| 972 |
+
outputs=[uploaded_file_list, uploaded_files_count, output_textbox, process_button, clear_button]
|
| 973 |
)
|
| 974 |
|
| 975 |
# Event handler for the "Clear" button
|
|
|
|
| 989 |
## and then use the return value of the function to update the component.
|
| 990 |
## Discarding for now. #//TODO: investigate further.
|
| 991 |
## SMY: Solved: using gr.State
|
| 992 |
+
|
| 993 |
inputs_arg = [
|
| 994 |
#pdf_files,
|
| 995 |
##pdf_files_wrap(pdf_files), # wrap pdf_files in a list (if not already)
|
| 996 |
uploaded_file_list,
|
| 997 |
+
uploaded_files_count, #files_count, #pdf_files_count,
|
| 998 |
provider_dd,
|
| 999 |
model_tb,
|
| 1000 |
hf_provider_dd,
|
|
|
|
| 1041 |
|
| 1042 |
##gr.File .upload() event, fire only after a file has been uploaded
|
| 1043 |
# Event handler for the pdf file upload button
|
| 1044 |
+
##TODO:
|
| 1045 |
+
#outputs=[uploaded_file_list, updated_files_count, output_textbox, process_button, clear_button]
|
| 1046 |
+
files_upload_pdf_fl.upload(
|
| 1047 |
fn=accumulate_files,
|
| 1048 |
+
inputs=[files_upload_pdf_fl, uploaded_file_list],
|
| 1049 |
+
outputs=[uploaded_file_list, uploaded_files_count, log_output, files_upload_pdf_fl, clear_button]
|
| 1050 |
)
|
| 1051 |
#inputs_arg[0] = files_upload
|
| 1052 |
btn_pdf_convert.click(
|
|
|
|
| 1087 |
|
| 1088 |
btn_pdf_count.click(
|
| 1089 |
fn=get_file_count,
|
| 1090 |
+
inputs=[files_upload_pdf_fl],
|
| 1091 |
outputs=[files_count, log_output]
|
| 1092 |
)
|
| 1093 |
btn_html_count.click(
|