semmyk commited on
Commit
04ec8b6
Β·
1 Parent(s): 7757db2

baseline08_beta0.3.2_02Oct25: HF Spaces GPU support, update requirements.txt ; - fix progress bar ; - minor tweaks/fixes

Browse files
Files changed (2) hide show
  1. requirements.txt +13 -2
  2. ui/gradio_ui.py +44 -18
requirements.txt CHANGED
@@ -1,8 +1,19 @@
1
  gradio>=5.44.0 # gradio[mcp]>=5.44.0
2
  #mcp>=1.15.0 # MCP Python SDK (Model Coontext Protocol)
 
 
 
 
 
 
 
 
 
 
 
3
  marker-pdf[full]>=1.10.0 # pip install marker (GitHub: https://github.com/datalab-to/marker)
4
  weasyprint>=59.0 # optional fallback if pandoc is not available
5
  #pandoc==2.3 # for Markdown β†’ PDF conversion
6
- python-magic==0.4.27 # file‑type detection
7
  #pdfdfium2 # Python binding to PDFium for PDF rendering, inspection, manipution and creation
8
- #huggingface_hub>=0.34.0 # HuggingFace integration
 
 
1
  gradio>=5.44.0 # gradio[mcp]>=5.44.0
2
  #mcp>=1.15.0 # MCP Python SDK (Model Coontext Protocol)
3
+
4
+ #gradio[mcp]>=5.44.0 # Gradio as MCP Server
5
+ #pydantic>=2.11.7 # dependants: Marker-pdf and Gradio[mcp]
6
+
7
+ ## HF Spaces recommendation: https://huggingface.co/docs/hub/spaces-gpus#frameworks
8
+ space--extra-index-url https://download.pytorch.org/whl/cu113
9
+ #torch
10
+ torch>=2.7.1 # ZeroGPU support
11
+ spaces>=0.42.1 # HF Spaces (default on HF Spaces
12
+ #huggingface_hub>=0.34.0 # HuggingFace integration
13
+
14
  marker-pdf[full]>=1.10.0 # pip install marker (GitHub: https://github.com/datalab-to/marker)
15
  weasyprint>=59.0 # optional fallback if pandoc is not available
16
  #pandoc==2.3 # for Markdown β†’ PDF conversion
 
17
  #pdfdfium2 # Python binding to PDFium for PDF rendering, inspection, manipution and creation
18
+
19
+ #python-magic==0.4.27 # file‑type detection: thin layer over the libmagic C library
ui/gradio_ui.py CHANGED
@@ -2,6 +2,7 @@
2
  from ast import Interactive
3
  import gradio as gr
4
  from concurrent.futures import ProcessPoolExecutor, as_completed
 
5
  import asyncio ##future
6
  import time
7
 
@@ -9,6 +10,7 @@ from pathlib import Path, WindowsPath
9
  from typing import Optional, Union #, Dict, List, Any, Tuple
10
 
11
  from huggingface_hub import get_token
 
12
 
13
  #import file_handler
14
  from file_handler import file_utils
@@ -57,6 +59,7 @@ except Exception as exc:
57
  # pool executor to convert files called by Gradio
58
  ##SMY: TODO: future: refactor to gradio_process.py and
59
  ## pull options to cli-options{"output_format":, "output_dir_string":, "use_llm":, "page_range":, "force_ocr":, "debug":, "strip_existing_ocr":, "disable_ocr_math""}
 
60
  def convert_batch(
61
  pdf_files, #: list[str],
62
  pdf_files_count: int,
@@ -246,7 +249,7 @@ def convert_batch(
246
  time.sleep(0.25)
247
  yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
248
 
249
- # Use progress.tqdm to integrate with the executor map
250
  #results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
251
  for result_interim in progress.tqdm(
252
  iterable=pool.map(pdf2md_converter.convert_files, pdf_files), #, max_retries), total=len(pdf_files)
@@ -257,9 +260,25 @@ def convert_batch(
257
  #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
258
  #progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
259
  #progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
260
- #time.sleep(0.25)
261
-
262
- yield gr.update(interactive=True), f"ProcessPoolExecutor: Got Results from files conversion: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  progress((11,16), desc=f"ProcessPoolExecutor: Got Results from files conversion")
264
  time.sleep(0.25)
265
  except Exception as exc:
@@ -526,6 +545,8 @@ def build_interface() -> gr.Blocks:
526
  # Concatenate the new files with the existing ones in the state
527
  updated_files = current_state + new_file_paths
528
  updated_filenames = [Path(f).name for f in updated_files]
 
 
529
 
530
  # Return the updated state and a message to the user
531
  #file_info = "\n".join(updated_files)
@@ -533,7 +554,7 @@ def build_interface() -> gr.Blocks:
533
  #message = f"Accumulated {len(updated_files)} file(s) total.\n\nAll file paths:\n{file_info}"
534
  message = f"Accumulated {len(updated_files)} file(s) total: \n{filename_info}"
535
 
536
- return updated_files, message, gr.update(interactive=True), gr.update(interactive=True)
537
 
538
  # with gr.Blocks(title=TITLE) as demo
539
  with gr.Blocks(title=TITLE, css=custom_css) as demo:
@@ -685,9 +706,6 @@ def build_interface() -> gr.Blocks:
685
 
686
  logout_status_md = gr.Markdown(visible=True) #visible=False)
687
 
688
- # The gr.State component to hold the accumulated list of files
689
- uploaded_file_list = gr.State([]) ##NB: initial value of `gr.State` must be able to be deepcopied
690
-
691
  # --- PDF & HTML β†’ Markdown tab ---
692
  with gr.Tab(" πŸ“„ PDF & HTML ➜ Markdown"):
693
  gr.Markdown(f"#### {DESCRIPTION_PDF_HTML}")
@@ -742,7 +760,7 @@ def build_interface() -> gr.Blocks:
742
  with gr.Tab(" πŸ“„ PDF ➜ Markdown (Flag for DEPRECATION)", interactive=False, visible=True): #False
743
  gr.Markdown(f"#### {DESCRIPTION_PDF}")
744
 
745
- files_upload_pdf = gr.File(
746
  label="Upload PDF files",
747
  file_count="directory", ## handle directory and files upload #"multiple",
748
  type="filepath",
@@ -821,10 +839,15 @@ def build_interface() -> gr.Blocks:
821
  label="Conversion Logs",
822
  lines=5,
823
  #max_lines=25,
824
- #interactive=False
 
825
  )
826
 
827
  # Initialise gr.State
 
 
 
 
828
  state_max_workers = gr.State(4) #max_workers_sl,
829
  state_max_retries = gr.State(2) #max_retries_sl,
830
  state_tz_hours = gr.State(value=None)
@@ -925,7 +948,7 @@ def build_interface() -> gr.Blocks:
925
  #yield [], msg, '', ''
926
  #return [], f"Files list cleared.", [], []
927
  yield [], msg, None, None
928
- return [], f"Files list cleared.", None, None
929
 
930
  #hf_login_logout_btn.click(fn=custom_do_logout, inputs=None, outputs=hf_login_logout_btn)
931
  ##unused
@@ -939,14 +962,14 @@ def build_interface() -> gr.Blocks:
939
  file_btn.upload(
940
  fn=accumulate_files,
941
  inputs=[file_btn, uploaded_file_list],
942
- outputs=[uploaded_file_list, output_textbox, process_button, clear_button]
943
  )
944
 
945
  # Event handler for the directory upload button
946
  dir_btn.upload(
947
  fn=accumulate_files,
948
  inputs=[dir_btn, uploaded_file_list],
949
- outputs=[uploaded_file_list, output_textbox, process_button, clear_button]
950
  )
951
 
952
  # Event handler for the "Clear" button
@@ -966,11 +989,12 @@ def build_interface() -> gr.Blocks:
966
  ## and then use the return value of the function to update the component.
967
  ## Discarding for now. #//TODO: investigate further.
968
  ## SMY: Solved: using gr.State
 
969
  inputs_arg = [
970
  #pdf_files,
971
  ##pdf_files_wrap(pdf_files), # wrap pdf_files in a list (if not already)
972
  uploaded_file_list,
973
- files_count, #pdf_files_count,
974
  provider_dd,
975
  model_tb,
976
  hf_provider_dd,
@@ -1017,10 +1041,12 @@ def build_interface() -> gr.Blocks:
1017
 
1018
  ##gr.File .upload() event, fire only after a file has been uploaded
1019
  # Event handler for the pdf file upload button
1020
- files_upload_pdf.upload(
 
 
1021
  fn=accumulate_files,
1022
- inputs=[files_upload_pdf, uploaded_file_list],
1023
- outputs=[uploaded_file_list, log_output]
1024
  )
1025
  #inputs_arg[0] = files_upload
1026
  btn_pdf_convert.click(
@@ -1061,7 +1087,7 @@ def build_interface() -> gr.Blocks:
1061
 
1062
  btn_pdf_count.click(
1063
  fn=get_file_count,
1064
- inputs=[files_upload_pdf],
1065
  outputs=[files_count, log_output]
1066
  )
1067
  btn_html_count.click(
 
2
  from ast import Interactive
3
  import gradio as gr
4
  from concurrent.futures import ProcessPoolExecutor, as_completed
5
+ import tqdm
6
  import asyncio ##future
7
  import time
8
 
 
10
  from typing import Optional, Union #, Dict, List, Any, Tuple
11
 
12
  from huggingface_hub import get_token
13
+ import spaces ##HuggingFace spaces to accelerate GPU support on HF Spaces
14
 
15
  #import file_handler
16
  from file_handler import file_utils
 
59
  # pool executor to convert files called by Gradio
60
  ##SMY: TODO: future: refactor to gradio_process.py and
61
  ## pull options to cli-options{"output_format":, "output_dir_string":, "use_llm":, "page_range":, "force_ocr":, "debug":, "strip_existing_ocr":, "disable_ocr_math""}
62
+ @spaces.GPU
63
  def convert_batch(
64
  pdf_files, #: list[str],
65
  pdf_files_count: int,
 
249
  time.sleep(0.25)
250
  yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
251
 
252
+ '''# Use progress.tqdm to integrate with the executor map
253
  #results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
254
  for result_interim in progress.tqdm(
255
  iterable=pool.map(pdf2md_converter.convert_files, pdf_files), #, max_retries), total=len(pdf_files)
 
260
  #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
261
  #progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
262
  #progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
263
+ #time.sleep(0.25)'''
264
+ def get_results_pool_map(pdf_files, pdf_files_count, progress2=gr.Progress()):
265
+ #Use progress.tqdm to integrate with the executor map
266
+ #results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
267
+ for result_interim in progress2.tqdm(
268
+ iterable=pool.map(pdf2md_converter.convert_files, pdf_files), #, max_retries), total=len(pdf_files)
269
+ desc=f"ProcessPoolExecutor: Pooling file conversion ... pool.map",
270
+ total=pdf_files_count):
271
+ results.append(result_interim)
272
+
273
+ # Update the Gradio UI to improve user-friendly eXperience
274
+ #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
275
+ progress2((0,len(pdf_files)), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
276
+ #progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
277
+ time.sleep(0.75) #.sleep(0.25)
278
+
279
+ return results
280
+ results = get_results_pool_map(pdf_files, pdf_files_count)
281
+ yield gr.update(interactive=True), f"ProcessPoolExecutor: Got Results from files conversion: [{str(results)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
282
  progress((11,16), desc=f"ProcessPoolExecutor: Got Results from files conversion")
283
  time.sleep(0.25)
284
  except Exception as exc:
 
545
  # Concatenate the new files with the existing ones in the state
546
  updated_files = current_state + new_file_paths
547
  updated_filenames = [Path(f).name for f in updated_files]
548
+
549
+ updated_files_count = len(updated_files)
550
 
551
  # Return the updated state and a message to the user
552
  #file_info = "\n".join(updated_files)
 
554
  #message = f"Accumulated {len(updated_files)} file(s) total.\n\nAll file paths:\n{file_info}"
555
  message = f"Accumulated {len(updated_files)} file(s) total: \n{filename_info}"
556
 
557
+ return updated_files, updated_files_count, message, gr.update(interactive=True), gr.update(interactive=True)
558
 
559
  # with gr.Blocks(title=TITLE) as demo
560
  with gr.Blocks(title=TITLE, css=custom_css) as demo:
 
706
 
707
  logout_status_md = gr.Markdown(visible=True) #visible=False)
708
 
 
 
 
709
  # --- PDF & HTML β†’ Markdown tab ---
710
  with gr.Tab(" πŸ“„ PDF & HTML ➜ Markdown"):
711
  gr.Markdown(f"#### {DESCRIPTION_PDF_HTML}")
 
760
  with gr.Tab(" πŸ“„ PDF ➜ Markdown (Flag for DEPRECATION)", interactive=False, visible=True): #False
761
  gr.Markdown(f"#### {DESCRIPTION_PDF}")
762
 
763
+ files_upload_pdf_fl = gr.File(
764
  label="Upload PDF files",
765
  file_count="directory", ## handle directory and files upload #"multiple",
766
  type="filepath",
 
839
  label="Conversion Logs",
840
  lines=5,
841
  #max_lines=25,
842
+ interactive=True, #False
843
+ show_label=False,
844
  )
845
 
846
  # Initialise gr.State
847
+ # The gr.State component to hold the accumulated list of files
848
+ uploaded_file_list = gr.State([]) ##NB: initial value of `gr.State` must be able to be deepcopied
849
+ uploaded_files_count = gr.State(0) ## initial files count
850
+
851
  state_max_workers = gr.State(4) #max_workers_sl,
852
  state_max_retries = gr.State(2) #max_retries_sl,
853
  state_tz_hours = gr.State(value=None)
 
948
  #yield [], msg, '', ''
949
  #return [], f"Files list cleared.", [], []
950
  yield [], msg, None, None
951
+ return [], 0, f"Files list cleared.", None, None
952
 
953
  #hf_login_logout_btn.click(fn=custom_do_logout, inputs=None, outputs=hf_login_logout_btn)
954
  ##unused
 
962
  file_btn.upload(
963
  fn=accumulate_files,
964
  inputs=[file_btn, uploaded_file_list],
965
+ outputs=[uploaded_file_list, uploaded_files_count, output_textbox, process_button, clear_button]
966
  )
967
 
968
  # Event handler for the directory upload button
969
  dir_btn.upload(
970
  fn=accumulate_files,
971
  inputs=[dir_btn, uploaded_file_list],
972
+ outputs=[uploaded_file_list, uploaded_files_count, output_textbox, process_button, clear_button]
973
  )
974
 
975
  # Event handler for the "Clear" button
 
989
  ## and then use the return value of the function to update the component.
990
  ## Discarding for now. #//TODO: investigate further.
991
  ## SMY: Solved: using gr.State
992
+
993
  inputs_arg = [
994
  #pdf_files,
995
  ##pdf_files_wrap(pdf_files), # wrap pdf_files in a list (if not already)
996
  uploaded_file_list,
997
+ uploaded_files_count, #files_count, #pdf_files_count,
998
  provider_dd,
999
  model_tb,
1000
  hf_provider_dd,
 
1041
 
1042
  ##gr.File .upload() event, fire only after a file has been uploaded
1043
  # Event handler for the pdf file upload button
1044
+ ##TODO:
1045
+ #outputs=[uploaded_file_list, updated_files_count, output_textbox, process_button, clear_button]
1046
+ files_upload_pdf_fl.upload(
1047
  fn=accumulate_files,
1048
+ inputs=[files_upload_pdf_fl, uploaded_file_list],
1049
+ outputs=[uploaded_file_list, uploaded_files_count, log_output, files_upload_pdf_fl, clear_button]
1050
  )
1051
  #inputs_arg[0] = files_upload
1052
  btn_pdf_convert.click(
 
1087
 
1088
  btn_pdf_count.click(
1089
  fn=get_file_count,
1090
+ inputs=[files_upload_pdf_fl],
1091
  outputs=[files_count, log_output]
1092
  )
1093
  btn_html_count.click(