Marthee commited on
Commit
13fb83e
·
verified ·
1 Parent(s): 756ad9e

Update InitialMarkups.py

Browse files
Files changed (1) hide show
  1. InitialMarkups.py +7 -10
InitialMarkups.py CHANGED
@@ -6,10 +6,9 @@ Automatically generated by Colab.
6
  Original file is located at
7
  https://colab.research.google.com/drive/12XfVkmKmN3oVjHhLVE0_GgkftgArFEK2
8
  """
 
9
 
10
- pip install pymupdf
11
 
12
- pip install fuzzywuzzy
13
 
14
  from io import BytesIO
15
  import re
@@ -600,7 +599,7 @@ def same_start_word(s1, s2):
600
  return words1[0].lower() == words2[0].lower()
601
  return False
602
 
603
- baselink='https://marthee-nbslink.hf.space/view-pdf?'
604
  def extract_section_under_header(pdf_path):
605
  top_margin = 70
606
  bottom_margin = 50
@@ -968,11 +967,9 @@ def extract_section_under_header(pdf_path):
968
  page_highlights[page_num] = bbox
969
  highlight_boxes(docHighlights, page_highlights)
970
 
971
- docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True)
972
- return json_output
973
-
974
- pdflink='https://www.dropbox.com/scl/fi/jtffcxszwpcnc6wdo61p6/WH007-JAC-RP-XX-SP-AA-8501-Redoak-Pump-House-Specification.pdf?rlkey=unq4ag9eajezv2j6y6ewkkk5u&e=29&st=wu3vsd70&dl=0'
975
-
976
- jsonOutput=extract_section_under_header(pdflink)
977
- print(jsonOutput)
978
 
 
 
 
 
 
6
  Original file is located at
7
  https://colab.research.google.com/drive/12XfVkmKmN3oVjHhLVE0_GgkftgArFEK2
8
  """
9
+ baselink='https://find-initialmarkups.hf.space/view-pdf?'
10
 
 
11
 
 
12
 
13
  from io import BytesIO
14
  import re
 
599
  return words1[0].lower() == words2[0].lower()
600
  return False
601
 
602
+
603
  def extract_section_under_header(pdf_path):
604
  top_margin = 70
605
  bottom_margin = 50
 
967
  page_highlights[page_num] = bbox
968
  highlight_boxes(docHighlights, page_highlights)
969
 
970
+ # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True)
 
 
 
 
 
 
971
 
972
+ pdf_bytes = BytesIO()
973
+ docHighlights.save(pdf_bytes)
974
+ print('JSONN',json_output)
975
+ return pdf_bytes.getvalue(), docHighlights , json_output