arasuezofis commited on
Commit
ba1c3af
·
verified ·
1 Parent(s): 236fc22

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -10
app.py CHANGED
@@ -4,9 +4,10 @@ from pdf2image import convert_from_bytes
4
  from PIL import Image
5
  import io
6
  import os
 
7
 
8
  # -----------------------
9
- # Ensure Tesseract knows where to find traineddata
10
  # -----------------------
11
  os.environ["TESSDATA_PREFIX"] = "/usr/share/tesseract-ocr/5/tessdata/"
12
 
@@ -36,23 +37,25 @@ lang_code = language_options[selected_lang]
36
  # -----------------------
37
  # Helper functions
38
  # -----------------------
 
39
  def image_to_searchable_pdf(image_obj: Image.Image, lang: str):
40
  """Convert PIL Image → searchable PDF"""
41
  return pytesseract.image_to_pdf_or_hocr(image_obj, extension="pdf", lang=lang)
42
 
43
  def pdf_to_searchable_pdf(pdf_bytes: bytes, lang: str):
44
- """Convert PDF bytes → searchable PDF page by page"""
45
  pages = convert_from_bytes(pdf_bytes)
46
- final_pdf = io.BytesIO()
47
 
48
- for idx, page in enumerate(pages):
49
- ocred_pdf = pytesseract.image_to_pdf_or_hocr(page, extension="pdf", lang=lang)
50
- if idx == 0:
51
- final_pdf.write(ocred_pdf)
52
- else:
53
- # Remove repeated PDF header
54
- final_pdf.write(ocred_pdf[28:])
55
 
 
 
56
  return final_pdf.getvalue()
57
 
58
  # -----------------------
 
4
  from PIL import Image
5
  import io
6
  import os
7
+ from PyPDF2 import PdfReader, PdfWriter
8
 
9
  # -----------------------
10
+ # Set Tesseract data path
11
  # -----------------------
12
  os.environ["TESSDATA_PREFIX"] = "/usr/share/tesseract-ocr/5/tessdata/"
13
 
 
37
  # -----------------------
38
  # Helper functions
39
  # -----------------------
40
+
41
  def image_to_searchable_pdf(image_obj: Image.Image, lang: str):
42
  """Convert PIL Image → searchable PDF"""
43
  return pytesseract.image_to_pdf_or_hocr(image_obj, extension="pdf", lang=lang)
44
 
45
  def pdf_to_searchable_pdf(pdf_bytes: bytes, lang: str):
46
+ """Convert multi-page PDF → single searchable PDF"""
47
  pages = convert_from_bytes(pdf_bytes)
48
+ pdf_writer = PdfWriter()
49
 
50
+ for page in pages:
51
+ # OCR each page
52
+ ocred_pdf_bytes = pytesseract.image_to_pdf_or_hocr(page, extension="pdf", lang=lang)
53
+ reader = PdfReader(io.BytesIO(ocred_pdf_bytes))
54
+ for p in reader.pages:
55
+ pdf_writer.add_page(p)
 
56
 
57
+ final_pdf = io.BytesIO()
58
+ pdf_writer.write(final_pdf)
59
  return final_pdf.getvalue()
60
 
61
  # -----------------------