arasuezofis commited on
Commit
8536ff7
·
verified ·
1 Parent(s): 4d1900c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -2
app.py CHANGED
@@ -3,6 +3,12 @@ import pytesseract
3
  from pdf2image import convert_from_bytes
4
  from PIL import Image
5
  import io
 
 
 
 
 
 
6
 
7
  # -----------------------
8
  # Streamlit Page Config
@@ -46,7 +52,7 @@ def pdf_to_searchable_pdf(pdf_bytes: bytes, lang_code: str):
46
  final_pdf = io.BytesIO()
47
 
48
  for idx, page in enumerate(pages):
49
- ocred_pdf = pytesseract.image_to_pdf_or_hocr(page, extension="pdf", lang=lang_code)
50
 
51
  if idx == 0:
52
  final_pdf.write(ocred_pdf)
@@ -56,7 +62,6 @@ def pdf_to_searchable_pdf(pdf_bytes: bytes, lang_code: str):
56
 
57
  return final_pdf.getvalue()
58
 
59
-
60
  # -----------------------
61
  # Streamlit File Upload
62
  # -----------------------
 
3
  from pdf2image import convert_from_bytes
4
  from PIL import Image
5
  import io
6
+ import os
7
+
8
+ # -----------------------
9
+ # Set Tesseract data path (HF Spaces)
10
+ # -----------------------
11
+ os.environ["TESSDATA_PREFIX"] = "/usr/share/tesseract-ocr/5/tessdata/"
12
 
13
  # -----------------------
14
  # Streamlit Page Config
 
52
  final_pdf = io.BytesIO()
53
 
54
  for idx, page in enumerate(pages):
55
+ ocred_pdf = pytesseract.image_to_pdf_or_hocr(page, extension='pdf', lang=lang_code)
56
 
57
  if idx == 0:
58
  final_pdf.write(ocred_pdf)
 
62
 
63
  return final_pdf.getvalue()
64
 
 
65
  # -----------------------
66
  # Streamlit File Upload
67
  # -----------------------