Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,6 +3,12 @@ import pytesseract
|
|
| 3 |
from pdf2image import convert_from_bytes
|
| 4 |
from PIL import Image
|
| 5 |
import io
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
# -----------------------
|
| 8 |
# Streamlit Page Config
|
|
@@ -46,7 +52,7 @@ def pdf_to_searchable_pdf(pdf_bytes: bytes, lang_code: str):
|
|
| 46 |
final_pdf = io.BytesIO()
|
| 47 |
|
| 48 |
for idx, page in enumerate(pages):
|
| 49 |
-
ocred_pdf = pytesseract.image_to_pdf_or_hocr(page, extension=
|
| 50 |
|
| 51 |
if idx == 0:
|
| 52 |
final_pdf.write(ocred_pdf)
|
|
@@ -56,7 +62,6 @@ def pdf_to_searchable_pdf(pdf_bytes: bytes, lang_code: str):
|
|
| 56 |
|
| 57 |
return final_pdf.getvalue()
|
| 58 |
|
| 59 |
-
|
| 60 |
# -----------------------
|
| 61 |
# Streamlit File Upload
|
| 62 |
# -----------------------
|
|
|
|
| 3 |
from pdf2image import convert_from_bytes
|
| 4 |
from PIL import Image
|
| 5 |
import io
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
# -----------------------
|
| 9 |
+
# Set Tesseract data path (HF Spaces)
|
| 10 |
+
# -----------------------
|
| 11 |
+
os.environ["TESSDATA_PREFIX"] = "/usr/share/tesseract-ocr/5/tessdata/"
|
| 12 |
|
| 13 |
# -----------------------
|
| 14 |
# Streamlit Page Config
|
|
|
|
| 52 |
final_pdf = io.BytesIO()
|
| 53 |
|
| 54 |
for idx, page in enumerate(pages):
|
| 55 |
+
ocred_pdf = pytesseract.image_to_pdf_or_hocr(page, extension='pdf', lang=lang_code)
|
| 56 |
|
| 57 |
if idx == 0:
|
| 58 |
final_pdf.write(ocred_pdf)
|
|
|
|
| 62 |
|
| 63 |
return final_pdf.getvalue()
|
| 64 |
|
|
|
|
| 65 |
# -----------------------
|
| 66 |
# Streamlit File Upload
|
| 67 |
# -----------------------
|