arasuezofis commited on
Commit
236fc22
·
verified ·
1 Parent(s): 8536ff7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -36
app.py CHANGED
@@ -6,12 +6,12 @@ import io
6
  import os
7
 
8
  # -----------------------
9
- # Set Tesseract data path (HF Spaces)
10
  # -----------------------
11
  os.environ["TESSDATA_PREFIX"] = "/usr/share/tesseract-ocr/5/tessdata/"
12
 
13
  # -----------------------
14
- # Streamlit Page Config
15
  # -----------------------
16
  st.set_page_config(page_title="Image/PDF → Searchable PDF", layout="centered")
17
 
@@ -19,72 +19,60 @@ st.title("📄 Image / PDF to Searchable PDF (OCR)")
19
  st.write(
20
  "Upload an image (PNG/JPG/JPEG) or a PDF. The app will convert it into a searchable PDF using OCR."
21
  )
22
- st.write("Supports English (eng) and Hindi (hin).")
23
 
24
  # -----------------------
25
- # Language Selection
26
  # -----------------------
27
- lang = st.selectbox(
28
- "Select OCR Language",
29
- {
30
- "English": "eng",
31
- "Hindi": "hin",
32
- "English + Hindi": "eng+hin"
33
- }
34
- )
35
 
36
  # -----------------------
37
- # Helper Functions
38
  # -----------------------
 
 
 
39
 
40
- def image_to_searchable_pdf(image_obj: Image.Image, lang_code: str):
41
- """
42
- Convert PIL Image → searchable PDF using Tesseract OCR
43
- """
44
- return pytesseract.image_to_pdf_or_hocr(image_obj, extension="pdf", lang=lang_code)
45
-
46
-
47
- def pdf_to_searchable_pdf(pdf_bytes: bytes, lang_code: str):
48
- """
49
- Convert PDF bytes → searchable PDF page by page
50
- """
51
  pages = convert_from_bytes(pdf_bytes)
52
  final_pdf = io.BytesIO()
53
 
54
  for idx, page in enumerate(pages):
55
- ocred_pdf = pytesseract.image_to_pdf_or_hocr(page, extension='pdf', lang=lang_code)
56
-
57
  if idx == 0:
58
  final_pdf.write(ocred_pdf)
59
  else:
60
- # Remove extra PDF header for subsequent pages
61
  final_pdf.write(ocred_pdf[28:])
62
 
63
  return final_pdf.getvalue()
64
 
65
  # -----------------------
66
- # Streamlit File Upload
67
  # -----------------------
68
-
69
  uploaded_file = st.file_uploader(
70
  "Upload Image or PDF", type=["png", "jpg", "jpeg", "pdf"]
71
  )
72
 
73
  if uploaded_file:
74
- # Read file once to avoid BodyStreamBuffer errors
75
  file_bytes = uploaded_file.getvalue()
76
-
77
- st.info("Processing file… This may take a few seconds.")
78
 
79
  try:
80
- # Handle image files
81
  if uploaded_file.type.startswith("image"):
82
  img = Image.open(io.BytesIO(file_bytes))
83
- result_pdf = image_to_searchable_pdf(img, lang)
84
 
85
- # Handle PDF files
86
  elif uploaded_file.type == "application/pdf":
87
- result_pdf = pdf_to_searchable_pdf(file_bytes, lang)
88
 
89
  else:
90
  st.error("Unsupported file type")
 
6
  import os
7
 
8
  # -----------------------
9
+ # Ensure Tesseract knows where to find traineddata
10
  # -----------------------
11
  os.environ["TESSDATA_PREFIX"] = "/usr/share/tesseract-ocr/5/tessdata/"
12
 
13
  # -----------------------
14
+ # Streamlit page config
15
  # -----------------------
16
  st.set_page_config(page_title="Image/PDF → Searchable PDF", layout="centered")
17
 
 
19
  st.write(
20
  "Upload an image (PNG/JPG/JPEG) or a PDF. The app will convert it into a searchable PDF using OCR."
21
  )
22
+ st.write("Supports English (eng), Hindi (hin), or both.")
23
 
24
  # -----------------------
25
+ # Language selection mapping
26
  # -----------------------
27
+ language_options = {
28
+ "English": "eng",
29
+ "Hindi": "hin",
30
+ "English + Hindi": "eng+hin"
31
+ }
32
+
33
+ selected_lang = st.selectbox("Select OCR Language", list(language_options.keys()))
34
+ lang_code = language_options[selected_lang]
35
 
36
  # -----------------------
37
+ # Helper functions
38
  # -----------------------
39
+ def image_to_searchable_pdf(image_obj: Image.Image, lang: str):
40
+ """Convert PIL Image → searchable PDF"""
41
+ return pytesseract.image_to_pdf_or_hocr(image_obj, extension="pdf", lang=lang)
42
 
43
+ def pdf_to_searchable_pdf(pdf_bytes: bytes, lang: str):
44
+ """Convert PDF bytes → searchable PDF page by page"""
 
 
 
 
 
 
 
 
 
45
  pages = convert_from_bytes(pdf_bytes)
46
  final_pdf = io.BytesIO()
47
 
48
  for idx, page in enumerate(pages):
49
+ ocred_pdf = pytesseract.image_to_pdf_or_hocr(page, extension="pdf", lang=lang)
 
50
  if idx == 0:
51
  final_pdf.write(ocred_pdf)
52
  else:
53
+ # Remove repeated PDF header
54
  final_pdf.write(ocred_pdf[28:])
55
 
56
  return final_pdf.getvalue()
57
 
58
  # -----------------------
59
+ # File uploader
60
  # -----------------------
 
61
  uploaded_file = st.file_uploader(
62
  "Upload Image or PDF", type=["png", "jpg", "jpeg", "pdf"]
63
  )
64
 
65
  if uploaded_file:
 
66
  file_bytes = uploaded_file.getvalue()
67
+ st.info("Processing file… This may take a few seconds…")
 
68
 
69
  try:
 
70
  if uploaded_file.type.startswith("image"):
71
  img = Image.open(io.BytesIO(file_bytes))
72
+ result_pdf = image_to_searchable_pdf(img, lang_code)
73
 
 
74
  elif uploaded_file.type == "application/pdf":
75
+ result_pdf = pdf_to_searchable_pdf(file_bytes, lang_code)
76
 
77
  else:
78
  st.error("Unsupported file type")