pdf_to_text / app.py
Ranjit89's picture
Update app.py
1248273 verified
import os
import re
import shutil
import cv2
import pytesseract
import gradio as gr
from pdf2image import convert_from_path, pdfinfo_from_path
from tqdm import tqdm
TEMP_DIR = "temp_pages"
TESS_LANG = "ben"
def process_pdf(pdf_file):
OUTPUT_TXT = "assamese_book.txt"
os.makedirs(TEMP_DIR, exist_ok=True)
pdf_path = pdf_file.name
# -----------------------------------
# GET TOTAL PAGES
# -----------------------------------
info = pdfinfo_from_path(pdf_path)
total_pages = info["Pages"]
all_text = []
# -----------------------------------
# PROCESS PAGES
# -----------------------------------
for page_num in tqdm(
range(1, total_pages + 1),
desc="PDF -> OCR",
unit="page"
):
pages = convert_from_path(
pdf_path,
dpi=300,
first_page=page_num,
last_page=page_num,
fmt="png"
)
page = pages[0]
img_path = os.path.join(
TEMP_DIR,
f"page_{page_num}.png"
)
page.save(img_path, "PNG")
# -----------------------------------
# PREPROCESS
# -----------------------------------
img = cv2.imread(img_path)
gray = cv2.cvtColor(
img,
cv2.COLOR_BGR2GRAY
)
gray = cv2.fastNlMeansDenoising(gray)
_, thresh = cv2.threshold(
gray,
0,
255,
cv2.THRESH_BINARY + cv2.THRESH_OTSU
)
# -----------------------------------
# OCR
# -----------------------------------
text = pytesseract.image_to_string(
thresh,
lang=TESS_LANG,
config="--oem 1 --psm 3"
)
# -----------------------------------
# CLEAN
# -----------------------------------
text = text.replace("\u200c", "")
text = text.replace("\u200d", "")
text = re.sub(r"\s+", " ", text).strip()
all_text.append(text)
# delete image instantly
os.remove(img_path)
# -----------------------------------
# SAVE TEXT
# -----------------------------------
with open(OUTPUT_TXT, "w", encoding="utf-8") as f:
f.write("\n".join(all_text))
# cleanup
shutil.rmtree(TEMP_DIR)
return OUTPUT_TXT
demo = gr.Interface(
fn=process_pdf,
inputs=gr.File(
label='Input PDF: "Israel - Hem Barua.pdf"',
file_types=[".pdf"]
),
outputs=gr.File(label="Download Extracted Text"),
title="Assamese PDF OCR",
description="Upload scanned Assamese PDFs and extract text."
)
demo.launch()