import os import re import shutil import cv2 import pytesseract import gradio as gr from pdf2image import convert_from_path, pdfinfo_from_path from tqdm import tqdm TEMP_DIR = "temp_pages" TESS_LANG = "ben" def process_pdf(pdf_file): OUTPUT_TXT = "assamese_book.txt" os.makedirs(TEMP_DIR, exist_ok=True) pdf_path = pdf_file.name # ----------------------------------- # GET TOTAL PAGES # ----------------------------------- info = pdfinfo_from_path(pdf_path) total_pages = info["Pages"] all_text = [] # ----------------------------------- # PROCESS PAGES # ----------------------------------- for page_num in tqdm( range(1, total_pages + 1), desc="PDF -> OCR", unit="page" ): pages = convert_from_path( pdf_path, dpi=300, first_page=page_num, last_page=page_num, fmt="png" ) page = pages[0] img_path = os.path.join( TEMP_DIR, f"page_{page_num}.png" ) page.save(img_path, "PNG") # ----------------------------------- # PREPROCESS # ----------------------------------- img = cv2.imread(img_path) gray = cv2.cvtColor( img, cv2.COLOR_BGR2GRAY ) gray = cv2.fastNlMeansDenoising(gray) _, thresh = cv2.threshold( gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU ) # ----------------------------------- # OCR # ----------------------------------- text = pytesseract.image_to_string( thresh, lang=TESS_LANG, config="--oem 1 --psm 3" ) # ----------------------------------- # CLEAN # ----------------------------------- text = text.replace("\u200c", "") text = text.replace("\u200d", "") text = re.sub(r"\s+", " ", text).strip() all_text.append(text) # delete image instantly os.remove(img_path) # ----------------------------------- # SAVE TEXT # ----------------------------------- with open(OUTPUT_TXT, "w", encoding="utf-8") as f: f.write("\n".join(all_text)) # cleanup shutil.rmtree(TEMP_DIR) return OUTPUT_TXT demo = gr.Interface( fn=process_pdf, inputs=gr.File( label='Input PDF: "Israel - Hem Barua.pdf"', file_types=[".pdf"] ), outputs=gr.File(label="Download Extracted Text"), title="Assamese PDF OCR", description="Upload scanned Assamese PDFs and extract text." ) demo.launch()