Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import shutil | |
| import cv2 | |
| import pytesseract | |
| import gradio as gr | |
| from pdf2image import convert_from_path, pdfinfo_from_path | |
| from tqdm import tqdm | |
| TEMP_DIR = "temp_pages" | |
| TESS_LANG = "ben" | |
| def process_pdf(pdf_file): | |
| OUTPUT_TXT = "assamese_book.txt" | |
| os.makedirs(TEMP_DIR, exist_ok=True) | |
| pdf_path = pdf_file.name | |
| # ----------------------------------- | |
| # GET TOTAL PAGES | |
| # ----------------------------------- | |
| info = pdfinfo_from_path(pdf_path) | |
| total_pages = info["Pages"] | |
| all_text = [] | |
| # ----------------------------------- | |
| # PROCESS PAGES | |
| # ----------------------------------- | |
| for page_num in tqdm( | |
| range(1, total_pages + 1), | |
| desc="PDF -> OCR", | |
| unit="page" | |
| ): | |
| pages = convert_from_path( | |
| pdf_path, | |
| dpi=300, | |
| first_page=page_num, | |
| last_page=page_num, | |
| fmt="png" | |
| ) | |
| page = pages[0] | |
| img_path = os.path.join( | |
| TEMP_DIR, | |
| f"page_{page_num}.png" | |
| ) | |
| page.save(img_path, "PNG") | |
| # ----------------------------------- | |
| # PREPROCESS | |
| # ----------------------------------- | |
| img = cv2.imread(img_path) | |
| gray = cv2.cvtColor( | |
| img, | |
| cv2.COLOR_BGR2GRAY | |
| ) | |
| gray = cv2.fastNlMeansDenoising(gray) | |
| _, thresh = cv2.threshold( | |
| gray, | |
| 0, | |
| 255, | |
| cv2.THRESH_BINARY + cv2.THRESH_OTSU | |
| ) | |
| # ----------------------------------- | |
| # OCR | |
| # ----------------------------------- | |
| text = pytesseract.image_to_string( | |
| thresh, | |
| lang=TESS_LANG, | |
| config="--oem 1 --psm 3" | |
| ) | |
| # ----------------------------------- | |
| # CLEAN | |
| # ----------------------------------- | |
| text = text.replace("\u200c", "") | |
| text = text.replace("\u200d", "") | |
| text = re.sub(r"\s+", " ", text).strip() | |
| all_text.append(text) | |
| # delete image instantly | |
| os.remove(img_path) | |
| # ----------------------------------- | |
| # SAVE TEXT | |
| # ----------------------------------- | |
| with open(OUTPUT_TXT, "w", encoding="utf-8") as f: | |
| f.write("\n".join(all_text)) | |
| # cleanup | |
| shutil.rmtree(TEMP_DIR) | |
| return OUTPUT_TXT | |
| demo = gr.Interface( | |
| fn=process_pdf, | |
| inputs=gr.File( | |
| label='Input PDF: "Israel - Hem Barua.pdf"', | |
| file_types=[".pdf"] | |
| ), | |
| outputs=gr.File(label="Download Extracted Text"), | |
| title="Assamese PDF OCR", | |
| description="Upload scanned Assamese PDFs and extract text." | |
| ) | |
| demo.launch() |