Spaces:

Ranjit89
/

pdf_to_text

Sleeping

App Files Files Community

pdf_to_text / app.py

Ranjit89

Update app.py

1248273 verified 8 days ago

raw

history blame contribute delete

2.69 kB

	import os
	import re
	import shutil
	import cv2
	import pytesseract
	import gradio as gr

	from pdf2image import convert_from_path, pdfinfo_from_path
	from tqdm import tqdm


	TEMP_DIR = "temp_pages"
	TESS_LANG = "ben"


	def process_pdf(pdf_file):

	OUTPUT_TXT = "assamese_book.txt"

	os.makedirs(TEMP_DIR, exist_ok=True)

	pdf_path = pdf_file.name

	# -----------------------------------
	# GET TOTAL PAGES
	# -----------------------------------
	info = pdfinfo_from_path(pdf_path)
	total_pages = info["Pages"]

	all_text = []

	# -----------------------------------
	# PROCESS PAGES
	# -----------------------------------
	for page_num in tqdm(
	range(1, total_pages + 1),
	desc="PDF -> OCR",
	unit="page"
	):

	pages = convert_from_path(
	pdf_path,
	dpi=300,
	first_page=page_num,
	last_page=page_num,
	fmt="png"
	)

	page = pages[0]

	img_path = os.path.join(
	TEMP_DIR,
	f"page_{page_num}.png"
	)

	page.save(img_path, "PNG")

	# -----------------------------------
	# PREPROCESS
	# -----------------------------------
	img = cv2.imread(img_path)

	gray = cv2.cvtColor(
	img,
	cv2.COLOR_BGR2GRAY
	)

	gray = cv2.fastNlMeansDenoising(gray)

	_, thresh = cv2.threshold(
	gray,
	0,
	255,
	cv2.THRESH_BINARY + cv2.THRESH_OTSU
	)

	# -----------------------------------
	# OCR
	# -----------------------------------
	text = pytesseract.image_to_string(
	thresh,
	lang=TESS_LANG,
	config="--oem 1 --psm 3"
	)

	# -----------------------------------
	# CLEAN
	# -----------------------------------
	text = text.replace("\u200c", "")
	text = text.replace("\u200d", "")
	text = re.sub(r"\s+", " ", text).strip()

	all_text.append(text)

	# delete image instantly
	os.remove(img_path)

	# -----------------------------------
	# SAVE TEXT
	# -----------------------------------
	with open(OUTPUT_TXT, "w", encoding="utf-8") as f:
	f.write("\n".join(all_text))

	# cleanup
	shutil.rmtree(TEMP_DIR)

	return OUTPUT_TXT


	demo = gr.Interface(
	fn=process_pdf,
	inputs=gr.File(
	label='Input PDF: "Israel - Hem Barua.pdf"',
	file_types=[".pdf"]
	),
	outputs=gr.File(label="Download Extracted Text"),
	title="Assamese PDF OCR",
	description="Upload scanned Assamese PDFs and extract text."
	)

	demo.launch()