Spaces:

lolhaha002
/

Pdf-Extractor

Sleeping

App Files Files Community

Pdf-Extractor / app.py

lolhaha002

Update app.py

27dca6d verified 3 months ago

raw

history blame contribute delete

2.21 kB

	import gradio as gr
	from pdf2image import convert_from_path
	from PIL import Image
	import pytesseract

	# UPDATED: 'eng' added for mixed words like "Mean", "Statistics"
	OCR_LANG = "guj+eng"

	def extract_gujarati_text(pdf_file, page_number):
	if pdf_file is None:
	return "Please upload a PDF file first.", None

	try:
	# Convert PDF page to image
	images = convert_from_path(pdf_file.name, first_page=page_number, last_page=page_number)
	image = images[0]

	# Configuration for better block handling (helps with textbook layouts)
	custom_config = r'--oem 3 --psm 3'

	text = pytesseract.image_to_string(image, lang=OCR_LANG, config=custom_config)
	return text, image

	except Exception as e:
	return f"Error: {str(e)}", None

	# CSS to ensure the image and text box are roughly the same height for easy comparison
	css = """
	.gradio-container {min-height: 0px !important;}
	#img_out {height: 80vh !important;}
	#txt_out textarea {height: 80vh !important;}
	"""

	with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
	gr.Markdown("## 📚 Side-by-Side Gujarati OCR (Textbook Mode)")

	# Input Section at the top
	with gr.Row(variant="panel"):
	with gr.Column(scale=1):
	pdf = gr.File(label="1. Upload PDF", file_types=[".pdf"])
	with gr.Column(scale=1):
	page = gr.Number(label="2. Page Number", minimum=1, value=1, step=1)
	button = gr.Button("3. Extract Text & Compare", variant="primary", size="lg")

	# Output Section: Side-by-Side Comparison
	with gr.Row():
	with gr.Column(scale=1):
	# Left: Original Image
	image_output = gr.Image(label="📄 Original Page Snapshot", type="pil", elem_id="img_out")

	with gr.Column(scale=1):
	# Right: Extracted Text
	text_output = gr.Textbox(
	label="📝 Extracted Text (Editable)",
	elem_id="txt_out",
	show_copy_button=True,
	interactive=True
	)

	button.click(fn=extract_gujarati_text, inputs=[pdf, page], outputs=[text_output, image_output])

	demo.launch()