Spaces:

lolhaha002
/

Pdf-Extractor

Sleeping

Pdf-Extractor / app(backup).py

Upload app(backup).py

aab000f verified 3 months ago

981 Bytes

	import gradio as gr
	from pdf2image import convert_from_path
	from PIL import Image
	import pytesseract

	OCR_LANG = "guj"

	def extract_gujarati_text(pdf_file, page_number):
	images = convert_from_path(pdf_file.name, first_page=page_number, last_page=page_number)
	image = images[0]
	text = pytesseract.image_to_string(image, lang=OCR_LANG)
	return text, image # Returning both OCR text and snapshot

	with gr.Blocks() as demo:
	gr.Markdown("## 📚 Gujarati OCR from PDF (with Page Snapshot)")
	pdf = gr.File(label="📤 Upload Gujarati PDF", file_types=[".pdf"])
	page = gr.Number(label="📄 Page Number", minimum=1, value=1, step=1)
	button = gr.Button("🔍 Extract Text")

	with gr.Row():
	image_output = gr.Image(label="🖼️ PDF Page Snapshot")
	text_output = gr.Textbox(label="📝 Extracted Gujarati Text", lines=20)

	button.click(fn=extract_gujarati_text, inputs=[pdf, page], outputs=[text_output, image_output])

	demo.launch()