Spaces:

mohamed12ahmed
/

ocrextractsamrformat

Runtime error

App Files Files Community

ocrextractsamrformat / app.py

mohamed12ahmed

Update app.py

a2e1f84 verified 5 months ago

raw

history blame contribute delete

3.1 kB

	import gradio as gr
	import cv2
	import numpy as np
	import pytesseract
	#pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
	pytesseract.pytesseract.tesseract_cmd = "tesseract"

	from PIL import Image
	from docx import Document
	import io


	# ---------- Step 1: Image preprocessing ----------
	def preprocess_image(image):
	gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
	gray = cv2.bilateralFilter(gray, 11, 17, 17)
	edges = cv2.Canny(gray, 30, 200)

	contours, _ = cv2.findContours(edges, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
	contours = sorted(contours, key=cv2.contourArea, reverse=True)[:5]

	doc_contour = None
	for contour in contours:
	perimeter = cv2.arcLength(contour, True)
	approx = cv2.approxPolyDP(contour, 0.02 * perimeter, True)
	if len(approx) == 4:
	doc_contour = approx
	break

	if doc_contour is not None:
	pts = doc_contour.reshape(4, 2)
	rect = np.zeros((4, 2), dtype="float32")

	s = pts.sum(axis=1)
	rect[0] = pts[np.argmin(s)]
	rect[2] = pts[np.argmax(s)]

	diff = np.diff(pts, axis=1)
	rect[1] = pts[np.argmin(diff)]
	rect[3] = pts[np.argmax(diff)]

	(tl, tr, br, bl) = rect
	widthA = np.linalg.norm(br - bl)
	widthB = np.linalg.norm(tr - tl)
	heightA = np.linalg.norm(tr - br)
	heightB = np.linalg.norm(tl - bl)
	maxWidth = int(max(widthA, widthB))
	maxHeight = int(max(heightA, heightB))

	dst = np.array([
	[0, 0],
	[maxWidth - 1, 0],
	[maxWidth - 1, maxHeight - 1],
	[0, maxHeight - 1]
	], dtype="float32")

	M = cv2.getPerspectiveTransform(rect, dst)
	warped = cv2.warpPerspective(np.array(image), M, (maxWidth, maxHeight))
	return Image.fromarray(warped)
	else:
	return image


	# ---------- Step 2: OCR & Word export ----------
	def extract_text(image):
	preprocessed = preprocess_image(image)
	text = pytesseract.image_to_string(preprocessed, lang="eng+ara")

	# Save as DOCX
	doc = Document()
	for line in text.splitlines():
	if line.strip():
	doc.add_paragraph(line)
	buffer = io.BytesIO()
	doc.save(buffer)
	buffer.seek(0)

	return text, (buffer, "document_output.docx")


	# ---------- Step 3: Gradio UI ----------
	def process_image(image):
	text, file_buffer = extract_text(image)
	return text, file_buffer


	with gr.Blocks(title="AI Document OCR (Light Version)") as demo:
	gr.Markdown("## 📄 AI Document OCR (Light Version)\nUpload a scanned or skewed document, and the model will correct it and extract formatted text.")

	with gr.Row():
	input_image = gr.Image(type="pil", label="Upload Document")
	with gr.Row():
	output_text = gr.Textbox(label="Extracted Text", lines=15)
	output_file = gr.File(label="Download as .docx")
	with gr.Row():
	submit_btn = gr.Button("Extract Text")

	submit_btn.click(process_image, inputs=input_image, outputs=[output_text, output_file])

	demo.launch()