Spaces:

eoeooe
/

OcrBroTest

Sleeping

App Files Files Community

OcrBroTest / app.py

eoeooe

Update app.py

f221e96 verified 5 months ago

raw

history blame contribute delete

2.28 kB

	import cv2
	import pytesseract
	from pytesseract import Output
	import numpy as np
	import gradio as gr
	from PIL import Image
	from collections import defaultdict
	import re

	def ocr_scb_slip_text(image):
	# แปลงจาก PIL → OpenCV
	img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

	# Preprocessing เพื่อให้ OCR อ่านแม่น
	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
	gray = cv2.adaptiveThreshold(
	gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
	cv2.THRESH_BINARY, 11, 2
	)
	kernel = np.ones((2,2), np.uint8)
	gray = cv2.dilate(gray, kernel, iterations=1)
	gray = cv2.erode(gray, kernel, iterations=1)

	# Tesseract config
	custom_config = r'-l eng+tha --oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'

	# OCR
	data = pytesseract.image_to_data(gray, config=custom_config, output_type=Output.DICT)
	totalBox = len(data['text'])
	lines = defaultdict(list)

	# จัดข้อความเป็นบรรทัด
	for i in range(totalBox):
	if int(data['conf'][i]) > 0:
	text = data['text'][i].strip()
	if text != "":
	line_id = (data['block_num'][i], data['par_num'][i], data['line_num'][i])
	lines[line_id].append(text)

	# รวมข้อความทั้งหมดเป็นบรรทัดเดียว
	extracted_texts = [" ".join(words) for _, words in sorted(lines.items())]
	final_text = "\n".join(extracted_texts)

	# แก้เว้นวรรคภาษาไทย
	final_text = re.sub(r'([\u0E00-\u0E7F])\s+([\u0E00-\u0E7F])', r'\1\2', final_text)

	return final_text

	# Gradio UI
	demo = gr.Interface(
	fn=ocr_scb_slip_text,
	inputs=gr.Image(type="pil", label="อัปโหลดสลิป SCB"),
	outputs=gr.Textbox(label="ข้อความ OCR รวมทุกบรรทัด"),
	title="OCR สลิป SCB (ข้อความรวม)",
	description="OCR สลิป SCB รวมโลโก้ SCB และข้อความทั้งหมดเป็นข้อความเดียว"
	)

	if __name__ == "__main__":
	demo.launch()