Spaces:
Sleeping
Sleeping
| import cv2 | |
| import pytesseract | |
| from pytesseract import Output | |
| import numpy as np | |
| import gradio as gr | |
| from PIL import Image | |
| from collections import defaultdict | |
| import re | |
| def ocr_scb_slip_text(image): | |
| # แปลงจาก PIL → OpenCV | |
| img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) | |
| # Preprocessing เพื่อให้ OCR อ่านแม่น | |
| gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) | |
| gray = cv2.adaptiveThreshold( | |
| gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, | |
| cv2.THRESH_BINARY, 11, 2 | |
| ) | |
| kernel = np.ones((2,2), np.uint8) | |
| gray = cv2.dilate(gray, kernel, iterations=1) | |
| gray = cv2.erode(gray, kernel, iterations=1) | |
| # Tesseract config | |
| custom_config = r'-l eng+tha --oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789' | |
| # OCR | |
| data = pytesseract.image_to_data(gray, config=custom_config, output_type=Output.DICT) | |
| totalBox = len(data['text']) | |
| lines = defaultdict(list) | |
| # จัดข้อความเป็นบรรทัด | |
| for i in range(totalBox): | |
| if int(data['conf'][i]) > 0: | |
| text = data['text'][i].strip() | |
| if text != "": | |
| line_id = (data['block_num'][i], data['par_num'][i], data['line_num'][i]) | |
| lines[line_id].append(text) | |
| # รวมข้อความทั้งหมดเป็นบรรทัดเดียว | |
| extracted_texts = [" ".join(words) for _, words in sorted(lines.items())] | |
| final_text = "\n".join(extracted_texts) | |
| # แก้เว้นวรรคภาษาไทย | |
| final_text = re.sub(r'([\u0E00-\u0E7F])\s+([\u0E00-\u0E7F])', r'\1\2', final_text) | |
| return final_text | |
| # Gradio UI | |
| demo = gr.Interface( | |
| fn=ocr_scb_slip_text, | |
| inputs=gr.Image(type="pil", label="อัปโหลดสลิป SCB"), | |
| outputs=gr.Textbox(label="ข้อความ OCR รวมทุกบรรทัด"), | |
| title="OCR สลิป SCB (ข้อความรวม)", | |
| description="OCR สลิป SCB รวมโลโก้ SCB และข้อความทั้งหมดเป็นข้อความเดียว" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |