Spaces:
Sleeping
Sleeping
| import re | |
| import json | |
| from PIL import Image | |
| import pytesseract | |
| class HostList: | |
| def __init__(self, is_debug=False) -> None: | |
| self.is_debug = is_debug | |
| # Host List Style (hlstyle) configuration for pytesseract | |
| # - psm means page segmentation (Ref. https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/) | |
| # - fixed slashed zero issue with custom traineddata: https://github.com/ReceiptManager/receipt-parser-server/tree/master/tessdata | |
| self.hlstyle_config = r'--psm 6 --tessdata-dir ./tessdata -l eng_slashed_zeros' | |
| def get_orientation(self, image: Image): | |
| # detect orientation | |
| osd = pytesseract.image_to_osd(image) | |
| isrotate = re.search('(?<=Rotate: )\d+', osd) | |
| isscript = re.search('(?<=Script: )\d+', osd) | |
| angle = re.search('(?<=Rotate: )\d+', osd).group(0) if isrotate else None | |
| script = re.search('(?<=Script: )\d+', osd).group(1) if isscript else None | |
| if self.is_debug: | |
| print("---------------------------------") | |
| print(f"angle : {angle}") | |
| print(f"script : {script}") | |
| return (angle, script) | |
| def post_processes(self, result: str): | |
| data = dict() | |
| obj = dict() | |
| data['data'] = [] | |
| is_host = False | |
| is_mid = False | |
| is_tid = False | |
| if self.is_debug: | |
| print("---------------------------------") | |
| print("post-processes:\n") | |
| lines = result.splitlines() | |
| for line in lines: | |
| if re.search(r'(\:)', line): | |
| infos = line.split(':')[1] | |
| # # Clear end line character | |
| # if len(infos) > 1: | |
| # infos.pop() | |
| # # Merge all | |
| # infos = ''.join(infos) | |
| print(infos) | |
| # Is alphabet or numeric ? | |
| if re.search(r'[a-zA-Z0-9]+', infos): | |
| if not is_host and not is_mid and not is_tid: | |
| is_host = True | |
| obj['host'] = re.sub('\W', '', infos) | |
| elif is_host and not is_mid and not is_tid: | |
| is_mid = True | |
| obj['mid'] = max(infos.split(' '), key=len) | |
| elif is_host and is_mid and not is_tid: | |
| is_tid = True | |
| obj['tid'] = max(infos.split(' '), key=len) | |
| if is_host and is_mid and is_tid: | |
| data['data'].append(obj.copy()) | |
| print(json.dumps(obj)) | |
| is_host = False | |
| is_mid = False | |
| is_tid = False | |
| obj.clear() | |
| print(json.dumps(data)) | |
| return f'{result}\n-------------------\n{json.dumps(data, indent=2)}' | |
| def process_image(self, image: Image): | |
| string = pytesseract.image_to_string(image, config=self.hlstyle_config) | |
| string = self.post_processes(string) | |
| return f'{string}' |