landify-cccd-ocr

Sleeping

File size: 13,987 Bytes

import os
import re
import json
import cv2
import time
import threading
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from paddleocr import PaddleOCR
from vietocr.tool.predictor import Predictor
from vietocr.tool.config import Cfg

CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))

ocr = None
detector = None


class Extractor:

    def __init__(self):

        self.config = Cfg.load_config_from_name('vgg_seq2seq')
        self.config['weights'] = os.path.join(CURRENT_DIR, "seq2seqocr.pth")
        self.config['cnn']['pretrained'] = False
        self.config['device'] = 'cpu'

        self.ocr = PaddleOCR(
            lang='en',
            use_gpu=False,
            ocr_version='PP-OCRv3',
            det_model_dir='./models/det/en_PP-OCRv3_det_infer/',
            rec_model_dir='./models/rec/en_PP-OCRv3_rec_infer/',
            cls_model_dir='./models/cls/ch_ppocr_mobile_v2.0_cls_infer/'
        )

        if (detector == None):
            self.detector = Predictor(self.config)
        else:
            self.detector = detector

        # result = {'ID_number':'',
        #              'Name':'',
        #              'Date_of_birth':'',
        #              'Gender':'',
        #              'Nationality':'',
        #              'Place_of_origin':'',
        #              'Place_of_residence':''}

    ####################################################################################################

    def Detection(self, frame):
        annotations = self.ocr.ocr(frame, rec=True, cls=False)
        return annotations[0]

    ####################################################################################################

    def WarpAndSave(self, frame, fileName, top_left, top_right, bottom_right, bottom_left):

        w, h, cn = frame.shape
        padding = 4.0
        padding = int(padding * w / 640)

        # All points are in format [cols, rows]
        pt_A = top_left[0], top_left[1]
        pt_B = bottom_left[0], bottom_left[1]
        pt_C = bottom_right[0], bottom_right[1]
        pt_D = top_right[0], top_right[1]

        # Here, I have used L2 norm. You can use L1 also.
        width_AD = np.sqrt(((pt_A[0] - pt_D[0]) ** 2) + ((pt_A[1] - pt_D[1]) ** 2))
        width_BC = np.sqrt(((pt_B[0] - pt_C[0]) ** 2) + ((pt_B[1] - pt_C[1]) ** 2))
        maxWidth = max(int(width_AD), int(width_BC))

        height_AB = np.sqrt(((pt_A[0] - pt_B[0]) ** 2) + ((pt_A[1] - pt_B[1]) ** 2))
        height_CD = np.sqrt(((pt_C[0] - pt_D[0]) ** 2) + ((pt_C[1] - pt_D[1]) ** 2))
        maxHeight = max(int(height_AB), int(height_CD))

        input_pts = np.float32([pt_A, pt_B, pt_C, pt_D])
        output_pts = np.float32([[0, 0],
                                 [0, maxHeight - 1],
                                 [maxWidth - 1, maxHeight - 1],
                                 [maxWidth - 1, 0]])

        # Compute the perspective transform M
        M = cv2.getPerspectiveTransform(input_pts, output_pts)

        matWarped = cv2.warpPerspective(frame, M, (maxWidth, maxHeight), flags=cv2.INTER_LINEAR)
        cv2.imwrite(fileName, matWarped)

        return True

    ####################################################################################################

    def WarpAndRec(self, frame, top_left, top_right, bottom_right, bottom_left):
        w, h, cn = frame.shape
        padding = 4.0
        padding = int(padding * w / 640)

        box = []
        # All points are in format [cols, rows]
        pt_A = top_left[0] - padding, top_left[1] - padding
        pt_B = bottom_left[0] - padding, bottom_left[1] + padding
        pt_C = bottom_right[0] + padding, bottom_right[1] + padding
        pt_D = top_right[0] + padding, top_right[1] - padding

        # Here, I have used L2 norm. You can use L1 also.
        width_AD = np.sqrt(((pt_A[0] - pt_D[0]) ** 2) + ((pt_A[1] - pt_D[1]) ** 2))
        width_BC = np.sqrt(((pt_B[0] - pt_C[0]) ** 2) + ((pt_B[1] - pt_C[1]) ** 2))
        maxWidth = max(int(width_AD), int(width_BC))

        height_AB = np.sqrt(((pt_A[0] - pt_B[0]) ** 2) + ((pt_A[1] - pt_B[1]) ** 2))
        height_CD = np.sqrt(((pt_C[0] - pt_D[0]) ** 2) + ((pt_C[1] - pt_D[1]) ** 2))
        maxHeight = max(int(height_AB), int(height_CD))

        input_pts = np.float32([pt_A, pt_B, pt_C, pt_D])
        output_pts = np.float32([[0, 0],
                                 [0, maxHeight - 1],
                                 [maxWidth - 1, maxHeight - 1],
                                 [maxWidth - 1, 0]])

        # Compute the perspective transform M
        M = cv2.getPerspectiveTransform(input_pts, output_pts)

        matWarped = cv2.warpPerspective(frame, M, (maxWidth, maxHeight), flags=cv2.INTER_LINEAR)
        # cv2.imwrite(fileName, matWarped)

        s = self.detector.predict(Image.fromarray(matWarped))

        box.append(pt_A)
        box.append(pt_D)
        box.append(pt_C)
        box.append(pt_B)

        return [s, box]

    ####################################################################################################

    def GetInformationAndSave(self, _results, _idnumber, _idnumberbox):
        print("---------------------------------")
        print(_results)
        # string = '{"ID_number": "09219802508", "Name": "", "Date_of_birth": "", "Gender": "", "Nationality": "", "Place_of_origin": "", "Place_of_residence": "", "ID_number_box": [[208.0, 171.0], [495.0, 177.0], [495.0, 201.0], [208.0, 195.0]]}'
        # result = json.loads(string)

        result = {}
        result['ID_number'] = _idnumber
        result['Name'] = ''
        result['Date_of_birth'] = ''
        result['Date_of_issue'] = ''
        result['Gender'] = ''
        result['Nationality'] = ''
        result['Place_of_origin'] = ''
        result['Place_of_residence'] = ''
        result['ID_number_box'] = _idnumberbox

        regex_issue = r'[0-9][0-9]/[0-9][0-9]'
        regex_dob = r'[0-9][0-9]/[0-9][0-9]'
        regex_residence = r'[0-9][0-9]/[0-9][0-9]/|[0-9]{4,10}|Date|Demo|Dis|Dec|Dale|fer|ting|gical|ping|exp|ver|pate|cond|trị|đến|không|Không|Có|Pat|ter|ity'

        for i, res in enumerate(_results):
            s = res[0]

            print(s)
            if re.search(r'tên|name', s):
                # result['ID_number']                   = result[i+1].split(':|;|,|\\.|\s+')[-1].strip()
                # ID_number                             = result[i+1] if re.search(r'[0-9][0-9][0-9]',(re.split(r':|[.]|\s+',result[i+1][0]))[-1].strip()) else (result[i+2] if re.search(r'[0-9][0-9][0-9]',result[i+2][0]) else result[i+3])
                # result['ID_number']                   = (re.split(r':|[.]|\s+',ID_number[0]))[-1].strip()
                # result['ID_number_box']               = ID_number[1]

                Name = _results[i + 1] if (not re.search(r'[0-9]', _results[i + 1][0])) else _results[i + 2]
                result['Name'] = Name[0].title()
                result['Name_box'] = Name[1] if Name[1] else []

                if (result['Date_of_birth'] == ''):
                    DOB = _results[i - 2] if re.search(regex_dob, _results[i - 2][0]) else []
                    result['Date_of_birth'] = (re.split(r':|\s+', DOB[0]))[-1].strip() if DOB else ''
                    result['Date_of_birth_box'] = DOB[1] if DOB else []
                continue

            if re.search(r'month|year|date', s) and (not result['Date_of_issue']):
                if re.search(regex_dob, s):
                    DOI = _results[i]

                elif re.search(regex_dob, _results[i - 1][0]):
                    DOI = _results[i - 1]

                elif re.search(regex_dob, _results[i + 1][0]):
                    DOI = _results[i + 1]

                else:
                    DOI = []

                result['Date_of_issue'] = (re.split(r':|\s+', DOI[0]))[-1].strip() if DOI else ''
                result['Date_of_issue_box'] = DOI[1] if DOI else []

                continue

            if re.search(r'sinh|birth|bith', s) and (not result['Date_of_birth']):
                if re.search(regex_dob, s):
                    DOB = _results[i]

                elif re.search(regex_dob, _results[i - 1][0]):
                    DOB = _results[i - 1]

                elif re.search(regex_dob, _results[i + 1][0]):
                    DOB = _results[i + 1]

                else:
                    DOB = []

                result['Date_of_birth'] = (re.split(r':|\s+', DOB[0]))[-1].strip() if DOB else ''
                result['Date_of_birth_box'] = DOB[1] if DOB else []

                if re.search(r"Việt Nam", _results[i + 1][0]):
                    result['Nationality'] = 'Việt Nam'
                    result['Nationality_box'] = _results[i + 1][1]

                continue

            if re.search(r'Giới|Sex', s):
                Gender = _results[i]
                result['Gender'] = 'Nữ' if re.search(r'Nữ|nữ', Gender[0]) else 'Nam'
                result['Gender_box'] = Gender[1] if Gender[1] else []
                # continue

            if re.search(r'Quốc|tịch|Nat', s):
                if (not re.search(r'ty|ing', re.split(r':|,|[.]|ty|tịch', s)[-1].strip()) and (
                        len(re.split(r':|,|[.]|ty|tịch', s)[-1].strip()) >= 3)):
                    Nationality = _results[i]

                elif not re.search(r'[0-9][0-9]/[0-9][0-9]/', _results[i + 1][0]):
                    Nationality = _results[i + 1]

                else:
                    Nationality = _results[i - 1]

                result['Nationality'] = re.split(r':|-|,|[.]|ty|[0-9]|tịch', Nationality[0])[-1].strip().title()
                result['Nationality_box'] = Nationality[1] if Nationality[1] else []

                for s in re.split(r'\s+', result['Nationality']):
                    if len(s) < 3:
                        result['Nationality'] = re.split(s, result['Nationality'])[-1].strip().title()
                if re.search(r'Nam', result['Nationality']):
                    result['Nationality'] = 'Việt Nam'

                continue

            if re.search(r'Quê|origin|ongin|ngin|orging', s):
                PlaceOfOrigin = [_results[i], _results[i + 1]] if not re.search(r'[0-9]{4}', _results[i + 1][0]) else []
                if PlaceOfOrigin:
                    if len(re.split(r':|;|of|ging|gin|ggong', PlaceOfOrigin[0][0])[-1].strip()) > 2:
                        result['Place_of_origin'] = (
                                    (re.split(r':|;|of|ging|gin|ggong', PlaceOfOrigin[0][0]))[-1].strip() + ', ' +
                                    PlaceOfOrigin[1][0])
                    else:
                        result['Place_of_origin'] = PlaceOfOrigin[1][0]
                    result['Place_of_origin_box'] = PlaceOfOrigin[1][1]
                continue

            if re.search(r'Nơi|trú|residence', s):
                vals2 = "" if (i + 2 > len(_results) - 1) else _results[i + 2] if len(_results[i + 2][0]) > 5 else \
                _results[-1]
                vals3 = "" if (i + 3 > len(_results) - 1) else _results[i + 3] if len(_results[i + 3][0]) > 5 else \
                _results[-1]

                if ((re.split(r':|;|residence|ence|end', s))[-1].strip() != ''):

                    if (vals2 != '' and not re.search(regex_residence, vals2[0])):
                        PlaceOfResidence = [_results[i], vals2]
                    elif (vals3 != '' and not re.search(regex_residence, vals3[0])):
                        PlaceOfResidence = [_results[i], vals3]
                    elif not re.search(regex_residence, _results[-1][0]):
                        PlaceOfResidence = [_results[i], _results[-1]]
                    else:
                        PlaceOfResidence = [_results[-1], []]

                else:
                    PlaceOfResidence = [vals2, []] if (vals2 and not re.search(regex_residence, vals2[0])) else [
                        _results[-1], []]

                print('PlaceOfResidence: {}'.format(PlaceOfResidence))
                if PlaceOfResidence[1]:
                    result['Place_of_residence'] = re.split(r':|;|residence|sidencs|ence|end', PlaceOfResidence[0][0])[
                                                       -1].strip() + ' ' + str(PlaceOfResidence[1][0]).strip()
                    result['Place_of_residence_box'] = PlaceOfResidence[1][1]

                else:
                    result['Place_of_residence'] = PlaceOfResidence[0][0]
                    result['Place_of_residence_box'] = PlaceOfResidence[0][1] if PlaceOfResidence else []
                continue

            elif (i == len(_results) - 1):
                if result['Place_of_residence'] == '':
                    if not re.search(regex_residence, _results[-1][0]):
                        PlaceOfResidence = _results[-1]
                    elif not re.search(regex_residence, _results[-2][0]):
                        PlaceOfResidence = _results[-2]
                    else:
                        PlaceOfResidence = []

                    result['Place_of_residence'] = PlaceOfResidence[0] if PlaceOfResidence else ''
                    result['Place_of_residence_box'] = PlaceOfResidence[1] if PlaceOfResidence else []
                if result['Gender'] == '':
                    result['Gender_box'] = []
                if result['Nationality'] == '':
                    result['Nationality_box'] = []
                if result['Name'] == '':
                    result['Name_box'] = []
                if result['Date_of_birth'] == '':
                    result['Date_of_birth_box'] = []
                if result['Place_of_origin'] == '':
                    result['Place_of_origin_box'] = []

            else:
                continue

        # with open('extracted_infomation.json', 'w', encoding='utf-8') as f:
        #     f.write(json.dumps(result, indent=4, ensure_ascii=False))
        #     f.close()

        return result