File size: 5,016 Bytes
f8b25ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import asyncio
from PIL import Image
import pytesseract
import re, cv2
import imutils
from concurrent.futures import ThreadPoolExecutor
from app.models.ocrtemplate import *
from app.core.database import get_database
from app.core.config import settings
from typing import Any
from fastapi import HTTPException
from pytesseract import Output

pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'

def identify_structure(line):
    line = line.strip()
    
    if line.count(':') > 1:
        return 'mixed-column'
    
    if ':' in line or re.search(r'^[A-Za-z]+\s+[A-Za-z0-9]+', line):
        return 'key-value'
    
    uppercase_words = re.findall(r'\b[A-Z]+\b', line)
    numbers = re.findall(r'\b\d+\b', line)
    
    if len(uppercase_words) > 1:
        return 'table-header'
    
    if len(numbers) > 1 and len(uppercase_words) <= 1:
        return 'table-row'
    
    return 'text'

def format_extracted_text(text):
    lines = text.split('\n')
    lines = [line.strip() for line in lines if line.strip()]
    
    formatted_text = []
    in_table = False
    
    for line in lines:
        structure = identify_structure(line)
        
        if structure == 'mixed-column':
            parts = line.split(':')
            formatted_parts = [f"{parts[i].strip()}: {parts[i+1].strip()}" for i in range(0, len(parts)-1, 2)]
            formatted_text.extend(formatted_parts)
            in_table = False
        elif structure == 'key-value':
            formatted_text.append(line)
            in_table = False
        elif structure == 'table-header':
            formatted_text.append(line)
            in_table = True
        elif structure == 'table-row' and in_table:
            formatted_text.append(line)
        else:
            if in_table:
                in_table = False
                formatted_text.append("\n")
            formatted_text.append(line)
    
    return "\n".join(formatted_text)

def refine_text_formatting(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\.\s', '.\n', text)
    text = re.sub(r'\s*:\s*', ': ', text)
    return text

def do_ocr(image_path):
    image = Image.open(image_path)
    extracted_text = pytesseract.image_to_string(image)
    formatted_text = format_extracted_text(extracted_text)
    return formatted_text

async def do_ocr_tesseract(image_path):
    loop = asyncio.get_event_loop()
    with ThreadPoolExecutor() as pool:
        formatted_text = await loop.run_in_executor(pool, do_ocr, image_path)
    return formatted_text


async def create_data_from_template(template_name:str, fields:Dict[str, str], user_id:str) -> OCRTemplateInDB:
    template = OCRTemplateInDB(template_name=template_name, fields=fields, user_id=user_id)
    db = get_database(settings.MongoDB_NAME)
    result = await db["extracted data"].insert_one(template.dict())
    if template:
        return template
    return None


def preprocess_image(image: Any) -> Any:
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply Gaussian blur
    blur = cv2.GaussianBlur(gray, (3, 3), 0)
    
    # Apply Otsu's thresholding
    thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

    # Morph open to remove noise and invert image
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
    invert = 255 - opening
    
    return invert

async def detect_rotation(image_path: str) -> Any:
    # Load the input image
    image = cv2.imread(image_path)
    if image is None:
        raise HTTPException(status_code=400, detail="Image not found or unable to read")

    # Convert from BGR to RGB channel ordering
    rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Use Tesseract to determine the text orientation
    results = pytesseract.image_to_osd(rgb, output_type=Output.DICT)
    
    # Display the orientation information
    print("[INFO] detected orientation: {}".format(results["orientation"]))
    print("[INFO] rotate by {} degrees to correct".format(results["rotate"]))
    print("[INFO] detected script: {}".format(results["script"]))
    
    # Rotate the image to correct the orientation
    rotated = imutils.rotate_bound(image, angle=results["rotate"])
    
    return rotated

async def tesseract_ocr(image_path: str) -> str:
    # Detect rotation and get the image
    image = await detect_rotation(image_path)
    
    # Preprocess the image
    preprocessed_image = preprocess_image(image)

    # Perform OCR using Tesseract
    result = pytesseract.image_to_string(preprocessed_image, config='--psm 6')

    formatted_text = format_extracted_text(result)
    return formatted_text
# Example usage
async def main():
    image_path = 'KTP.jpg'
    formatted_text = await do_ocr_tesseract(image_path)
    formatted_text_pre = await tesseract_ocr(image_path)
    print(formatted_text)
    print(formatted_text_pre)

# asyncio.run(main())