productivity / app /ocr.py
ariansyahdedy's picture
add tesseract-ocr
82700d7
import easyocr
import fitz
import pytesseract
from PIL import Image
import cv2
import io, os
import pymupdf
from fastapi import HTTPException
import numpy as np
from pytesseract import Output
import imutils
# pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
pytesseract.pytesseract.tesseract_cmd = 'tesseract'
# if os.name == 'nt':
# # Windows
# pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
# else:
# # Unix/Linux, where tesseract should be in the PATH
# pytesseract.pytesseract.tesseract_cmd = 'tesseract'
def reformat_ocr_result(result):
mapping = {
"NIK": "NIK : ",
"Nama": "Nama : ",
"TempatITgl Lahir": "Tempat/Tgl Lahir : ",
"Jenis Kelamin": "Jenis Kelamin : ",
"GolDarah": "Gol Darah : ",
"Alamat": "Alamat : ",
"RTIRW": "RT/RW : ",
"KelDesa": "Kel/Desa : ",
"Kecamatan": "Kecamatan : ",
"Agama": "Agama : ",
"Status Perkawinan": "Status Perkawinan : ",
"Pekerjaan": "Pekerjaan : ",
"Kewarganegaraan": "Kewarganegaraan : ",
"Berlaku Hingga": "Berlaku Hingga : "
}
formatted_output = []
current_key = None
current_value = []
for line in result:
# Extract the text from each OCR result line
text = line.split(' (confidence: ')[0]
# Check if the text matches any of the keys
if any(key in text for key in mapping):
# If there's a current key, finalize its value
if current_key:
formatted_output.append(f"{mapping[current_key]}{' '.join(current_value)}")
current_value = []
# Find the key in the text
for key in mapping:
if key in text:
current_key = key
break
else:
if current_key:
# Add the text to the current value
current_value.append(text)
# Finalize the last key-value pair
if current_key:
formatted_output.append(f"{mapping[current_key]}{' '.join(current_value)}")
print(formatted_output)
return formatted_output
async def detect_rotation(image):
# load the input image, convert it from BGR to RGB channel ordering,
# and use Tesseract to determine the text orientation
image = cv2.imread(image)
rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
results = pytesseract.image_to_osd(rgb, output_type=Output.DICT)
# display the orientation information
print("[INFO] detected orientation: {}".format(
results["orientation"]))
print("[INFO] rotate by {} degrees to correct".format(
results["rotate"]))
print("[INFO] detected script: {}".format(results["script"]))
# rotate the image to correct the orientation
rotated = imutils.rotate_bound(image, angle=results["rotate"])
return rotated
def easyocr_ocr(image_path):
reader = easyocr.Reader(['id'])
result = reader.readtext(image_path)
result_list = [f"{text} (confidence: {confidence:.4f})" for (bbox, text, confidence) in result]
formatted_output = reformat_ocr_result(result_list)
return formatted_output
# Function to convert PDF page to PIL image
def pdf_page_to_image(pdf_page):
# Convert PDF page to image using fitz and PIL
pix = pdf_page.get_pixmap()
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
return img
async def tesseract_ocr_pdf(pdf_path):
try:
pdf_document = fitz.open(pdf_path)
text = ""
for page_number in range(len(pdf_document)):
page = pdf_document.load_page(page_number)
image = pdf_page_to_image(page)
# Save the image to a temporary path
temp_image_path = f"temp_page_{page_number}.png"
image.save(temp_image_path)
# Perform OCR on the saved image
page_text = await tesseract_ocr(temp_image_path)
text += page_text + "\n"
pdf_document.close()
return text
except Exception as e:
print(f"Error opening PDF: {e}")
raise HTTPException(status_code=400, detail="Error processing PDF file")
async def tesseract_ocr(image_path):
# Grayscale, Gaussian blur, Otsu's threshold
image = await detect_rotation(image_path)
# image = cv2.imread(image_path)
# corrected_image = correct_image_rotation(image)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (3,3), 0)
thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
# Morph open to remove noise and invert image
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
invert = 255 - opening
result = pytesseract.image_to_string( invert, config='--psm 6')
result = result.replace('\n', ' ').replace('\r', ' ').replace(' ', ' ')
return result