import easyocr import fitz import pytesseract from PIL import Image import cv2 import io, os import pymupdf from fastapi import HTTPException import numpy as np from pytesseract import Output import imutils # pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe' pytesseract.pytesseract.tesseract_cmd = 'tesseract' # if os.name == 'nt': # # Windows # pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe' # else: # # Unix/Linux, where tesseract should be in the PATH # pytesseract.pytesseract.tesseract_cmd = 'tesseract' def reformat_ocr_result(result): mapping = { "NIK": "NIK : ", "Nama": "Nama : ", "TempatITgl Lahir": "Tempat/Tgl Lahir : ", "Jenis Kelamin": "Jenis Kelamin : ", "GolDarah": "Gol Darah : ", "Alamat": "Alamat : ", "RTIRW": "RT/RW : ", "KelDesa": "Kel/Desa : ", "Kecamatan": "Kecamatan : ", "Agama": "Agama : ", "Status Perkawinan": "Status Perkawinan : ", "Pekerjaan": "Pekerjaan : ", "Kewarganegaraan": "Kewarganegaraan : ", "Berlaku Hingga": "Berlaku Hingga : " } formatted_output = [] current_key = None current_value = [] for line in result: # Extract the text from each OCR result line text = line.split(' (confidence: ')[0] # Check if the text matches any of the keys if any(key in text for key in mapping): # If there's a current key, finalize its value if current_key: formatted_output.append(f"{mapping[current_key]}{' '.join(current_value)}") current_value = [] # Find the key in the text for key in mapping: if key in text: current_key = key break else: if current_key: # Add the text to the current value current_value.append(text) # Finalize the last key-value pair if current_key: formatted_output.append(f"{mapping[current_key]}{' '.join(current_value)}") print(formatted_output) return formatted_output async def detect_rotation(image): # load the input image, convert it from BGR to RGB channel ordering, # and use Tesseract to determine the text orientation image = cv2.imread(image) rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) results = pytesseract.image_to_osd(rgb, output_type=Output.DICT) # display the orientation information print("[INFO] detected orientation: {}".format( results["orientation"])) print("[INFO] rotate by {} degrees to correct".format( results["rotate"])) print("[INFO] detected script: {}".format(results["script"])) # rotate the image to correct the orientation rotated = imutils.rotate_bound(image, angle=results["rotate"]) return rotated def easyocr_ocr(image_path): reader = easyocr.Reader(['id']) result = reader.readtext(image_path) result_list = [f"{text} (confidence: {confidence:.4f})" for (bbox, text, confidence) in result] formatted_output = reformat_ocr_result(result_list) return formatted_output # Function to convert PDF page to PIL image def pdf_page_to_image(pdf_page): # Convert PDF page to image using fitz and PIL pix = pdf_page.get_pixmap() img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) return img async def tesseract_ocr_pdf(pdf_path): try: pdf_document = fitz.open(pdf_path) text = "" for page_number in range(len(pdf_document)): page = pdf_document.load_page(page_number) image = pdf_page_to_image(page) # Save the image to a temporary path temp_image_path = f"temp_page_{page_number}.png" image.save(temp_image_path) # Perform OCR on the saved image page_text = await tesseract_ocr(temp_image_path) text += page_text + "\n" pdf_document.close() return text except Exception as e: print(f"Error opening PDF: {e}") raise HTTPException(status_code=400, detail="Error processing PDF file") async def tesseract_ocr(image_path): # Grayscale, Gaussian blur, Otsu's threshold image = await detect_rotation(image_path) # image = cv2.imread(image_path) # corrected_image = correct_image_rotation(image) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) blur = cv2.GaussianBlur(gray, (3,3), 0) thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] # Morph open to remove noise and invert image kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3)) opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1) invert = 255 - opening result = pytesseract.image_to_string( invert, config='--psm 6') result = result.replace('\n', ' ').replace('\r', ' ').replace(' ', ' ') return result