Spaces:
Runtime error
Runtime error
| import easyocr | |
| import fitz | |
| import pytesseract | |
| from PIL import Image | |
| import cv2 | |
| import io, os | |
| import pymupdf | |
| from fastapi import HTTPException | |
| import numpy as np | |
| from pytesseract import Output | |
| import imutils | |
| # pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe' | |
| pytesseract.pytesseract.tesseract_cmd = 'tesseract' | |
| # if os.name == 'nt': | |
| # # Windows | |
| # pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe' | |
| # else: | |
| # # Unix/Linux, where tesseract should be in the PATH | |
| # pytesseract.pytesseract.tesseract_cmd = 'tesseract' | |
| def reformat_ocr_result(result): | |
| mapping = { | |
| "NIK": "NIK : ", | |
| "Nama": "Nama : ", | |
| "TempatITgl Lahir": "Tempat/Tgl Lahir : ", | |
| "Jenis Kelamin": "Jenis Kelamin : ", | |
| "GolDarah": "Gol Darah : ", | |
| "Alamat": "Alamat : ", | |
| "RTIRW": "RT/RW : ", | |
| "KelDesa": "Kel/Desa : ", | |
| "Kecamatan": "Kecamatan : ", | |
| "Agama": "Agama : ", | |
| "Status Perkawinan": "Status Perkawinan : ", | |
| "Pekerjaan": "Pekerjaan : ", | |
| "Kewarganegaraan": "Kewarganegaraan : ", | |
| "Berlaku Hingga": "Berlaku Hingga : " | |
| } | |
| formatted_output = [] | |
| current_key = None | |
| current_value = [] | |
| for line in result: | |
| # Extract the text from each OCR result line | |
| text = line.split(' (confidence: ')[0] | |
| # Check if the text matches any of the keys | |
| if any(key in text for key in mapping): | |
| # If there's a current key, finalize its value | |
| if current_key: | |
| formatted_output.append(f"{mapping[current_key]}{' '.join(current_value)}") | |
| current_value = [] | |
| # Find the key in the text | |
| for key in mapping: | |
| if key in text: | |
| current_key = key | |
| break | |
| else: | |
| if current_key: | |
| # Add the text to the current value | |
| current_value.append(text) | |
| # Finalize the last key-value pair | |
| if current_key: | |
| formatted_output.append(f"{mapping[current_key]}{' '.join(current_value)}") | |
| print(formatted_output) | |
| return formatted_output | |
| async def detect_rotation(image): | |
| # load the input image, convert it from BGR to RGB channel ordering, | |
| # and use Tesseract to determine the text orientation | |
| image = cv2.imread(image) | |
| rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) | |
| results = pytesseract.image_to_osd(rgb, output_type=Output.DICT) | |
| # display the orientation information | |
| print("[INFO] detected orientation: {}".format( | |
| results["orientation"])) | |
| print("[INFO] rotate by {} degrees to correct".format( | |
| results["rotate"])) | |
| print("[INFO] detected script: {}".format(results["script"])) | |
| # rotate the image to correct the orientation | |
| rotated = imutils.rotate_bound(image, angle=results["rotate"]) | |
| return rotated | |
| def easyocr_ocr(image_path): | |
| reader = easyocr.Reader(['id']) | |
| result = reader.readtext(image_path) | |
| result_list = [f"{text} (confidence: {confidence:.4f})" for (bbox, text, confidence) in result] | |
| formatted_output = reformat_ocr_result(result_list) | |
| return formatted_output | |
| # Function to convert PDF page to PIL image | |
| def pdf_page_to_image(pdf_page): | |
| # Convert PDF page to image using fitz and PIL | |
| pix = pdf_page.get_pixmap() | |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| return img | |
| async def tesseract_ocr_pdf(pdf_path): | |
| try: | |
| pdf_document = fitz.open(pdf_path) | |
| text = "" | |
| for page_number in range(len(pdf_document)): | |
| page = pdf_document.load_page(page_number) | |
| image = pdf_page_to_image(page) | |
| # Save the image to a temporary path | |
| temp_image_path = f"temp_page_{page_number}.png" | |
| image.save(temp_image_path) | |
| # Perform OCR on the saved image | |
| page_text = await tesseract_ocr(temp_image_path) | |
| text += page_text + "\n" | |
| pdf_document.close() | |
| return text | |
| except Exception as e: | |
| print(f"Error opening PDF: {e}") | |
| raise HTTPException(status_code=400, detail="Error processing PDF file") | |
| async def tesseract_ocr(image_path): | |
| # Grayscale, Gaussian blur, Otsu's threshold | |
| image = await detect_rotation(image_path) | |
| # image = cv2.imread(image_path) | |
| # corrected_image = correct_image_rotation(image) | |
| gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
| blur = cv2.GaussianBlur(gray, (3,3), 0) | |
| thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] | |
| # Morph open to remove noise and invert image | |
| kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3)) | |
| opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1) | |
| invert = 255 - opening | |
| result = pytesseract.image_to_string( invert, config='--psm 6') | |
| result = result.replace('\n', ' ').replace('\r', ' ').replace(' ', ' ') | |
| return result | |