Spaces:

ariansyahdedy
/

productivity

Runtime error

App Files Files Community

productivity / app /ocr.py

ariansyahdedy

add tesseract-ocr

82700d7 over 1 year ago

raw

history blame contribute delete

5.04 kB

	import easyocr
	import fitz
	import pytesseract
	from PIL import Image
	import cv2
	import io, os
	import pymupdf
	from fastapi import HTTPException
	import numpy as np
	from pytesseract import Output
	import imutils


	# pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
	pytesseract.pytesseract.tesseract_cmd = 'tesseract'
	# if os.name == 'nt':
	# # Windows
	# pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
	# else:
	# # Unix/Linux, where tesseract should be in the PATH
	# pytesseract.pytesseract.tesseract_cmd = 'tesseract'

	def reformat_ocr_result(result):
	mapping = {
	"NIK": "NIK : ",
	"Nama": "Nama : ",
	"TempatITgl Lahir": "Tempat/Tgl Lahir : ",
	"Jenis Kelamin": "Jenis Kelamin : ",
	"GolDarah": "Gol Darah : ",
	"Alamat": "Alamat : ",
	"RTIRW": "RT/RW : ",
	"KelDesa": "Kel/Desa : ",
	"Kecamatan": "Kecamatan : ",
	"Agama": "Agama : ",
	"Status Perkawinan": "Status Perkawinan : ",
	"Pekerjaan": "Pekerjaan : ",
	"Kewarganegaraan": "Kewarganegaraan : ",
	"Berlaku Hingga": "Berlaku Hingga : "
	}

	formatted_output = []
	current_key = None
	current_value = []

	for line in result:
	# Extract the text from each OCR result line
	text = line.split(' (confidence: ')[0]

	# Check if the text matches any of the keys
	if any(key in text for key in mapping):
	# If there's a current key, finalize its value
	if current_key:
	formatted_output.append(f"{mapping[current_key]}{' '.join(current_value)}")
	current_value = []

	# Find the key in the text
	for key in mapping:
	if key in text:
	current_key = key
	break
	else:
	if current_key:
	# Add the text to the current value
	current_value.append(text)
	# Finalize the last key-value pair
	if current_key:
	formatted_output.append(f"{mapping[current_key]}{' '.join(current_value)}")
	print(formatted_output)
	return formatted_output

	async def detect_rotation(image):
	# load the input image, convert it from BGR to RGB channel ordering,
	# and use Tesseract to determine the text orientation
	image = cv2.imread(image)
	rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
	results = pytesseract.image_to_osd(rgb, output_type=Output.DICT)
	# display the orientation information
	print("[INFO] detected orientation: {}".format(
	results["orientation"]))
	print("[INFO] rotate by {} degrees to correct".format(
	results["rotate"]))
	print("[INFO] detected script: {}".format(results["script"]))
	# rotate the image to correct the orientation
	rotated = imutils.rotate_bound(image, angle=results["rotate"])
	return rotated


	def easyocr_ocr(image_path):
	reader = easyocr.Reader(['id'])
	result = reader.readtext(image_path)
	result_list = [f"{text} (confidence: {confidence:.4f})" for (bbox, text, confidence) in result]
	formatted_output = reformat_ocr_result(result_list)
	return formatted_output

	# Function to convert PDF page to PIL image
	def pdf_page_to_image(pdf_page):
	# Convert PDF page to image using fitz and PIL
	pix = pdf_page.get_pixmap()
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	return img

	async def tesseract_ocr_pdf(pdf_path):
	try:
	pdf_document = fitz.open(pdf_path)
	text = ""

	for page_number in range(len(pdf_document)):
	page = pdf_document.load_page(page_number)
	image = pdf_page_to_image(page)
	# Save the image to a temporary path
	temp_image_path = f"temp_page_{page_number}.png"

	image.save(temp_image_path)
	# Perform OCR on the saved image
	page_text = await tesseract_ocr(temp_image_path)
	text += page_text + "\n"

	pdf_document.close()
	return text
	except Exception as e:
	print(f"Error opening PDF: {e}")
	raise HTTPException(status_code=400, detail="Error processing PDF file")

	async def tesseract_ocr(image_path):
	# Grayscale, Gaussian blur, Otsu's threshold
	image = await detect_rotation(image_path)
	# image = cv2.imread(image_path)

	# corrected_image = correct_image_rotation(image)
	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	blur = cv2.GaussianBlur(gray, (3,3), 0)
	thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

	# Morph open to remove noise and invert image
	kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
	opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
	invert = 255 - opening
	result = pytesseract.image_to_string( invert, config='--psm 6')
	result = result.replace('\n', ' ').replace('\r', ' ').replace(' ', ' ')

	return result