import easyocr
import fitz
import pytesseract
from PIL import Image
import cv2
import io, os
import pymupdf
from fastapi  import HTTPException
import numpy as np
from pytesseract import Output
import imutils


# pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
pytesseract.pytesseract.tesseract_cmd = 'tesseract'
# if os.name == 'nt':
#     # Windows
#     pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
# else:
#     # Unix/Linux, where tesseract should be in the PATH
#     pytesseract.pytesseract.tesseract_cmd = 'tesseract'

def reformat_ocr_result(result):
    mapping = {
        "NIK": "NIK : ",
        "Nama": "Nama : ",
        "TempatITgl Lahir": "Tempat/Tgl Lahir : ",
        "Jenis Kelamin": "Jenis Kelamin : ",
        "GolDarah": "Gol Darah : ",
        "Alamat": "Alamat : ",
        "RTIRW": "RT/RW : ",
        "KelDesa": "Kel/Desa : ",
        "Kecamatan": "Kecamatan : ",
        "Agama": "Agama : ",
        "Status Perkawinan": "Status Perkawinan : ",
        "Pekerjaan": "Pekerjaan : ",
        "Kewarganegaraan": "Kewarganegaraan : ",
        "Berlaku Hingga": "Berlaku Hingga : "
    }
    
    formatted_output = []
    current_key = None
    current_value = []
   
    for line in result:
        # Extract the text from each OCR result line
        text = line.split(' (confidence: ')[0]

        # Check if the text matches any of the keys
        if any(key in text for key in mapping):
            # If there's a current key, finalize its value
            if current_key:
                formatted_output.append(f"{mapping[current_key]}{' '.join(current_value)}")
                current_value = []

            # Find the key in the text
            for key in mapping:
                if key in text:
                    current_key = key
                    break
        else:
            if current_key:
                # Add the text to the current value
                current_value.append(text)
    # Finalize the last key-value pair
    if current_key:
        formatted_output.append(f"{mapping[current_key]}{' '.join(current_value)}")
    print(formatted_output)
    return formatted_output

async def detect_rotation(image):
    # load the input image, convert it from BGR to RGB channel ordering,
    # and use Tesseract to determine the text orientation
    image = cv2.imread(image)
    rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = pytesseract.image_to_osd(rgb, output_type=Output.DICT)
    # display the orientation information
    print("[INFO] detected orientation: {}".format(
        results["orientation"]))
    print("[INFO] rotate by {} degrees to correct".format(
        results["rotate"]))
    print("[INFO] detected script: {}".format(results["script"]))
    # rotate the image to correct the orientation
    rotated = imutils.rotate_bound(image, angle=results["rotate"])
    return rotated


def easyocr_ocr(image_path):
    reader = easyocr.Reader(['id'])
    result = reader.readtext(image_path)
    result_list = [f"{text} (confidence: {confidence:.4f})" for (bbox, text, confidence) in result]
    formatted_output = reformat_ocr_result(result_list)
    return formatted_output

# Function to convert PDF page to PIL image
def pdf_page_to_image(pdf_page):
    # Convert PDF page to image using fitz and PIL
    pix = pdf_page.get_pixmap()
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    return img

async def tesseract_ocr_pdf(pdf_path):
    try:
        pdf_document = fitz.open(pdf_path)
        text = ""

        for page_number in range(len(pdf_document)):
            page = pdf_document.load_page(page_number)
            image = pdf_page_to_image(page)
             # Save the image to a temporary path
            temp_image_path = f"temp_page_{page_number}.png"

            image.save(temp_image_path)
            # Perform OCR on the saved image
            page_text = await tesseract_ocr(temp_image_path)
            text += page_text + "\n"

        pdf_document.close()
        return text
    except Exception as e:
        print(f"Error opening PDF: {e}")
        raise HTTPException(status_code=400, detail="Error processing PDF file")

async def tesseract_ocr(image_path):
    # Grayscale, Gaussian blur, Otsu's threshold
    image = await detect_rotation(image_path)
    # image = cv2.imread(image_path)
    
    # corrected_image = correct_image_rotation(image)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (3,3), 0)
    thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

    # Morph open to remove noise and invert image
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
    invert = 255 - opening
    result = pytesseract.image_to_string( invert, config='--psm 6')
    result = result.replace('\n', ' ').replace('\r', ' ').replace('  ', ' ')

    return result