import os import cv2 import numpy as np from pdf2image import convert_from_path from main import RapidOCR from image_enhancement import enhance_image # Initialize OCR engine once. ocr_engine = RapidOCR() def adaptive_threshold_to_rgb(image_rgb): """ Convert an RGB image to LAB, apply adaptive thresholding only on the L channel, then convert back to RGB. Parameters: image_rgb (numpy.ndarray): Input RGB image. Returns: thresholded_rgb (numpy.ndarray): RGB image after thresholding the L channel. """ # Convert RGB to LAB color space. image_lab = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2LAB) l_channel, a_channel, b_channel = cv2.split(image_lab) # Adaptive thresholding on the L channel. thresholded_l = cv2.adaptiveThreshold( l_channel, maxValue=255, adaptiveMethod=cv2.ADAPTIVE_THRESH_GAUSSIAN_C, # or ADAPTIVE_THRESH_MEAN_C thresholdType=cv2.THRESH_BINARY, blockSize=11, C=2 ) # Merge the thresholded L channel with original A and B, then convert back to RGB. updated_lab = cv2.merge((thresholded_l, a_channel, b_channel)) thresholded_rgb = cv2.cvtColor(updated_lab, cv2.COLOR_LAB2RGB) return thresholded_rgb def ocr_detect(image, ocr_engine): """ Run OCR on the image and check for two consecutive rows that contain the '<' character. Parameters: image (numpy.ndarray): Input image. ocr_engine: OCR engine instance. Returns: detected (bool): True if found, else False. row1 (str): The first detected row with '<'. row2 (str): The second detected row with '<'. """ result, _ = ocr_engine(image, use_det=True, use_cls=False, use_rec=True) if result: # Get recognized strings test_list = [r[1] for r in result] for j in range(len(test_list) - 1): count1 = test_list[j].count("<") count2 = test_list[j + 1].count("<") if count1 > 1 and count2 > 1: return True, test_list[j], test_list[j + 1] return False, None, None def rotate_until_detect(image, ocr_engine, max_attempts=4): """ Rotate the image 90° clockwise up to max_attempts times until OCR returns two consecutive rows that meet the specified criteria. Parameters: image (numpy.ndarray): Input image. ocr_engine: OCR engine instance. max_attempts (int): Maximum number of rotations. Returns: image (numpy.ndarray): Final rotated image. detected (bool): True if OCR detection succeeded. row1, row2 (str, str): The two detected rows (if found; otherwise None). """ attempt = 0 detected = False row1, row2 = None, None while attempt < max_attempts: detected, row1, row2 = ocr_detect(image, ocr_engine) if detected: break image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE) attempt += 1 return image, detected, row1, row2 def process_pdf(pdf_f, ocr_engine, enhance_params, save_images=False): """ Process a single PDF file by converting a range of pages, enhancing images, and attempting OCR detections. A PDF is considered successful if at least one page yields two consecutive rows detected. Returns the (row1, row2) pair on success. Parameters: pdf_f (str): File path of the PDF. ocr_engine: The OCR engine instance. enhance_params (dict): Parameters for image enhancement. save_images (bool): If True, save intermediate enhanced images (default: False). Returns: (pdf_success, detected_rows): pdf_success (bool): True if detection succeeded in any page. detected_rows (tuple): (row1, row2) from the successful page, or (None, None) if not. """ images = convert_from_path(pdf_f, dpi=300, first_page=1, last_page=3) bs_name = os.path.basename(pdf_f) bs_name_0 = os.path.splitext(bs_name)[0] pdf_success = False detected_rows = (None, None) for i, pil_image in enumerate(images): # Convert the PIL image to a NumPy array. img = np.array(pil_image) # print(f"Processing page {i + 1} of {bs_name}") # Enhance the image. img = enhance_image(img, enhance_params, verbose=False) img = np.uint8(img * 255.) # Optionally save the enhanced image. if save_images: enhanced_img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) cv2.imwrite(f'{bs_name_0}_{i + 1}.jpg', enhanced_img_bgr) # Attempt OCR on the enhanced image (with rotations). proc_img, detected, row1, row2 = rotate_until_detect(img, ocr_engine) if detected: # print(f"OCR detection succeeded on page {i + 1} of {bs_name}") pdf_success = True detected_rows = (row1, row2) break else: # Fallback: perform adaptive thresholding then try OCR. # print(f"No detection on page {i + 1} of {bs_name}. Trying adaptive thresholding.") adaptive_img = adaptive_threshold_to_rgb(img) proc_img, detected, row1, row2 = rotate_until_detect(adaptive_img, ocr_engine) if detected: # print(f"OCR detection (via adaptive thresholding) succeeded on page {i + 1} of {bs_name}") pdf_success = True detected_rows = (row1, row2) break else: print(f"OCR detection failed on page {i + 1} of {bs_name}.") if pdf_success: print(f"PDF file {bs_name_0} processed successfully.") else: print(f"PDF file {bs_name_0} did NOT yield a successful OCR detection.") return pdf_success, detected_rows def main(): # Define the folder containing PDFs. dataPath = '/home/tung/Tung_Works/OCR_code/OCR-20250423T073748Z-001/OCR/OCR辨識失敗-部分樣本' list_pdf = [ os.path.join(root, file) for root, _, files in os.walk(dataPath) for file in files if file.endswith('.pdf') ] # Define image enhancement parameters. enhance_params = { 'local_contrast': 1.2, # 1.2x increase in detail 'mid_tones': 0.5, # middle range 'tonal_width': 0.5, # middle range 'areas_dark': 0.7, # 70% improvement in dark areas 'areas_bright': 0.5, # 50% improvement in bright areas 'brightness': 0.1, # slight increase in overall brightness 'saturation_degree': 1.2, # 1.2x increase in color saturation 'preserve_tones': True, 'color_correction': True, } # Process each PDF and collect results. for pdf_f in list_pdf: print("") print(f"--- Processing PDF: {pdf_f} ---") success, detected_rows = process_pdf(pdf_f, ocr_engine, enhance_params, save_images=False) if success: # print("\nSuccess in detecting two rows for this PDF:") print("PDF:", os.path.basename(pdf_f)) print("Row 1:", detected_rows[0]) print("Row 2:", detected_rows[1]) else: print("No successful detection for this PDF.") if __name__ == '__main__': main()