Spaces:
Build error
Build error
| import os | |
| import cv2 | |
| import numpy as np | |
| from pdf2image import convert_from_path | |
| from main import RapidOCR | |
| from image_enhancement import enhance_image | |
| # Initialize OCR engine once. | |
| ocr_engine = RapidOCR() | |
| def adaptive_threshold_to_rgb(image_rgb): | |
| """ | |
| Convert an RGB image to LAB, apply adaptive thresholding only on the L channel, | |
| then convert back to RGB. | |
| Parameters: | |
| image_rgb (numpy.ndarray): Input RGB image. | |
| Returns: | |
| thresholded_rgb (numpy.ndarray): RGB image after thresholding the L channel. | |
| """ | |
| # Convert RGB to LAB color space. | |
| image_lab = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2LAB) | |
| l_channel, a_channel, b_channel = cv2.split(image_lab) | |
| # Adaptive thresholding on the L channel. | |
| thresholded_l = cv2.adaptiveThreshold( | |
| l_channel, | |
| maxValue=255, | |
| adaptiveMethod=cv2.ADAPTIVE_THRESH_GAUSSIAN_C, # or ADAPTIVE_THRESH_MEAN_C | |
| thresholdType=cv2.THRESH_BINARY, | |
| blockSize=11, | |
| C=2 | |
| ) | |
| # Merge the thresholded L channel with original A and B, then convert back to RGB. | |
| updated_lab = cv2.merge((thresholded_l, a_channel, b_channel)) | |
| thresholded_rgb = cv2.cvtColor(updated_lab, cv2.COLOR_LAB2RGB) | |
| return thresholded_rgb | |
| def ocr_detect(image, ocr_engine): | |
| """ | |
| Run OCR on the image and check for two consecutive rows that contain the '<' character. | |
| Parameters: | |
| image (numpy.ndarray): Input image. | |
| ocr_engine: OCR engine instance. | |
| Returns: | |
| detected (bool): True if found, else False. | |
| row1 (str): The first detected row with '<'. | |
| row2 (str): The second detected row with '<'. | |
| """ | |
| result, _ = ocr_engine(image, use_det=True, use_cls=False, use_rec=True) | |
| if result: | |
| # Get recognized strings | |
| test_list = [r[1] for r in result] | |
| for j in range(len(test_list) - 1): | |
| count1 = test_list[j].count("<") | |
| count2 = test_list[j + 1].count("<") | |
| if count1 > 1 and count2 > 1: | |
| return True, test_list[j], test_list[j + 1] | |
| return False, None, None | |
| def rotate_until_detect(image, ocr_engine, max_attempts=4): | |
| """ | |
| Rotate the image 90° clockwise up to max_attempts times until OCR returns | |
| two consecutive rows that meet the specified criteria. | |
| Parameters: | |
| image (numpy.ndarray): Input image. | |
| ocr_engine: OCR engine instance. | |
| max_attempts (int): Maximum number of rotations. | |
| Returns: | |
| image (numpy.ndarray): Final rotated image. | |
| detected (bool): True if OCR detection succeeded. | |
| row1, row2 (str, str): The two detected rows (if found; otherwise None). | |
| """ | |
| attempt = 0 | |
| detected = False | |
| row1, row2 = None, None | |
| while attempt < max_attempts: | |
| detected, row1, row2 = ocr_detect(image, ocr_engine) | |
| if detected: | |
| break | |
| image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE) | |
| attempt += 1 | |
| return image, detected, row1, row2 | |
| def process_pdf(pdf_f, ocr_engine, enhance_params, save_images=False): | |
| """ | |
| Process a single PDF file by converting a range of pages, enhancing images, | |
| and attempting OCR detections. A PDF is considered successful if at least one page | |
| yields two consecutive rows detected. Returns the (row1, row2) pair on success. | |
| Parameters: | |
| pdf_f (str): File path of the PDF. | |
| ocr_engine: The OCR engine instance. | |
| enhance_params (dict): Parameters for image enhancement. | |
| save_images (bool): If True, save intermediate enhanced images (default: False). | |
| Returns: | |
| (pdf_success, detected_rows): | |
| pdf_success (bool): True if detection succeeded in any page. | |
| detected_rows (tuple): (row1, row2) from the successful page, or (None, None) if not. | |
| """ | |
| images = convert_from_path(pdf_f, dpi=300, first_page=1, last_page=3) | |
| bs_name = os.path.basename(pdf_f) | |
| bs_name_0 = os.path.splitext(bs_name)[0] | |
| pdf_success = False | |
| detected_rows = (None, None) | |
| for i, pil_image in enumerate(images): | |
| # Convert the PIL image to a NumPy array. | |
| img = np.array(pil_image) | |
| # print(f"Processing page {i + 1} of {bs_name}") | |
| # Enhance the image. | |
| img = enhance_image(img, enhance_params, verbose=False) | |
| img = np.uint8(img * 255.) | |
| # Optionally save the enhanced image. | |
| if save_images: | |
| enhanced_img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) | |
| cv2.imwrite(f'{bs_name_0}_{i + 1}.jpg', enhanced_img_bgr) | |
| # Attempt OCR on the enhanced image (with rotations). | |
| proc_img, detected, row1, row2 = rotate_until_detect(img, ocr_engine) | |
| if detected: | |
| # print(f"OCR detection succeeded on page {i + 1} of {bs_name}") | |
| pdf_success = True | |
| detected_rows = (row1, row2) | |
| break | |
| else: | |
| # Fallback: perform adaptive thresholding then try OCR. | |
| # print(f"No detection on page {i + 1} of {bs_name}. Trying adaptive thresholding.") | |
| adaptive_img = adaptive_threshold_to_rgb(img) | |
| proc_img, detected, row1, row2 = rotate_until_detect(adaptive_img, ocr_engine) | |
| if detected: | |
| # print(f"OCR detection (via adaptive thresholding) succeeded on page {i + 1} of {bs_name}") | |
| pdf_success = True | |
| detected_rows = (row1, row2) | |
| break | |
| else: | |
| print(f"OCR detection failed on page {i + 1} of {bs_name}.") | |
| if pdf_success: | |
| print(f"PDF file {bs_name_0} processed successfully.") | |
| else: | |
| print(f"PDF file {bs_name_0} did NOT yield a successful OCR detection.") | |
| return pdf_success, detected_rows | |
| def main(): | |
| # Define the folder containing PDFs. | |
| dataPath = '/home/tung/Tung_Works/OCR_code/OCR-20250423T073748Z-001/OCR/OCR辨識失敗-部分樣本' | |
| list_pdf = [ | |
| os.path.join(root, file) | |
| for root, _, files in os.walk(dataPath) | |
| for file in files if file.endswith('.pdf') | |
| ] | |
| # Define image enhancement parameters. | |
| enhance_params = { | |
| 'local_contrast': 1.2, # 1.2x increase in detail | |
| 'mid_tones': 0.5, # middle range | |
| 'tonal_width': 0.5, # middle range | |
| 'areas_dark': 0.7, # 70% improvement in dark areas | |
| 'areas_bright': 0.5, # 50% improvement in bright areas | |
| 'brightness': 0.1, # slight increase in overall brightness | |
| 'saturation_degree': 1.2, # 1.2x increase in color saturation | |
| 'preserve_tones': True, | |
| 'color_correction': True, | |
| } | |
| # Process each PDF and collect results. | |
| for pdf_f in list_pdf: | |
| print("") | |
| print(f"--- Processing PDF: {pdf_f} ---") | |
| success, detected_rows = process_pdf(pdf_f, ocr_engine, enhance_params, save_images=False) | |
| if success: | |
| # print("\nSuccess in detecting two rows for this PDF:") | |
| print("PDF:", os.path.basename(pdf_f)) | |
| print("Row 1:", detected_rows[0]) | |
| print("Row 2:", detected_rows[1]) | |
| else: | |
| print("No successful detection for this PDF.") | |
| if __name__ == '__main__': | |
| main() | |