Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF | |
| import easyocr | |
| import os | |
| import tempfile | |
| import logging | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list: | |
| """ | |
| Extract text, words, and normalized bounding boxes from a scanned PDF using PyMuPDF and EasyOCR. | |
| Args: | |
| pdf_path (str): Path to the PDF file. | |
| Returns: | |
| list: List of dictionaries, each containing 'text' (str), 'words' (list of str), | |
| 'bbox' (list of [x0, y0, x1, y1] normalized to 0-1000), and 'image_dims' ([width, height]) for each page. | |
| Returns empty list if failed. | |
| """ | |
| try: | |
| # Save PDF to a temporary file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: | |
| with open(pdf_path, 'rb') as f: | |
| tmp.write(f.read()) | |
| temp_path = tmp.name | |
| logger.info(f"Temporary PDF created at: {temp_path}") | |
| # Convert PDF to images using PyMuPDF | |
| doc = fitz.open(temp_path) | |
| if not doc.page_count: | |
| logger.error(f"PDF is empty or unreadable: {pdf_path}") | |
| return [] | |
| all_pages = [] | |
| reader = easyocr.Reader(['en'], gpu=False) # Initialize EasyOCR, adjust languages as needed | |
| for page_num in range(len(doc)): | |
| page = doc[page_num] | |
| pix = page.get_pixmap(matrix=fitz.Matrix(400/72, 400/72)) # Increased DPI to 400 for better detection | |
| img_path = f"{temp_path}_page_{page_num}.png" | |
| pix.save(img_path) | |
| logger.info(f"Image generated at: {img_path}, Dimensions: {pix.width}x{pix.height}") | |
| # Get image dimensions | |
| image_width, image_height = pix.width, pix.height | |
| # Perform OCR using EasyOCR | |
| results = reader.readtext(img_path) | |
| if not results: | |
| logger.warning(f"No text detected on page {page_num + 1}") | |
| text = " ".join([res[1] for res in results]) # Concatenated text for compatibility | |
| words = [] | |
| bboxes = [] | |
| # Split text segments into words and assign normalized bounding boxes | |
| for res in results: | |
| segment_text = res[1] | |
| segment_bbox = [res[0][0][0], res[0][0][1], res[0][2][0], res[0][2][1]] # [x0, y0, x1, y1] | |
| # Normalize bounding box to 0-1000 range | |
| normalized_bbox = [ | |
| int((segment_bbox[0] / image_width) * 1000), | |
| int((segment_bbox[1] / image_height) * 1000), | |
| int((segment_bbox[2] / image_width) * 1000), | |
| int((segment_bbox[3] / image_height) * 1000) | |
| ] | |
| # Ensure coordinates are within 0-1000 | |
| normalized_bbox = [max(0, min(1000, coord)) for coord in normalized_bbox] | |
| segment_words = segment_text.split() | |
| # Assign the same normalized bounding box to each word in the segment | |
| for word in segment_words: | |
| words.append(word) | |
| bboxes.append(normalized_bbox) | |
| if text.strip(): | |
| all_pages.append({ | |
| "text": text, | |
| "words": words, | |
| "bbox": bboxes, | |
| "image_dims": [image_width, image_height] | |
| }) | |
| else: | |
| all_pages.append({ | |
| "text": f"Page {page_num + 1}: No text detected", | |
| "words": [], | |
| "bbox": [], | |
| "image_dims": [image_width, image_height] | |
| }) | |
| # Clean up temporary image | |
| if os.path.exists(img_path): | |
| os.unlink(img_path) | |
| doc.close() | |
| logger.info(f"Extracted data from {len(all_pages)} pages") | |
| return all_pages | |
| except Exception as e: | |
| logger.error(f"OCR failed: {str(e)}") | |
| return [] | |
| finally: | |
| if os.path.exists(temp_path): | |
| os.unlink(temp_path) |