Spaces:
Runtime error
Runtime error
| import fitz # PyMuPDF | |
| from PIL import Image | |
| import pytesseract | |
| import io | |
| import os | |
| def extract_images_from_pdf(pdf_path, output_folder, dpi=300): | |
| doc = fitz.open(pdf_path) | |
| images = [] | |
| for page_num in range(len(doc)): | |
| page = doc.load_page(page_num) | |
| pix = page.get_pixmap(matrix=fitz.Matrix(dpi / 72, dpi / 72)) | |
| img_bytes = pix.tobytes("ppm") | |
| # Convert PPM to JPEG using PIL and save | |
| image = Image.open(io.BytesIO(img_bytes)) | |
| output_filename = f"{output_folder}/page_{page_num + 1}.jpg" | |
| image.save(output_filename, "JPEG") | |
| # Also store the image bytes for OCR processing | |
| img_converted_bytes = io.BytesIO() | |
| image.save(img_converted_bytes, format='JPEG') | |
| images.append(img_converted_bytes.getvalue()) | |
| print(f"Images saved to {output_folder}") | |
| return images | |
| def ocr_images(images): | |
| text_from_images = [] | |
| for image_bytes in images: | |
| image = Image.open(io.BytesIO(image_bytes)) | |
| text = pytesseract.image_to_string(image, lang='ces') | |
| text_from_images.append(text) | |
| return text_from_images | |
| def save_text_to_file(text, file_path): | |
| with open(file_path, 'w', encoding='utf-8') as file: | |
| file.write(text) | |
| # Path to your PDF file and output locations | |
| pdf_path = "norm.pdf" | |
| output_folder = "extracted_images" | |
| output_text_file = "ocr_results.txt" | |
| # Ensure the output directory exists | |
| os.makedirs(output_folder, exist_ok=True) | |
| # Extract images from the PDF and save them as JPEG | |
| images = extract_images_from_pdf(pdf_path, output_folder) | |
| # Transcribe text from the extracted images with Czech language | |
| transcribed_texts = ocr_images(images) | |
| # Combine all texts into a single string | |
| combined_text = "\n".join(transcribed_texts) | |
| # Save OCR results to a text file | |
| save_text_to_file(combined_text, output_text_file) | |
| print(f"OCR results saved to {output_text_file}") | |