import gradio as gr import fitz # PyMuPDF for handling PDFs from transformers import AutoModelForVision2Seq, AutoProcessor import torch import torchvision from PIL import Image import io # Initialize the OCR model and processor from Hugging Face model_name = "allenai/olmOCR-2-7B-1025-FP8" processor = AutoProcessor.from_pretrained(model_name) model = AutoModelForVision2Seq.from_pretrained(model_name) # Function to perform OCR on a PDF, page by page using olmocr def ocr_pdf(pdf_file): # Open the PDF with PyMuPDF doc = fitz.open(pdf_file.name) ocr_results = [] # To store OCR results for each page for page_num in range(len(doc)): # Get the page and convert it to an image page = doc.load_page(page_num) pix = page.get_pixmap() # Convert pixmap to image img = Image.open(io.BytesIO(pix.tobytes("png"))) # Process the image for OCR (olmocr expects image in a specific format) inputs = processor(images=img, return_tensors="pt") # Perform OCR using olmocr model with torch.no_grad(): outputs = model.generate(**inputs) # Decode the generated output (OCR text) ocr_text = processor.decode(outputs[0], skip_special_tokens=True) # Prepend page number to the OCR text page_result = f"Page {page_num + 1}:\n{ocr_text}" # Store result in list ocr_results.append(page_result) # Join all OCR results into one string (for displaying purposes) return "\n\n".join(ocr_results) # Gradio interface def create_gradio_interface(): with gr.Blocks() as demo: gr.Markdown("### OCR of PDF Pages using olmocr Model") file_input = gr.File(label="Upload PDF", type="file") output_text = gr.Textbox(label="OCR Results", lines=15) file_input.change(ocr_pdf, inputs=file_input, outputs=output_text) return demo # Create and launch the Gradio app if __name__ == "__main__": app = create_gradio_interface() app.launch()