File size: 2,088 Bytes
7e0ccd5
 
 
33b67be
 
7e0ccd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import gradio as gr
import fitz  # PyMuPDF for handling PDFs
from transformers import AutoModelForVision2Seq, AutoProcessor
import torch 
import torchvision
from PIL import Image
import io

# Initialize the OCR model and processor from Hugging Face
model_name = "allenai/olmOCR-2-7B-1025-FP8"
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForVision2Seq.from_pretrained(model_name)

# Function to perform OCR on a PDF, page by page using olmocr
def ocr_pdf(pdf_file):
    # Open the PDF with PyMuPDF
    doc = fitz.open(pdf_file.name)
    
    ocr_results = []  # To store OCR results for each page
    
    for page_num in range(len(doc)):
        # Get the page and convert it to an image
        page = doc.load_page(page_num)
        pix = page.get_pixmap()
        
        # Convert pixmap to image
        img = Image.open(io.BytesIO(pix.tobytes("png")))
        
        # Process the image for OCR (olmocr expects image in a specific format)
        inputs = processor(images=img, return_tensors="pt")
        
        # Perform OCR using olmocr model
        with torch.no_grad():
            outputs = model.generate(**inputs)
        
        # Decode the generated output (OCR text)
        ocr_text = processor.decode(outputs[0], skip_special_tokens=True)
        
        # Prepend page number to the OCR text
        page_result = f"Page {page_num + 1}:\n{ocr_text}"
        
        # Store result in list
        ocr_results.append(page_result)
    
    # Join all OCR results into one string (for displaying purposes)
    return "\n\n".join(ocr_results)

# Gradio interface
def create_gradio_interface():
    with gr.Blocks() as demo:
        gr.Markdown("### OCR of PDF Pages using olmocr Model")
        file_input = gr.File(label="Upload PDF", type="file")
        output_text = gr.Textbox(label="OCR Results", lines=15)
        
        file_input.change(ocr_pdf, inputs=file_input, outputs=output_text)
    
    return demo

# Create and launch the Gradio app
if __name__ == "__main__":
    app = create_gradio_interface()
    app.launch()