OCR_DEMO / app.py
Huzaifa424's picture
Update app.py
e4e2cb3 verified
import gradio as gr
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor
from PyPDF2 import PdfReader
import tempfile
import os
from pdf2image import convert_from_path
token= os.getenv("HF_TOKEN")
# Model and processor setup
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
# Load the model
model = MllamaForConditionalGeneration.from_pretrained(
model_id,
use_auth_token=token,
torch_dtype=torch.bfloat16,
device_map="auto", # Automatically allocates the model across available devices
)
processor = AutoProcessor.from_pretrained(model_id)
def process_pdf(pdf_file):
"""Extract text from each page of a PDF."""
# Read the PDF using pdf2image to convert pages to images
images = convert_from_path(pdf_file.name)
extracted_text = {}
for i, page_image in enumerate(images):
# Define the instruction for OCR
messages = [
{"role": "user", "content": [
{"type": "image"},
{"type": "text", "text": "Extract all the text from this image:"}
]}
]
# Prepare the input
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(
page_image,
input_text,
add_special_tokens=False,
return_tensors="pt"
).to(model.device)
# Generate the output
output = model.generate(**inputs, max_new_tokens=1500)
# Decode the generated text
page_text = processor.decode(output[0])
extracted_text[f"Page {i + 1}"] = page_text
return extracted_text
def display_results(pdf_file):
"""Process the PDF and display results as key-value pairs with checkboxes."""
extracted_text = process_pdf(pdf_file)
checkboxes = {key: False for key in extracted_text.keys()}
return checkboxes, extracted_text
def create_interface():
"""Build the Gradio interface."""
with gr.Blocks() as app:
gr.Markdown("# PDF OCR Extractor with Key-Value Pairs")
with gr.Row():
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
submit_button = gr.Button("Extract Text")
with gr.Row():
checkboxes_output = gr.CheckboxGroup(label="Select Pages", choices=[])
text_output = gr.Textbox(label="Extracted Text", lines=10, interactive=False)
submit_button.click(
display_results,
inputs=[pdf_input],
outputs=[checkboxes_output, text_output]
)
return app
if __name__ == "__main__":
interface = create_interface()
interface.launch()