Spaces:

Huzaifa424
/

OCR_DEMO

Runtime error

App Files Files Community

OCR_DEMO / app.py

Huzaifa424

Update app.py

e4e2cb3 verified over 1 year ago

raw

history blame contribute delete

2.7 kB

	import gradio as gr
	import requests
	import torch
	from PIL import Image
	from transformers import MllamaForConditionalGeneration, AutoProcessor
	from PyPDF2 import PdfReader
	import tempfile
	import os
	from pdf2image import convert_from_path
	token= os.getenv("HF_TOKEN")
	# Model and processor setup
	model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

	# Load the model
	model = MllamaForConditionalGeneration.from_pretrained(
	model_id,
	use_auth_token=token,
	torch_dtype=torch.bfloat16,
	device_map="auto", # Automatically allocates the model across available devices
	)
	processor = AutoProcessor.from_pretrained(model_id)

	def process_pdf(pdf_file):
	"""Extract text from each page of a PDF."""
	# Read the PDF using pdf2image to convert pages to images
	images = convert_from_path(pdf_file.name)
	extracted_text = {}

	for i, page_image in enumerate(images):
	# Define the instruction for OCR
	messages = [
	{"role": "user", "content": [
	{"type": "image"},
	{"type": "text", "text": "Extract all the text from this image:"}
	]}
	]

	# Prepare the input
	input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
	inputs = processor(
	page_image,
	input_text,
	add_special_tokens=False,
	return_tensors="pt"
	).to(model.device)

	# Generate the output
	output = model.generate(**inputs, max_new_tokens=1500)

	# Decode the generated text
	page_text = processor.decode(output[0])
	extracted_text[f"Page {i + 1}"] = page_text

	return extracted_text

	def display_results(pdf_file):
	"""Process the PDF and display results as key-value pairs with checkboxes."""
	extracted_text = process_pdf(pdf_file)
	checkboxes = {key: False for key in extracted_text.keys()}

	return checkboxes, extracted_text

	def create_interface():
	"""Build the Gradio interface."""
	with gr.Blocks() as app:
	gr.Markdown("# PDF OCR Extractor with Key-Value Pairs")

	with gr.Row():
	pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
	submit_button = gr.Button("Extract Text")

	with gr.Row():
	checkboxes_output = gr.CheckboxGroup(label="Select Pages", choices=[])
	text_output = gr.Textbox(label="Extracted Text", lines=10, interactive=False)

	submit_button.click(
	display_results,
	inputs=[pdf_input],
	outputs=[checkboxes_output, text_output]
	)

	return app

	if __name__ == "__main__":
	interface = create_interface()
	interface.launch()