Spaces:

lightonai
/

LightOnOCR-1B-Demo

Running

App Files Files Community

LightOnOCR-1B-Demo / app.py

staghado

Update app.py

5b6bee3 verified about 1 month ago

raw

history blame

7.3 kB

	#!/usr/bin/env python3
	import os
	import json
	import base64
	import requests
	import gradio as gr
	from PIL import Image
	from io import BytesIO
	import pypdfium2 as pdfium

	ENDPOINT = os.environ.get("VLLM_ENDPOINT")
	MODEL = os.environ.get("VLLM_MODEL")

	if not ENDPOINT or not MODEL:
	raise ValueError("VLLM_ENDPOINT and VLLM_MODEL environment variables must be set.")


	def image_to_base64(image):
	buffered = BytesIO()
	if image.mode == 'RGBA':
	image = image.convert('RGB')
	image.save(buffered, format="PNG")
	return base64.b64encode(buffered.getvalue()).decode("utf-8")


	def render_pdf_page(page, max_resolution=1540, scale=2.77):
	width, height = page.get_size()
	pixel_width = width * scale
	pixel_height = height * scale
	resize_factor = min(1, max_resolution / pixel_width, max_resolution / pixel_height)
	target_scale = scale * resize_factor
	return page.render(scale=target_scale, rev_byteorder=True).to_pil()


	def process_pdf(pdf_path, page_num=1):
	pdf = pdfium.PdfDocument(pdf_path)
	total_pages = len(pdf)
	page_idx = min(max(int(page_num) - 1, 0), total_pages - 1)

	page = pdf[page_idx]
	img = render_pdf_page(page)

	pdf.close()
	return img, total_pages, page_idx + 1


	def process_input(file_input, temperature, page_num):
	if file_input is None:
	yield "Please upload an image or PDF first.", "", "", None, gr.update()
	return

	image_to_process = None
	page_info = ""

	file_path = file_input if isinstance(file_input, str) else file_input.name

	if file_path.lower().endswith('.pdf'):
	try:
	image_to_process, total_pages, actual_page = process_pdf(file_path, int(page_num))
	page_info = f"Processing page {actual_page} of {total_pages}"
	except Exception as e:
	yield f"Error processing PDF", "", "", None, gr.update()
	return
	else:
	try:
	image_to_process = Image.open(file_path)
	page_info = "Processing image"
	except Exception as e:
	yield f"Error opening image", "", "", None, gr.update()
	return

	content = [
	{"type": "text", "text": ""},
	{
	"type": "image_url",
	"image_url": {"url": f"data:image/png;base64,{image_to_base64(image_to_process)}"}
	}
	]

	payload = {
	"model": MODEL,
	"messages": [{"role": "user", "content": content}],
	"temperature": temperature,
	"stream": True
	}

	try:
	response = requests.post(
	ENDPOINT,
	headers={"Content-Type": "application/json"},
	data=json.dumps(payload),
	stream=True
	)
	response.raise_for_status()

	accumulated_response = ""
	first_chunk = True

	for line in response.iter_lines():
	if line:
	line = line.decode('utf-8')
	if line.startswith('data: '):
	line = line[6:]

	if line.strip() == '[DONE]':
	break

	try:
	chunk = json.loads(line)
	if 'choices' in chunk and len(chunk['choices']) > 0:
	delta = chunk['choices'][0].get('delta', {})
	content_delta = delta.get('content', '')
	if content_delta:
	accumulated_response += content_delta
	if first_chunk:
	yield accumulated_response, accumulated_response, page_info, image_to_process, gr.update()
	first_chunk = False
	else:
	yield accumulated_response, accumulated_response, page_info, gr.update(), gr.update()
	except json.JSONDecodeError:
	continue

	except Exception as e:
	error_msg = f"Error"
	yield error_msg, error_msg, page_info, image_to_process, gr.update()


	def update_slider(file_input):
	if file_input is None:
	return gr.update(maximum=20, value=1)

	file_path = file_input if isinstance(file_input, str) else file_input.name

	if file_path.lower().endswith('.pdf'):
	try:
	pdf = pdfium.PdfDocument(file_path)
	total_pages = len(pdf)
	pdf.close()
	return gr.update(maximum=total_pages, value=1)
	except:
	return gr.update(maximum=20, value=1)
	else:
	return gr.update(maximum=1, value=1)


	with gr.Blocks(title="📖 Image/PDF OCR", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 📖 Image/PDF to Text Extraction

	💡 How to use:
	1. Upload an image or PDF
	2. For PDFs: select which page to extract (1-20)
	3. Adjust temperature if needed
	4. Click "Extract Text"

	Note: The Markdown rendering for tables is not always correct, check the raw output for complex tables!
	""")

	with gr.Row():
	with gr.Column(scale=1):
	file_input = gr.File(
	label="🖼️ Upload Image or PDF",
	file_types=[".pdf", ".png", ".jpg", ".jpeg"],
	type="filepath"
	)
	rendered_image = gr.Image(
	label="📄 Preview",
	type="pil",
	height=400,
	interactive=False
	)
	num_pages = gr.Slider(
	minimum=1,
	maximum=20,
	value=1,
	step=1,
	label="PDF: Page Number",
	info="Select which page to extract"
	)
	page_info = gr.Textbox(
	label="Processing Info",
	value="",
	interactive=False
	)
	temperature = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.2,
	step=0.05,
	label="Temperature"
	)
	submit_btn = gr.Button("Extract Text", variant="primary")
	clear_btn = gr.Button("Clear", variant="secondary")

	with gr.Column(scale=2):
	output_text = gr.Markdown(
	label="📄 Extracted Text (Rendered)",
	value="Extracted text will appear here..."
	)

	with gr.Row():
	with gr.Column():
	raw_output = gr.Textbox(
	label="Raw Markdown Output",
	placeholder="Raw text will appear here...",
	lines=20,
	max_lines=30,
	show_copy_button=True
	)

	submit_btn.click(
	fn=process_input,
	inputs=[file_input, temperature, num_pages],
	outputs=[output_text, raw_output, page_info, rendered_image, num_pages]
	)

	file_input.change(
	fn=update_slider,
	inputs=[file_input],
	outputs=[num_pages]
	)

	clear_btn.click(
	fn=lambda: (None, "Extracted text will appear here...", "", "", None, 1),
	outputs=[file_input, output_text, raw_output, page_info, rendered_image, num_pages]
	)


	if __name__ == "__main__":
	demo.launch()