LightOnOCR-1B-Demo / app_space.py
DocUA's picture
Unified project structure: app_space.py for ZeroGPU, root README metadata
a25a813
#!/usr/bin/env python3
"""
Gradio web interface for LightOnOCR-1B specialized for Hugging Face Spaces.
"""
import os
import sys
import gradio as gr
from pathlib import Path
from PIL import Image
import pypdfium2 as pdfium
import spaces
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent))
from backends.pytorch_backend import PyTorchBackend
# Global backend
BACKEND = None
def load_backend():
"""Load PyTorch backend."""
global BACKEND
if BACKEND is None:
print("Loading PyTorch backend...")
BACKEND = PyTorchBackend()
# We don't call load_model() explicitly here, it happens lazily or inside processed_image
# But for ZeroGPU it's better to initialize it inside the GPU decorated function
# or load it globally if it fits in VRAM on init (ZeroGPU swaps it in/out)
# Standard pattern: Init model globally, use inside @spaces.GPU
BACKEND.load_model()
print(f"Backend loaded: {BACKEND.get_backend_info()}")
return BACKEND
# Initialize globally for ZeroGPU
load_backend()
def render_pdf_page(page, scale=2.0):
"""Render PDF page to PIL Image."""
return page.render(scale=scale, rev_byteorder=True).to_pil()
def process_pdf(pdf_path, num_pages=1, scale=2.0):
"""Extract images from PDF."""
pdf = pdfium.PdfDocument(pdf_path)
total_pages = len(pdf)
pages_to_process = min(num_pages, total_pages, 10) # Max 10 pages
images = []
for i in range(pages_to_process):
page = pdf[i]
img = render_pdf_page(page, scale=scale)
images.append(img)
pdf.close()
return images, total_pages
@spaces.GPU(duration=120) # Increase duration for OCR
def run_inference(image, max_tokens):
"""Run inference on GPU."""
global BACKEND
if BACKEND is None:
load_backend()
return BACKEND.process_image(image, temperature=0.0, max_tokens=max_tokens)
def process_input(file_input, scale, max_tokens, num_pages):
"""Process uploaded file with OCR."""
if file_input is None:
yield "Idle", "Please upload an image or PDF first.", "", "", None
return
images_to_process = []
page_info = ""
display_image = None
# ... (rest of image loading logic same as before, simplified for diff clarity)
file_path = Path(file_input) if isinstance(file_input, str) else Path(file_input.name)
if not file_path.exists():
yield "Error", f"File not accessible: {file_path}", "", "", None
return
# Load images
if file_path.suffix.lower() == '.pdf':
try:
images_to_process, total_pages = process_pdf(str(file_path), num_pages, scale)
if len(images_to_process) == 0:
yield "Error", "Could not extract pages from PDF.", "", "", None
return
display_image = images_to_process[0]
page_info = f"Processing {len(images_to_process)} of {total_pages} pages"
except Exception as e:
yield "Error", f"Error processing PDF: {str(e)}", "", "", None
return
else:
try:
img = Image.open(file_path)
images_to_process = [img]
display_image = img
page_info = "Processing image"
except Exception as e:
yield "Error", f"Error opening image: {str(e)}", "", "", None
return
# Process with OCR
try:
yield "Processing...", "Processing images...", "", page_info, display_image
all_texts = []
for i, img in enumerate(images_to_process):
try:
print(f"Processing page {i+1}/{len(images_to_process)}...")
# Run inference on GPU (hardcoded temp=0.0)
text = run_inference(img, max_tokens=max_tokens)
all_texts.append(text.strip())
# Update progress
full_text = "\n\n---\n\n".join(all_texts)
yield "Processing...", full_text, full_text, page_info, display_image
except Exception as e:
error_msg = f"Error on page {i+1}: {str(e)}"
print(f"ERROR: {error_msg}")
all_texts.append(f"[{error_msg}]")
continue
# Final result
final_text = "\n\n---\n\n".join(all_texts)
yield "Complete", final_text, final_text, page_info, display_image
except Exception as e:
error_msg = f"Error during processing: {str(e)}"
yield "Error", error_msg, "", page_info, display_image
# Create Gradio interface
with gr.Blocks(title="πŸ“– LightOnOCR-1B Demo", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# πŸ“– LightOnOCR-1B - OCR Demo
Upload an image or PDF to extract text. Running on ZeroGPU with PyTorch.
"""
)
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(
label="πŸ–ΌοΈ Upload Image or PDF",
file_types=[".pdf", ".png", ".jpg", ".jpeg"],
type="filepath"
)
rendered_image = gr.Image(
label="πŸ“„ Preview",
type="pil",
height=300,
interactive=False
)
with gr.Accordion("βš™οΈ Settings", open=True):
scale_slider = gr.Slider(
minimum=1.0,
maximum=3.0,
value=2.0,
step=0.5,
label="PDF Scale",
info="Higher = better quality, slower"
)
max_tokens_slider = gr.Slider(
minimum=256,
maximum=2048,
value=1024,
step=256,
label="Max Tokens",
info="Lower = faster, may cut off long text"
)
num_pages = gr.Slider(
minimum=1,
maximum=10,
value=1,
step=1,
label="PDF Pages",
info="Number of pages to process (max 10)"
)
page_info = gr.Textbox(
label="Processing Info",
value="",
interactive=False
)
submit_btn = gr.Button("πŸš€ Extract Text", variant="primary", size="lg")
clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
with gr.Column(scale=2):
status_display = gr.Textbox(
label="Status",
value="Idle",
interactive=False
)
with gr.Tabs():
with gr.Tab("πŸ“„ Rendered"):
output_text = gr.Markdown(
value="*Extracted text will appear here...*",
height=600
)
with gr.Tab("πŸ“ Raw Text"):
raw_output = gr.Textbox(
placeholder="Raw text will appear here...",
lines=25,
show_copy_button=True
)
# Event handlers
submit_btn.click(
fn=process_input,
inputs=[file_input, scale_slider, max_tokens_slider, num_pages],
outputs=[status_display, output_text, raw_output, page_info, rendered_image]
)
clear_btn.click(
fn=lambda: ("Idle", None, "*Extracted text will appear here...*", "", "", None),
outputs=[status_display, file_input, output_text, raw_output, page_info, rendered_image]
)
if __name__ == "__main__":
demo.launch()