#!/usr/bin/env python3 import os import json import base64 import requests import gradio as gr from PIL import Image from io import BytesIO import pypdfium2 as pdfium ENDPOINT = os.environ.get("VLLM_ENDPOINT") MODEL = os.environ.get("VLLM_MODEL") if not ENDPOINT or not MODEL: raise ValueError("VLLM_ENDPOINT and VLLM_MODEL environment variables must be set.") def image_to_base64(image): buffered = BytesIO() if image.mode == 'RGBA': image = image.convert('RGB') image.save(buffered, format="PNG") return base64.b64encode(buffered.getvalue()).decode("utf-8") def render_pdf_page(page, max_resolution=1280, scale=2.77): width, height = page.get_size() pixel_width = width * scale pixel_height = height * scale resize_factor = min(max_resolution / pixel_width, max_resolution / pixel_height) target_scale = scale * resize_factor return page.render(scale=target_scale, rev_byteorder=True).to_pil() def process_pdf(pdf_path, num_pages=1): pdf = pdfium.PdfDocument(pdf_path) total_pages = len(pdf) pages_to_process = min(num_pages, total_pages, 5) images = [] for i in range(pages_to_process): page = pdf[i] img = render_pdf_page(page) images.append(img) pdf.close() return images, total_pages def process_input(file_input, temperature, num_pages): if file_input is None: yield "Please upload an image or PDF first.", "", "", None return images_to_process = [] page_info = "" display_image = None file_path = file_input if isinstance(file_input, str) else file_input.name if file_path.lower().endswith('.pdf'): try: images_to_process, total_pages = process_pdf(file_path, num_pages) if len(images_to_process) == 0: yield "Error: Could not extract pages from PDF.", "", "", None return display_image = images_to_process[0] if len(images_to_process) == 1: page_info = f"Processing page 1 of {total_pages}" else: page_info = f"Processing {len(images_to_process)} pages of {total_pages}" except Exception as e: yield f"Error processing PDF: {str(e)}", "", "", None return else: try: img = Image.open(file_path) images_to_process = [img] display_image = img page_info = "Processing image" except Exception as e: yield f"Error opening image: {str(e)}", "", "", None return content = [{"type": "text", "text": ""}] for img in images_to_process: try: b64_image = image_to_base64(img) content.append({ "type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64_image}"} }) except Exception as e: yield f"Error encoding image: {str(e)}", "", "", display_image return payload = { "model": MODEL, "messages": [ { "role": "user", "content": content } ], "temperature": temperature, "stream": True } try: response = requests.post( ENDPOINT, headers={"Content-Type": "application/json"}, data=json.dumps(payload), stream=True ) response.raise_for_status() accumulated_response = "" first_chunk = True for line in response.iter_lines(): if line: line = line.decode('utf-8') if line.startswith('data: '): line = line[6:] if line.strip() == '[DONE]': break try: chunk = json.loads(line) if 'choices' in chunk and len(chunk['choices']) > 0: delta = chunk['choices'][0].get('delta', {}) content_delta = delta.get('content', '') if content_delta: accumulated_response += content_delta if first_chunk: yield accumulated_response, accumulated_response, page_info, display_image first_chunk = False else: yield accumulated_response, accumulated_response, page_info, gr.update() except json.JSONDecodeError: continue except Exception as e: error_msg = f"Error: {str(e)}" yield error_msg, error_msg, page_info, display_image with gr.Blocks(title="📖 Image/PDF OCR", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # 📖 Image/PDF to Text Extraction **💡 How to use:** 1. Upload an image or PDF 2. For PDFs: choose how many pages to process (1-5, default is 1) 3. Adjust temperature if needed 4. Click "Extract Text" """ ) with gr.Row(): with gr.Column(scale=1): file_input = gr.File( label="🖼️ Upload Image or PDF", file_types=[".pdf", ".png", ".jpg", ".jpeg"], type="filepath" ) rendered_image = gr.Image( label="📄 Preview (First Page)", type="pil", height=400, interactive=False ) num_pages = gr.Slider( minimum=1, maximum=5, value=1, step=1, label="PDF: Number of Pages to Process", info="Only applies to PDF files (max 5 pages)" ) page_info = gr.Textbox( label="Processing Info", value="", interactive=False ) temperature = gr.Slider( minimum=0.1, maximum=1.0, value=0.2, step=0.05, label="Temperature" ) submit_btn = gr.Button("Extract Text", variant="primary") clear_btn = gr.Button("Clear", variant="secondary") with gr.Column(scale=2): output_text = gr.Markdown( label="📄 Extracted Text (Rendered)", value="