staghado's picture
Update app.py
20bdd1c verified
raw
history blame
7.58 kB
#!/usr/bin/env python3
import os
import json
import base64
import requests
import gradio as gr
from PIL import Image
from io import BytesIO
import pypdfium2 as pdfium
ENDPOINT = os.environ.get("VLLM_ENDPOINT")
MODEL = os.environ.get("VLLM_MODEL")
if not ENDPOINT or not MODEL:
raise ValueError("VLLM_ENDPOINT and VLLM_MODEL environment variables must be set.")
def image_to_base64(image):
buffered = BytesIO()
if image.mode == 'RGBA':
image = image.convert('RGB')
image.save(buffered, format="PNG")
return base64.b64encode(buffered.getvalue()).decode("utf-8")
def render_pdf_page(page, max_resolution=1540, scale=2.77):
width, height = page.get_size()
pixel_width = width * scale
pixel_height = height * scale
resize_factor = min(1, max_resolution / pixel_width, max_resolution / pixel_height)
target_scale = scale * resize_factor
return page.render(scale=target_scale, rev_byteorder=True).to_pil()
def process_pdf(pdf_path, num_pages=1):
pdf = pdfium.PdfDocument(pdf_path)
total_pages = len(pdf)
pages_to_process = min(num_pages, total_pages, 5)
images = []
for i in range(pages_to_process):
page = pdf[i]
img = render_pdf_page(page)
images.append(img)
pdf.close()
return images, total_pages
def process_input(file_input, temperature, num_pages):
if file_input is None:
yield "Please upload an image or PDF first.", "", "", None
return
images_to_process = []
page_info = ""
display_image = None
file_path = file_input if isinstance(file_input, str) else file_input.name
if file_path.lower().endswith('.pdf'):
try:
images_to_process, total_pages = process_pdf(file_path, num_pages)
if len(images_to_process) == 0:
yield "Error: Could not extract pages from PDF.", "", "", None
return
display_image = images_to_process[0]
if len(images_to_process) == 1:
page_info = f"Processing page 1 of {total_pages}"
else:
page_info = f"Processing {len(images_to_process)} pages of {total_pages}"
except Exception as e:
yield f"Error processing PDF: {str(e)}", "", "", None
return
else:
try:
img = Image.open(file_path)
images_to_process = [img]
display_image = img
page_info = "Processing image"
except Exception as e:
yield f"Error opening image: {str(e)}", "", "", None
return
content = [{"type": "text", "text": ""}]
for img in images_to_process:
try:
b64_image = image_to_base64(img)
content.append({
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{b64_image}"}
})
except Exception as e:
yield f"Error encoding image: {str(e)}", "", "", display_image
return
payload = {
"model": MODEL,
"messages": [
{
"role": "user",
"content": content
}
],
"temperature": temperature,
"stream": True
}
try:
response = requests.post(
ENDPOINT,
headers={"Content-Type": "application/json"},
data=json.dumps(payload),
stream=True
)
response.raise_for_status()
accumulated_response = ""
first_chunk = True
for line in response.iter_lines():
if line:
line = line.decode('utf-8')
if line.startswith('data: '):
line = line[6:]
if line.strip() == '[DONE]':
break
try:
chunk = json.loads(line)
if 'choices' in chunk and len(chunk['choices']) > 0:
delta = chunk['choices'][0].get('delta', {})
content_delta = delta.get('content', '')
if content_delta:
accumulated_response += content_delta
if first_chunk:
yield accumulated_response, accumulated_response, page_info, display_image
first_chunk = False
else:
yield accumulated_response, accumulated_response, page_info, gr.update()
except json.JSONDecodeError:
continue
except Exception as e:
error_msg = f"Error: {str(e)}"
yield error_msg, error_msg, page_info, display_image
with gr.Blocks(title="πŸ“– Image/PDF OCR", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# πŸ“– Image/PDF to Text Extraction
**πŸ’‘ How to use:**
1. Upload an image or PDF
2. For PDFs: choose how many pages to process (1-5, default is 1)
3. Adjust temperature if needed
4. Click "Extract Text"
Note: The Markdown rendering for tables is not always correct, check the raw output for complex tables!
"""
)
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(
label="πŸ–ΌοΈ Upload Image or PDF",
file_types=[".pdf", ".png", ".jpg", ".jpeg"],
type="filepath"
)
rendered_image = gr.Image(
label="πŸ“„ Preview (First Page)",
type="pil",
height=400,
interactive=False
)
num_pages = gr.Slider(
minimum=1,
maximum=5,
value=1,
step=1,
label="PDF: Number of Pages to Process",
info="Only applies to PDF files (max 5 pages)"
)
page_info = gr.Textbox(
label="Processing Info",
value="",
interactive=False
)
temperature = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.2,
step=0.05,
label="Temperature"
)
submit_btn = gr.Button("Extract Text", variant="primary")
clear_btn = gr.Button("Clear", variant="secondary")
with gr.Column(scale=2):
output_text = gr.Markdown(
label="πŸ“„ Extracted Text (Rendered)",
value="<div style='min-height: 600px; padding: 10px; border: 1px solid #e0e0e0; border-radius: 4px; background-color: #f9f9f9;'><em>Extracted text will appear here...</em></div>",
height=600
)
with gr.Row():
with gr.Column():
raw_output = gr.Textbox(
label="Raw Markdown Output",
placeholder="Raw text will appear here...",
lines=20,
max_lines=30,
show_copy_button=True
)
submit_btn.click(
fn=process_input,
inputs=[file_input, temperature, num_pages],
outputs=[output_text, raw_output, page_info, rendered_image]
)
clear_btn.click(
fn=lambda: (None, "", "", "", None, 1),
outputs=[file_input, output_text, raw_output, page_info, rendered_image, num_pages]
)
if __name__ == "__main__":
demo.launch()