Spaces:
Running
Running
File size: 7,360 Bytes
111a99e 2fcfad9 111a99e 9f9c33b 2fcfad9 6804c82 111a99e b38e046 111a99e 6804c82 9f9c33b 2fcfad9 a036cd1 2fcfad9 20bdd1c 6804c82 20bdd1c 6804c82 a02a7ea 6804c82 6a172b5 72c3b35 6804c82 a02a7ea 6804c82 6a172b5 6804c82 a02a7ea a036cd1 0c74f80 d5a7e96 2fcfad9 6804c82 6a172b5 0c74f80 6804c82 a036cd1 6804c82 72c3b35 a02a7ea 0c74f80 a02a7ea 6804c82 0c74f80 6804c82 a036cd1 0c74f80 a02a7ea a036cd1 0c74f80 a036cd1 72c3b35 6804c82 a036cd1 0c74f80 a036cd1 2fcfad9 111a99e 72c3b35 111a99e 9f9c33b 111a99e 9f9c33b 111a99e a02a7ea 111a99e b77caf3 111a99e 6804c82 a02a7ea 111a99e b77caf3 0c74f80 9f9c33b a608f20 6804c82 72c3b35 2fcfad9 111ff5f 6804c82 a036cd1 6804c82 111ff5f 0c74f80 a02a7ea 0c74f80 eed9900 0c74f80 a02a7ea 6a172b5 a02a7ea 6a172b5 a02a7ea 6a172b5 111ff5f 2fcfad9 3c5f2af 2fcfad9 111ff5f 6804c82 111ff5f d5a7e96 72c3b35 111ff5f d5a7e96 6804c82 d5a7e96 111ff5f 6804c82 a02a7ea 0c74f80 2fcfad9 111ff5f 72c3b35 a02a7ea 111ff5f 9f9c33b 111a99e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
#!/usr/bin/env python3
import os
import json
import base64
import requests
import gradio as gr
from PIL import Image
from io import BytesIO
import pypdfium2 as pdfium
ENDPOINT = os.environ.get("VLLM_ENDPOINT")
MODEL = os.environ.get("VLLM_MODEL")
if not ENDPOINT or not MODEL:
raise ValueError("VLLM_ENDPOINT and VLLM_MODEL environment variables must be set.")
def image_to_base64(image):
buffered = BytesIO()
if image.mode == 'RGBA':
image = image.convert('RGB')
image.save(buffered, format="PNG")
return base64.b64encode(buffered.getvalue()).decode("utf-8")
def render_pdf_page(page, max_resolution=1540, scale=2.77):
width, height = page.get_size()
pixel_width = width * scale
pixel_height = height * scale
resize_factor = min(1, max_resolution / pixel_width, max_resolution / pixel_height)
target_scale = scale * resize_factor
return page.render(scale=target_scale, rev_byteorder=True).to_pil()
def process_pdf(pdf_path, num_pages=1):
pdf = pdfium.PdfDocument(pdf_path)
total_pages = len(pdf)
pages_to_process = min(int(num_pages), total_pages, 5)
images = []
for i in range(pages_to_process):
page = pdf[i]
img = render_pdf_page(page)
images.append(img)
pdf.close()
return images, total_pages
def process_input(file_input, temperature, num_pages):
if file_input is None:
yield "Please upload an image or PDF first.", "", "", None
return
images_to_process = []
page_info = ""
display_image = None
file_path = file_input if isinstance(file_input, str) else file_input.name
if file_path.lower().endswith('.pdf'):
try:
images_to_process, total_pages = process_pdf(file_path, int(num_pages))
if len(images_to_process) == 0:
yield "Error: Could not extract pages from PDF.", "", "", None
return
display_image = images_to_process[0]
if len(images_to_process) == 1:
page_info = f"Processing page 1 of {total_pages}"
else:
page_info = f"Processing {len(images_to_process)} pages of {total_pages}"
except Exception as e:
yield f"Error processing PDF: {str(e)}", "", "", None
return
else:
try:
img = Image.open(file_path)
images_to_process = [img]
display_image = img
page_info = "Processing image"
except Exception as e:
yield f"Error opening image: {str(e)}", "", "", None
return
content = [{"type": "text", "text": "Extract the text from this image."}]
for img in images_to_process:
try:
b64_image = image_to_base64(img)
content.append({
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{b64_image}"}
})
except Exception as e:
yield f"Error encoding image: {str(e)}", "", "", display_image
return
payload = {
"model": MODEL,
"messages": [{"role": "user", "content": content}],
"temperature": temperature,
"stream": True
}
try:
response = requests.post(
ENDPOINT,
headers={"Content-Type": "application/json"},
data=json.dumps(payload),
stream=True
)
response.raise_for_status()
accumulated_response = ""
first_chunk = True
for line in response.iter_lines():
if line:
line = line.decode('utf-8')
if line.startswith('data: '):
line = line[6:]
if line.strip() == '[DONE]':
break
try:
chunk = json.loads(line)
if 'choices' in chunk and len(chunk['choices']) > 0:
delta = chunk['choices'][0].get('delta', {})
content_delta = delta.get('content', '')
if content_delta:
accumulated_response += content_delta
if first_chunk:
yield accumulated_response, accumulated_response, page_info, display_image
first_chunk = False
else:
yield accumulated_response, accumulated_response, page_info, gr.update()
except json.JSONDecodeError:
continue
except Exception as e:
error_msg = f"Error: {str(e)}"
yield error_msg, error_msg, page_info, display_image
with gr.Blocks(title="π Image/PDF OCR", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# π Image/PDF to Text Extraction
**π‘ How to use:**
1. Upload an image or PDF
2. For PDFs: choose how many pages to process (1-5, default is 1)
3. Adjust temperature if needed
4. Click "Extract Text"
**Note:** The Markdown rendering for tables is not always correct, check the raw output for complex tables!
""")
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(
label="πΌοΈ Upload Image or PDF",
file_types=[".pdf", ".png", ".jpg", ".jpeg"],
type="filepath"
)
rendered_image = gr.Image(
label="π Preview (First Page)",
type="pil",
height=400,
interactive=False
)
num_pages = gr.Slider(
minimum=1,
maximum=5,
value=1,
step=1,
label="PDF: Number of Pages to Process",
info="Only applies to PDF files (max 5 pages)"
)
page_info = gr.Textbox(
label="Processing Info",
value="",
interactive=False
)
temperature = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.2,
step=0.05,
label="Temperature"
)
submit_btn = gr.Button("Extract Text", variant="primary")
clear_btn = gr.Button("Clear", variant="secondary")
with gr.Column(scale=2):
output_text = gr.Markdown(
label="π Extracted Text (Rendered)",
value="*Extracted text will appear here...*"
)
with gr.Row():
with gr.Column():
raw_output = gr.Textbox(
label="Raw Markdown Output",
placeholder="Raw text will appear here...",
lines=20,
max_lines=30,
show_copy_button=True
)
submit_btn.click(
fn=process_input,
inputs=[file_input, temperature, num_pages],
outputs=[output_text, raw_output, page_info, rendered_image]
)
clear_btn.click(
fn=lambda: (None, "*Extracted text will appear here...*", "", "", None, 1),
outputs=[file_input, output_text, raw_output, page_info, rendered_image, num_pages]
)
if __name__ == "__main__":
demo.launch() |