Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -240,7 +240,7 @@ def image_to_data_uri(image):
|
|
| 240 |
return f"data:image/png;base64,{b64}"
|
| 241 |
|
| 242 |
|
| 243 |
-
def extract_text_via_vllm(image, model_name, temperature=0.2, stream=False):
|
| 244 |
"""Extract text from image using vLLM endpoint."""
|
| 245 |
config = MODEL_REGISTRY.get(model_name)
|
| 246 |
if config is None:
|
|
@@ -277,7 +277,7 @@ def extract_text_via_vllm(image, model_name, temperature=0.2, stream=False):
|
|
| 277 |
response = client.chat.completions.create(
|
| 278 |
model=model_id,
|
| 279 |
messages=messages,
|
| 280 |
-
max_tokens=
|
| 281 |
temperature=temperature if temperature > 0 else 0.0,
|
| 282 |
top_p=0.9,
|
| 283 |
stream=True,
|
|
@@ -299,7 +299,7 @@ def extract_text_via_vllm(image, model_name, temperature=0.2, stream=False):
|
|
| 299 |
response = client.chat.completions.create(
|
| 300 |
model=model_id,
|
| 301 |
messages=messages,
|
| 302 |
-
max_tokens=
|
| 303 |
temperature=temperature if temperature > 0 else 0.0,
|
| 304 |
top_p=0.9,
|
| 305 |
stream=False,
|
|
@@ -331,13 +331,13 @@ def render_bbox_with_crops(raw_output, source_image):
|
|
| 331 |
|
| 332 |
|
| 333 |
@spaces.GPU
|
| 334 |
-
def extract_text_from_image(image, model_name, temperature=0.2, stream=False):
|
| 335 |
"""Extract text from image using LightOnOCR model."""
|
| 336 |
# Check if model has a vLLM endpoint configured
|
| 337 |
config = MODEL_REGISTRY.get(model_name, {})
|
| 338 |
if config.get("vllm_endpoint"):
|
| 339 |
# Use vLLM endpoint instead of local model
|
| 340 |
-
yield from extract_text_via_vllm(image, model_name, temperature, stream)
|
| 341 |
return
|
| 342 |
|
| 343 |
# Get model and processor from cache or load
|
|
@@ -375,7 +375,7 @@ def extract_text_from_image(image, model_name, temperature=0.2, stream=False):
|
|
| 375 |
|
| 376 |
generation_kwargs = dict(
|
| 377 |
**inputs,
|
| 378 |
-
max_new_tokens=
|
| 379 |
temperature=temperature if temperature > 0 else 0.0,
|
| 380 |
top_p=0.9,
|
| 381 |
top_k=0,
|
|
@@ -421,7 +421,7 @@ def extract_text_from_image(image, model_name, temperature=0.2, stream=False):
|
|
| 421 |
yield cleaned_text
|
| 422 |
|
| 423 |
|
| 424 |
-
def process_input(file_input, model_name, temperature, page_num, enable_streaming):
|
| 425 |
"""Process uploaded file (image or PDF) and extract text with optional streaming."""
|
| 426 |
if file_input is None:
|
| 427 |
yield "Please upload an image or PDF first.", "", "", None, gr.update()
|
|
@@ -458,7 +458,7 @@ def process_input(file_input, model_name, temperature, page_num, enable_streamin
|
|
| 458 |
try:
|
| 459 |
# Extract text using LightOnOCR with optional streaming
|
| 460 |
for extracted_text in extract_text_from_image(
|
| 461 |
-
image_to_process, model_name, temperature, stream=enable_streaming
|
| 462 |
):
|
| 463 |
# For bbox models, render cropped images inline
|
| 464 |
if has_bbox:
|
|
@@ -574,6 +574,14 @@ State-of-the-art OCR on OlmOCR-Bench, ~9× smaller and faster than competitors.
|
|
| 574 |
value=True,
|
| 575 |
info="Show text progressively as it's generated",
|
| 576 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 577 |
submit_btn = gr.Button("Extract Text", variant="primary")
|
| 578 |
clear_btn = gr.Button("Clear", variant="secondary")
|
| 579 |
|
|
@@ -620,7 +628,6 @@ State-of-the-art OCR on OlmOCR-Bench, ~9× smaller and faster than competitors.
|
|
| 620 |
outputs=[file_input],
|
| 621 |
)
|
| 622 |
|
| 623 |
-
|
| 624 |
with gr.Row():
|
| 625 |
with gr.Column():
|
| 626 |
raw_output = gr.Textbox(
|
|
@@ -630,15 +637,18 @@ State-of-the-art OCR on OlmOCR-Bench, ~9× smaller and faster than competitors.
|
|
| 630 |
max_lines=30,
|
| 631 |
)
|
| 632 |
|
| 633 |
-
|
| 634 |
# Event handlers
|
| 635 |
submit_btn.click(
|
| 636 |
fn=process_input,
|
| 637 |
-
inputs=[file_input, model_selector, temperature, num_pages, enable_streaming],
|
| 638 |
outputs=[output_text, raw_output, page_info, rendered_image, num_pages],
|
| 639 |
)
|
| 640 |
|
| 641 |
-
file_input.change(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 642 |
|
| 643 |
model_selector.change(
|
| 644 |
fn=get_model_info_text, inputs=[model_selector], outputs=[model_info]
|
|
@@ -654,6 +664,7 @@ State-of-the-art OCR on OlmOCR-Bench, ~9× smaller and faster than competitors.
|
|
| 654 |
"",
|
| 655 |
None,
|
| 656 |
1,
|
|
|
|
| 657 |
),
|
| 658 |
outputs=[
|
| 659 |
file_input,
|
|
@@ -664,9 +675,10 @@ State-of-the-art OCR on OlmOCR-Bench, ~9× smaller and faster than competitors.
|
|
| 664 |
page_info,
|
| 665 |
rendered_image,
|
| 666 |
num_pages,
|
|
|
|
| 667 |
],
|
| 668 |
)
|
| 669 |
|
| 670 |
|
| 671 |
if __name__ == "__main__":
|
| 672 |
-
demo.launch(theme=gr.themes.Soft(), ssr_mode=False)
|
|
|
|
| 240 |
return f"data:image/png;base64,{b64}"
|
| 241 |
|
| 242 |
|
| 243 |
+
def extract_text_via_vllm(image, model_name, temperature=0.2, stream=False, max_tokens=2048):
|
| 244 |
"""Extract text from image using vLLM endpoint."""
|
| 245 |
config = MODEL_REGISTRY.get(model_name)
|
| 246 |
if config is None:
|
|
|
|
| 277 |
response = client.chat.completions.create(
|
| 278 |
model=model_id,
|
| 279 |
messages=messages,
|
| 280 |
+
max_tokens=max_tokens,
|
| 281 |
temperature=temperature if temperature > 0 else 0.0,
|
| 282 |
top_p=0.9,
|
| 283 |
stream=True,
|
|
|
|
| 299 |
response = client.chat.completions.create(
|
| 300 |
model=model_id,
|
| 301 |
messages=messages,
|
| 302 |
+
max_tokens=max_tokens,
|
| 303 |
temperature=temperature if temperature > 0 else 0.0,
|
| 304 |
top_p=0.9,
|
| 305 |
stream=False,
|
|
|
|
| 331 |
|
| 332 |
|
| 333 |
@spaces.GPU
|
| 334 |
+
def extract_text_from_image(image, model_name, temperature=0.2, stream=False, max_tokens=2048):
|
| 335 |
"""Extract text from image using LightOnOCR model."""
|
| 336 |
# Check if model has a vLLM endpoint configured
|
| 337 |
config = MODEL_REGISTRY.get(model_name, {})
|
| 338 |
if config.get("vllm_endpoint"):
|
| 339 |
# Use vLLM endpoint instead of local model
|
| 340 |
+
yield from extract_text_via_vllm(image, model_name, temperature, stream, max_tokens)
|
| 341 |
return
|
| 342 |
|
| 343 |
# Get model and processor from cache or load
|
|
|
|
| 375 |
|
| 376 |
generation_kwargs = dict(
|
| 377 |
**inputs,
|
| 378 |
+
max_new_tokens=max_tokens,
|
| 379 |
temperature=temperature if temperature > 0 else 0.0,
|
| 380 |
top_p=0.9,
|
| 381 |
top_k=0,
|
|
|
|
| 421 |
yield cleaned_text
|
| 422 |
|
| 423 |
|
| 424 |
+
def process_input(file_input, model_name, temperature, page_num, enable_streaming, max_output_tokens):
|
| 425 |
"""Process uploaded file (image or PDF) and extract text with optional streaming."""
|
| 426 |
if file_input is None:
|
| 427 |
yield "Please upload an image or PDF first.", "", "", None, gr.update()
|
|
|
|
| 458 |
try:
|
| 459 |
# Extract text using LightOnOCR with optional streaming
|
| 460 |
for extracted_text in extract_text_from_image(
|
| 461 |
+
image_to_process, model_name, temperature, stream=enable_streaming, max_tokens=max_output_tokens
|
| 462 |
):
|
| 463 |
# For bbox models, render cropped images inline
|
| 464 |
if has_bbox:
|
|
|
|
| 574 |
value=True,
|
| 575 |
info="Show text progressively as it's generated",
|
| 576 |
)
|
| 577 |
+
max_output_tokens = gr.Slider(
|
| 578 |
+
minimum=256,
|
| 579 |
+
maximum=8192,
|
| 580 |
+
value=2048,
|
| 581 |
+
step=256,
|
| 582 |
+
label="Max Output Tokens",
|
| 583 |
+
info="Maximum number of tokens to generate",
|
| 584 |
+
)
|
| 585 |
submit_btn = gr.Button("Extract Text", variant="primary")
|
| 586 |
clear_btn = gr.Button("Clear", variant="secondary")
|
| 587 |
|
|
|
|
| 628 |
outputs=[file_input],
|
| 629 |
)
|
| 630 |
|
|
|
|
| 631 |
with gr.Row():
|
| 632 |
with gr.Column():
|
| 633 |
raw_output = gr.Textbox(
|
|
|
|
| 637 |
max_lines=30,
|
| 638 |
)
|
| 639 |
|
|
|
|
| 640 |
# Event handlers
|
| 641 |
submit_btn.click(
|
| 642 |
fn=process_input,
|
| 643 |
+
inputs=[file_input, model_selector, temperature, num_pages, enable_streaming, max_output_tokens],
|
| 644 |
outputs=[output_text, raw_output, page_info, rendered_image, num_pages],
|
| 645 |
)
|
| 646 |
|
| 647 |
+
file_input.change(
|
| 648 |
+
fn=update_slider_and_preview,
|
| 649 |
+
inputs=[file_input],
|
| 650 |
+
outputs=[num_pages, rendered_image],
|
| 651 |
+
)
|
| 652 |
|
| 653 |
model_selector.change(
|
| 654 |
fn=get_model_info_text, inputs=[model_selector], outputs=[model_info]
|
|
|
|
| 664 |
"",
|
| 665 |
None,
|
| 666 |
1,
|
| 667 |
+
2048,
|
| 668 |
),
|
| 669 |
outputs=[
|
| 670 |
file_input,
|
|
|
|
| 675 |
page_info,
|
| 676 |
rendered_image,
|
| 677 |
num_pages,
|
| 678 |
+
max_output_tokens,
|
| 679 |
],
|
| 680 |
)
|
| 681 |
|
| 682 |
|
| 683 |
if __name__ == "__main__":
|
| 684 |
+
demo.launch(theme=gr.themes.Soft(), ssr_mode=False)
|