Bapt120 commited on
Commit
2e8b279
·
verified ·
1 Parent(s): 1e8cd84

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -13
app.py CHANGED
@@ -240,7 +240,7 @@ def image_to_data_uri(image):
240
  return f"data:image/png;base64,{b64}"
241
 
242
 
243
- def extract_text_via_vllm(image, model_name, temperature=0.2, stream=False):
244
  """Extract text from image using vLLM endpoint."""
245
  config = MODEL_REGISTRY.get(model_name)
246
  if config is None:
@@ -277,7 +277,7 @@ def extract_text_via_vllm(image, model_name, temperature=0.2, stream=False):
277
  response = client.chat.completions.create(
278
  model=model_id,
279
  messages=messages,
280
- max_tokens=2048,
281
  temperature=temperature if temperature > 0 else 0.0,
282
  top_p=0.9,
283
  stream=True,
@@ -299,7 +299,7 @@ def extract_text_via_vllm(image, model_name, temperature=0.2, stream=False):
299
  response = client.chat.completions.create(
300
  model=model_id,
301
  messages=messages,
302
- max_tokens=2048,
303
  temperature=temperature if temperature > 0 else 0.0,
304
  top_p=0.9,
305
  stream=False,
@@ -331,13 +331,13 @@ def render_bbox_with_crops(raw_output, source_image):
331
 
332
 
333
  @spaces.GPU
334
- def extract_text_from_image(image, model_name, temperature=0.2, stream=False):
335
  """Extract text from image using LightOnOCR model."""
336
  # Check if model has a vLLM endpoint configured
337
  config = MODEL_REGISTRY.get(model_name, {})
338
  if config.get("vllm_endpoint"):
339
  # Use vLLM endpoint instead of local model
340
- yield from extract_text_via_vllm(image, model_name, temperature, stream)
341
  return
342
 
343
  # Get model and processor from cache or load
@@ -375,7 +375,7 @@ def extract_text_from_image(image, model_name, temperature=0.2, stream=False):
375
 
376
  generation_kwargs = dict(
377
  **inputs,
378
- max_new_tokens=2048,
379
  temperature=temperature if temperature > 0 else 0.0,
380
  top_p=0.9,
381
  top_k=0,
@@ -421,7 +421,7 @@ def extract_text_from_image(image, model_name, temperature=0.2, stream=False):
421
  yield cleaned_text
422
 
423
 
424
- def process_input(file_input, model_name, temperature, page_num, enable_streaming):
425
  """Process uploaded file (image or PDF) and extract text with optional streaming."""
426
  if file_input is None:
427
  yield "Please upload an image or PDF first.", "", "", None, gr.update()
@@ -458,7 +458,7 @@ def process_input(file_input, model_name, temperature, page_num, enable_streamin
458
  try:
459
  # Extract text using LightOnOCR with optional streaming
460
  for extracted_text in extract_text_from_image(
461
- image_to_process, model_name, temperature, stream=enable_streaming
462
  ):
463
  # For bbox models, render cropped images inline
464
  if has_bbox:
@@ -574,6 +574,14 @@ State-of-the-art OCR on OlmOCR-Bench, ~9× smaller and faster than competitors.
574
  value=True,
575
  info="Show text progressively as it's generated",
576
  )
 
 
 
 
 
 
 
 
577
  submit_btn = gr.Button("Extract Text", variant="primary")
578
  clear_btn = gr.Button("Clear", variant="secondary")
579
 
@@ -620,7 +628,6 @@ State-of-the-art OCR on OlmOCR-Bench, ~9× smaller and faster than competitors.
620
  outputs=[file_input],
621
  )
622
 
623
-
624
  with gr.Row():
625
  with gr.Column():
626
  raw_output = gr.Textbox(
@@ -630,15 +637,18 @@ State-of-the-art OCR on OlmOCR-Bench, ~9× smaller and faster than competitors.
630
  max_lines=30,
631
  )
632
 
633
-
634
  # Event handlers
635
  submit_btn.click(
636
  fn=process_input,
637
- inputs=[file_input, model_selector, temperature, num_pages, enable_streaming],
638
  outputs=[output_text, raw_output, page_info, rendered_image, num_pages],
639
  )
640
 
641
- file_input.change(fn=update_slider_and_preview, inputs=[file_input], outputs=[num_pages, rendered_image])
 
 
 
 
642
 
643
  model_selector.change(
644
  fn=get_model_info_text, inputs=[model_selector], outputs=[model_info]
@@ -654,6 +664,7 @@ State-of-the-art OCR on OlmOCR-Bench, ~9× smaller and faster than competitors.
654
  "",
655
  None,
656
  1,
 
657
  ),
658
  outputs=[
659
  file_input,
@@ -664,9 +675,10 @@ State-of-the-art OCR on OlmOCR-Bench, ~9× smaller and faster than competitors.
664
  page_info,
665
  rendered_image,
666
  num_pages,
 
667
  ],
668
  )
669
 
670
 
671
  if __name__ == "__main__":
672
- demo.launch(theme=gr.themes.Soft(), ssr_mode=False)
 
240
  return f"data:image/png;base64,{b64}"
241
 
242
 
243
+ def extract_text_via_vllm(image, model_name, temperature=0.2, stream=False, max_tokens=2048):
244
  """Extract text from image using vLLM endpoint."""
245
  config = MODEL_REGISTRY.get(model_name)
246
  if config is None:
 
277
  response = client.chat.completions.create(
278
  model=model_id,
279
  messages=messages,
280
+ max_tokens=max_tokens,
281
  temperature=temperature if temperature > 0 else 0.0,
282
  top_p=0.9,
283
  stream=True,
 
299
  response = client.chat.completions.create(
300
  model=model_id,
301
  messages=messages,
302
+ max_tokens=max_tokens,
303
  temperature=temperature if temperature > 0 else 0.0,
304
  top_p=0.9,
305
  stream=False,
 
331
 
332
 
333
  @spaces.GPU
334
+ def extract_text_from_image(image, model_name, temperature=0.2, stream=False, max_tokens=2048):
335
  """Extract text from image using LightOnOCR model."""
336
  # Check if model has a vLLM endpoint configured
337
  config = MODEL_REGISTRY.get(model_name, {})
338
  if config.get("vllm_endpoint"):
339
  # Use vLLM endpoint instead of local model
340
+ yield from extract_text_via_vllm(image, model_name, temperature, stream, max_tokens)
341
  return
342
 
343
  # Get model and processor from cache or load
 
375
 
376
  generation_kwargs = dict(
377
  **inputs,
378
+ max_new_tokens=max_tokens,
379
  temperature=temperature if temperature > 0 else 0.0,
380
  top_p=0.9,
381
  top_k=0,
 
421
  yield cleaned_text
422
 
423
 
424
+ def process_input(file_input, model_name, temperature, page_num, enable_streaming, max_output_tokens):
425
  """Process uploaded file (image or PDF) and extract text with optional streaming."""
426
  if file_input is None:
427
  yield "Please upload an image or PDF first.", "", "", None, gr.update()
 
458
  try:
459
  # Extract text using LightOnOCR with optional streaming
460
  for extracted_text in extract_text_from_image(
461
+ image_to_process, model_name, temperature, stream=enable_streaming, max_tokens=max_output_tokens
462
  ):
463
  # For bbox models, render cropped images inline
464
  if has_bbox:
 
574
  value=True,
575
  info="Show text progressively as it's generated",
576
  )
577
+ max_output_tokens = gr.Slider(
578
+ minimum=256,
579
+ maximum=8192,
580
+ value=2048,
581
+ step=256,
582
+ label="Max Output Tokens",
583
+ info="Maximum number of tokens to generate",
584
+ )
585
  submit_btn = gr.Button("Extract Text", variant="primary")
586
  clear_btn = gr.Button("Clear", variant="secondary")
587
 
 
628
  outputs=[file_input],
629
  )
630
 
 
631
  with gr.Row():
632
  with gr.Column():
633
  raw_output = gr.Textbox(
 
637
  max_lines=30,
638
  )
639
 
 
640
  # Event handlers
641
  submit_btn.click(
642
  fn=process_input,
643
+ inputs=[file_input, model_selector, temperature, num_pages, enable_streaming, max_output_tokens],
644
  outputs=[output_text, raw_output, page_info, rendered_image, num_pages],
645
  )
646
 
647
+ file_input.change(
648
+ fn=update_slider_and_preview,
649
+ inputs=[file_input],
650
+ outputs=[num_pages, rendered_image],
651
+ )
652
 
653
  model_selector.change(
654
  fn=get_model_info_text, inputs=[model_selector], outputs=[model_info]
 
664
  "",
665
  None,
666
  1,
667
+ 2048,
668
  ),
669
  outputs=[
670
  file_input,
 
675
  page_info,
676
  rendered_image,
677
  num_pages,
678
+ max_output_tokens,
679
  ],
680
  )
681
 
682
 
683
  if __name__ == "__main__":
684
+ demo.launch(theme=gr.themes.Soft(), ssr_mode=False)