Spaces:

tachiwin
/

document-ocr

Running

App Files Files Community

Luis J Camargo commited on 1 day ago

Commit

610efd0

1 Parent(s): b7745a8

feat: Implement streaming OCR results by converting `run_inference` to a generator and updating UI wrappers to consume partial outputs.

Browse files

Files changed (1) hide show

app.py +47 -31

app.py CHANGED Viewed

@@ -173,20 +173,22 @@ def _escape_inequalities_in_math(md: str) -> str:
 def run_inference(img_path, task_type="ocr", progress=gr.Progress()):
     if not PADDLE_AVAILABLE:
-        return "❌ Paddle backend not installed.", "", "", ""
     if pipeline is None:
-        return "❌ Pipeline is not initialized. Check server logs for error details.", "", "", ""
     if not img_path:
-        return "⚠️ No image provided.", "", "", ""
     try:
         logger.info(f"--- Inference Start: {task_type} ---")
         progress(0, desc="📦 Initializing inference engine...")
         output = pipeline.predict(input=img_path)
         logger.info(f"Output object type: {type(output)}")
-        logger.info(f"Output object: {output}")
         md_content = ""
         json_content = ""
@@ -196,52 +198,60 @@ def run_inference(img_path, task_type="ocr", progress=gr.Progress()):
         run_output_dir = os.path.join(OUTPUT_DIR, run_id)
         os.makedirs(run_output_dir, exist_ok=True)
-        logger.info(f"will iterate")
-        progress(0.2, desc="🔍 Parsing document structure...")
         for i, res in enumerate(output):
             logger.info(f"Processing segment {i+1}...")
-            progress((i + 1) / 5, desc=f"✍️ Recognizing content (segment {i+1})...")
-            # Save results
-            res.save_to_json(save_path=run_output_dir)
-            res.save_to_markdown(save_path=run_output_dir)
             res.print()
-            # Read back generated files
-            fnames = os.listdir(run_output_dir)
             for fname in fnames:
-                fpath = os.path.join(run_output_dir, fname)
                 if fname.endswith(".md"):
                     with open(fpath, 'r', encoding='utf-8') as f:
                         content = f.read()
-                        if content not in md_content:
-                            md_content += content + "\n\n"
                 elif fname.endswith(".json"):
                     with open(fpath, 'r', encoding='utf-8') as f:
                         content = f.read()
-                        if content not in json_content:
-                            json_content += content + "\n\n"
                 elif fname.endswith((".png", ".jpg", ".jpeg")) and ("res" in fname or "vis" in fname):
                     vis_src = image_to_base64_data_url(fpath)
                     vis_html += f'<div style="margin-bottom:20px; border: 2px solid #10b981; border-radius: 12px; overflow: hidden; background:white;">'
                     vis_html += f'<img src="{vis_src}" alt="Vis {i+1}" style="width:100%;">'
                     vis_html += f'</div>'
             logger.info(f"Finished processing segment {i+1}")
         if not md_content:
             md_content = "⚠️ Finished but no content was recognized."
-        md_preview = _escape_inequalities_in_math(md_content)
         logger.info("--- Inference Finished Successfully ---")
-        progress(1.0, desc="✅ Recovery complete")
-        return md_preview, md_content, vis_html, json_content
     except Exception as e:
         logger.error(f"❌ Inference Error: {e}")
         logger.error(traceback.format_exc())
-        return f"❌ Error: {str(e)}", "", "", ""
 # --- UI Components ---
@@ -281,7 +291,7 @@ with gr.Blocks() as demo:
             with gr.Row():
                 with gr.Column(scale=5):
                     file_doc = gr.Image(label="Upload Image", type="filepath")
-                    btn_parse = gr.Button("� Start Parsing", variant="primary")
                     with gr.Row():
                         chart_switch = gr.Checkbox(label="Chart OCR", value=True)
                         unwarp_switch = gr.Checkbox(label="Unwarping", value=False)
@@ -296,9 +306,11 @@ with gr.Blocks() as demo:
                             md_raw_doc = gr.Code(language="markdown")
             def parse_doc_wrapper(fp, ch, uw):
-                if not fp: return "⚠️ Please upload an image.", "", ""
-                res_preview, res_raw, res_vis, res_json = run_inference(fp, task_type="Document")
-                return res_preview, res_vis, res_raw
             btn_parse.click(
                 parse_doc_wrapper,
@@ -325,9 +337,11 @@ with gr.Blocks() as demo:
                             md_raw_vl = gr.Code(language="markdown")
             def run_vl_wrapper(fp, prompt):
-                if not fp: return "⚠️ Please upload an image.", ""
-                res_preview, res_raw, _, _ = run_inference(fp, task_type=prompt)
-                return res_preview, res_raw
             for btn, prompt in [(btn_ocr, "Text"), (btn_formula, "Formula"), (btn_table, "Table")]:
                 btn.click(
@@ -352,9 +366,11 @@ with gr.Blocks() as demo:
                             json_spot = gr.Code(label="JSON", language="json")
             def run_spotting_wrapper(fp):
-                if not fp: return "", ""
-                _, _, vis, js = run_inference(fp, task_type="Spotting")
-                return vis, js
             btn_run_spot.click(
                 run_spotting_wrapper,

 def run_inference(img_path, task_type="ocr", progress=gr.Progress()):
     if not PADDLE_AVAILABLE:
+        yield "❌ Paddle backend not installed.", "", "", ""
+        return
     if pipeline is None:
+        yield "❌ Pipeline is not initialized. Check server logs for error details.", "", "", ""
+        return
     if not img_path:
+        yield "⚠️ No image provided.", "", "", ""
+        return
     try:
         logger.info(f"--- Inference Start: {task_type} ---")
         progress(0, desc="📦 Initializing inference engine...")
         output = pipeline.predict(input=img_path)
         logger.info(f"Output object type: {type(output)}")
         md_content = ""
         json_content = ""
         run_output_dir = os.path.join(OUTPUT_DIR, run_id)
         os.makedirs(run_output_dir, exist_ok=True)
+        logger.info(f"Inference generator ready. Starting iteration...")
+        progress(0.1, desc="🔍 Document preprocessing...")
         for i, res in enumerate(output):
             logger.info(f"Processing segment {i+1}...")
+            # Use dynamic progress increment
+            p_val = min(0.1 + (i + 1) * 0.15, 0.95)
+            progress(p_val, desc=f"✍️ Recognizing content (segment {i+1})...")
+            # Save results to unique dir
+            seg_dir = os.path.join(run_output_dir, f"seg_{i}")
+            os.makedirs(seg_dir, exist_ok=True)
+            res.save_to_json(save_path=seg_dir)
+            res.save_to_markdown(save_path=seg_dir)
             res.print()
+            # Gather files specifically from this segment
+            fnames = os.listdir(seg_dir)
             for fname in fnames:
+                fpath = os.path.join(seg_dir, fname)
                 if fname.endswith(".md"):
                     with open(fpath, 'r', encoding='utf-8') as f:
                         content = f.read()
+                        md_content += content + "\n\n"
                 elif fname.endswith(".json"):
                     with open(fpath, 'r', encoding='utf-8') as f:
                         content = f.read()
+                        json_content += content + "\n\n"
                 elif fname.endswith((".png", ".jpg", ".jpeg")) and ("res" in fname or "vis" in fname):
                     vis_src = image_to_base64_data_url(fpath)
                     vis_html += f'<div style="margin-bottom:20px; border: 2px solid #10b981; border-radius: 12px; overflow: hidden; background:white;">'
                     vis_html += f'<img src="{vis_src}" alt="Vis {i+1}" style="width:100%;">'
                     vis_html += f'</div>'
+            # Yield partial results to keep UI alive
+            partial_md = _escape_inequalities_in_math(md_content)
+            yield partial_md, md_content, vis_html, json_content
             logger.info(f"Finished processing segment {i+1}")
         if not md_content:
             md_content = "⚠️ Finished but no content was recognized."
+            yield md_content, md_content, "", ""
+        else:
+            final_md = _escape_inequalities_in_math(md_content)
+            progress(1.0, desc="✅ Complete")
+            yield final_md, md_content, vis_html, json_content
         logger.info("--- Inference Finished Successfully ---")
     except Exception as e:
         logger.error(f"❌ Inference Error: {e}")
         logger.error(traceback.format_exc())
+        yield f"❌ Error: {str(e)}", "", "", ""
+        return
 # --- UI Components ---
             with gr.Row():
                 with gr.Column(scale=5):
                     file_doc = gr.Image(label="Upload Image", type="filepath")
+                    btn_parse = gr.Button("🔍 Start Parsing", variant="primary")
                     with gr.Row():
                         chart_switch = gr.Checkbox(label="Chart OCR", value=True)
                         unwarp_switch = gr.Checkbox(label="Unwarping", value=False)
                             md_raw_doc = gr.Code(language="markdown")
             def parse_doc_wrapper(fp, ch, uw):
+                if not fp:
+                    yield "⚠️ Please upload an image.", "", ""
+                    return
+                for res_preview, res_raw, res_vis, res_json in run_inference(fp, task_type="Document"):
+                    yield res_preview, res_vis, res_raw
             btn_parse.click(
                 parse_doc_wrapper,
                             md_raw_vl = gr.Code(language="markdown")
             def run_vl_wrapper(fp, prompt):
+                if not fp:
+                    yield "⚠️ Please upload an image.", ""
+                    return
+                for res_preview, res_raw, _, _ in run_inference(fp, task_type=prompt):
+                    yield res_preview, res_raw
             for btn, prompt in [(btn_ocr, "Text"), (btn_formula, "Formula"), (btn_table, "Table")]:
                 btn.click(
                             json_spot = gr.Code(label="JSON", language="json")
             def run_spotting_wrapper(fp):
+                if not fp:
+                    yield "", ""
+                    return
+                for _, _, vis, js in run_inference(fp, task_type="Spotting"):
+                    yield vis, js
             btn_run_spot.click(
                 run_spotting_wrapper,