Spaces:

tachiwin
/

document-ocr

Running

App Files Files Community

Luis J Camargo commited on 1 day ago

Commit

acf8835

1 Parent(s): 4bdfa9b

feat: Add Gradio UI and inference logic for audio language classification with a custom Whisper encoder.

Browse files

Files changed (2) hide show

app.py +71 -83
default.yaml +104 -0

app.py CHANGED Viewed

@@ -22,10 +22,10 @@ logging.basicConfig(level=logging.INFO, format=LOGGING_FORMAT, handlers=[logging
 logger = logging.getLogger("TachiwinDocOCR")
 CUSTOM_MODEL_PATH = "tachiwin/Tachiwin-OCR-1.5"
-# The CLI generated filename is usually {pipeline_name}.yaml
-INTERNAL_CONFIG_FILE = "PaddleOCR-VL.yaml"
-# Our final working file
-FINAL_CONFIG_FILE = "custom_pipeline_config.yaml"
 OUTPUT_DIR = "output"
 os.makedirs(OUTPUT_DIR, exist_ok=True)
@@ -42,7 +42,7 @@ PADDLE_AVAILABLE = False
 try:
     import paddle
     import paddlex
-    from paddleocr import PaddleOCRVL
     PADDLE_AVAILABLE = True
     logger.info(f"Paddle libraries loaded. PaddleX version: {getattr(paddlex, '__version__', 'Unknown')}")
 except ImportError as e:
@@ -60,80 +60,69 @@ def setup_pipeline():
         return
     try:
-        logger.info("Starting setup_pipeline...")
-        # 1. Generate default config via CLI
-        if not os.path.exists(FINAL_CONFIG_FILE):
-            logger.info("Generating default configuration via paddlex CLI...")
-            # Command: paddlex --get_pipeline_config PaddleOCR-VL --save_path ./
             try:
-                result = subprocess.run(
                     ["paddlex", "--get_pipeline_config", "PaddleOCR-VL", "--save_path", "./"],
                     capture_output=True, text=True, check=True
                 )
-                logger.info(f"CLI Output: {result.stdout}")
-            except subprocess.CalledProcessError as e:
-                logger.error(f"CLI Error: {e.stderr}")
-                # If CLI fails, we can't proceed with custom model easily without a template
-                raise e
-            # The file generated is likely PaddleOCR-VL.yaml
-            if os.path.exists(INTERNAL_CONFIG_FILE):
-                os.rename(INTERNAL_CONFIG_FILE, FINAL_CONFIG_FILE)
-                logger.info(f"Renamed {INTERNAL_CONFIG_FILE} to {FINAL_CONFIG_FILE}")
-            else:
-                logger.error(f"Expected config file {INTERNAL_CONFIG_FILE} was not found after CLI execution.")
-                # List files to see what was created
-                logger.info(f"Current directory files: {os.listdir('.')}")
-                raise FileNotFoundError(f"Config file {INTERNAL_CONFIG_FILE} not found.")
-        # 2. Load and Modify Config
-        logger.info(f"Loading configuration from {FINAL_CONFIG_FILE}")
-        with open(FINAL_CONFIG_FILE, 'r', encoding='utf-8') as f:
             config_data = yaml.safe_load(f)
-        logger.info("Modifying configuration with custom model path...")
-        # Search and update VLRecognition model_dir
         updated = False
-        if 'SubModules' in config_data:
-            if 'VLRecognition' in config_data['SubModules']:
-                config_data['SubModules']['VLRecognition']['model_dir'] = CUSTOM_MODEL_PATH
-                updated = True
-        if not updated:
-            # Deep search fallback
-            def deep_update(d):
-                count = 0
-                for k, v in d.items():
-                    if k == 'VLRecognition' and isinstance(v, dict):
                         v['model_dir'] = CUSTOM_MODEL_PATH
-                        count += 1
-                    elif isinstance(v, dict):
-                        count += deep_update(v)
-                return count
-            updated = deep_update(config_data) > 0
-        if updated:
-            logger.info(f"Successfully updated VLRecognition model_dir to {CUSTOM_MODEL_PATH}")
-        else:
-            logger.warning("Could not find VLRecognition sub-module in the configuration to update its path.")
-        with open(FINAL_CONFIG_FILE, 'w', encoding='utf-8') as f:
-            yaml.dump(config_data, f, default_flow_style=False)
-        # Log final YAML for verification
-        logger.info("--- UPDATED YAML CONFIG ---")
         print(yaml.dump(config_data, default_flow_style=False))
-        logger.info("--- END UPDATED YAML ---")
-        # 3. Initialize pipeline
-        logger.info(f"Initializing PaddleOCRVL with config: {FINAL_CONFIG_FILE}")
-        pipeline = PaddleOCRVL(pipeline_config=FINAL_CONFIG_FILE)
-        logger.info("PaddleOCRVL initialized successfully.")
     except Exception as e:
-        logger.error(f"CRITICAL: Failed to setup pipeline: {e}")
         logger.error(traceback.format_exc())
 # Initial setup
@@ -193,11 +182,14 @@ def update_preview_visibility(path_or_url: Optional[str]) -> Dict:
 # --- Inference Logic ---
 def run_inference(img_path, task_type="ocr"):
-    if not PADDLE_AVAILABLE or pipeline is None:
-        return "❌ Paddle backend not available. Check initialization logs.", "", "", ""
     if not img_path:
-        return "⚠️ Please upload an image.", "", "", ""
     try:
         logger.info(f"--- Inference Start: {task_type} ---")
@@ -212,12 +204,10 @@ def run_inference(img_path, task_type="ocr"):
         os.makedirs(run_output_dir, exist_ok=True)
         for i, res in enumerate(output):
-            # Save results
             res.save_to_json(save_path=run_output_dir)
             res.save_to_markdown(save_path=run_output_dir)
             res.print()
-            # Read back generated files
             fnames = os.listdir(run_output_dir)
             for fname in fnames:
                 fpath = os.path.join(run_output_dir, fname)
@@ -229,7 +219,7 @@ def run_inference(img_path, task_type="ocr"):
                         json_content += f.read() + "\n\n"
                 elif fname.endswith((".png", ".jpg", ".jpeg")) and "res" in fname:
                     vis_src = image_to_base64_data_url(fpath)
-                    vis_html += f'<div style="margin-bottom:20px; border: 2px solid #10b981; border-radius: 12px; overflow: hidden;">'
                     vis_html += f'<img src="{vis_src}" alt="Vis {i+1}" style="width:100%;">'
                     vis_html += f'</div>'
@@ -237,16 +227,15 @@ def run_inference(img_path, task_type="ocr"):
             md_content = "⚠️ Finished but no content was recognized."
         md_preview = _escape_inequalities_in_math(md_content)
-        logger.info("--- Inference Finished ---")
         return md_preview, md_content, vis_html, json_content
     except Exception as e:
-        logger.error(f"Inference Error: {e}")
         logger.error(traceback.format_exc())
         return f"❌ Error: {str(e)}", "", "", ""
 # --- UI Components ---
-# (Keeping previous UI logic)
 custom_css = """
 body, .gradio-container { font-family: 'Inter', system-ui, sans-serif; }
@@ -257,9 +246,10 @@ body, .gradio-container { font-family: 'Inter', system-ui, sans-serif; }
     color: white;
     border-radius: 1.5rem;
     margin-bottom: 2rem;
 }
 .app-header h1 { color: white !important; font-weight: 800; font-size: 2.5rem; }
-.notice { background: #f0fdf4; border: 1px solid #bbf7d0; color: #166534; padding: 1rem; border-radius: 1rem; margin-bottom: 2rem; }
 .output-box { border: 1px solid #e2e8f0 !important; border-radius: 1rem !important; }
 """
@@ -268,22 +258,22 @@ with gr.Blocks(theme=gr.themes.Ocean(), css=custom_css) as demo:
         """
         <div class="app-header">
             <h1>🌎 Tachiwin Document Parsing OCR 🦡</h1>
-            <p>Fine-tuned for the 68 Indigenous Languages of Mexico</p>
         </div>
         """
     )
     with gr.Row(elem_classes=["notice"]):
-        gr.Markdown(f"**Engine:** PaddleOCRVL 1.5 | **Model:** `{CUSTOM_MODEL_PATH}`")
     with gr.Tabs():
-        # Document Parsing Tab
         with gr.Tab("📄 Full Document Parsing"):
             with gr.Row():
                 with gr.Column(scale=5):
-                    file_doc = gr.File(label="Upload Image", type="filepath")
                     preview_doc_html = gr.HTML(visible=False)
-                    btn_parse = gr.Button("� Start Parsing", variant="primary")
                     with gr.Row():
                         chart_switch = gr.Checkbox(label="Chart OCR", value=True)
                         unwarp_switch = gr.Checkbox(label="Unwarping", value=False)
@@ -305,14 +295,13 @@ with gr.Blocks(theme=gr.themes.Ocean(), css=custom_css) as demo:
             btn_parse.click(parse_doc_wrapper, [file_doc, chart_switch, unwarp_switch], [md_preview_doc, vis_image_doc, md_raw_doc])
-        # Element Recognition Tab
         with gr.Tab("🧩 Specific Recognition"):
             with gr.Row():
                 with gr.Column(scale=5):
                     file_vl = gr.File(label="Upload Element", type="filepath")
                     preview_vl_html = gr.HTML(visible=False)
                     with gr.Row():
-                        btn_ocr = gr.Button("Text OCR", variant="secondary")
                         btn_formula = gr.Button("Formula", variant="secondary")
                         btn_table = gr.Button("Table", variant="secondary")
@@ -332,7 +321,6 @@ with gr.Blocks(theme=gr.themes.Ocean(), css=custom_css) as demo:
             for btn, prompt in [(btn_ocr, "Text"), (btn_formula, "Formula"), (btn_table, "Table")]:
                 btn.click(run_vl_wrapper, [file_vl, gr.State(prompt)], [md_preview_vl, md_raw_vl])
-        # Spotting Tab
         with gr.Tab("📍 Feature Spotting"):
             with gr.Row():
                 with gr.Column(scale=5):
@@ -343,7 +331,7 @@ with gr.Blocks(theme=gr.themes.Ocean(), css=custom_css) as demo:
                 with gr.Column(scale=7):
                     with gr.Tabs():
                         with gr.Tab("🖼️ Detection"):
-                            vis_image_spot = gr.HTML('<div style="text-align:center; color:#94a3b8; padding: 50px;">Bboxes view.</div>')
                         with gr.Tab("💾 JSON Feed"):
                             json_spot = gr.Code(label="JSON", language="json")
@@ -355,7 +343,7 @@ with gr.Blocks(theme=gr.themes.Ocean(), css=custom_css) as demo:
             btn_run_spot.click(run_spotting_wrapper, file_spot, [vis_image_spot, json_spot])
-    gr.Markdown("--- \n *May the indigenous languages of Mexico never be lost. Tachiwin Project.*")
 if __name__ == "__main__":
     demo.queue().launch()

 logger = logging.getLogger("TachiwinDocOCR")
 CUSTOM_MODEL_PATH = "tachiwin/Tachiwin-OCR-1.5"
+# The YAML file provided by the user or generated
+CONFIG_FILE = "default.yaml"
+# Fallback generated if default.yaml doesn't exist
+GENERATED_CONFIG = "PaddleOCR-VL.yaml"
 OUTPUT_DIR = "output"
 os.makedirs(OUTPUT_DIR, exist_ok=True)
 try:
     import paddle
     import paddlex
+    from paddlex import create_pipeline
     PADDLE_AVAILABLE = True
     logger.info(f"Paddle libraries loaded. PaddleX version: {getattr(paddlex, '__version__', 'Unknown')}")
 except ImportError as e:
         return
     try:
+        logger.info("🚀 Starting Tachiwin Doc OCR Pipeline Setup...")
+        target_config = None
+        # Use existing default.yaml if present
+        if os.path.exists(CONFIG_FILE):
+            logger.info(f"✅ Found existing configuration: {CONFIG_FILE}")
+            target_config = CONFIG_FILE
+        else:
+            logger.info(f"⚠️ {CONFIG_FILE} not found. Generating default configuration via paddlex CLI...")
             try:
+                subprocess.run(
                     ["paddlex", "--get_pipeline_config", "PaddleOCR-VL", "--save_path", "./"],
                     capture_output=True, text=True, check=True
                 )
+                if os.path.exists(GENERATED_CONFIG):
+                    target_config = GENERATED_CONFIG
+                    logger.info(f"✅ Generated {target_config}")
+                else:
+                    logger.error(f"❌ CLI generation failed to produce {GENERATED_CONFIG}")
+                    logger.info(f"Directory contents: {os.listdir('.')}")
+                    return
+            except Exception as e:
+                logger.error(f"❌ Failed to run paddlex CLI: {e}")
+                return
+        # Load and verify/update config
+        logger.info(f"📄 Loading YAML from {target_config}...")
+        with open(target_config, 'r', encoding='utf-8') as f:
             config_data = yaml.safe_load(f)
+        # Update model_dir if it's not set correctly
         updated = False
+        def update_config(d):
+            nonlocal updated
+            for k, v in d.items():
+                if k == 'VLRecognition' and isinstance(v, dict):
+                    if v.get('model_dir') != CUSTOM_MODEL_PATH:
+                        logger.info(f"🔧 Updating VLRecognition model_dir: {v.get('model_dir')} -> {CUSTOM_MODEL_PATH}")
                         v['model_dir'] = CUSTOM_MODEL_PATH
+                        updated = True
+                elif isinstance(v, dict):
+                    update_config(v)
+        update_config(config_data)
+        if updated:
+            with open(target_config, 'w', encoding='utf-8') as f:
+                yaml.dump(config_data, f, default_flow_style=False)
+            logger.info(f"💾 Updated configuration saved to {target_config}")
+        # Log the config being used
+        logger.info(f"--- [START] {target_config} CONTENT ---")
         print(yaml.dump(config_data, default_flow_style=False))
+        logger.info(f"--- [END] {target_config} CONTENT ---")
+        # Initialize pipeline using the recommended PaddleX way
+        logger.info(f"⚙️ Initializing pipeline with create_pipeline(pipeline={target_config})")
+        # According to help: create_pipeline can take a path to yaml
+        pipeline = create_pipeline(pipeline=target_config)
+        logger.info("✨ Pipeline initialized successfully!")
     except Exception as e:
+        logger.error(f"🔥 CRITICAL: Pipeline Setup Failed")
         logger.error(traceback.format_exc())
 # Initial setup
 # --- Inference Logic ---
 def run_inference(img_path, task_type="ocr"):
+    if not PADDLE_AVAILABLE:
+        return "❌ Paddle backend not installed.", "", "", ""
+    if pipeline is None:
+        return "❌ Pipeline is not initialized. Check server logs for error details.", "", "", ""
     if not img_path:
+        return "⚠️ No image provided.", "", "", ""
     try:
         logger.info(f"--- Inference Start: {task_type} ---")
         os.makedirs(run_output_dir, exist_ok=True)
         for i, res in enumerate(output):
             res.save_to_json(save_path=run_output_dir)
             res.save_to_markdown(save_path=run_output_dir)
             res.print()
             fnames = os.listdir(run_output_dir)
             for fname in fnames:
                 fpath = os.path.join(run_output_dir, fname)
                         json_content += f.read() + "\n\n"
                 elif fname.endswith((".png", ".jpg", ".jpeg")) and "res" in fname:
                     vis_src = image_to_base64_data_url(fpath)
+                    vis_html += f'<div style="margin-bottom:20px; border: 2px solid #10b981; border-radius: 12px; overflow: hidden; background:white;">'
                     vis_html += f'<img src="{vis_src}" alt="Vis {i+1}" style="width:100%;">'
                     vis_html += f'</div>'
             md_content = "⚠️ Finished but no content was recognized."
         md_preview = _escape_inequalities_in_math(md_content)
+        logger.info("--- Inference Finished Successfully ---")
         return md_preview, md_content, vis_html, json_content
     except Exception as e:
+        logger.error(f"❌ Inference Error: {e}")
         logger.error(traceback.format_exc())
         return f"❌ Error: {str(e)}", "", "", ""
 # --- UI Components ---
 custom_css = """
 body, .gradio-container { font-family: 'Inter', system-ui, sans-serif; }
     color: white;
     border-radius: 1.5rem;
     margin-bottom: 2rem;
+    box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1);
 }
 .app-header h1 { color: white !important; font-weight: 800; font-size: 2.5rem; }
+.notice { background: #f0fdf4; border: 1px solid #bbf7d0; color: #166534; padding: 1rem; border-radius: 1rem; margin-bottom: 2rem; font-weight: 500;}
 .output-box { border: 1px solid #e2e8f0 !important; border-radius: 1rem !important; }
 """
         """
         <div class="app-header">
             <h1>🌎 Tachiwin Document Parsing OCR 🦡</h1>
+            <p>Advancing linguistic rights with state-of-the-art document parsing</p>
         </div>
         """
     )
     with gr.Row(elem_classes=["notice"]):
+        status_text = "Initialized" if pipeline else "Initializing/Failed"
+        gr.Markdown(f"**⚡ Status:** {status_text} | **Model:** `{CUSTOM_MODEL_PATH}` | **Hardware:** CPU")
     with gr.Tabs():
         with gr.Tab("📄 Full Document Parsing"):
             with gr.Row():
                 with gr.Column(scale=5):
+                    file_doc = gr.File(label="Upload Document", type="filepath")
                     preview_doc_html = gr.HTML(visible=False)
+                    btn_parse = gr.Button("🔍 Start Parsing", variant="primary")
                     with gr.Row():
                         chart_switch = gr.Checkbox(label="Chart OCR", value=True)
                         unwarp_switch = gr.Checkbox(label="Unwarping", value=False)
             btn_parse.click(parse_doc_wrapper, [file_doc, chart_switch, unwarp_switch], [md_preview_doc, vis_image_doc, md_raw_doc])
         with gr.Tab("🧩 Specific Recognition"):
             with gr.Row():
                 with gr.Column(scale=5):
                     file_vl = gr.File(label="Upload Element", type="filepath")
                     preview_vl_html = gr.HTML(visible=False)
                     with gr.Row():
+                        btn_ocr = gr.Button("Text", variant="secondary")
                         btn_formula = gr.Button("Formula", variant="secondary")
                         btn_table = gr.Button("Table", variant="secondary")
             for btn, prompt in [(btn_ocr, "Text"), (btn_formula, "Formula"), (btn_table, "Table")]:
                 btn.click(run_vl_wrapper, [file_vl, gr.State(prompt)], [md_preview_vl, md_raw_vl])
         with gr.Tab("📍 Feature Spotting"):
             with gr.Row():
                 with gr.Column(scale=5):
                 with gr.Column(scale=7):
                     with gr.Tabs():
                         with gr.Tab("🖼️ Detection"):
+                            vis_image_spot = gr.HTML('<div style="text-align:center; color:#94a3b8; padding: 50px;">Visual detection here.</div>')
                         with gr.Tab("💾 JSON Feed"):
                             json_spot = gr.Code(label="JSON", language="json")
             btn_run_spot.click(run_spotting_wrapper, file_spot, [vis_image_spot, json_spot])
+    gr.Markdown("--- \n *Tachiwin Project: Indigenous Languages of Mexico.*")
 if __name__ == "__main__":
     demo.queue().launch()

default.yaml ADDED Viewed

	@@ -0,0 +1,104 @@

+Serving:
+  extra:
+    max_num_input_imgs: null
+SubModules:
+  LayoutDetection:
+    batch_size: 8
+    layout_merge_bboxes_mode:
+      0: union
+      1: union
+      2: union
+      3: large
+      4: union
+      5: large
+      6: large
+      7: union
+      8: union
+      9: union
+      10: union
+      11: union
+      12: union
+      13: union
+      14: union
+      15: large
+      16: union
+      17: large
+      18: union
+      19: union
+      20: union
+      21: union
+      22: union
+      23: union
+      24: union
+    layout_nms: true
+    layout_unclip_ratio:
+    - 1.0
+    - 1.0
+    model_dir: null
+    model_name: PP-DocLayoutV2
+    module_name: layout_detection
+    threshold:
+      0: 0.5
+      1: 0.5
+      2: 0.5
+      3: 0.5
+      4: 0.5
+      5: 0.4
+      6: 0.4
+      7: 0.5
+      8: 0.5
+      9: 0.5
+      10: 0.5
+      11: 0.5
+      12: 0.5
+      13: 0.5
+      14: 0.5
+      15: 0.4
+      16: 0.5
+      17: 0.4
+      18: 0.5
+      19: 0.5
+      20: 0.45
+      21: 0.5
+      22: 0.4
+      23: 0.4
+      24: 0.5
+  VLRecognition:
+    batch_size: -1
+    genai_config:
+      backend: native
+    model_dir: tachiwin/Tachiwin-OCR-1.5
+    model_name: PaddleOCR-VL-0.9B
+    module_name: vl_recognition
+SubPipelines:
+  DocPreprocessor:
+    SubModules:
+      DocOrientationClassify:
+        batch_size: 8
+        model_dir: null
+        model_name: PP-LCNet_x1_0_doc_ori
+        module_name: doc_text_orientation
+      DocUnwarping:
+        model_dir: null
+        model_name: UVDoc
+        module_name: image_unwarping
+    batch_size: 8
+    pipeline_name: doc_preprocessor
+    use_doc_orientation_classify: true
+    use_doc_unwarping: true
+batch_size: 64
+format_block_content: false
+markdown_ignore_labels:
+- number
+- footnote
+- header
+- header_image
+- footer
+- footer_image
+- aside_text
+merge_layout_blocks: true
+pipeline_name: PaddleOCR-VL
+use_chart_recognition: false
+use_doc_preprocessor: false
+use_layout_detection: true
+use_queues: true